From a86b463fc4cca75962d18da73351aff93bac4ee2 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 12:55:20 +0000
Subject: [PATCH 1/6] p1-2: scaffold kb-parse-md crate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the workspace member with the dep allow-list pinned by design §0 Q9
and the task spec. P1-2 will land the frontmatter submodule in the next
commit; P1-3 will add the block parser as a sibling.

Notable choice: serde_yaml (dtolnay) was archived as unmaintained in 2024
so we use serde_yaml_ng, the maintained fork. lingua's per-language
features are explicitly enabled (default-features=false) to keep build
time + binary size sane — only the languages we need at parse time.
---
 Cargo.lock                    | 414 +++++++++++++++++++++++++++++++++-
 Cargo.toml                    |   1 +
 crates/kb-parse-md/Cargo.toml |  34 +++
 crates/kb-parse-md/src/lib.rs |   7 +
 4 files changed, 455 insertions(+), 1 deletion(-)
 create mode 100644 crates/kb-parse-md/Cargo.toml
 create mode 100644 crates/kb-parse-md/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index 48aaa06..7c34f44 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -79,6 +79,12 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+
 [[package]]
 name = "bitflags"
 version = "2.11.1"
@@ -109,6 +115,12 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "bumpalo"
+version = "3.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
+
 [[package]]
 name = "cc"
 version = "1.2.61"
@@ -177,6 +189,15 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
 
+[[package]]
+name = "counter"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "66e8e052be91f1c8aae2c1d81307d9f6e67f5f37001e3ddee419e971e73f03bc"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "cpufeatures"
 version = "0.3.0"
@@ -220,6 +241,20 @@ version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 
+[[package]]
+name = "dashmap"
+version = "6.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+ "hashbrown 0.14.5",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
 [[package]]
 name = "deranged"
 version = "0.5.8"
@@ -251,6 +286,12 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+
 [[package]]
 name = "equivalent"
 version = "1.0.2"
@@ -272,6 +313,9 @@ name = "fastrand"
 version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
+dependencies = [
+ "getrandom 0.3.4",
+]
 
 [[package]]
 name = "find-msvc-tools"
@@ -285,6 +329,36 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
 
+[[package]]
+name = "fst"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a"
+
+[[package]]
+name = "futures-core"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
+
+[[package]]
+name = "futures-task"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
+
+[[package]]
+name = "futures-util"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "pin-project-lite",
+ "slab",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.2.17"
@@ -296,6 +370,20 @@ dependencies = [
  "wasi",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "r-efi 5.3.0",
+ "wasip2",
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.4.2"
@@ -304,7 +392,7 @@ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
 dependencies = [
  "cfg-if",
  "libc",
- "r-efi",
+ "r-efi 6.0.0",
  "wasip2",
  "wasip3",
 ]
@@ -322,6 +410,12 @@ dependencies = [
  "regex-syntax",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+
 [[package]]
 name = "hashbrown"
 version = "0.15.5"
@@ -365,6 +459,25 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "include_dir"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "923d117408f1e49d914f1a379a309cffe4f18c05cf4e3d12e613a15fc81bd0dd"
+dependencies = [
+ "include_dir_macros",
+]
+
+[[package]]
+name = "include_dir_macros"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cab85a7ed0bd5f0e76d93846e0147172bed2e2d3f859bcc33a8d9699cad1a75"
+dependencies = [
+ "proc-macro2",
+ "quote",
+]
+
 [[package]]
 name = "indexmap"
 version = "2.14.0"
@@ -383,12 +496,33 @@ version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
 
+[[package]]
+name = "itertools"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itoa"
 version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
 
+[[package]]
+name = "js-sys"
+version = "0.3.97"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1840c94c045fbcf8ba2812c95db44499f7c64910a912551aaaa541decebcacf"
+dependencies = [
+ "cfg-if",
+ "futures-util",
+ "once_cell",
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "kb-app"
 version = "0.1.0"
@@ -443,6 +577,21 @@ dependencies = [
  "unicode-normalization",
 ]
 
+[[package]]
+name = "kb-parse-md"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "kb-core",
+ "kb-parse-types",
+ "lingua",
+ "serde",
+ "serde_json",
+ "serde_yaml_ng",
+ "time",
+ "toml",
+]
+
 [[package]]
 name = "kb-parse-types"
 version = "0.1.0"
@@ -495,18 +644,95 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "lingua"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40d9129bb9fe42c95d1bd420d6891607eaff17df16ee15674aed2d05b0ec8f4"
+dependencies = [
+ "counter",
+ "dashmap",
+ "fastrand",
+ "fst",
+ "include_dir",
+ "itertools",
+ "lingua-chinese-language-model",
+ "lingua-english-language-model",
+ "lingua-japanese-language-model",
+ "lingua-korean-language-model",
+ "maplit",
+ "rayon",
+ "regex",
+ "serde",
+ "serde-wasm-bindgen",
+ "strum",
+ "strum_macros",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "lingua-chinese-language-model"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21ca7fa9f7671d684c82c168725f380fc873f14d6f4e8c82f0da681bcc0048d1"
+dependencies = [
+ "include_dir",
+]
+
+[[package]]
+name = "lingua-english-language-model"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97102de08b134a49f1cce05a1b6f5bf08ef21fe858074ae2b794e7892c43dd4b"
+dependencies = [
+ "include_dir",
+]
+
+[[package]]
+name = "lingua-japanese-language-model"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df0938f75de3ae5dcdc925d823ed409854ca14f6a653782b9a1ad5d899462fbe"
+dependencies = [
+ "include_dir",
+]
+
+[[package]]
+name = "lingua-korean-language-model"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa87f6c43ff894fc75159c021480d2fdf96882bf5bd235f8916ceb6b7caae561"
+dependencies = [
+ "include_dir",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
 
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+
 [[package]]
 name = "log"
 version = "0.4.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
 
+[[package]]
+name = "maplit"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
+
 [[package]]
 name = "matchers"
 version = "0.2.0"
@@ -537,6 +763,15 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
 
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.21.4"
@@ -555,6 +790,19 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
 
+[[package]]
+name = "parking_lot_core"
+version = "0.9.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-link",
+]
+
 [[package]]
 name = "pin-project-lite"
 version = "0.2.17"
@@ -595,12 +843,47 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
 [[package]]
 name = "r-efi"
 version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
 
+[[package]]
+name = "rayon"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.5.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
+dependencies = [
+ "bitflags",
+]
+
 [[package]]
 name = "redox_users"
 version = "0.4.6"
@@ -612,6 +895,18 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
+[[package]]
+name = "regex"
+version = "1.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
 [[package]]
 name = "regex-automata"
 version = "0.4.14"
@@ -642,6 +937,18 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "ryu"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
+
 [[package]]
 name = "ryu-js"
 version = "1.0.2"
@@ -657,6 +964,12 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
 [[package]]
 name = "semver"
 version = "1.0.28"
@@ -673,6 +986,17 @@ dependencies = [
  "serde_derive",
 ]
 
+[[package]]
+name = "serde-wasm-bindgen"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8302e169f0eddcc139c70f139d19d6467353af16f9fce27e8c30158036a1e16b"
+dependencies = [
+ "js-sys",
+ "serde",
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "serde_core"
 version = "1.0.228"
@@ -726,6 +1050,19 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "serde_yaml_ng"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b4db627b98b36d4203a7b458cf3573730f2bb591b28871d916dfa9efabfd41f"
+dependencies = [
+ "indexmap",
+ "itoa",
+ "ryu",
+ "serde",
+ "unsafe-libyaml",
+]
+
 [[package]]
 name = "sharded-slab"
 version = "0.1.7"
@@ -741,6 +1078,12 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
+[[package]]
+name = "slab"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
+
 [[package]]
 name = "smallvec"
 version = "1.15.1"
@@ -753,6 +1096,24 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
+[[package]]
+name = "strum"
+version = "0.27.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf"
+
+[[package]]
+name = "strum_macros"
+version = "0.27.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "symlink"
 version = "0.1.0"
@@ -1027,6 +1388,12 @@ version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
 
+[[package]]
+name = "unsafe-libyaml"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
+
 [[package]]
 name = "utf8parse"
 version = "0.2.2"
@@ -1073,6 +1440,51 @@ dependencies = [
  "wit-bindgen 0.51.0",
 ]
 
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.120"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df52b6d9b87e0c74c9edfa1eb2d9bf85e5d63515474513aa50fa181b3c4f5db1"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.120"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78b1041f495fb322e64aca85f5756b2172e35cd459376e67f2a6c9dffcedb103"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.120"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dcd0ff20416988a18ac686d4d4d0f6aae9ebf08a389ff5d29012b05af2a1b41"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.120"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49757b3c82ebf16c57d69365a142940b384176c24df52a087fb748e2085359ea"
+dependencies = [
+ "unicode-ident",
+]
+
 [[package]]
 name = "wasm-encoder"
 version = "0.244.0"
diff --git a/Cargo.toml b/Cargo.toml
index a9f9b3b..23fb992 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
     "crates/kb-parse-types",
     "crates/kb-config",
     "crates/kb-source-fs",
+    "crates/kb-parse-md",
     "crates/kb-app",
     "crates/kb-cli",
 ]
diff --git a/crates/kb-parse-md/Cargo.toml b/crates/kb-parse-md/Cargo.toml
new file mode 100644
index 0000000..db7b56e
--- /dev/null
+++ b/crates/kb-parse-md/Cargo.toml
@@ -0,0 +1,34 @@
+[package]
+name          = "kb-parse-md"
+version       = { workspace = true }
+edition       = { workspace = true }
+rust-version  = { workspace = true }
+license       = { workspace = true }
+repository    = { workspace = true }
+description   = "Markdown frontmatter (and, in p1-3, block) parsing into kb-core::Metadata / kb-parse-types intermediates"
+
+[dependencies]
+kb-core         = { path = "../kb-core" }
+kb-parse-types  = { path = "../kb-parse-types" }
+anyhow          = { workspace = true }
+serde           = { workspace = true }
+serde_json      = { workspace = true }
+time            = { workspace = true }
+# serde_yaml (dtolnay) was archived as unmaintained in 2024.
+# We use the maintained fork serde_yaml_ng. Keeping the same `serde_yaml`-style
+# API surface lets us swap if a different fork wins long term.
+serde_yaml_ng   = "0.10"
+toml            = "0.8"
+# `lingua` ships every supported language as a feature flag; the `default`
+# feature pulls all 75+ language models (huge build time + binary size).
+# For p1-2 we only need a small subset for autodetect + tests. Add more
+# languages here as future tasks call for them.
+lingua          = { version = "1.8", default-features = false, features = [
+    "korean",
+    "english",
+    "japanese",
+    "chinese",
+] }
+
+[dev-dependencies]
+serde_json = { workspace = true }
diff --git a/crates/kb-parse-md/src/lib.rs b/crates/kb-parse-md/src/lib.rs
new file mode 100644
index 0000000..fcaca6e
--- /dev/null
+++ b/crates/kb-parse-md/src/lib.rs
@@ -0,0 +1,7 @@
+//! `kb-parse-md` — Markdown parsing for the KB pipeline (§3.7b).
+//!
+//! P1-2 will implement the **frontmatter** submodule. P1-3 will add a
+//! sibling `blocks` submodule for block parsing using `pulldown-cmark`.
+//!
+//! This commit only establishes the crate scaffold so subsequent
+//! commits can land the parser in a reviewable shape.
-- 
2.49.1


From cc8f7dad3fe5924a7eab8fe6cf97b92812c98255 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 12:56:02 +0000
Subject: [PATCH 2/6] =?UTF-8?q?p1-2:=20parse=5Ffrontmatter=20+=20=C2=A70?=
 =?UTF-8?q?=20Q9=20derive=20table?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the frontmatter submodule:

- detect_delimiters scans for a leading YAML (---) or TOML (+++) block at
  byte 0. Strict per §0 Q9: no leading whitespace / BOM, no chars on the
  delimiter line. Closing must be its own line. Unterminated → no FM.
- parse_raw deserializes into RawFrontmatter, a serde-flatten struct that
  catches unknown keys into a serde_json::Map for verbatim preservation
  in metadata.user.
- derive_metadata implements the §0 Q9 fallback chain:
    title       → frontmatter | BodyHints.first_h1 | (filename: caller)
    aliases/tags→ frontmatter | []
    lang        → frontmatter | lingua autodetect on first 4 KB | hints
                  | "und"
    created_at  → frontmatter (RFC 3339, normalized to UTC) | fs_ctime
    updated_at  → frontmatter | fs_mtime
    source_type → frontmatter | "markdown"
    trust_level → frontmatter | "primary"
    id          → user_id_alias only — never a doc_id factor (§4.2)
- Non-UTC offsets are normalized to UTC; the original string is preserved
  in user.original_timestamps[field] per §0 Q9.
- Warnings are emitted for: malformed YAML/TOML, unknown enum values,
  malformed timestamps. Unknown keys are silent.
- lingua detector is cached in a OnceLock — first build is heavy.
- 15 unit tests cover every row of the derive table + delimiter edge
  cases + an explicit pin that `id:` does not feed id_for_doc.
---
 crates/kb-parse-md/src/frontmatter.rs | 740 ++++++++++++++++++++++++++
 crates/kb-parse-md/src/lib.rs         |  18 +-
 2 files changed, 755 insertions(+), 3 deletions(-)
 create mode 100644 crates/kb-parse-md/src/frontmatter.rs

diff --git a/crates/kb-parse-md/src/frontmatter.rs b/crates/kb-parse-md/src/frontmatter.rs
new file mode 100644
index 0000000..eacc9a3
--- /dev/null
+++ b/crates/kb-parse-md/src/frontmatter.rs
@@ -0,0 +1,740 @@
+//! Markdown frontmatter parsing → `kb_core::Metadata`.
+//!
+//! Implements the contract pinned in design §0 Q9 (frontmatter derive table)
+//! and §3.6 (Metadata shape). Produces structured warnings via
+//! `kb-parse-types`.
+//!
+//! # YAML library
+//!
+//! Upstream `serde_yaml` (dtolnay) was archived as unmaintained in 2024. We
+//! use [`serde_yaml_ng`], a maintained fork with an API-compatible surface,
+//! so a future swap to whichever fork wins (`serde_yml`, `yaml-rust2`, …)
+//! is a one-line dep change.
+
+use std::sync::OnceLock;
+
+use kb_core::{Metadata, SourceType, TrustLevel};
+use kb_parse_types::{Warning, WarningKind};
+use lingua::{IsoCode639_1, Language, LanguageDetector, LanguageDetectorBuilder};
+use serde::Deserialize;
+use serde_json::{Map, Value};
+use time::OffsetDateTime;
+
+/// Caller-supplied fallback values used when frontmatter is missing or partial.
+///
+/// `BodyHints` is parser-input only — it is not part of `kb-core` and never
+/// crosses the storage boundary. The §0 Q9 derive table consults these
+/// fallbacks in a fixed order, see [`parse_frontmatter`].
+#[derive(Clone, Debug)]
+pub struct BodyHints {
+    /// First H1 of the body, if any. Used as `title` fallback when the
+    /// frontmatter does not specify one.
+    pub first_h1: Option<String>,
+    /// Filesystem creation time. Used as `created_at` fallback.
+    pub fs_ctime: OffsetDateTime,
+    /// Filesystem modification time. Used as `updated_at` fallback.
+    pub fs_mtime: OffsetDateTime,
+    /// Optional language fallback used when neither frontmatter nor lingua
+    /// detection produce a value. If `None` the final fallback is `"und"`.
+    pub fallback_lang: Option<String>,
+}
+
+/// Byte range of the frontmatter region inside the input slice.
+///
+/// `start` is the offset of the leading delimiter (`---` or `+++`).
+/// `end` is the offset just past the closing delimiter line's trailing
+/// newline (i.e. the body starts at `bytes[end..]`).
+///
+/// Per the task brief this is technically meant to be crate-internal, but
+/// the [`parse_frontmatter`] return type forces it to be `pub`. P1-3 / P1-4
+/// reuse it via this same crate.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub struct FrontmatterSpan {
+    pub start: usize,
+    pub end: usize,
+}
+
+/// Parse the frontmatter (if any) from a Markdown byte slice into a
+/// `kb_core::Metadata`, applying the §0 Q9 derive table for missing fields.
+///
+/// On a malformed frontmatter the function still returns `Ok` — the
+/// frontmatter contents are discarded and the caller is told via a
+/// `Warning { kind: MalformedFrontmatter, .. }`. The returned span still
+/// covers the delimited region so the caller can skip it during body
+/// slicing.
+///
+/// `Err` is reserved for genuinely fatal conditions (e.g. non-UTF-8 input
+/// that can't even be lossy-decoded), which currently cannot arise here.
+pub fn parse_frontmatter(
+    bytes: &[u8],
+    hints: &BodyHints,
+) -> anyhow::Result<(Metadata, Option<FrontmatterSpan>, Vec<Warning>)> {
+    let mut warnings = Vec::new();
+
+    let detected = detect_delimiters(bytes);
+
+    let (raw_opt, span_opt) = match detected {
+        None => (None, None),
+        Some((delim, span)) => {
+            // SAFETY: detect_delimiters guarantees inner bytes are valid UTF-8
+            // because it scanned for ASCII delimiters and slices on those
+            // boundaries. We still go through `from_utf8` to surface non-ASCII
+            // bytes safely as a malformed-frontmatter warning.
+            let inner_start = span.start + delim.opening_len();
+            let inner_end = span.end - delim.closing_len();
+            let inner = &bytes[inner_start..inner_end];
+            match std::str::from_utf8(inner) {
+                Ok(s) => match parse_raw(delim, s) {
+                    Ok(raw) => (Some(raw), Some(span)),
+                    Err(e) => {
+                        warnings.push(Warning {
+                            kind: WarningKind::MalformedFrontmatter,
+                            note: e,
+                        });
+                        (None, Some(span))
+                    }
+                },
+                Err(e) => {
+                    warnings.push(Warning {
+                        kind: WarningKind::MalformedFrontmatter,
+                        note: format!("frontmatter not valid utf-8: {e}"),
+                    });
+                    (None, Some(span))
+                }
+            }
+        }
+    };
+
+    let body_start = span_opt.map(|s| s.end).unwrap_or(0);
+    let body = &bytes[body_start..];
+
+    let metadata = derive_metadata(raw_opt, hints, body, &mut warnings);
+
+    Ok((metadata, span_opt, warnings))
+}
+
+// ---------------------------------------------------------------------------
+// Delimiter detection
+// ---------------------------------------------------------------------------
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub(crate) enum DelimKind {
+    Yaml,
+    Toml,
+}
+
+impl DelimKind {
+    /// Bytes consumed at the start (delimiter line + newline).
+    fn opening_len(self) -> usize {
+        // "---\n" or "+++\n" — both 4 bytes; "---\r\n" handled by detect.
+        match self {
+            DelimKind::Yaml => 4,
+            DelimKind::Toml => 4,
+        }
+    }
+
+    fn closing_len(self) -> usize {
+        // The closing delimiter line itself plus its trailing newline. Same
+        // shape as opening; `detect_delimiters` adjusts for `\r\n`.
+        match self {
+            DelimKind::Yaml => 4,
+            DelimKind::Toml => 4,
+        }
+    }
+
+    fn marker(self) -> &'static [u8] {
+        match self {
+            DelimKind::Yaml => b"---",
+            DelimKind::Toml => b"+++",
+        }
+    }
+}
+
+/// Look for a YAML or TOML frontmatter block at the very start of `bytes`.
+/// Returns `(kind, span)` where `span.start = 0` and `span.end` points
+/// just past the closing delimiter's trailing newline (or EOF).
+///
+/// Anything that isn't an exact `---\n` / `+++\n` opener at byte 0 is treated
+/// as "no frontmatter" — no leading whitespace, no BOM, etc. Per design §0 Q9.
+pub(crate) fn detect_delimiters(bytes: &[u8]) -> Option<(DelimKind, FrontmatterSpan)> {
+    let kind = match bytes.first()? {
+        b'-' if bytes.starts_with(b"---") => DelimKind::Yaml,
+        b'+' if bytes.starts_with(b"+++") => DelimKind::Toml,
+        _ => return None,
+    };
+
+    let marker = kind.marker();
+
+    // Opening line must be just the marker + newline (LF or CRLF). No trailing
+    // chars on the same line are allowed — that's not a frontmatter delimiter.
+    let after_open = match bytes.get(marker.len()) {
+        Some(b'\n') => marker.len() + 1,
+        Some(b'\r') if bytes.get(marker.len() + 1) == Some(&b'\n') => marker.len() + 2,
+        _ => return None,
+    };
+
+    // Find the closing marker on its own line.
+    // Walk line by line. We need a line that is exactly `marker` (optionally
+    // followed by spaces? per §0 Q9 we keep it strict: marker + EOL only).
+    let mut i = after_open;
+    while i < bytes.len() {
+        let line_start = i;
+        // find next newline (or EOF)
+        let line_end = bytes[line_start..]
+            .iter()
+            .position(|&b| b == b'\n')
+            .map(|p| line_start + p)
+            .unwrap_or(bytes.len());
+
+        let line = {
+            // trim trailing \r if present (CRLF)
+            let mut end = line_end;
+            if end > line_start && bytes[end.saturating_sub(1)] == b'\r' {
+                end -= 1;
+            }
+            &bytes[line_start..end]
+        };
+
+        if line == marker {
+            // Closing delimiter found. Compute span end = line_end + 1 if a
+            // newline is present, else line_end (EOF).
+            let span_end = if line_end < bytes.len() {
+                line_end + 1
+            } else {
+                bytes.len()
+            };
+            return Some((
+                kind,
+                FrontmatterSpan {
+                    start: 0,
+                    end: span_end,
+                },
+            ));
+        }
+
+        if line_end >= bytes.len() {
+            break;
+        }
+        i = line_end + 1;
+    }
+
+    // No closing delimiter — not a frontmatter block.
+    None
+}
+
+// ---------------------------------------------------------------------------
+// Raw frontmatter (parsed shape, before §0 Q9 derive)
+// ---------------------------------------------------------------------------
+
+/// Untyped frontmatter view. Known fields are pulled by name, unknowns flow
+/// into `extra`. We deliberately use `serde_json::Value` everywhere so YAML
+/// and TOML go through the same downstream pipeline.
+#[derive(Debug, Default, Deserialize)]
+struct RawFrontmatter {
+    #[serde(default)]
+    title: Option<String>,
+    #[serde(default)]
+    aliases: Option<Vec<String>>,
+    #[serde(default)]
+    tags: Option<Vec<String>>,
+    #[serde(default)]
+    lang: Option<String>,
+    #[serde(default)]
+    created_at: Option<String>,
+    #[serde(default)]
+    updated_at: Option<String>,
+    #[serde(default)]
+    source_type: Option<String>,
+    #[serde(default)]
+    trust_level: Option<String>,
+    /// `id:` field is captured as an alias only — never feeds doc_id (§4.2).
+    #[serde(default)]
+    id: Option<String>,
+    /// Catch-all for unknown keys → `metadata.user`.
+    #[serde(flatten)]
+    extra: Map<String, Value>,
+}
+
+fn parse_raw(kind: DelimKind, slice: &str) -> Result<RawFrontmatter, String> {
+    match kind {
+        DelimKind::Yaml => {
+            // Empty YAML frontmatter is legal (parses to null) — handle
+            // explicitly so `serde_yaml_ng` doesn't fail trying to deserialize
+            // null into a struct.
+            if slice.trim().is_empty() {
+                return Ok(RawFrontmatter::default());
+            }
+            serde_yaml_ng::from_str::<RawFrontmatter>(slice).map_err(|e| e.to_string())
+        }
+        DelimKind::Toml => {
+            if slice.trim().is_empty() {
+                return Ok(RawFrontmatter::default());
+            }
+            toml::from_str::<RawFrontmatter>(slice).map_err(|e| e.to_string())
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// §0 Q9 derive table
+// ---------------------------------------------------------------------------
+
+fn derive_metadata(
+    raw: Option<RawFrontmatter>,
+    hints: &BodyHints,
+    body: &[u8],
+    warnings: &mut Vec<Warning>,
+) -> Metadata {
+    let raw = raw.unwrap_or_default();
+
+    // user map starts from the unknown-key overflow.
+    let mut user = raw.extra;
+
+    // ---- title ----
+    // Frontmatter → BodyHints.first_h1 → None.
+    // Filename fallback is the caller's responsibility (P1-4 normalize), per
+    // task brief — `BodyHints` does not carry a filename.
+    let title = raw.title.or_else(|| hints.first_h1.clone());
+    if let Some(t) = title {
+        user.insert("title".to_string(), Value::String(t));
+    }
+
+    // ---- aliases / tags ----
+    let aliases = raw.aliases.unwrap_or_default();
+    let tags = raw.tags.unwrap_or_default();
+
+    // ---- lang ----
+    // Frontmatter → lingua autodetect (first 4 KB of body) → fallback_lang → "und".
+    // The lang field is not on Metadata (§3.6) — store it under user.lang.
+    let lang = raw
+        .lang
+        .or_else(|| detect_lang(body))
+        .or_else(|| hints.fallback_lang.clone())
+        .unwrap_or_else(|| "und".to_string());
+    user.insert("lang".to_string(), Value::String(lang));
+
+    // ---- timestamps ----
+    let mut original_timestamps: Map<String, Value> = Map::new();
+    let created_at = parse_ts(
+        raw.created_at.as_deref(),
+        "created_at",
+        &mut original_timestamps,
+        warnings,
+    )
+    .unwrap_or(hints.fs_ctime);
+    let updated_at = parse_ts(
+        raw.updated_at.as_deref(),
+        "updated_at",
+        &mut original_timestamps,
+        warnings,
+    )
+    .unwrap_or(hints.fs_mtime);
+    if !original_timestamps.is_empty() {
+        user.insert(
+            "original_timestamps".to_string(),
+            Value::Object(original_timestamps),
+        );
+    }
+
+    // ---- source_type ----
+    let source_type = match raw.source_type.as_deref() {
+        None => SourceType::Markdown,
+        Some(s) => match parse_source_type(s) {
+            Some(st) => st,
+            None => {
+                warnings.push(Warning {
+                    kind: WarningKind::MalformedFrontmatter,
+                    note: format!("unknown source_type={s}, defaulted to markdown"),
+                });
+                SourceType::Markdown
+            }
+        },
+    };
+
+    // ---- trust_level ----
+    let trust_level = match raw.trust_level.as_deref() {
+        None => TrustLevel::Primary,
+        Some(s) => match parse_trust_level(s) {
+            Some(tl) => tl,
+            None => {
+                warnings.push(Warning {
+                    kind: WarningKind::MalformedFrontmatter,
+                    note: format!("unknown trust_level={s}, defaulted to primary"),
+                });
+                TrustLevel::Primary
+            }
+        },
+    };
+
+    // ---- id alias ----
+    // `id:` field becomes `metadata.user_id_alias` AND is mirrored into the
+    // user map under `user_id_alias` (per design §4.2 — not a doc_id factor).
+    let user_id_alias = raw.id;
+    if let Some(ref id) = user_id_alias {
+        user.insert(
+            "user_id_alias".to_string(),
+            Value::String(id.clone()),
+        );
+    }
+
+    Metadata {
+        aliases,
+        tags,
+        created_at,
+        updated_at,
+        source_type,
+        trust_level,
+        user_id_alias,
+        user,
+    }
+}
+
+fn parse_source_type(s: &str) -> Option<SourceType> {
+    // Mirror the lowercase serde rename used on SourceType.
+    match s {
+        "markdown" => Some(SourceType::Markdown),
+        "note" => Some(SourceType::Note),
+        "paper" => Some(SourceType::Paper),
+        "reference" => Some(SourceType::Reference),
+        "inbox" => Some(SourceType::Inbox),
+        _ => None,
+    }
+}
+
+fn parse_trust_level(s: &str) -> Option<TrustLevel> {
+    match s {
+        "primary" => Some(TrustLevel::Primary),
+        "secondary" => Some(TrustLevel::Secondary),
+        "generated" => Some(TrustLevel::Generated),
+        _ => None,
+    }
+}
+
+/// Parse an RFC 3339 timestamp string and normalize to UTC. If the original
+/// offset was non-UTC, push it into `original_timestamps[field]` per §0 Q9.
+/// Returns `None` if the input is missing OR malformed (in which case a
+/// warning is emitted).
+fn parse_ts(
+    s: Option<&str>,
+    field: &str,
+    original_timestamps: &mut Map<String, Value>,
+    warnings: &mut Vec<Warning>,
+) -> Option<OffsetDateTime> {
+    let s = s?;
+    match OffsetDateTime::parse(s, &time::format_description::well_known::Rfc3339) {
+        Ok(dt) => {
+            if dt.offset() != time::UtcOffset::UTC {
+                original_timestamps.insert(field.to_string(), Value::String(s.to_string()));
+            }
+            Some(dt.to_offset(time::UtcOffset::UTC))
+        }
+        Err(e) => {
+            warnings.push(Warning {
+                kind: WarningKind::MalformedFrontmatter,
+                note: format!("malformed {field}={s:?}: {e}"),
+            });
+            None
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Lingua detector (cached statically — first init is heavy)
+// ---------------------------------------------------------------------------
+
+fn detector() -> &'static LanguageDetector {
+    static DETECTOR: OnceLock<LanguageDetector> = OnceLock::new();
+    DETECTOR.get_or_init(|| {
+        // Keep the language set narrow: matches the cargo features we enable
+        // on the `lingua` dep. Adding more languages here without enabling
+        // their feature flag will fail to compile.
+        LanguageDetectorBuilder::from_languages(&[
+            Language::English,
+            Language::Korean,
+            Language::Japanese,
+            Language::Chinese,
+        ])
+        .build()
+    })
+}
+
+/// Run lingua autodetect on the first 4 KB of body. Returns an ISO 639-1
+/// two-letter code (lowercase) on success.
+///
+/// Note: lingua needs reasonably long input to be confident. Empty / very
+/// short bodies return `None` so we fall through to the next derive step.
+fn detect_lang(body: &[u8]) -> Option<String> {
+    const WINDOW: usize = 4 * 1024;
+    if body.is_empty() {
+        return None;
+    }
+    let n = body.len().min(WINDOW);
+    // Find a UTF-8-safe slice end ≤ n. Walk back at most 4 bytes.
+    let mut end = n;
+    while end > 0 && std::str::from_utf8(&body[..end]).is_err() {
+        end -= 1;
+    }
+    if end == 0 {
+        return None;
+    }
+    let s = std::str::from_utf8(&body[..end]).ok()?;
+    if s.trim().is_empty() {
+        return None;
+    }
+    let lang = detector().detect_language_of(s)?;
+    Some(iso_code(lang).to_string())
+}
+
+fn iso_code(lang: Language) -> &'static str {
+    // `lingua::IsoCode639_1` is gated by the language features enabled on the
+    // crate — only the variants below are compiled into our build, so this
+    // match is exhaustive for the configured detector.
+    match lang.iso_code_639_1() {
+        IsoCode639_1::EN => "en",
+        IsoCode639_1::KO => "ko",
+        IsoCode639_1::JA => "ja",
+        IsoCode639_1::ZH => "zh",
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use kb_core::{
+        AssetId, WorkspacePath,
+        ids::id_for_doc,
+        versions::ParserVersion,
+    };
+    use time::macros::datetime;
+
+    fn hints() -> BodyHints {
+        BodyHints {
+            first_h1: None,
+            fs_ctime: datetime!(2024-01-01 00:00:00 UTC),
+            fs_mtime: datetime!(2024-01-02 00:00:00 UTC),
+            fallback_lang: None,
+        }
+    }
+
+    #[test]
+    fn yaml_happy_path() {
+        let md = b"---\n\
+title: My Doc\n\
+aliases: [a, b]\n\
+tags: [t1, t2]\n\
+lang: en\n\
+created_at: 2024-03-01T00:00:00Z\n\
+updated_at: 2024-03-02T00:00:00Z\n\
+source_type: note\n\
+trust_level: secondary\n\
+---\nbody\n";
+
+        let (meta, span, warns) = parse_frontmatter(md, &hints()).unwrap();
+        assert!(warns.is_empty(), "warnings: {warns:?}");
+        let span = span.expect("span present");
+        assert_eq!(span.start, 0);
+        assert_eq!(meta.aliases, vec!["a".to_string(), "b".to_string()]);
+        assert_eq!(meta.tags, vec!["t1".to_string(), "t2".to_string()]);
+        assert_eq!(meta.source_type, SourceType::Note);
+        assert_eq!(meta.trust_level, TrustLevel::Secondary);
+        assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC));
+        assert_eq!(meta.updated_at, datetime!(2024-03-02 00:00:00 UTC));
+        assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("My Doc"));
+        assert_eq!(meta.user.get("lang").and_then(|v| v.as_str()), Some("en"));
+        assert_eq!(meta.user_id_alias, None);
+    }
+
+    #[test]
+    fn toml_happy_path() {
+        let md = b"+++\n\
+title = \"My Doc\"\n\
+aliases = [\"a\", \"b\"]\n\
+tags = [\"t1\", \"t2\"]\n\
+lang = \"en\"\n\
+created_at = \"2024-03-01T00:00:00Z\"\n\
+updated_at = \"2024-03-02T00:00:00Z\"\n\
+source_type = \"note\"\n\
+trust_level = \"secondary\"\n\
++++\nbody\n";
+
+        let (meta, span, warns) = parse_frontmatter(md, &hints()).unwrap();
+        assert!(warns.is_empty(), "warnings: {warns:?}");
+        assert!(span.is_some());
+        assert_eq!(meta.aliases, vec!["a".to_string(), "b".to_string()]);
+        assert_eq!(meta.tags, vec!["t1".to_string(), "t2".to_string()]);
+        assert_eq!(meta.source_type, SourceType::Note);
+        assert_eq!(meta.trust_level, TrustLevel::Secondary);
+    }
+
+    #[test]
+    fn unknown_keys_preserved_in_user() {
+        let md = b"---\n\
+title: Doc\n\
+custom_field: hello\n\
+nested: {a: 1}\n\
+---\n";
+        let (meta, _span, warns) = parse_frontmatter(md, &hints()).unwrap();
+        assert!(warns.is_empty(), "warnings: {warns:?}");
+        assert_eq!(
+            meta.user.get("custom_field").and_then(|v| v.as_str()),
+            Some("hello")
+        );
+        assert!(meta.user.get("nested").is_some());
+    }
+
+    #[test]
+    fn unknown_enum_value_warns_and_defaults() {
+        let md = b"---\n\
+trust_level: weird\n\
+source_type: alien\n\
+---\n";
+        let (meta, _span, warns) = parse_frontmatter(md, &hints()).unwrap();
+        assert_eq!(meta.trust_level, TrustLevel::Primary);
+        assert_eq!(meta.source_type, SourceType::Markdown);
+        assert_eq!(warns.len(), 2);
+        assert!(warns.iter().all(|w| matches!(w.kind, WarningKind::MalformedFrontmatter)));
+        assert!(warns.iter().any(|w| w.note.contains("trust_level=weird")));
+        assert!(warns.iter().any(|w| w.note.contains("source_type=alien")));
+    }
+
+    #[test]
+    fn malformed_yaml_emits_warning_and_defaults() {
+        // Unclosed quote → YAML parse fails.
+        let md = b"---\ntitle: \"unterminated\n---\n";
+        let (meta, span, warns) = parse_frontmatter(md, &hints()).unwrap();
+        assert!(span.is_some(), "span still reflects delim region");
+        assert_eq!(warns.len(), 1);
+        assert!(matches!(warns[0].kind, WarningKind::MalformedFrontmatter));
+        // Body fallbacks applied.
+        assert_eq!(meta.created_at, datetime!(2024-01-01 00:00:00 UTC));
+        assert_eq!(meta.updated_at, datetime!(2024-01-02 00:00:00 UTC));
+        assert_eq!(meta.source_type, SourceType::Markdown);
+        assert_eq!(meta.trust_level, TrustLevel::Primary);
+    }
+
+    #[test]
+    fn no_frontmatter_uses_body_hints_silently() {
+        let md = b"# Just a heading\n\nsome body";
+        let mut h = hints();
+        h.first_h1 = Some("Just a heading".to_string());
+        h.fallback_lang = Some("en".to_string());
+        let (meta, span, warns) = parse_frontmatter(md, &h).unwrap();
+        assert!(span.is_none());
+        assert!(warns.is_empty());
+        assert_eq!(
+            meta.user.get("title").and_then(|v| v.as_str()),
+            Some("Just a heading")
+        );
+        // Body too short for confident lingua autodetect → fallback_lang.
+        assert_eq!(meta.user.get("lang").and_then(|v| v.as_str()), Some("en"));
+    }
+
+    /// `id:` field MUST NOT influence `doc_id` (design §4.2). Compute the
+    /// recipe twice — with and without the field — and assert the results
+    /// match.
+    #[test]
+    fn id_field_does_not_feed_doc_id() {
+        let with_id = b"---\nid: my-handle\ntitle: Doc\n---\n";
+        let without = b"---\ntitle: Doc\n---\n";
+
+        let (meta_with, _, _) = parse_frontmatter(with_id, &hints()).unwrap();
+        let (meta_without, _, _) = parse_frontmatter(without, &hints()).unwrap();
+
+        assert_eq!(meta_with.user_id_alias.as_deref(), Some("my-handle"));
+        assert_eq!(meta_without.user_id_alias, None);
+
+        let asset = AssetId("0123456789abcdef0123456789abcdef".to_string());
+        let path = WorkspacePath::new("notes/test.md".to_string()).unwrap();
+        let pv = ParserVersion("pulldown-cmark-0.x".to_string());
+
+        let id_a = id_for_doc(&path, &asset, &pv);
+        let id_b = id_for_doc(&path, &asset, &pv);
+        assert_eq!(
+            id_a, id_b,
+            "id_for_doc must be stable across runs and not see metadata"
+        );
+        // Sanity: the recipe takes (workspace_path, asset_id, parser_version)
+        // only — there is literally no parameter to plumb metadata through.
+    }
+
+    #[test]
+    fn non_utc_timestamp_preserved_in_user_original_timestamps() {
+        let md = b"---\ncreated_at: 2024-01-15T10:00:00+09:00\n---\n";
+        let (meta, _, warns) = parse_frontmatter(md, &hints()).unwrap();
+        assert!(warns.is_empty(), "warnings: {warns:?}");
+        // Normalized to UTC.
+        assert_eq!(meta.created_at, datetime!(2024-01-15 01:00:00 UTC));
+        let orig = meta
+            .user
+            .get("original_timestamps")
+            .and_then(|v| v.as_object())
+            .expect("original_timestamps map present");
+        assert_eq!(
+            orig.get("created_at").and_then(|v| v.as_str()),
+            Some("2024-01-15T10:00:00+09:00")
+        );
+    }
+
+    #[test]
+    fn malformed_timestamp_warns_and_falls_back() {
+        let md = b"---\ncreated_at: not-a-date\n---\n";
+        let (meta, _, warns) = parse_frontmatter(md, &hints()).unwrap();
+        assert_eq!(warns.len(), 1);
+        assert!(matches!(warns[0].kind, WarningKind::MalformedFrontmatter));
+        assert!(warns[0].note.contains("created_at"));
+        // Fallback to fs_ctime.
+        assert_eq!(meta.created_at, datetime!(2024-01-01 00:00:00 UTC));
+    }
+
+    #[test]
+    fn detect_delimiters_no_match_without_leading_marker() {
+        assert!(detect_delimiters(b"# heading\n---\n---\n").is_none());
+        assert!(detect_delimiters(b"  ---\n---\n").is_none(), "leading whitespace");
+        assert!(detect_delimiters(b"").is_none());
+    }
+
+    #[test]
+    fn detect_delimiters_yaml_basic() {
+        let bytes = b"---\nfoo: bar\n---\nbody\n";
+        let (kind, span) = detect_delimiters(bytes).unwrap();
+        assert_eq!(kind, DelimKind::Yaml);
+        assert_eq!(span.start, 0);
+        // body starts at "body\n" — the closing "---\n" is part of the span.
+        assert_eq!(&bytes[span.end..], b"body\n");
+    }
+
+    #[test]
+    fn detect_delimiters_toml_basic() {
+        let bytes = b"+++\nfoo = \"bar\"\n+++\nbody\n";
+        let (kind, span) = detect_delimiters(bytes).unwrap();
+        assert_eq!(kind, DelimKind::Toml);
+        assert_eq!(&bytes[span.end..], b"body\n");
+    }
+
+    #[test]
+    fn detect_delimiters_unterminated_returns_none() {
+        // `---\n` then no closing — treat as no frontmatter.
+        let bytes = b"---\nfoo: bar\n";
+        assert!(detect_delimiters(bytes).is_none());
+    }
+
+    #[test]
+    fn empty_yaml_frontmatter_is_legal() {
+        let md = b"---\n---\nbody\n";
+        let (_meta, span, warns) = parse_frontmatter(md, &hints()).unwrap();
+        assert!(span.is_some());
+        assert!(warns.is_empty(), "warnings: {warns:?}");
+    }
+
+    #[test]
+    fn lingua_detects_korean_and_english() {
+        let ko = "안녕하세요. 이것은 한국어로 작성된 문서입니다. 형태소 분석은 어렵습니다. 그러나 lingua는 잘 동작합니다.".as_bytes();
+        let en = "Hello there. This document is written in English. The lingua language detector is statistical and works on short text too, given enough words.".as_bytes();
+        assert_eq!(detect_lang(ko).as_deref(), Some("ko"));
+        assert_eq!(detect_lang(en).as_deref(), Some("en"));
+    }
+}
diff --git a/crates/kb-parse-md/src/lib.rs b/crates/kb-parse-md/src/lib.rs
index fcaca6e..4ce413d 100644
--- a/crates/kb-parse-md/src/lib.rs
+++ b/crates/kb-parse-md/src/lib.rs
@@ -1,7 +1,19 @@
 //! `kb-parse-md` — Markdown parsing for the KB pipeline (§3.7b).
 //!
-//! P1-2 will implement the **frontmatter** submodule. P1-3 will add a
+//! P1-2 implements the **frontmatter** submodule only. P1-3 will add a
 //! sibling `blocks` submodule for block parsing using `pulldown-cmark`.
 //!
-//! This commit only establishes the crate scaffold so subsequent
-//! commits can land the parser in a reviewable shape.
+//! Public surface for P1-2 is intentionally narrow:
+//!
+//! * [`parse_frontmatter`] — pure function from Markdown bytes to
+//!   `(Metadata, Option<FrontmatterSpan>, Vec<Warning>)`.
+//! * [`BodyHints`] — caller-supplied fallbacks that feed the §0 Q9 derive
+//!   table when frontmatter is missing or partial.
+//! * [`FrontmatterSpan`] — byte offsets of the frontmatter region in the
+//!   input slice (returned by [`parse_frontmatter`]).
+//!
+//! Anything else in this crate is `pub(crate)` and may change without notice.
+
+pub mod frontmatter;
+
+pub use frontmatter::{BodyHints, FrontmatterSpan, parse_frontmatter};
-- 
2.49.1


From 42a7d53e5dc55248397988e8dde8bb2d82160579 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 12:56:19 +0000
Subject: [PATCH 3/6] p1-2: fixtures + snapshot tests for frontmatter parser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two markdown fixtures with hand-authored JSON baselines that pin the
§0 Q9 derive output across runs:

- frontmatter-only.md exercises the YAML happy path with most fields,
  unknown keys, an `id:` field, and a non-UTC created_at (so the
  baseline shows original_timestamps preservation).
- mixed-lang.md is body-only with no `lang:` field; baseline pins the
  lingua autodetect result for our enabled language set.

A separate `emit_snapshots` test (marked `#[ignore]`) regenerates the
baselines from the current parser output. A determinism test parses
the fixture twice and asserts equality so any non-determinism (e.g.
key ordering, lingua nondeterminism) fails fast.
---
 .../tests/frontmatter_snapshots.rs            | 111 ++++++++++++++++++
 fixtures/markdown/frontmatter-only.md         |  22 ++++
 .../markdown/frontmatter-only.snapshot.json   |  31 +++++
 fixtures/markdown/mixed-lang.md               |   9 ++
 fixtures/markdown/mixed-lang.snapshot.json    |  16 +++
 5 files changed, 189 insertions(+)
 create mode 100644 crates/kb-parse-md/tests/frontmatter_snapshots.rs
 create mode 100644 fixtures/markdown/frontmatter-only.md
 create mode 100644 fixtures/markdown/frontmatter-only.snapshot.json
 create mode 100644 fixtures/markdown/mixed-lang.md
 create mode 100644 fixtures/markdown/mixed-lang.snapshot.json

diff --git a/crates/kb-parse-md/tests/frontmatter_snapshots.rs b/crates/kb-parse-md/tests/frontmatter_snapshots.rs
new file mode 100644
index 0000000..84c6bcc
--- /dev/null
+++ b/crates/kb-parse-md/tests/frontmatter_snapshots.rs
@@ -0,0 +1,111 @@
+//! Snapshot tests pinning the §0 Q9 derive output for two fixtures.
+//!
+//! The baseline JSON next to each fixture is hand-authored / regenerated
+//! from a deterministic run. `BodyHints` timestamps are caller-provided
+//! and therefore stable; lingua autodetect over our fixtures is also
+//! stable for the language set we configured.
+
+use kb_parse_md::{BodyHints, parse_frontmatter};
+use serde::Serialize;
+use serde_json::Value;
+use std::fs;
+use std::path::PathBuf;
+use time::macros::datetime;
+
+/// Stable view of the parser output suitable for JSON snapshotting.
+/// We deliberately exclude `FrontmatterSpan` byte offsets here too — they're
+/// fully determined by the input file and are exercised by unit tests; the
+/// snapshot focuses on the §0 Q9 derive contract.
+#[derive(Serialize)]
+struct Snapshot {
+    metadata: kb_core::Metadata,
+    span_present: bool,
+    warnings: Vec<kb_parse_types::Warning>,
+}
+
+fn fixtures_dir() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("..")
+        .join("..")
+        .join("fixtures")
+        .join("markdown")
+}
+
+fn pinned_hints() -> BodyHints {
+    BodyHints {
+        first_h1: None,
+        fs_ctime: datetime!(2024-01-01 00:00:00 UTC),
+        fs_mtime: datetime!(2024-01-02 00:00:00 UTC),
+        fallback_lang: None,
+    }
+}
+
+fn assert_snapshot(fixture: &str, baseline: &str) {
+    let dir = fixtures_dir();
+    let bytes = fs::read(dir.join(fixture)).expect("fixture readable");
+
+    let (meta, span, warns) = parse_frontmatter(&bytes, &pinned_hints()).unwrap();
+    let snap = Snapshot {
+        metadata: meta,
+        span_present: span.is_some(),
+        warnings: warns,
+    };
+    let actual: Value = serde_json::to_value(&snap).unwrap();
+
+    let expected_text =
+        fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable");
+    let expected: Value = serde_json::from_str(&expected_text).expect("baseline parses as json");
+
+    if actual != expected {
+        let actual_pretty = serde_json::to_string_pretty(&actual).unwrap();
+        panic!(
+            "snapshot drift for {fixture}\n\
+             --- expected ({baseline}) ---\n{expected_text}\n\
+             --- actual ---\n{actual_pretty}\n\
+             If the change is intentional, update {baseline}."
+        );
+    }
+}
+
+#[test]
+fn frontmatter_only_snapshot() {
+    assert_snapshot("frontmatter-only.md", "frontmatter-only.snapshot.json");
+}
+
+/// Run with `cargo test -p kb-parse-md --test frontmatter_snapshots emit_snapshots -- --ignored --nocapture`
+/// to regenerate the baseline JSON files from the current parser output.
+#[test]
+#[ignore]
+fn emit_snapshots() {
+    let dir = fixtures_dir();
+    for (fixture, baseline) in [
+        ("frontmatter-only.md", "frontmatter-only.snapshot.json"),
+        ("mixed-lang.md", "mixed-lang.snapshot.json"),
+    ] {
+        let bytes = fs::read(dir.join(fixture)).unwrap();
+        let (meta, span, warns) = parse_frontmatter(&bytes, &pinned_hints()).unwrap();
+        let snap = Snapshot {
+            metadata: meta,
+            span_present: span.is_some(),
+            warnings: warns,
+        };
+        let json = serde_json::to_string_pretty(&snap).unwrap();
+        fs::write(dir.join(baseline), format!("{json}\n")).unwrap();
+        eprintln!("wrote {}", dir.join(baseline).display());
+    }
+}
+
+#[test]
+fn mixed_lang_snapshot() {
+    assert_snapshot("mixed-lang.md", "mixed-lang.snapshot.json");
+}
+
+/// Determinism: parsing the same fixture twice in a row must give equal output.
+#[test]
+fn snapshot_is_deterministic_across_runs() {
+    let dir = fixtures_dir();
+    let bytes = fs::read(dir.join("frontmatter-only.md")).unwrap();
+    let (a, _, _) = parse_frontmatter(&bytes, &pinned_hints()).unwrap();
+    let (b, _, _) = parse_frontmatter(&bytes, &pinned_hints()).unwrap();
+    assert_eq!(serde_json::to_value(&a).unwrap(), serde_json::to_value(&b).unwrap());
+}
diff --git a/fixtures/markdown/frontmatter-only.md b/fixtures/markdown/frontmatter-only.md
new file mode 100644
index 0000000..766b641
--- /dev/null
+++ b/fixtures/markdown/frontmatter-only.md
@@ -0,0 +1,22 @@
+---
+title: Frontmatter Only
+aliases:
+  - fm-only
+  - first-fixture
+tags:
+  - parse
+  - test
+lang: en
+created_at: 2024-01-15T10:00:00+09:00
+updated_at: 2024-02-20T08:30:00Z
+source_type: note
+trust_level: secondary
+id: my-stable-handle
+custom_field: hello
+nested_obj:
+  key: value
+---
+
+# Body Heading
+
+Body paragraph.
diff --git a/fixtures/markdown/frontmatter-only.snapshot.json b/fixtures/markdown/frontmatter-only.snapshot.json
new file mode 100644
index 0000000..ae187df
--- /dev/null
+++ b/fixtures/markdown/frontmatter-only.snapshot.json
@@ -0,0 +1,31 @@
+{
+  "metadata": {
+    "aliases": [
+      "fm-only",
+      "first-fixture"
+    ],
+    "tags": [
+      "parse",
+      "test"
+    ],
+    "created_at": "2024-01-15T01:00:00Z",
+    "updated_at": "2024-02-20T08:30:00Z",
+    "source_type": "note",
+    "trust_level": "secondary",
+    "user_id_alias": "my-stable-handle",
+    "user": {
+      "custom_field": "hello",
+      "lang": "en",
+      "nested_obj": {
+        "key": "value"
+      },
+      "original_timestamps": {
+        "created_at": "2024-01-15T10:00:00+09:00"
+      },
+      "title": "Frontmatter Only",
+      "user_id_alias": "my-stable-handle"
+    }
+  },
+  "span_present": true,
+  "warnings": []
+}
diff --git a/fixtures/markdown/mixed-lang.md b/fixtures/markdown/mixed-lang.md
new file mode 100644
index 0000000..71ccf6e
--- /dev/null
+++ b/fixtures/markdown/mixed-lang.md
@@ -0,0 +1,9 @@
+# Mixed Language Note
+
+이 문서는 한국어와 영어가 섞여 있습니다. The body has both Korean
+sentences and English sentences. lingua는 통계적 언어 감지기를 제공합니다.
+This is to test that auto-detect picks one of `ko` or `en` deterministically
+when no `lang:` field is present in the frontmatter.
+
+본문은 첫 4 KB만 분석되지만, 짧은 문서에서도 잘 동작해야 합니다.
+The detector should pick the dominant language across the sample window.
diff --git a/fixtures/markdown/mixed-lang.snapshot.json b/fixtures/markdown/mixed-lang.snapshot.json
new file mode 100644
index 0000000..c6cda5c
--- /dev/null
+++ b/fixtures/markdown/mixed-lang.snapshot.json
@@ -0,0 +1,16 @@
+{
+  "metadata": {
+    "aliases": [],
+    "tags": [],
+    "created_at": "2024-01-01T00:00:00Z",
+    "updated_at": "2024-01-02T00:00:00Z",
+    "source_type": "markdown",
+    "trust_level": "primary",
+    "user_id_alias": null,
+    "user": {
+      "lang": "en"
+    }
+  },
+  "span_present": false,
+  "warnings": []
+}
-- 
2.49.1


From 1fab6b0207f75a6a8bc62f1007099eefa13b2c8f Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 13:02:28 +0000
Subject: [PATCH 4/6] p1-2: address spec review (drop user_id_alias mirror in
 user map)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Spec §"Behavior contract" line 74 says `id:` is captured into
`metadata.user_id_alias` only. Remove the redundant `user.insert`
that was also writing it into the user map, and update the snapshot
baseline accordingly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/kb-parse-md/src/frontmatter.rs            | 10 ++--------
 fixtures/markdown/frontmatter-only.snapshot.json |  3 +--
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/crates/kb-parse-md/src/frontmatter.rs b/crates/kb-parse-md/src/frontmatter.rs
index eacc9a3..9272773 100644
--- a/crates/kb-parse-md/src/frontmatter.rs
+++ b/crates/kb-parse-md/src/frontmatter.rs
@@ -367,15 +367,9 @@ fn derive_metadata(
     };
 
     // ---- id alias ----
-    // `id:` field becomes `metadata.user_id_alias` AND is mirrored into the
-    // user map under `user_id_alias` (per design §4.2 — not a doc_id factor).
+    // `id:` field becomes `metadata.user_id_alias` only (spec §"Behavior
+    // contract" line 74). It is NOT mirrored into the user map.
     let user_id_alias = raw.id;
-    if let Some(ref id) = user_id_alias {
-        user.insert(
-            "user_id_alias".to_string(),
-            Value::String(id.clone()),
-        );
-    }
 
     Metadata {
         aliases,
diff --git a/fixtures/markdown/frontmatter-only.snapshot.json b/fixtures/markdown/frontmatter-only.snapshot.json
index ae187df..62160d0 100644
--- a/fixtures/markdown/frontmatter-only.snapshot.json
+++ b/fixtures/markdown/frontmatter-only.snapshot.json
@@ -22,8 +22,7 @@
       "original_timestamps": {
         "created_at": "2024-01-15T10:00:00+09:00"
       },
-      "title": "Frontmatter Only",
-      "user_id_alias": "my-stable-handle"
+      "title": "Frontmatter Only"
     }
   },
   "span_present": true,
-- 
2.49.1


From 6a4db624b6c28132e9f41d5ba688b82095ba89d1 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 13:12:34 +0000
Subject: [PATCH 5/6] p1-2: fix CRLF / trailing whitespace / BOM in frontmatter
 delimiter detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

C1: detect_delimiters now returns (DelimKind, FrontmatterSpan, Range<usize>)
where the inner range is the YAML/TOML payload byte range — derived in one
place rather than recomputed by the parser via fixed-width opening_len /
closing_len constants that wrongly assumed LF endings. CRLF input now parses
correctly end-to-end; the originally-failing reviewer probe
"---\r\ntitle: Doc\r\n---\r\nbody\r\n" now yields title="Doc" with no
warnings.

I1: Trailing horizontal whitespace (spaces / tabs) on either delimiter
line is now accepted, matching Hugo / Jekyll. Editors that auto-trim
trailing whitespace no longer silently break otherwise-valid frontmatter.

I2: A leading UTF-8 BOM (EF BB BF, byte 0 only) is tolerated and skipped
before delimiter scanning. The returned span.start accounts for the BOM
(=3) so callers using bytes[span.end..] for body slicing still get the
correct range without further bookkeeping. Mid-input BOMs are not stripped.

M2: Drop the now-dead DelimKind::opening_len / closing_len constants —
the inner range is encoded once at detection time.

12 new tests covering CRLF (YAML / TOML / mixed-EOL / end-to-end),
trailing whitespace on opener / closer / tabs, leading BOM (detection +
full pipeline), and mid-input BOM non-stripping.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kb-parse-md/src/frontmatter.rs | 381 +++++++++++++++++++++-----
 1 file changed, 309 insertions(+), 72 deletions(-)

diff --git a/crates/kb-parse-md/src/frontmatter.rs b/crates/kb-parse-md/src/frontmatter.rs
index 9272773..6790854 100644
--- a/crates/kb-parse-md/src/frontmatter.rs
+++ b/crates/kb-parse-md/src/frontmatter.rs
@@ -11,6 +11,7 @@
 //! so a future swap to whichever fork wins (`serde_yml`, `yaml-rust2`, …)
 //! is a one-line dep change.
 
+use std::ops::Range;
 use std::sync::OnceLock;
 
 use kb_core::{Metadata, SourceType, TrustLevel};
@@ -75,15 +76,9 @@ pub fn parse_frontmatter(
 
     let (raw_opt, span_opt) = match detected {
         None => (None, None),
-        Some((delim, span)) => {
-            // SAFETY: detect_delimiters guarantees inner bytes are valid UTF-8
-            // because it scanned for ASCII delimiters and slices on those
-            // boundaries. We still go through `from_utf8` to surface non-ASCII
-            // bytes safely as a malformed-frontmatter warning.
-            let inner_start = span.start + delim.opening_len();
-            let inner_end = span.end - delim.closing_len();
-            let inner = &bytes[inner_start..inner_end];
-            match std::str::from_utf8(inner) {
+        Some((delim, span, inner)) => {
+            let inner_bytes = &bytes[inner.clone()];
+            match std::str::from_utf8(inner_bytes) {
                 Ok(s) => match parse_raw(delim, s) {
                     Ok(raw) => (Some(raw), Some(span)),
                     Err(e) => {
@@ -124,24 +119,6 @@ pub(crate) enum DelimKind {
 }
 
 impl DelimKind {
-    /// Bytes consumed at the start (delimiter line + newline).
-    fn opening_len(self) -> usize {
-        // "---\n" or "+++\n" — both 4 bytes; "---\r\n" handled by detect.
-        match self {
-            DelimKind::Yaml => 4,
-            DelimKind::Toml => 4,
-        }
-    }
-
-    fn closing_len(self) -> usize {
-        // The closing delimiter line itself plus its trailing newline. Same
-        // shape as opening; `detect_delimiters` adjusts for `\r\n`.
-        match self {
-            DelimKind::Yaml => 4,
-            DelimKind::Toml => 4,
-        }
-    }
-
     fn marker(self) -> &'static [u8] {
         match self {
             DelimKind::Yaml => b"---",
@@ -150,78 +127,178 @@ impl DelimKind {
     }
 }
 
+/// UTF-8 BOM. Stripped if present at byte 0; never elsewhere.
+const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
+
 /// Look for a YAML or TOML frontmatter block at the very start of `bytes`.
-/// Returns `(kind, span)` where `span.start = 0` and `span.end` points
-/// just past the closing delimiter's trailing newline (or EOF).
 ///
-/// Anything that isn't an exact `---\n` / `+++\n` opener at byte 0 is treated
-/// as "no frontmatter" — no leading whitespace, no BOM, etc. Per design §0 Q9.
-pub(crate) fn detect_delimiters(bytes: &[u8]) -> Option<(DelimKind, FrontmatterSpan)> {
-    let kind = match bytes.first()? {
-        b'-' if bytes.starts_with(b"---") => DelimKind::Yaml,
-        b'+' if bytes.starts_with(b"+++") => DelimKind::Toml,
+/// Returns `(kind, span, inner_range)` where:
+/// * `span.start` is the offset of the leading delimiter line (after BOM if
+///   any — i.e. `0` on BOM-less input, `3` on BOM-prefixed input). `span.end`
+///   points just past the closing delimiter line's trailing newline (or EOF).
+///   This is the "outer" range callers use for body slicing.
+/// * `inner_range` is the byte range of the YAML/TOML payload between the
+///   delimiter lines, not including either delimiter line nor their EOLs.
+///   This is what gets fed to the YAML/TOML parser.
+///
+/// All offsets are relative to the ORIGINAL `bytes` slice — callers that
+/// hold the original input can use both the span and the inner range
+/// directly without further bookkeeping.
+///
+/// A leading UTF-8 BOM (`EF BB BF`, exactly at byte 0) is tolerated and
+/// skipped; the returned `span.start` accounts for it. Subsequent
+/// BOM-shaped sequences are NOT stripped.
+///
+/// Trailing horizontal whitespace (ASCII spaces / tabs) is permitted on
+/// both the opening and closing delimiter lines: `---  \n` and `---\t\n`
+/// both count as a delimiter. This keeps editors that automatically trim
+/// trailing whitespace from silently breaking otherwise-valid frontmatter,
+/// and matches Hugo / Jekyll behaviour.
+///
+/// Anything else that isn't a delimiter at the very start (leading
+/// whitespace, indentation, prose) is treated as "no frontmatter" per
+/// design §0 Q9.
+pub(crate) fn detect_delimiters(
+    bytes: &[u8],
+) -> Option<(DelimKind, FrontmatterSpan, Range<usize>)> {
+    // Skip a leading UTF-8 BOM, but only at byte 0. The returned offsets
+    // remain relative to the original `bytes`, so we record `bom_offset`
+    // and add it to every position we compute below.
+    let bom_offset = if bytes.starts_with(UTF8_BOM) {
+        UTF8_BOM.len()
+    } else {
+        0
+    };
+    let scan = &bytes[bom_offset..];
+
+    let kind = match scan.first()? {
+        b'-' if scan.starts_with(b"---") => DelimKind::Yaml,
+        b'+' if scan.starts_with(b"+++") => DelimKind::Toml,
         _ => return None,
     };
 
     let marker = kind.marker();
 
-    // Opening line must be just the marker + newline (LF or CRLF). No trailing
-    // chars on the same line are allowed — that's not a frontmatter delimiter.
-    let after_open = match bytes.get(marker.len()) {
-        Some(b'\n') => marker.len() + 1,
-        Some(b'\r') if bytes.get(marker.len() + 1) == Some(&b'\n') => marker.len() + 2,
-        _ => return None,
-    };
+    // Opening line: marker, then optional horizontal whitespace, then EOL.
+    // `line_end_after_marker` returns `None` if a non-whitespace, non-EOL
+    // byte follows the marker — that's not a valid frontmatter opener.
+    let (_open_line_end, after_open_eol) = line_end_after_marker(scan, marker.len())?;
 
-    // Find the closing marker on its own line.
-    // Walk line by line. We need a line that is exactly `marker` (optionally
-    // followed by spaces? per §0 Q9 we keep it strict: marker + EOL only).
-    let mut i = after_open;
-    while i < bytes.len() {
+    let inner_start_in_scan = after_open_eol;
+
+    // Walk lines looking for a closing marker line. A line counts as a
+    // closer if `trim_ascii_end` of it equals the marker.
+    let mut i = after_open_eol;
+    while i < scan.len() {
         let line_start = i;
-        // find next newline (or EOF)
-        let line_end = bytes[line_start..]
+        let nl_pos = scan[line_start..]
             .iter()
             .position(|&b| b == b'\n')
-            .map(|p| line_start + p)
-            .unwrap_or(bytes.len());
-
-        let line = {
-            // trim trailing \r if present (CRLF)
-            let mut end = line_end;
-            if end > line_start && bytes[end.saturating_sub(1)] == b'\r' {
-                end -= 1;
+            .map(|p| line_start + p);
+        let line_content_end = match nl_pos {
+            Some(p) => {
+                // Trim trailing \r if present (CRLF).
+                if p > line_start && scan[p - 1] == b'\r' {
+                    p - 1
+                } else {
+                    p
+                }
             }
-            &bytes[line_start..end]
+            None => scan.len(),
         };
 
-        if line == marker {
-            // Closing delimiter found. Compute span end = line_end + 1 if a
-            // newline is present, else line_end (EOF).
-            let span_end = if line_end < bytes.len() {
-                line_end + 1
-            } else {
-                bytes.len()
+        let line = &scan[line_start..line_content_end];
+        if trim_ascii_end(line) == marker {
+            // Inner ends at the byte before this closing line's start; the
+            // EOL that terminates the previous content line is part of that
+            // line, not of the YAML/TOML payload, so strip one EOL.
+            //
+            // Clamp to `inner_start_in_scan` — when the frontmatter is
+            // empty (`---\n---\n`), the closing line sits directly after
+            // the opening's EOL and there is no preceding content line to
+            // strip from.
+            let inner_end_in_scan =
+                strip_one_trailing_eol(scan, line_start).max(inner_start_in_scan);
+
+            // span.end: just past the closing line's trailing newline (or
+            // EOF if the file ends without one).
+            let span_end_in_scan = match nl_pos {
+                Some(p) => p + 1,
+                None => scan.len(),
             };
+
             return Some((
                 kind,
                 FrontmatterSpan {
-                    start: 0,
-                    end: span_end,
+                    start: bom_offset,
+                    end: span_end_in_scan + bom_offset,
                 },
+                (inner_start_in_scan + bom_offset)..(inner_end_in_scan + bom_offset),
             ));
         }
 
-        if line_end >= bytes.len() {
-            break;
+        match nl_pos {
+            Some(p) => i = p + 1,
+            None => break,
         }
-        i = line_end + 1;
     }
 
     // No closing delimiter — not a frontmatter block.
     None
 }
 
+/// Find the line-end position of the opening delimiter line.
+///
+/// Given `scan` and `start = marker.len()`, returns
+/// `Some((line_content_end, after_eol))` where:
+/// * `line_content_end` is the byte index of the first `\r` (if `\r\n`)
+///   or `\n` ending the opening line — i.e. the slice `scan[marker.len()..line_content_end]`
+///   contains the trailing-whitespace-only region between the marker and
+///   the EOL.
+/// * `after_eol` is the byte index of the first byte of the next line
+///   (i.e. just past the `\n`).
+///
+/// Returns `None` if there is no EOL after the marker (treat as no frontmatter).
+fn line_end_after_marker(scan: &[u8], start: usize) -> Option<(usize, usize)> {
+    let mut i = start;
+    while i < scan.len() {
+        match scan[i] {
+            b'\n' => return Some((i, i + 1)),
+            b'\r' if scan.get(i + 1) == Some(&b'\n') => return Some((i, i + 2)),
+            b' ' | b'\t' => i += 1,
+            _ => return None,
+        }
+    }
+    None
+}
+
+/// `[u8]::trim_ascii_end` requires Rust 1.80; we mirror it here for clarity
+/// and minimum-MSRV portability.
+fn trim_ascii_end(bs: &[u8]) -> &[u8] {
+    let mut end = bs.len();
+    while end > 0 && matches!(bs[end - 1], b' ' | b'\t') {
+        end -= 1;
+    }
+    &bs[..end]
+}
+
+/// Given a position `pos` that points to the start of a line, walk back over
+/// at most one EOL sequence (`\n` or `\r\n`) and return the resulting
+/// position. This trims exactly one terminator off the previous line so the
+/// inner payload doesn't capture the closing delimiter's preceding newline.
+fn strip_one_trailing_eol(scan: &[u8], pos: usize) -> usize {
+    if pos == 0 {
+        return pos;
+    }
+    if scan[pos - 1] == b'\n' {
+        if pos >= 2 && scan[pos - 2] == b'\r' {
+            return pos - 2;
+        }
+        return pos - 1;
+    }
+    pos
+}
+
 // ---------------------------------------------------------------------------
 // Raw frontmatter (parsed shape, before §0 Q9 derive)
 // ---------------------------------------------------------------------------
@@ -694,19 +771,22 @@ source_type: alien\n\
     #[test]
     fn detect_delimiters_yaml_basic() {
         let bytes = b"---\nfoo: bar\n---\nbody\n";
-        let (kind, span) = detect_delimiters(bytes).unwrap();
+        let (kind, span, inner) = detect_delimiters(bytes).unwrap();
         assert_eq!(kind, DelimKind::Yaml);
         assert_eq!(span.start, 0);
         // body starts at "body\n" — the closing "---\n" is part of the span.
         assert_eq!(&bytes[span.end..], b"body\n");
+        // inner range covers exactly "foo: bar" (no surrounding EOL).
+        assert_eq!(&bytes[inner], b"foo: bar");
     }
 
     #[test]
     fn detect_delimiters_toml_basic() {
         let bytes = b"+++\nfoo = \"bar\"\n+++\nbody\n";
-        let (kind, span) = detect_delimiters(bytes).unwrap();
+        let (kind, span, inner) = detect_delimiters(bytes).unwrap();
         assert_eq!(kind, DelimKind::Toml);
         assert_eq!(&bytes[span.end..], b"body\n");
+        assert_eq!(&bytes[inner], b"foo = \"bar\"");
     }
 
     #[test]
@@ -731,4 +811,161 @@ source_type: alien\n\
         assert_eq!(detect_lang(ko).as_deref(), Some("ko"));
         assert_eq!(detect_lang(en).as_deref(), Some("en"));
     }
+
+    // ---- C1: CRLF line endings ------------------------------------------------
+
+    #[test]
+    fn detect_delimiters_crlf_yaml() {
+        let bytes = b"---\r\ntitle: Doc\r\n---\r\nbody\r\n";
+        let (kind, span, inner) = detect_delimiters(bytes).unwrap();
+        assert_eq!(kind, DelimKind::Yaml);
+        assert_eq!(span.start, 0);
+        // span ends just past the CRLF after the closing "---".
+        assert_eq!(&bytes[span.end..], b"body\r\n");
+        // Inner is exactly the YAML payload, sans surrounding EOLs.
+        assert_eq!(&bytes[inner], b"title: Doc");
+    }
+
+    #[test]
+    fn detect_delimiters_crlf_toml() {
+        let bytes = b"+++\r\ntitle = \"Doc\"\r\n+++\r\nbody\r\n";
+        let (kind, span, inner) = detect_delimiters(bytes).unwrap();
+        assert_eq!(kind, DelimKind::Toml);
+        assert_eq!(&bytes[span.end..], b"body\r\n");
+        assert_eq!(&bytes[inner], b"title = \"Doc\"");
+    }
+
+    #[test]
+    fn parse_frontmatter_crlf_yaml_end_to_end() {
+        let bytes = b"---\r\n\
+title: Doc\r\n\
+created_at: 2024-03-01T00:00:00Z\r\n\
+updated_at: 2024-03-02T00:00:00Z\r\n\
+---\r\nbody\r\n";
+        let (meta, span, warns) = parse_frontmatter(bytes, &hints()).unwrap();
+        assert!(warns.is_empty(), "warnings: {warns:?}");
+        assert!(span.is_some());
+        assert_eq!(
+            meta.user.get("title").and_then(|v| v.as_str()),
+            Some("Doc")
+        );
+        assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC));
+        assert_eq!(meta.updated_at, datetime!(2024-03-02 00:00:00 UTC));
+    }
+
+    #[test]
+    fn parse_frontmatter_crlf_toml_end_to_end() {
+        let bytes = b"+++\r\n\
+title = \"Doc\"\r\n\
+created_at = \"2024-03-01T00:00:00Z\"\r\n\
++++\r\nbody\r\n";
+        let (meta, span, warns) = parse_frontmatter(bytes, &hints()).unwrap();
+        assert!(warns.is_empty(), "warnings: {warns:?}");
+        assert!(span.is_some());
+        assert_eq!(
+            meta.user.get("title").and_then(|v| v.as_str()),
+            Some("Doc")
+        );
+        assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC));
+    }
+
+    /// Mixed-EOL input: opening uses `\n`, closing uses `\r\n` (or vice
+    /// versa). Policy: each line is considered independently, so any
+    /// combination of LF / CRLF parses correctly. This keeps tools that
+    /// edit only one end of a file (e.g. an editor that auto-wraps the
+    /// last line) from breaking otherwise-valid frontmatter.
+    #[test]
+    fn parse_frontmatter_mixed_lf_crlf() {
+        // Opening LF, closing CRLF.
+        let a = b"---\ntitle: A\n---\r\nbody\n";
+        let (meta, _span, warns) = parse_frontmatter(a, &hints()).unwrap();
+        assert!(warns.is_empty(), "case A warnings: {warns:?}");
+        assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("A"));
+
+        // Opening CRLF, closing LF.
+        let b = b"---\r\ntitle: B\r\n---\nbody\n";
+        let (meta, _span, warns) = parse_frontmatter(b, &hints()).unwrap();
+        assert!(warns.is_empty(), "case B warnings: {warns:?}");
+        assert_eq!(meta.user.get("title").and_then(|v| v.as_str()), Some("B"));
+    }
+
+    // ---- I1: trailing whitespace on delimiter lines ---------------------------
+
+    #[test]
+    fn detect_delimiters_yaml_with_trailing_whitespace_on_opener() {
+        let bytes = b"---  \ntitle: x\n---\nbody\n";
+        let (kind, span, inner) = detect_delimiters(bytes).unwrap();
+        assert_eq!(kind, DelimKind::Yaml);
+        assert_eq!(span.start, 0);
+        assert_eq!(&bytes[span.end..], b"body\n");
+        assert_eq!(&bytes[inner], b"title: x");
+    }
+
+    #[test]
+    fn detect_delimiters_yaml_with_trailing_whitespace_on_closer() {
+        let bytes = b"---\ntitle: x\n---  \nbody\n";
+        let (kind, span, inner) = detect_delimiters(bytes).unwrap();
+        assert_eq!(kind, DelimKind::Yaml);
+        assert_eq!(&bytes[span.end..], b"body\n");
+        assert_eq!(&bytes[inner], b"title: x");
+    }
+
+    #[test]
+    fn detect_delimiters_yaml_with_tabs_on_delimiter_line() {
+        let bytes = b"---\t\ntitle: x\n---\nbody\n";
+        let (kind, span, _inner) = detect_delimiters(bytes).unwrap();
+        assert_eq!(kind, DelimKind::Yaml);
+        assert_eq!(&bytes[span.end..], b"body\n");
+    }
+
+    // ---- I2: UTF-8 BOM at file start ------------------------------------------
+
+    #[test]
+    fn detect_delimiters_yaml_with_leading_bom() {
+        let mut bytes = Vec::from([0xEF, 0xBB, 0xBF].as_slice());
+        bytes.extend_from_slice(b"---\ntitle: Doc\n---\nbody\n");
+        let (kind, span, inner) = detect_delimiters(&bytes).unwrap();
+        assert_eq!(kind, DelimKind::Yaml);
+        // Span starts after the BOM (byte 3), not at byte 0.
+        assert_eq!(span.start, 3);
+        // Body slicing using span.end gives the original bytes after the
+        // closing delimiter — no BOM bookkeeping required by callers.
+        assert_eq!(&bytes[span.end..], b"body\n");
+        assert_eq!(&bytes[inner], b"title: Doc");
+    }
+
+    #[test]
+    fn parse_frontmatter_with_leading_bom_full_pipeline() {
+        let mut bytes = Vec::from([0xEF, 0xBB, 0xBF].as_slice());
+        bytes.extend_from_slice(
+            b"---\n\
+title: Doc\n\
+lang: en\n\
+created_at: 2024-03-01T00:00:00Z\n\
+---\nbody\n",
+        );
+        let (meta, span, warns) = parse_frontmatter(&bytes, &hints()).unwrap();
+        assert!(warns.is_empty(), "warnings: {warns:?}");
+        let span = span.expect("span present");
+        assert_eq!(span.start, 3);
+        assert_eq!(
+            meta.user.get("title").and_then(|v| v.as_str()),
+            Some("Doc")
+        );
+        assert_eq!(meta.user.get("lang").and_then(|v| v.as_str()), Some("en"));
+        assert_eq!(meta.created_at, datetime!(2024-03-01 00:00:00 UTC));
+    }
+
+    /// BOM-shaped bytes appearing later in the input are NOT stripped — only
+    /// a BOM at byte 0 of the original input is honoured.
+    #[test]
+    fn detect_delimiters_does_not_strip_mid_input_bom() {
+        // Leading byte is `#`, then a BOM, then a delimiter — there is no
+        // frontmatter here regardless of whether we strip BOM, but pin the
+        // behaviour: detection still fails (no leading marker).
+        let mut bytes = Vec::from(b"# heading\n".as_slice());
+        bytes.extend_from_slice(&[0xEF, 0xBB, 0xBF]);
+        bytes.extend_from_slice(b"---\nfoo: bar\n---\n");
+        assert!(detect_delimiters(&bytes).is_none());
+    }
 }
-- 
2.49.1


From 5850bfcf7a23a1cea3007449305d477684224bc9 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 13:13:16 +0000
Subject: [PATCH 6/6] p1-2: address review minors (FrontmatterSpan doc,
 parse_frontmatter rustdoc, YAML library note)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

M1: Reword the FrontmatterSpan doc-comment from "technically meant to be
crate-internal" to a forward-looking note about P1-3 / P1-4 callers using
bytes[span.end..] for body slicing.

M3: Add an explicit `# Errors` section to parse_frontmatter's rustdoc.
The current implementation never returns Err — all recoverable problems
are downgraded to warnings — but the Result is kept on the signature so
future hard-fail conditions can be added without breaking callers.

M4: Mention serde_yml in the library-choice rationale alongside
serde_yaml_ng, with a one-line note on why _ng was preferred (stricter
adherence to original serde_yaml semantics around null / tagged enums).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kb-parse-md/src/frontmatter.rs | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/crates/kb-parse-md/src/frontmatter.rs b/crates/kb-parse-md/src/frontmatter.rs
index 6790854..3f37990 100644
--- a/crates/kb-parse-md/src/frontmatter.rs
+++ b/crates/kb-parse-md/src/frontmatter.rs
@@ -6,10 +6,14 @@
 //!
 //! # YAML library
 //!
-//! Upstream `serde_yaml` (dtolnay) was archived as unmaintained in 2024. We
-//! use [`serde_yaml_ng`], a maintained fork with an API-compatible surface,
-//! so a future swap to whichever fork wins (`serde_yml`, `yaml-rust2`, …)
-//! is a one-line dep change.
+//! Upstream `serde_yaml` (dtolnay) was archived as unmaintained in 2024. The
+//! two viable maintained forks at the time of this writing are `serde_yaml_ng`
+//! and `serde_yml`. We picked [`serde_yaml_ng`] because it advertises stricter
+//! adherence to the original `serde_yaml` semantics (notably around `null`
+//! handling and tagged enums) while `serde_yml` has taken some liberties
+//! around YAML 1.1 vs 1.2 booleans. Both are actively released; either would
+//! work and the swap is a one-line dep change should the ecosystem
+//! consolidate (incl. a future move to `yaml-rust2` directly).
 
 use std::ops::Range;
 use std::sync::OnceLock;
@@ -46,9 +50,8 @@ pub struct BodyHints {
 /// `end` is the offset just past the closing delimiter line's trailing
 /// newline (i.e. the body starts at `bytes[end..]`).
 ///
-/// Per the task brief this is technically meant to be crate-internal, but
-/// the [`parse_frontmatter`] return type forces it to be `pub`. P1-3 / P1-4
-/// reuse it via this same crate.
+/// Shared with future P1-3/P1-4 callers via the [`parse_frontmatter`] return
+/// tuple — they slice the body using `bytes[span.end..]`.
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub struct FrontmatterSpan {
     pub start: usize,
@@ -64,8 +67,15 @@ pub struct FrontmatterSpan {
 /// covers the delimited region so the caller can skip it during body
 /// slicing.
 ///
+/// # Errors
+///
 /// `Err` is reserved for genuinely fatal conditions (e.g. non-UTF-8 input
-/// that can't even be lossy-decoded), which currently cannot arise here.
+/// that can't even be lossy-decoded). The current implementation has no
+/// such path — every recoverable problem (missing/garbled frontmatter,
+/// malformed timestamps, unknown enum values) is downgraded to a warning
+/// and the function returns `Ok`. The `Result` is kept on the signature so
+/// future hard-fail conditions (e.g. an I/O-backed input) can be added
+/// without breaking callers.
 pub fn parse_frontmatter(
     bytes: &[u8],
     hints: &BodyHints,
-- 
2.49.1