From ca0eb2f9cb58cc0f11265d1299bcc8f92df3cbb8 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 08:52:15 +0000
Subject: [PATCH] p0-1: address review (reject # in WorkspacePath, parse error
 context)

- WorkspacePath: add `WorkspacePath::new(s)` validating constructor that
  rejects any string containing `#` (collides with the W3C-Media-Fragments
  separator that Citation URIs depend on). Doc-comment on the type now
  explains the invariant.
- normalize::to_posix changes signature to `Result<WorkspacePath, CoreError>`
  and now flows through `WorkspacePath::new`, so a path with `#` is rejected
  at construction rather than at every reader. Only one caller existed
  outside tests, so the signature change is contained.
- Citation::parse uses `WorkspacePath::new` on the path side. With multiple
  `#` separators in the input, `rsplit_once` would otherwise leave a `#` on
  the path; the new constructor closes the hole.
- Citation::parse + parse_hms_ms: every `bail!` / `anyhow!` site now quotes
  the offending substring (and the full input where useful) so error
  messages identify what went wrong without re-deriving from context.
- New tests: `to_posix_rejects_hash_in_path`,
  `parse_path_with_hash_rejected_at_to_posix_layer`. Existing
  `to_posix(...).0` call sites updated for the Result signature.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kb-core/src/asset.rs     | 21 ++++++++-
 crates/kb-core/src/citation.rs  | 79 +++++++++++++++++++++++++--------
 crates/kb-core/src/ids.rs       |  2 +-
 crates/kb-core/src/normalize.rs | 28 +++++++++---
 4 files changed, 104 insertions(+), 26 deletions(-)

diff --git a/crates/kb-core/src/asset.rs b/crates/kb-core/src/asset.rs
index f25e7ba..8532175 100644
--- a/crates/kb-core/src/asset.rs
+++ b/crates/kb-core/src/asset.rs
@@ -5,6 +5,7 @@ use std::path::PathBuf;
 use serde::{Deserialize, Serialize};
 use time::OffsetDateTime;
 
+use crate::errors::CoreError;
 use crate::ids::AssetId;
 use crate::media::{Checksum, MediaType};
 
@@ -17,10 +18,28 @@ pub enum SourceUri {
 }
 
 /// POSIX-relative path inside the workspace root (§6.6, §4.1). Always
-/// produced via `crate::normalize::to_posix`.
+/// produced via `crate::normalize::to_posix` (filesystem side) or
+/// `WorkspacePath::new` (parse side). The inner string is forbidden from
+/// containing the `#` character: a workspace path must never collide with
+/// the W3C-Media-Fragments separator that `Citation` URIs rely on, so the
+/// invariant is enforced at construction rather than at every call site.
 #[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
 pub struct WorkspacePath(pub String);
 
+impl WorkspacePath {
+    /// Construct a `WorkspacePath` from a string, rejecting any input that
+    /// contains `#`. Use this on the parser side (e.g. `Citation::parse`)
+    /// where the input does not flow through `to_posix`.
+    pub fn new(s: String) -> Result<Self, CoreError> {
+        if s.contains('#') {
+            return Err(CoreError::Malformed(format!(
+                "workspace path must not contain '#': {s:?}"
+            )));
+        }
+        Ok(Self(s))
+    }
+}
+
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 #[serde(rename_all = "lowercase", tag = "kind")]
 pub enum AssetStorage {
diff --git a/crates/kb-core/src/citation.rs b/crates/kb-core/src/citation.rs
index 13299bd..80adb67 100644
--- a/crates/kb-core/src/citation.rs
+++ b/crates/kb-core/src/citation.rs
@@ -93,13 +93,20 @@ impl Citation {
             Some(t) => t,
             None => bail!("citation has no '#' fragment: {s:?}"),
         };
-        let path = WorkspacePath(path_str.to_owned());
+        // `WorkspacePath::new` rejects any remaining `#` on the path side
+        // (e.g. the input had multiple `#` separators), closing the
+        // hash-in-path concern at construction rather than at every reader.
+        let path = WorkspacePath::new(path_str.to_owned())?;
 
         if let Some(rest) = frag.strip_prefix("L") {
             // line range: `L<a>` or `L<a>-L<b>`
             if let Some((a, b)) = rest.split_once("-L") {
-                let start: u32 = a.parse().map_err(|_| anyhow::anyhow!("bad line start"))?;
-                let end: u32 = b.parse().map_err(|_| anyhow::anyhow!("bad line end"))?;
+                let start: u32 = a
+                    .parse()
+                    .map_err(|_| anyhow::anyhow!("bad line start in {a:?} (input {s:?})"))?;
+                let end: u32 = b
+                    .parse()
+                    .map_err(|_| anyhow::anyhow!("bad line end in {b:?} (input {s:?})"))?;
                 return Ok(Citation::Line {
                     path,
                     start,
@@ -107,7 +114,9 @@ impl Citation {
                     section: None,
                 });
             }
-            let n: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad line number"))?;
+            let n: u32 = rest
+                .parse()
+                .map_err(|_| anyhow::anyhow!("bad line number in {rest:?} (input {s:?})"))?;
             return Ok(Citation::Line {
                 path,
                 start: n,
@@ -116,7 +125,9 @@ impl Citation {
             });
         }
         if let Some(rest) = frag.strip_prefix("p=") {
-            let page: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad page number"))?;
+            let page: u32 = rest
+                .parse()
+                .map_err(|_| anyhow::anyhow!("bad page number in {rest:?} (input {s:?})"))?;
             return Ok(Citation::Page {
                 path,
                 page,
@@ -126,12 +137,20 @@ impl Citation {
         if let Some(rest) = frag.strip_prefix("xywh=") {
             let parts: Vec<&str> = rest.split(',').collect();
             if parts.len() != 4 {
-                bail!("xywh= expects 4 comma-separated values: {rest:?}");
+                bail!("xywh= expects 4 comma-separated values, got {rest:?} (input {s:?})");
             }
-            let x: u32 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad xywh.x"))?;
-            let y: u32 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad xywh.y"))?;
-            let w: u32 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad xywh.w"))?;
-            let h: u32 = parts[3].parse().map_err(|_| anyhow::anyhow!("bad xywh.h"))?;
+            let x: u32 = parts[0]
+                .parse()
+                .map_err(|_| anyhow::anyhow!("bad xywh.x in {:?} (input {s:?})", parts[0]))?;
+            let y: u32 = parts[1]
+                .parse()
+                .map_err(|_| anyhow::anyhow!("bad xywh.y in {:?} (input {s:?})", parts[1]))?;
+            let w: u32 = parts[2]
+                .parse()
+                .map_err(|_| anyhow::anyhow!("bad xywh.w in {:?} (input {s:?})", parts[2]))?;
+            let h: u32 = parts[3]
+                .parse()
+                .map_err(|_| anyhow::anyhow!("bad xywh.h in {:?} (input {s:?})", parts[3]))?;
             return Ok(Citation::Region { path, x, y, w, h });
         }
         if frag == "caption" {
@@ -145,13 +164,13 @@ impl Citation {
             let (range, speaker) = match rest.split_once('&') {
                 Some((r, kv)) => match kv.strip_prefix("speaker=") {
                     Some(sp) => (r, Some(sp.to_owned())),
-                    None => bail!("unknown time-fragment param: {kv:?}"),
+                    None => bail!("unknown time-fragment param {kv:?} (input {s:?})"),
                 },
                 None => (rest, None),
             };
             let (s_str, e_str) = match range.split_once(',') {
                 Some(t) => t,
-                None => bail!("time fragment expects '<start>,<end>': {range:?}"),
+                None => bail!("time fragment expects '<start>,<end>', got {range:?} (input {s:?})"),
             };
             let start_ms = parse_hms_ms(s_str)?;
             let end_ms = parse_hms_ms(e_str)?;
@@ -162,7 +181,7 @@ impl Citation {
                 speaker,
             });
         }
-        bail!("unrecognised citation fragment: {frag:?}")
+        bail!("unrecognised citation fragment {frag:?} (input {s:?})")
     }
 }
 
@@ -181,22 +200,32 @@ fn parse_hms_ms(s: &str) -> Result<u64> {
     if parts.len() != 3 {
         bail!("time component expects hh:mm:ss.mmm, got {s:?}");
     }
-    let h: u64 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad hours"))?;
-    let m: u64 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad minutes"))?;
+    let h: u64 = parts[0]
+        .parse()
+        .map_err(|_| anyhow::anyhow!("bad hours in {:?} (input {s:?})", parts[0]))?;
+    let m: u64 = parts[1]
+        .parse()
+        .map_err(|_| anyhow::anyhow!("bad minutes in {:?} (input {s:?})", parts[1]))?;
     let (sec, ms) = match parts[2].split_once('.') {
         Some((s_part, ms_part)) => {
-            let sec: u64 = s_part.parse().map_err(|_| anyhow::anyhow!("bad seconds"))?;
+            let sec: u64 = s_part
+                .parse()
+                .map_err(|_| anyhow::anyhow!("bad seconds in {s_part:?} (input {s:?})"))?;
             // Pad/truncate to exactly 3 digits.
             let mut ms_str = ms_part.to_owned();
             while ms_str.len() < 3 {
                 ms_str.push('0');
             }
             ms_str.truncate(3);
-            let ms: u64 = ms_str.parse().map_err(|_| anyhow::anyhow!("bad milliseconds"))?;
+            let ms: u64 = ms_str
+                .parse()
+                .map_err(|_| anyhow::anyhow!("bad milliseconds in {ms_part:?} (input {s:?})"))?;
             (sec, ms)
         }
         None => {
-            let sec: u64 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad seconds"))?;
+            let sec: u64 = parts[2]
+                .parse()
+                .map_err(|_| anyhow::anyhow!("bad seconds in {:?} (input {s:?})", parts[2]))?;
             (sec, 0)
         }
     };
@@ -208,7 +237,7 @@ mod tests {
     use super::*;
 
     fn p(s: &str) -> WorkspacePath {
-        WorkspacePath(s.to_owned())
+        WorkspacePath::new(s.to_owned()).expect("test paths must not contain '#'")
     }
 
     #[test]
@@ -313,4 +342,16 @@ mod tests {
     fn parse_rejects_unknown_fragment() {
         assert!(Citation::parse("a.md#mystery=1").is_err());
     }
+
+    /// `rsplit_once('#')` would otherwise leave a `#` on the path side when
+    /// the input contains multiple `#` separators (e.g. someone embeds a
+    /// fake fragment in the path). The `WorkspacePath::new` constructor
+    /// closes that hole at construction time.
+    #[test]
+    fn parse_path_with_hash_rejected_at_to_posix_layer() {
+        // `notes/x#evil.md#L7` — rsplit_once strips `#L7`, leaving
+        // `notes/x#evil.md` on the path side. WorkspacePath::new must reject.
+        let r = Citation::parse("notes/x#evil.md#L7");
+        assert!(r.is_err(), "path with embedded '#' must be rejected");
+    }
 }
diff --git a/crates/kb-core/src/ids.rs b/crates/kb-core/src/ids.rs
index a9992c4..9e09a5a 100644
--- a/crates/kb-core/src/ids.rs
+++ b/crates/kb-core/src/ids.rs
@@ -309,7 +309,7 @@ mod tests {
     #[test]
     fn id_for_doc_pinned() {
         let asset = AssetId("6cb0ef0eb89c63b8b6e76ec53dca6e7d".to_string());
-        let path = WorkspacePath("notes/test.md".to_string());
+        let path = WorkspacePath::new("notes/test.md".to_string()).unwrap();
         let pv = ParserVersion("pulldown-cmark-0.x".to_string());
         let id = id_for_doc(&path, &asset, &pv);
         assert_eq!(id.0, "8547fe58cb42d593fd761d77242401db");
diff --git a/crates/kb-core/src/normalize.rs b/crates/kb-core/src/normalize.rs
index d23858f..c4c20b9 100644
--- a/crates/kb-core/src/normalize.rs
+++ b/crates/kb-core/src/normalize.rs
@@ -5,6 +5,7 @@ use std::path::{Component, Path};
 use unicode_normalization::UnicodeNormalization;
 
 use crate::asset::WorkspacePath;
+use crate::errors::CoreError;
 
 /// NFC-normalize a UTF-8 string (§4.1).
 pub fn nfc(input: &str) -> String {
@@ -16,7 +17,11 @@ pub fn nfc(input: &str) -> String {
 /// - strip a leading `./`
 /// - collapse repeated slashes
 /// - NFC-normalize
-pub fn to_posix(path: &Path) -> WorkspacePath {
+///
+/// Returns `Err(CoreError::Malformed(..))` if the resulting POSIX form
+/// contains `#`, since `WorkspacePath` is forbidden from colliding with
+/// the W3C-Media-Fragments separator that `Citation` URIs depend on.
+pub fn to_posix(path: &Path) -> Result<WorkspacePath, CoreError> {
     let mut out = String::new();
     let mut first = true;
     for comp in path.components() {
@@ -52,7 +57,7 @@ pub fn to_posix(path: &Path) -> WorkspacePath {
     if out.is_empty() {
         out.push('.');
     }
-    WorkspacePath(nfc(&out))
+    WorkspacePath::new(nfc(&out))
 }
 
 #[cfg(test)]
@@ -64,7 +69,7 @@ mod tests {
         let p = Path::new("./a//b.md");
         // `Path::components` already collapses `//` on POSIX; the test
         // doc-fixed example asserts the final string is `a/b.md`.
-        assert_eq!(to_posix(p).0, "a/b.md");
+        assert_eq!(to_posix(p).unwrap().0, "a/b.md");
     }
 
     #[test]
@@ -74,8 +79,11 @@ mod tests {
         // collapse, so the WorkspacePath comes out NFC regardless of input.
         let nfd = "\u{1100}\u{1161}.md";
         let nfc_str = "\u{AC00}.md";
-        assert_eq!(to_posix(Path::new(nfd)).0, to_posix(Path::new(nfc_str)).0);
-        assert_eq!(to_posix(Path::new(nfd)).0, "\u{AC00}.md");
+        assert_eq!(
+            to_posix(Path::new(nfd)).unwrap().0,
+            to_posix(Path::new(nfc_str)).unwrap().0
+        );
+        assert_eq!(to_posix(Path::new(nfd)).unwrap().0, "\u{AC00}.md");
     }
 
     #[test]
@@ -83,4 +91,14 @@ mod tests {
         let s = "\u{AC00}";
         assert_eq!(nfc(s), s);
     }
+
+    #[test]
+    fn to_posix_rejects_hash_in_path() {
+        // `#` collides with the W3C-Media-Fragments separator used by
+        // `Citation`; the WorkspacePath invariant rejects it at construction.
+        let p = Path::new("notes/has#hash.md");
+        let err = to_posix(p).expect_err("# in path must be rejected");
+        let msg = format!("{err}");
+        assert!(msg.contains('#'), "error message should mention '#': {msg}");
+    }
 }