From ca0eb2f9cb58cc0f11265d1299bcc8f92df3cbb8 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 08:52:15 +0000 Subject: [PATCH] p0-1: address review (reject # in WorkspacePath, parse error context) - WorkspacePath: add `WorkspacePath::new(s)` validating constructor that rejects any string containing `#` (collides with the W3C-Media-Fragments separator that Citation URIs depend on). Doc-comment on the type now explains the invariant. - normalize::to_posix changes signature to `Result` and now flows through `WorkspacePath::new`, so a path with `#` is rejected at construction rather than at every reader. Only one caller existed outside tests, so the signature change is contained. - Citation::parse uses `WorkspacePath::new` on the path side. With multiple `#` separators in the input, `rsplit_once` would otherwise leave a `#` on the path; the new constructor closes the hole. - Citation::parse + parse_hms_ms: every `bail!` / `anyhow!` site now quotes the offending substring (and the full input where useful) so error messages identify what went wrong without re-deriving from context. - New tests: `to_posix_rejects_hash_in_path`, `parse_path_with_hash_rejected_at_to_posix_layer`. Existing `to_posix(...).0` call sites updated for the Result signature. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-core/src/asset.rs | 21 ++++++++- crates/kb-core/src/citation.rs | 79 +++++++++++++++++++++++++-------- crates/kb-core/src/ids.rs | 2 +- crates/kb-core/src/normalize.rs | 28 +++++++++--- 4 files changed, 104 insertions(+), 26 deletions(-) diff --git a/crates/kb-core/src/asset.rs b/crates/kb-core/src/asset.rs index f25e7ba..8532175 100644 --- a/crates/kb-core/src/asset.rs +++ b/crates/kb-core/src/asset.rs @@ -5,6 +5,7 @@ use std::path::PathBuf; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; +use crate::errors::CoreError; use crate::ids::AssetId; use crate::media::{Checksum, MediaType}; @@ -17,10 +18,28 @@ pub enum SourceUri { } /// POSIX-relative path inside the workspace root (§6.6, §4.1). Always -/// produced via `crate::normalize::to_posix`. +/// produced via `crate::normalize::to_posix` (filesystem side) or +/// `WorkspacePath::new` (parse side). The inner string is forbidden from +/// containing the `#` character: a workspace path must never collide with +/// the W3C-Media-Fragments separator that `Citation` URIs rely on, so the +/// invariant is enforced at construction rather than at every call site. #[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] pub struct WorkspacePath(pub String); +impl WorkspacePath { + /// Construct a `WorkspacePath` from a string, rejecting any input that + /// contains `#`. Use this on the parser side (e.g. `Citation::parse`) + /// where the input does not flow through `to_posix`. + pub fn new(s: String) -> Result { + if s.contains('#') { + return Err(CoreError::Malformed(format!( + "workspace path must not contain '#': {s:?}" + ))); + } + Ok(Self(s)) + } +} + #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "lowercase", tag = "kind")] pub enum AssetStorage { diff --git a/crates/kb-core/src/citation.rs b/crates/kb-core/src/citation.rs index 13299bd..80adb67 100644 --- a/crates/kb-core/src/citation.rs +++ b/crates/kb-core/src/citation.rs @@ -93,13 +93,20 @@ impl Citation { Some(t) => t, None => bail!("citation has no '#' fragment: {s:?}"), }; - let path = WorkspacePath(path_str.to_owned()); + // `WorkspacePath::new` rejects any remaining `#` on the path side + // (e.g. the input had multiple `#` separators), closing the + // hash-in-path concern at construction rather than at every reader. + let path = WorkspacePath::new(path_str.to_owned())?; if let Some(rest) = frag.strip_prefix("L") { // line range: `L` or `L-L` if let Some((a, b)) = rest.split_once("-L") { - let start: u32 = a.parse().map_err(|_| anyhow::anyhow!("bad line start"))?; - let end: u32 = b.parse().map_err(|_| anyhow::anyhow!("bad line end"))?; + let start: u32 = a + .parse() + .map_err(|_| anyhow::anyhow!("bad line start in {a:?} (input {s:?})"))?; + let end: u32 = b + .parse() + .map_err(|_| anyhow::anyhow!("bad line end in {b:?} (input {s:?})"))?; return Ok(Citation::Line { path, start, @@ -107,7 +114,9 @@ impl Citation { section: None, }); } - let n: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad line number"))?; + let n: u32 = rest + .parse() + .map_err(|_| anyhow::anyhow!("bad line number in {rest:?} (input {s:?})"))?; return Ok(Citation::Line { path, start: n, @@ -116,7 +125,9 @@ impl Citation { }); } if let Some(rest) = frag.strip_prefix("p=") { - let page: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad page number"))?; + let page: u32 = rest + .parse() + .map_err(|_| anyhow::anyhow!("bad page number in {rest:?} (input {s:?})"))?; return Ok(Citation::Page { path, page, @@ -126,12 +137,20 @@ impl Citation { if let Some(rest) = frag.strip_prefix("xywh=") { let parts: Vec<&str> = rest.split(',').collect(); if parts.len() != 4 { - bail!("xywh= expects 4 comma-separated values: {rest:?}"); + bail!("xywh= expects 4 comma-separated values, got {rest:?} (input {s:?})"); } - let x: u32 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad xywh.x"))?; - let y: u32 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad xywh.y"))?; - let w: u32 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad xywh.w"))?; - let h: u32 = parts[3].parse().map_err(|_| anyhow::anyhow!("bad xywh.h"))?; + let x: u32 = parts[0] + .parse() + .map_err(|_| anyhow::anyhow!("bad xywh.x in {:?} (input {s:?})", parts[0]))?; + let y: u32 = parts[1] + .parse() + .map_err(|_| anyhow::anyhow!("bad xywh.y in {:?} (input {s:?})", parts[1]))?; + let w: u32 = parts[2] + .parse() + .map_err(|_| anyhow::anyhow!("bad xywh.w in {:?} (input {s:?})", parts[2]))?; + let h: u32 = parts[3] + .parse() + .map_err(|_| anyhow::anyhow!("bad xywh.h in {:?} (input {s:?})", parts[3]))?; return Ok(Citation::Region { path, x, y, w, h }); } if frag == "caption" { @@ -145,13 +164,13 @@ impl Citation { let (range, speaker) = match rest.split_once('&') { Some((r, kv)) => match kv.strip_prefix("speaker=") { Some(sp) => (r, Some(sp.to_owned())), - None => bail!("unknown time-fragment param: {kv:?}"), + None => bail!("unknown time-fragment param {kv:?} (input {s:?})"), }, None => (rest, None), }; let (s_str, e_str) = match range.split_once(',') { Some(t) => t, - None => bail!("time fragment expects ',': {range:?}"), + None => bail!("time fragment expects ',', got {range:?} (input {s:?})"), }; let start_ms = parse_hms_ms(s_str)?; let end_ms = parse_hms_ms(e_str)?; @@ -162,7 +181,7 @@ impl Citation { speaker, }); } - bail!("unrecognised citation fragment: {frag:?}") + bail!("unrecognised citation fragment {frag:?} (input {s:?})") } } @@ -181,22 +200,32 @@ fn parse_hms_ms(s: &str) -> Result { if parts.len() != 3 { bail!("time component expects hh:mm:ss.mmm, got {s:?}"); } - let h: u64 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad hours"))?; - let m: u64 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad minutes"))?; + let h: u64 = parts[0] + .parse() + .map_err(|_| anyhow::anyhow!("bad hours in {:?} (input {s:?})", parts[0]))?; + let m: u64 = parts[1] + .parse() + .map_err(|_| anyhow::anyhow!("bad minutes in {:?} (input {s:?})", parts[1]))?; let (sec, ms) = match parts[2].split_once('.') { Some((s_part, ms_part)) => { - let sec: u64 = s_part.parse().map_err(|_| anyhow::anyhow!("bad seconds"))?; + let sec: u64 = s_part + .parse() + .map_err(|_| anyhow::anyhow!("bad seconds in {s_part:?} (input {s:?})"))?; // Pad/truncate to exactly 3 digits. let mut ms_str = ms_part.to_owned(); while ms_str.len() < 3 { ms_str.push('0'); } ms_str.truncate(3); - let ms: u64 = ms_str.parse().map_err(|_| anyhow::anyhow!("bad milliseconds"))?; + let ms: u64 = ms_str + .parse() + .map_err(|_| anyhow::anyhow!("bad milliseconds in {ms_part:?} (input {s:?})"))?; (sec, ms) } None => { - let sec: u64 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad seconds"))?; + let sec: u64 = parts[2] + .parse() + .map_err(|_| anyhow::anyhow!("bad seconds in {:?} (input {s:?})", parts[2]))?; (sec, 0) } }; @@ -208,7 +237,7 @@ mod tests { use super::*; fn p(s: &str) -> WorkspacePath { - WorkspacePath(s.to_owned()) + WorkspacePath::new(s.to_owned()).expect("test paths must not contain '#'") } #[test] @@ -313,4 +342,16 @@ mod tests { fn parse_rejects_unknown_fragment() { assert!(Citation::parse("a.md#mystery=1").is_err()); } + + /// `rsplit_once('#')` would otherwise leave a `#` on the path side when + /// the input contains multiple `#` separators (e.g. someone embeds a + /// fake fragment in the path). The `WorkspacePath::new` constructor + /// closes that hole at construction time. + #[test] + fn parse_path_with_hash_rejected_at_to_posix_layer() { + // `notes/x#evil.md#L7` — rsplit_once strips `#L7`, leaving + // `notes/x#evil.md` on the path side. WorkspacePath::new must reject. + let r = Citation::parse("notes/x#evil.md#L7"); + assert!(r.is_err(), "path with embedded '#' must be rejected"); + } } diff --git a/crates/kb-core/src/ids.rs b/crates/kb-core/src/ids.rs index a9992c4..9e09a5a 100644 --- a/crates/kb-core/src/ids.rs +++ b/crates/kb-core/src/ids.rs @@ -309,7 +309,7 @@ mod tests { #[test] fn id_for_doc_pinned() { let asset = AssetId("6cb0ef0eb89c63b8b6e76ec53dca6e7d".to_string()); - let path = WorkspacePath("notes/test.md".to_string()); + let path = WorkspacePath::new("notes/test.md".to_string()).unwrap(); let pv = ParserVersion("pulldown-cmark-0.x".to_string()); let id = id_for_doc(&path, &asset, &pv); assert_eq!(id.0, "8547fe58cb42d593fd761d77242401db"); diff --git a/crates/kb-core/src/normalize.rs b/crates/kb-core/src/normalize.rs index d23858f..c4c20b9 100644 --- a/crates/kb-core/src/normalize.rs +++ b/crates/kb-core/src/normalize.rs @@ -5,6 +5,7 @@ use std::path::{Component, Path}; use unicode_normalization::UnicodeNormalization; use crate::asset::WorkspacePath; +use crate::errors::CoreError; /// NFC-normalize a UTF-8 string (§4.1). pub fn nfc(input: &str) -> String { @@ -16,7 +17,11 @@ pub fn nfc(input: &str) -> String { /// - strip a leading `./` /// - collapse repeated slashes /// - NFC-normalize -pub fn to_posix(path: &Path) -> WorkspacePath { +/// +/// Returns `Err(CoreError::Malformed(..))` if the resulting POSIX form +/// contains `#`, since `WorkspacePath` is forbidden from colliding with +/// the W3C-Media-Fragments separator that `Citation` URIs depend on. +pub fn to_posix(path: &Path) -> Result { let mut out = String::new(); let mut first = true; for comp in path.components() { @@ -52,7 +57,7 @@ pub fn to_posix(path: &Path) -> WorkspacePath { if out.is_empty() { out.push('.'); } - WorkspacePath(nfc(&out)) + WorkspacePath::new(nfc(&out)) } #[cfg(test)] @@ -64,7 +69,7 @@ mod tests { let p = Path::new("./a//b.md"); // `Path::components` already collapses `//` on POSIX; the test // doc-fixed example asserts the final string is `a/b.md`. - assert_eq!(to_posix(p).0, "a/b.md"); + assert_eq!(to_posix(p).unwrap().0, "a/b.md"); } #[test] @@ -74,8 +79,11 @@ mod tests { // collapse, so the WorkspacePath comes out NFC regardless of input. let nfd = "\u{1100}\u{1161}.md"; let nfc_str = "\u{AC00}.md"; - assert_eq!(to_posix(Path::new(nfd)).0, to_posix(Path::new(nfc_str)).0); - assert_eq!(to_posix(Path::new(nfd)).0, "\u{AC00}.md"); + assert_eq!( + to_posix(Path::new(nfd)).unwrap().0, + to_posix(Path::new(nfc_str)).unwrap().0 + ); + assert_eq!(to_posix(Path::new(nfd)).unwrap().0, "\u{AC00}.md"); } #[test] @@ -83,4 +91,14 @@ mod tests { let s = "\u{AC00}"; assert_eq!(nfc(s), s); } + + #[test] + fn to_posix_rejects_hash_in_path() { + // `#` collides with the W3C-Media-Fragments separator used by + // `Citation`; the WorkspacePath invariant rejects it at construction. + let p = Path::new("notes/has#hash.md"); + let err = to_posix(p).expect_err("# in path must be rejected"); + let msg = format!("{err}"); + assert!(msg.contains('#'), "error message should mention '#': {msg}"); + } }