p0-1: address review (reject # in WorkspacePath, parse error context)

- WorkspacePath: add `WorkspacePath::new(s)` validating constructor that
  rejects any string containing `#` (collides with the W3C-Media-Fragments
  separator that Citation URIs depend on). Doc-comment on the type now
  explains the invariant.
- normalize::to_posix changes signature to `Result<WorkspacePath, CoreError>`
  and now flows through `WorkspacePath::new`, so a path with `#` is rejected
  at construction rather than at every reader. Only one caller existed
  outside tests, so the signature change is contained.
- Citation::parse uses `WorkspacePath::new` on the path side. With multiple
  `#` separators in the input, `rsplit_once` would otherwise leave a `#` on
  the path; the new constructor closes the hole.
- Citation::parse + parse_hms_ms: every `bail!` / `anyhow!` site now quotes
  the offending substring (and the full input where useful) so error
  messages identify what went wrong without re-deriving from context.
- New tests: `to_posix_rejects_hash_in_path`,
  `parse_path_with_hash_rejected_at_to_posix_layer`. Existing
  `to_posix(...).0` call sites updated for the Result signature.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-30 08:52:15 +00:00
parent 286e62734d
commit ca0eb2f9cb
4 changed files with 104 additions and 26 deletions

View File

@@ -5,6 +5,7 @@ use std::path::PathBuf;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use crate::errors::CoreError;
use crate::ids::AssetId;
use crate::media::{Checksum, MediaType};
@@ -17,10 +18,28 @@ pub enum SourceUri {
}
/// POSIX-relative path inside the workspace root (§6.6, §4.1). Always
/// produced via `crate::normalize::to_posix`.
/// produced via `crate::normalize::to_posix` (filesystem side) or
/// `WorkspacePath::new` (parse side). The inner string is forbidden from
/// containing the `#` character: a workspace path must never collide with
/// the W3C-Media-Fragments separator that `Citation` URIs rely on, so the
/// invariant is enforced at construction rather than at every call site.
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
pub struct WorkspacePath(pub String);
impl WorkspacePath {
/// Construct a `WorkspacePath` from a string, rejecting any input that
/// contains `#`. Use this on the parser side (e.g. `Citation::parse`)
/// where the input does not flow through `to_posix`.
pub fn new(s: String) -> Result<Self, CoreError> {
if s.contains('#') {
return Err(CoreError::Malformed(format!(
"workspace path must not contain '#': {s:?}"
)));
}
Ok(Self(s))
}
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", tag = "kind")]
pub enum AssetStorage {

View File

@@ -93,13 +93,20 @@ impl Citation {
Some(t) => t,
None => bail!("citation has no '#' fragment: {s:?}"),
};
let path = WorkspacePath(path_str.to_owned());
// `WorkspacePath::new` rejects any remaining `#` on the path side
// (e.g. the input had multiple `#` separators), closing the
// hash-in-path concern at construction rather than at every reader.
let path = WorkspacePath::new(path_str.to_owned())?;
if let Some(rest) = frag.strip_prefix("L") {
// line range: `L<a>` or `L<a>-L<b>`
if let Some((a, b)) = rest.split_once("-L") {
let start: u32 = a.parse().map_err(|_| anyhow::anyhow!("bad line start"))?;
let end: u32 = b.parse().map_err(|_| anyhow::anyhow!("bad line end"))?;
let start: u32 = a
.parse()
.map_err(|_| anyhow::anyhow!("bad line start in {a:?} (input {s:?})"))?;
let end: u32 = b
.parse()
.map_err(|_| anyhow::anyhow!("bad line end in {b:?} (input {s:?})"))?;
return Ok(Citation::Line {
path,
start,
@@ -107,7 +114,9 @@ impl Citation {
section: None,
});
}
let n: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad line number"))?;
let n: u32 = rest
.parse()
.map_err(|_| anyhow::anyhow!("bad line number in {rest:?} (input {s:?})"))?;
return Ok(Citation::Line {
path,
start: n,
@@ -116,7 +125,9 @@ impl Citation {
});
}
if let Some(rest) = frag.strip_prefix("p=") {
let page: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad page number"))?;
let page: u32 = rest
.parse()
.map_err(|_| anyhow::anyhow!("bad page number in {rest:?} (input {s:?})"))?;
return Ok(Citation::Page {
path,
page,
@@ -126,12 +137,20 @@ impl Citation {
if let Some(rest) = frag.strip_prefix("xywh=") {
let parts: Vec<&str> = rest.split(',').collect();
if parts.len() != 4 {
bail!("xywh= expects 4 comma-separated values: {rest:?}");
bail!("xywh= expects 4 comma-separated values, got {rest:?} (input {s:?})");
}
let x: u32 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad xywh.x"))?;
let y: u32 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad xywh.y"))?;
let w: u32 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad xywh.w"))?;
let h: u32 = parts[3].parse().map_err(|_| anyhow::anyhow!("bad xywh.h"))?;
let x: u32 = parts[0]
.parse()
.map_err(|_| anyhow::anyhow!("bad xywh.x in {:?} (input {s:?})", parts[0]))?;
let y: u32 = parts[1]
.parse()
.map_err(|_| anyhow::anyhow!("bad xywh.y in {:?} (input {s:?})", parts[1]))?;
let w: u32 = parts[2]
.parse()
.map_err(|_| anyhow::anyhow!("bad xywh.w in {:?} (input {s:?})", parts[2]))?;
let h: u32 = parts[3]
.parse()
.map_err(|_| anyhow::anyhow!("bad xywh.h in {:?} (input {s:?})", parts[3]))?;
return Ok(Citation::Region { path, x, y, w, h });
}
if frag == "caption" {
@@ -145,13 +164,13 @@ impl Citation {
let (range, speaker) = match rest.split_once('&') {
Some((r, kv)) => match kv.strip_prefix("speaker=") {
Some(sp) => (r, Some(sp.to_owned())),
None => bail!("unknown time-fragment param: {kv:?}"),
None => bail!("unknown time-fragment param {kv:?} (input {s:?})"),
},
None => (rest, None),
};
let (s_str, e_str) = match range.split_once(',') {
Some(t) => t,
None => bail!("time fragment expects '<start>,<end>': {range:?}"),
None => bail!("time fragment expects '<start>,<end>', got {range:?} (input {s:?})"),
};
let start_ms = parse_hms_ms(s_str)?;
let end_ms = parse_hms_ms(e_str)?;
@@ -162,7 +181,7 @@ impl Citation {
speaker,
});
}
bail!("unrecognised citation fragment: {frag:?}")
bail!("unrecognised citation fragment {frag:?} (input {s:?})")
}
}
@@ -181,22 +200,32 @@ fn parse_hms_ms(s: &str) -> Result<u64> {
if parts.len() != 3 {
bail!("time component expects hh:mm:ss.mmm, got {s:?}");
}
let h: u64 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad hours"))?;
let m: u64 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad minutes"))?;
let h: u64 = parts[0]
.parse()
.map_err(|_| anyhow::anyhow!("bad hours in {:?} (input {s:?})", parts[0]))?;
let m: u64 = parts[1]
.parse()
.map_err(|_| anyhow::anyhow!("bad minutes in {:?} (input {s:?})", parts[1]))?;
let (sec, ms) = match parts[2].split_once('.') {
Some((s_part, ms_part)) => {
let sec: u64 = s_part.parse().map_err(|_| anyhow::anyhow!("bad seconds"))?;
let sec: u64 = s_part
.parse()
.map_err(|_| anyhow::anyhow!("bad seconds in {s_part:?} (input {s:?})"))?;
// Pad/truncate to exactly 3 digits.
let mut ms_str = ms_part.to_owned();
while ms_str.len() < 3 {
ms_str.push('0');
}
ms_str.truncate(3);
let ms: u64 = ms_str.parse().map_err(|_| anyhow::anyhow!("bad milliseconds"))?;
let ms: u64 = ms_str
.parse()
.map_err(|_| anyhow::anyhow!("bad milliseconds in {ms_part:?} (input {s:?})"))?;
(sec, ms)
}
None => {
let sec: u64 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad seconds"))?;
let sec: u64 = parts[2]
.parse()
.map_err(|_| anyhow::anyhow!("bad seconds in {:?} (input {s:?})", parts[2]))?;
(sec, 0)
}
};
@@ -208,7 +237,7 @@ mod tests {
use super::*;
fn p(s: &str) -> WorkspacePath {
WorkspacePath(s.to_owned())
WorkspacePath::new(s.to_owned()).expect("test paths must not contain '#'")
}
#[test]
@@ -313,4 +342,16 @@ mod tests {
fn parse_rejects_unknown_fragment() {
assert!(Citation::parse("a.md#mystery=1").is_err());
}
/// `rsplit_once('#')` would otherwise leave a `#` on the path side when
/// the input contains multiple `#` separators (e.g. someone embeds a
/// fake fragment in the path). The `WorkspacePath::new` constructor
/// closes that hole at construction time.
#[test]
fn parse_path_with_hash_rejected_at_to_posix_layer() {
// `notes/x#evil.md#L7` — rsplit_once strips `#L7`, leaving
// `notes/x#evil.md` on the path side. WorkspacePath::new must reject.
let r = Citation::parse("notes/x#evil.md#L7");
assert!(r.is_err(), "path with embedded '#' must be rejected");
}
}

View File

@@ -309,7 +309,7 @@ mod tests {
#[test]
fn id_for_doc_pinned() {
let asset = AssetId("6cb0ef0eb89c63b8b6e76ec53dca6e7d".to_string());
let path = WorkspacePath("notes/test.md".to_string());
let path = WorkspacePath::new("notes/test.md".to_string()).unwrap();
let pv = ParserVersion("pulldown-cmark-0.x".to_string());
let id = id_for_doc(&path, &asset, &pv);
assert_eq!(id.0, "8547fe58cb42d593fd761d77242401db");

View File

@@ -5,6 +5,7 @@ use std::path::{Component, Path};
use unicode_normalization::UnicodeNormalization;
use crate::asset::WorkspacePath;
use crate::errors::CoreError;
/// NFC-normalize a UTF-8 string (§4.1).
pub fn nfc(input: &str) -> String {
@@ -16,7 +17,11 @@ pub fn nfc(input: &str) -> String {
/// - strip a leading `./`
/// - collapse repeated slashes
/// - NFC-normalize
pub fn to_posix(path: &Path) -> WorkspacePath {
///
/// Returns `Err(CoreError::Malformed(..))` if the resulting POSIX form
/// contains `#`, since `WorkspacePath` is forbidden from colliding with
/// the W3C-Media-Fragments separator that `Citation` URIs depend on.
pub fn to_posix(path: &Path) -> Result<WorkspacePath, CoreError> {
let mut out = String::new();
let mut first = true;
for comp in path.components() {
@@ -52,7 +57,7 @@ pub fn to_posix(path: &Path) -> WorkspacePath {
if out.is_empty() {
out.push('.');
}
WorkspacePath(nfc(&out))
WorkspacePath::new(nfc(&out))
}
#[cfg(test)]
@@ -64,7 +69,7 @@ mod tests {
let p = Path::new("./a//b.md");
// `Path::components` already collapses `//` on POSIX; the test
// doc-fixed example asserts the final string is `a/b.md`.
assert_eq!(to_posix(p).0, "a/b.md");
assert_eq!(to_posix(p).unwrap().0, "a/b.md");
}
#[test]
@@ -74,8 +79,11 @@ mod tests {
// collapse, so the WorkspacePath comes out NFC regardless of input.
let nfd = "\u{1100}\u{1161}.md";
let nfc_str = "\u{AC00}.md";
assert_eq!(to_posix(Path::new(nfd)).0, to_posix(Path::new(nfc_str)).0);
assert_eq!(to_posix(Path::new(nfd)).0, "\u{AC00}.md");
assert_eq!(
to_posix(Path::new(nfd)).unwrap().0,
to_posix(Path::new(nfc_str)).unwrap().0
);
assert_eq!(to_posix(Path::new(nfd)).unwrap().0, "\u{AC00}.md");
}
#[test]
@@ -83,4 +91,14 @@ mod tests {
let s = "\u{AC00}";
assert_eq!(nfc(s), s);
}
#[test]
fn to_posix_rejects_hash_in_path() {
// `#` collides with the W3C-Media-Fragments separator used by
// `Citation`; the WorkspacePath invariant rejects it at construction.
let p = Path::new("notes/has#hash.md");
let err = to_posix(p).expect_err("# in path must be rejected");
let msg = format!("{err}");
assert!(msg.contains('#'), "error message should mention '#': {msg}");
}
}