p0-1: address review (reject # in WorkspacePath, parse error context)
- WorkspacePath: add `WorkspacePath::new(s)` validating constructor that rejects any string containing `#` (collides with the W3C-Media-Fragments separator that Citation URIs depend on). Doc-comment on the type now explains the invariant. - normalize::to_posix changes signature to `Result<WorkspacePath, CoreError>` and now flows through `WorkspacePath::new`, so a path with `#` is rejected at construction rather than at every reader. Only one caller existed outside tests, so the signature change is contained. - Citation::parse uses `WorkspacePath::new` on the path side. With multiple `#` separators in the input, `rsplit_once` would otherwise leave a `#` on the path; the new constructor closes the hole. - Citation::parse + parse_hms_ms: every `bail!` / `anyhow!` site now quotes the offending substring (and the full input where useful) so error messages identify what went wrong without re-deriving from context. - New tests: `to_posix_rejects_hash_in_path`, `parse_path_with_hash_rejected_at_to_posix_layer`. Existing `to_posix(...).0` call sites updated for the Result signature. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ use std::path::PathBuf;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::errors::CoreError;
|
||||
use crate::ids::AssetId;
|
||||
use crate::media::{Checksum, MediaType};
|
||||
|
||||
@@ -17,10 +18,28 @@ pub enum SourceUri {
|
||||
}
|
||||
|
||||
/// POSIX-relative path inside the workspace root (§6.6, §4.1). Always
|
||||
/// produced via `crate::normalize::to_posix`.
|
||||
/// produced via `crate::normalize::to_posix` (filesystem side) or
|
||||
/// `WorkspacePath::new` (parse side). The inner string is forbidden from
|
||||
/// containing the `#` character: a workspace path must never collide with
|
||||
/// the W3C-Media-Fragments separator that `Citation` URIs rely on, so the
|
||||
/// invariant is enforced at construction rather than at every call site.
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
||||
pub struct WorkspacePath(pub String);
|
||||
|
||||
impl WorkspacePath {
|
||||
/// Construct a `WorkspacePath` from a string, rejecting any input that
|
||||
/// contains `#`. Use this on the parser side (e.g. `Citation::parse`)
|
||||
/// where the input does not flow through `to_posix`.
|
||||
pub fn new(s: String) -> Result<Self, CoreError> {
|
||||
if s.contains('#') {
|
||||
return Err(CoreError::Malformed(format!(
|
||||
"workspace path must not contain '#': {s:?}"
|
||||
)));
|
||||
}
|
||||
Ok(Self(s))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum AssetStorage {
|
||||
|
||||
@@ -93,13 +93,20 @@ impl Citation {
|
||||
Some(t) => t,
|
||||
None => bail!("citation has no '#' fragment: {s:?}"),
|
||||
};
|
||||
let path = WorkspacePath(path_str.to_owned());
|
||||
// `WorkspacePath::new` rejects any remaining `#` on the path side
|
||||
// (e.g. the input had multiple `#` separators), closing the
|
||||
// hash-in-path concern at construction rather than at every reader.
|
||||
let path = WorkspacePath::new(path_str.to_owned())?;
|
||||
|
||||
if let Some(rest) = frag.strip_prefix("L") {
|
||||
// line range: `L<a>` or `L<a>-L<b>`
|
||||
if let Some((a, b)) = rest.split_once("-L") {
|
||||
let start: u32 = a.parse().map_err(|_| anyhow::anyhow!("bad line start"))?;
|
||||
let end: u32 = b.parse().map_err(|_| anyhow::anyhow!("bad line end"))?;
|
||||
let start: u32 = a
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad line start in {a:?} (input {s:?})"))?;
|
||||
let end: u32 = b
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad line end in {b:?} (input {s:?})"))?;
|
||||
return Ok(Citation::Line {
|
||||
path,
|
||||
start,
|
||||
@@ -107,7 +114,9 @@ impl Citation {
|
||||
section: None,
|
||||
});
|
||||
}
|
||||
let n: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad line number"))?;
|
||||
let n: u32 = rest
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad line number in {rest:?} (input {s:?})"))?;
|
||||
return Ok(Citation::Line {
|
||||
path,
|
||||
start: n,
|
||||
@@ -116,7 +125,9 @@ impl Citation {
|
||||
});
|
||||
}
|
||||
if let Some(rest) = frag.strip_prefix("p=") {
|
||||
let page: u32 = rest.parse().map_err(|_| anyhow::anyhow!("bad page number"))?;
|
||||
let page: u32 = rest
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad page number in {rest:?} (input {s:?})"))?;
|
||||
return Ok(Citation::Page {
|
||||
path,
|
||||
page,
|
||||
@@ -126,12 +137,20 @@ impl Citation {
|
||||
if let Some(rest) = frag.strip_prefix("xywh=") {
|
||||
let parts: Vec<&str> = rest.split(',').collect();
|
||||
if parts.len() != 4 {
|
||||
bail!("xywh= expects 4 comma-separated values: {rest:?}");
|
||||
bail!("xywh= expects 4 comma-separated values, got {rest:?} (input {s:?})");
|
||||
}
|
||||
let x: u32 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad xywh.x"))?;
|
||||
let y: u32 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad xywh.y"))?;
|
||||
let w: u32 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad xywh.w"))?;
|
||||
let h: u32 = parts[3].parse().map_err(|_| anyhow::anyhow!("bad xywh.h"))?;
|
||||
let x: u32 = parts[0]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad xywh.x in {:?} (input {s:?})", parts[0]))?;
|
||||
let y: u32 = parts[1]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad xywh.y in {:?} (input {s:?})", parts[1]))?;
|
||||
let w: u32 = parts[2]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad xywh.w in {:?} (input {s:?})", parts[2]))?;
|
||||
let h: u32 = parts[3]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad xywh.h in {:?} (input {s:?})", parts[3]))?;
|
||||
return Ok(Citation::Region { path, x, y, w, h });
|
||||
}
|
||||
if frag == "caption" {
|
||||
@@ -145,13 +164,13 @@ impl Citation {
|
||||
let (range, speaker) = match rest.split_once('&') {
|
||||
Some((r, kv)) => match kv.strip_prefix("speaker=") {
|
||||
Some(sp) => (r, Some(sp.to_owned())),
|
||||
None => bail!("unknown time-fragment param: {kv:?}"),
|
||||
None => bail!("unknown time-fragment param {kv:?} (input {s:?})"),
|
||||
},
|
||||
None => (rest, None),
|
||||
};
|
||||
let (s_str, e_str) = match range.split_once(',') {
|
||||
Some(t) => t,
|
||||
None => bail!("time fragment expects '<start>,<end>': {range:?}"),
|
||||
None => bail!("time fragment expects '<start>,<end>', got {range:?} (input {s:?})"),
|
||||
};
|
||||
let start_ms = parse_hms_ms(s_str)?;
|
||||
let end_ms = parse_hms_ms(e_str)?;
|
||||
@@ -162,7 +181,7 @@ impl Citation {
|
||||
speaker,
|
||||
});
|
||||
}
|
||||
bail!("unrecognised citation fragment: {frag:?}")
|
||||
bail!("unrecognised citation fragment {frag:?} (input {s:?})")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,22 +200,32 @@ fn parse_hms_ms(s: &str) -> Result<u64> {
|
||||
if parts.len() != 3 {
|
||||
bail!("time component expects hh:mm:ss.mmm, got {s:?}");
|
||||
}
|
||||
let h: u64 = parts[0].parse().map_err(|_| anyhow::anyhow!("bad hours"))?;
|
||||
let m: u64 = parts[1].parse().map_err(|_| anyhow::anyhow!("bad minutes"))?;
|
||||
let h: u64 = parts[0]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad hours in {:?} (input {s:?})", parts[0]))?;
|
||||
let m: u64 = parts[1]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad minutes in {:?} (input {s:?})", parts[1]))?;
|
||||
let (sec, ms) = match parts[2].split_once('.') {
|
||||
Some((s_part, ms_part)) => {
|
||||
let sec: u64 = s_part.parse().map_err(|_| anyhow::anyhow!("bad seconds"))?;
|
||||
let sec: u64 = s_part
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad seconds in {s_part:?} (input {s:?})"))?;
|
||||
// Pad/truncate to exactly 3 digits.
|
||||
let mut ms_str = ms_part.to_owned();
|
||||
while ms_str.len() < 3 {
|
||||
ms_str.push('0');
|
||||
}
|
||||
ms_str.truncate(3);
|
||||
let ms: u64 = ms_str.parse().map_err(|_| anyhow::anyhow!("bad milliseconds"))?;
|
||||
let ms: u64 = ms_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad milliseconds in {ms_part:?} (input {s:?})"))?;
|
||||
(sec, ms)
|
||||
}
|
||||
None => {
|
||||
let sec: u64 = parts[2].parse().map_err(|_| anyhow::anyhow!("bad seconds"))?;
|
||||
let sec: u64 = parts[2]
|
||||
.parse()
|
||||
.map_err(|_| anyhow::anyhow!("bad seconds in {:?} (input {s:?})", parts[2]))?;
|
||||
(sec, 0)
|
||||
}
|
||||
};
|
||||
@@ -208,7 +237,7 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
fn p(s: &str) -> WorkspacePath {
|
||||
WorkspacePath(s.to_owned())
|
||||
WorkspacePath::new(s.to_owned()).expect("test paths must not contain '#'")
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -313,4 +342,16 @@ mod tests {
|
||||
fn parse_rejects_unknown_fragment() {
|
||||
assert!(Citation::parse("a.md#mystery=1").is_err());
|
||||
}
|
||||
|
||||
/// `rsplit_once('#')` would otherwise leave a `#` on the path side when
|
||||
/// the input contains multiple `#` separators (e.g. someone embeds a
|
||||
/// fake fragment in the path). The `WorkspacePath::new` constructor
|
||||
/// closes that hole at construction time.
|
||||
#[test]
|
||||
fn parse_path_with_hash_rejected_at_to_posix_layer() {
|
||||
// `notes/x#evil.md#L7` — rsplit_once strips `#L7`, leaving
|
||||
// `notes/x#evil.md` on the path side. WorkspacePath::new must reject.
|
||||
let r = Citation::parse("notes/x#evil.md#L7");
|
||||
assert!(r.is_err(), "path with embedded '#' must be rejected");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -309,7 +309,7 @@ mod tests {
|
||||
#[test]
|
||||
fn id_for_doc_pinned() {
|
||||
let asset = AssetId("6cb0ef0eb89c63b8b6e76ec53dca6e7d".to_string());
|
||||
let path = WorkspacePath("notes/test.md".to_string());
|
||||
let path = WorkspacePath::new("notes/test.md".to_string()).unwrap();
|
||||
let pv = ParserVersion("pulldown-cmark-0.x".to_string());
|
||||
let id = id_for_doc(&path, &asset, &pv);
|
||||
assert_eq!(id.0, "8547fe58cb42d593fd761d77242401db");
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::path::{Component, Path};
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
use crate::asset::WorkspacePath;
|
||||
use crate::errors::CoreError;
|
||||
|
||||
/// NFC-normalize a UTF-8 string (§4.1).
|
||||
pub fn nfc(input: &str) -> String {
|
||||
@@ -16,7 +17,11 @@ pub fn nfc(input: &str) -> String {
|
||||
/// - strip a leading `./`
|
||||
/// - collapse repeated slashes
|
||||
/// - NFC-normalize
|
||||
pub fn to_posix(path: &Path) -> WorkspacePath {
|
||||
///
|
||||
/// Returns `Err(CoreError::Malformed(..))` if the resulting POSIX form
|
||||
/// contains `#`, since `WorkspacePath` is forbidden from colliding with
|
||||
/// the W3C-Media-Fragments separator that `Citation` URIs depend on.
|
||||
pub fn to_posix(path: &Path) -> Result<WorkspacePath, CoreError> {
|
||||
let mut out = String::new();
|
||||
let mut first = true;
|
||||
for comp in path.components() {
|
||||
@@ -52,7 +57,7 @@ pub fn to_posix(path: &Path) -> WorkspacePath {
|
||||
if out.is_empty() {
|
||||
out.push('.');
|
||||
}
|
||||
WorkspacePath(nfc(&out))
|
||||
WorkspacePath::new(nfc(&out))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -64,7 +69,7 @@ mod tests {
|
||||
let p = Path::new("./a//b.md");
|
||||
// `Path::components` already collapses `//` on POSIX; the test
|
||||
// doc-fixed example asserts the final string is `a/b.md`.
|
||||
assert_eq!(to_posix(p).0, "a/b.md");
|
||||
assert_eq!(to_posix(p).unwrap().0, "a/b.md");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -74,8 +79,11 @@ mod tests {
|
||||
// collapse, so the WorkspacePath comes out NFC regardless of input.
|
||||
let nfd = "\u{1100}\u{1161}.md";
|
||||
let nfc_str = "\u{AC00}.md";
|
||||
assert_eq!(to_posix(Path::new(nfd)).0, to_posix(Path::new(nfc_str)).0);
|
||||
assert_eq!(to_posix(Path::new(nfd)).0, "\u{AC00}.md");
|
||||
assert_eq!(
|
||||
to_posix(Path::new(nfd)).unwrap().0,
|
||||
to_posix(Path::new(nfc_str)).unwrap().0
|
||||
);
|
||||
assert_eq!(to_posix(Path::new(nfd)).unwrap().0, "\u{AC00}.md");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -83,4 +91,14 @@ mod tests {
|
||||
let s = "\u{AC00}";
|
||||
assert_eq!(nfc(s), s);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_posix_rejects_hash_in_path() {
|
||||
// `#` collides with the W3C-Media-Fragments separator used by
|
||||
// `Citation`; the WorkspacePath invariant rejects it at construction.
|
||||
let p = Path::new("notes/has#hash.md");
|
||||
let err = to_posix(p).expect_err("# in path must be rejected");
|
||||
let msg = format!("{err}");
|
||||
assert!(msg.contains('#'), "error message should mention '#': {msg}");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user