diff --git a/Cargo.lock b/Cargo.lock index ec3d4e7..0ce2a30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -577,6 +577,21 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "kb-normalize" +version = "0.1.0" +dependencies = [ + "anyhow", + "kb-core", + "kb-parse-md", + "kb-parse-types", + "serde", + "serde_json", + "time", + "tracing", + "unicode-normalization", +] + [[package]] name = "kb-parse-md" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 23fb992..b5d4b57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ "crates/kb-config", "crates/kb-source-fs", "crates/kb-parse-md", + "crates/kb-normalize", "crates/kb-app", "crates/kb-cli", ] diff --git a/crates/kb-core/src/document.rs b/crates/kb-core/src/document.rs index e0bb295..477656b 100644 --- a/crates/kb-core/src/document.rs +++ b/crates/kb-core/src/document.rs @@ -100,11 +100,11 @@ pub struct AudioRefBlock { #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "lowercase", tag = "kind")] pub enum Inline { - Text(String), - Code(String), + Text { text: String }, + Code { code: String }, Link { text: String, href: String }, - Strong(Vec), - Emph(Vec), + Strong { children: Vec }, + Emph { children: Vec }, } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] @@ -175,3 +175,37 @@ pub struct TranscriptSegment { pub speaker: Option, pub confidence: Option, } + +#[cfg(test)] +mod tests { + use super::*; + + /// Each `Inline` variant must serialize and deserialize cleanly under + /// the internally-tagged representation. Newtype-with-primitive variants + /// (`Text(String)`, `Code(String)`, `Strong(Vec<…>)`, `Emph(Vec<…>)`) + /// previously failed at serde runtime because `tag = "kind"` cannot + /// describe a newtype carrying a non-struct value. The struct-variant + /// shape used here is the §9 schema migration. + #[test] + fn inline_serde_round_trip() { + let cases = vec![ + Inline::Text { text: "hi".into() }, + Inline::Code { code: "x".into() }, + Inline::Link { + text: "t".into(), + href: "h".into(), + }, + Inline::Strong { + children: vec![Inline::Text { text: "bold".into() }], + }, + Inline::Emph { + children: vec![Inline::Text { text: "em".into() }], + }, + ]; + for c in cases { + let s = serde_json::to_string(&c).expect("serialize"); + let back: Inline = serde_json::from_str(&s).expect("deserialize"); + assert_eq!(c, back); + } + } +} diff --git a/crates/kb-normalize/Cargo.toml b/crates/kb-normalize/Cargo.toml new file mode 100644 index 0000000..6d61e35 --- /dev/null +++ b/crates/kb-normalize/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "kb-normalize" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Lift parser output (kb-parse-types) into kb-core::CanonicalDocument with deterministic IDs (§3.4, §4.2, §4.3)" + +[dependencies] +kb-core = { path = "../kb-core" } +kb-parse-types = { path = "../kb-parse-types" } +serde = { workspace = true } +serde_json = { workspace = true } +unicode-normalization = "0.1" +time = { workspace = true } +anyhow = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +# kb-parse-md is permitted as a *dev*-dependency only — used by the +# integration snapshot test to drive a fixture through the real parser. +# Forbidden as a regular dep per design §8 (kb-normalize must not depend +# on any specific parser); `cargo tree -p kb-normalize --depth 1` (the +# default scope, excluding dev-deps) confirms this. +kb-parse-md = { path = "../kb-parse-md" } +serde_json = { workspace = true } diff --git a/crates/kb-normalize/src/lib.rs b/crates/kb-normalize/src/lib.rs new file mode 100644 index 0000000..9e08c76 --- /dev/null +++ b/crates/kb-normalize/src/lib.rs @@ -0,0 +1,843 @@ +//! `kb-normalize` — lift parser output (`kb-parse-types`) into a +//! [`kb_core::CanonicalDocument`] with deterministic IDs. +//! +//! Per design §3.4 (CanonicalDocument / Block), §4.2 (ID recipe), §4.3 +//! (ordinal rule), §3.6 (Provenance), §8 (module boundaries). +//! +//! Public surface: +//! +//! * [`build_canonical_document`] — assemble a `CanonicalDocument` from +//! `(RawAsset, Metadata, Vec, ParserVersion, Vec)`. +//! * [`id_for_doc`], [`id_for_block`] — re-exports of the canonical +//! ID-recipe functions in `kb-core::ids` (§4.2). `kb-core` is the only +//! implementation; `kb-normalize` is the canonical *entry point* per +//! design §8. +//! +//! This crate must NOT depend on any parser implementation crate +//! (`kb-parse-md`, `kb-parse-pdf`, …). All parser output flows in via +//! the shared `kb-parse-types` crate. + +use std::collections::HashMap; + +use anyhow::Result; +use kb_core::{ + Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, DocumentId, HeadingBlock, + ImageRefBlock, Inline, Lang, ListBlock, Metadata, ParserVersion, Provenance, ProvenanceEvent, + ProvenanceKind, RawAsset, TableBlock, TextBlock, +}; +use kb_parse_types::{ParsedBlock, ParsedPayload, Warning, WarningKind}; +use time::OffsetDateTime; +use unicode_normalization::UnicodeNormalization; + +pub use kb_core::{id_for_block, id_for_doc}; + +/// Build a [`CanonicalDocument`] from the raw asset, frontmatter +/// metadata, parser blocks, parser version, and any warnings. +/// +/// Behavior contract (per design §3.4 / §4.2 / §4.3 / §3.6): +/// +/// * `doc_id = id_for_doc(workspace_path, asset_id, parser_version)` — +/// `workspace_path` is consumed verbatim from `asset` (already NFC + +/// POSIX per `kb_core::normalize::to_posix`). +/// * `block_id = id_for_block(doc_id, kind, heading_path, ordinal, +/// source_span)` — `ordinal` is **0-based, scoped to (heading_path, +/// block_kind), in document order** per §4.3. +/// * `title` and `lang` are lifted from `metadata.user["title"]` / +/// `metadata.user["lang"]` (where P1-2 stashes them) into the dedicated +/// `CanonicalDocument` fields, and removed from the user map to avoid +/// duplication. Both keys are lifted only if present and stringy; +/// non-stringy values (e.g. `Number`, `Array`) and missing keys +/// silently default to empty title / empty `Lang`. P1-2's frontmatter +/// parser only writes these keys when the source value parses as a +/// string, so the non-stringy branches are defense-in-depth. +/// * `provenance` is seeded with `Discovered` (from `asset.discovered_at`), +/// `Parsed`, `Normalized` events, and one `Warning` event per upstream +/// warning. The two normalize-side events share one `now_utc()` reading +/// so the timestamp jitter inside a single call is bounded — event +/// ordering is preserved by `Vec` position. +/// * `schema_version` and `doc_version` are pinned to `1` (initial). +pub fn build_canonical_document( + asset: &RawAsset, + metadata: Metadata, + blocks: Vec, + parser_version: &ParserVersion, + warnings: Vec, +) -> Result { + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version); + + // Lift title / lang from `metadata.user` (P1-2 stashed them there + // because `Metadata` does not carry them directly). Strip after + // lifting so the wire form does not duplicate the data. + let mut metadata = metadata; + let title = metadata + .user + .remove("title") + .and_then(|v| v.as_str().map(String::from)) + .unwrap_or_default(); + let lang = metadata + .user + .remove("lang") + .and_then(|v| v.as_str().map(|s| Lang(s.to_string()))) + .unwrap_or_else(|| Lang(String::new())); + + // §4.3 ordinal rule — per (heading_path, block_kind), 0-based, + // document order. A separate counter is kept for each grouping key. + let mut counters: HashMap<(Vec, &'static str), u32> = HashMap::new(); + // Some lift paths (e.g. AudioRef pre-P8) drop the block entirely and + // synthesize a Warning so the wire form never carries an invalid + // `AssetId`. These warnings originate at the lift stage and are + // attributed to `kb-normalize` (not to whatever upstream emitter the + // bare `WarningKind` would resolve to via `warning_agent`). They are + // tracked separately so the agent string is correct in Provenance. + let mut lift_warnings: Vec = Vec::new(); + let lifted_blocks: Vec = blocks + .into_iter() + .filter_map(|pb| lift_block(&doc_id, pb, &mut counters, &mut lift_warnings)) + .collect(); + + tracing::debug!( + target: "kb-normalize", + "built canonical document doc_id={} blocks={}", + doc_id.0, + lifted_blocks.len() + ); + + // Provenance — share `now` between the parse + normalize stages so + // the per-call timestamp jitter is bounded. + let now = OffsetDateTime::now_utc(); + let mut events: Vec = + Vec::with_capacity(3 + warnings.len() + lift_warnings.len()); + events.push(ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-md".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!("parser_version={}", parser_version.0)), + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-normalize".to_string(), + kind: ProvenanceKind::Normalized, + note: None, + }); + // {:?} on WarningKind renders camel-case variant name; intentional + // for human-readable Provenance trace. + for w in warnings { + events.push(ProvenanceEvent { + at: now, + agent: warning_agent(&w.kind).to_string(), + kind: ProvenanceKind::Warning, + note: Some(format!("{:?}: {}", w.kind, w.note)), + }); + } + // Lift-stage warnings (currently only AudioRef-deferred drops) are + // unconditionally attributed to `kb-normalize`. + for w in lift_warnings { + events.push(ProvenanceEvent { + at: now, + agent: "kb-normalize".to_string(), + kind: ProvenanceKind::Warning, + note: Some(format!("{:?}: {}", w.kind, w.note)), + }); + } + let provenance = Provenance { events }; + + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang, + blocks: lifted_blocks, + metadata, + provenance, + parser_version: parser_version.clone(), + schema_version: 1, + doc_version: 1, + }) +} + +/// Resolve a `WarningKind` to the upstream agent that emitted it. Used +/// to fill `ProvenanceEvent::agent` for the warning's event entry. +/// +/// `ExtractFailed` is emitted today by `kb-parse-md`'s panic-recovery +/// guard around `parse_blocks` — see `crates/kb-parse-md/src/blocks.rs`. +/// If a future stage (e.g. `kb-normalize` itself, an extractor, …) starts +/// emitting `ExtractFailed`, this mapping needs to grow context (perhaps +/// a separate `WarningSource` field on `Warning`) so attribution stays +/// honest. For now, all `ExtractFailed` warnings observed by +/// `build_canonical_document` originated in the parser. +fn warning_agent(kind: &WarningKind) -> &'static str { + match kind { + WarningKind::MalformedFrontmatter | WarningKind::EncodingFallback => "kb-parse-md", + WarningKind::MalformedTable => "kb-parse-md", + WarningKind::ExtractFailed => "kb-parse-md", + } +} + +/// Map a `ParsedPayload` variant to the lowercase, no-spaces string used +/// as `block_kind` in the §4.2 ID tuple. +fn payload_kind(payload: &ParsedPayload) -> &'static str { + match payload { + ParsedPayload::Heading { .. } => "heading", + ParsedPayload::Paragraph { .. } => "paragraph", + ParsedPayload::List { .. } => "list", + ParsedPayload::Code { .. } => "code", + ParsedPayload::Table { .. } => "table", + ParsedPayload::Quote { .. } => "quote", + ParsedPayload::ImageRef { .. } => "imageref", + ParsedPayload::AudioRef { .. } => "audioref", + } +} + +fn next_ordinal( + counters: &mut HashMap<(Vec, &'static str), u32>, + heading_path: &[String], + kind: &'static str, +) -> u32 { + let key = (heading_path.to_vec(), kind); + let entry = counters.entry(key).or_insert(0); + let ordinal = *entry; + *entry += 1; + ordinal +} + +fn lift_block( + doc_id: &DocumentId, + pb: ParsedBlock, + counters: &mut HashMap<(Vec, &'static str), u32>, + warnings: &mut Vec, +) -> Option { + let kind = payload_kind(&pb.payload); + // Task spec line 73: "All input strings normalized to NFC before + // hashing." `pulldown-cmark` does not NFC heading text, and + // `serde_json_canonicalizer` v0.3 does not normalize strings either, + // so we must NFC-normalize `heading_path` here before it feeds both + // the §4.2 ID recipe AND the on-disk `CommonBlock.heading_path` (so + // wire form matches ID input). Without this, NFD `\u{1100}\u{1161}` + // and NFC `\u{AC00}` (both render as 가) would produce different + // `block_id`s for what is logically the same heading. + let heading_path_nfc: Vec = + pb.heading_path.iter().map(|s| s.nfc().collect()).collect(); + let ordinal = next_ordinal(counters, &heading_path_nfc, kind); + let block_id: BlockId = + id_for_block(doc_id, kind, &heading_path_nfc, ordinal, &pb.source_span); + let common = CommonBlock { + block_id, + heading_path: heading_path_nfc, + source_span: pb.source_span, + }; + let block = match pb.payload { + ParsedPayload::Heading { level, text } => Block::Heading(HeadingBlock { + common, + level, + text, + }), + ParsedPayload::Paragraph { text, inlines } => Block::Paragraph(TextBlock { + common, + text, + inlines, + }), + ParsedPayload::List { ordered, items } => Block::List(ListBlock { + common: common.clone(), + ordered, + items: items + .into_iter() + .map(|item_inlines| TextBlock { + // All list items currently inherit the parent's + // CommonBlock (incl. block_id). Per-item IDs would + // require a §4.2 recipe extension. Spec (§3.4) + // defines `ListBlock.items: Vec` and + // does not allocate per-item BlockIds. Re-using the + // parent's common keeps the wire form deterministic + // while letting the inline tree carry the item + // content. + common: common.clone(), + text: flatten_inlines(&item_inlines), + inlines: item_inlines, + }) + .collect(), + }), + ParsedPayload::Code { lang, code } => Block::Code(CodeBlock { common, lang, code }), + ParsedPayload::Table { headers, rows } => Block::Table(TableBlock { + common, + headers, + rows, + }), + ParsedPayload::Quote { text, inlines } => Block::Quote(TextBlock { + common, + text, + inlines, + }), + ParsedPayload::ImageRef { src, alt } => Block::ImageRef(ImageRefBlock { + common, + asset_id: None, + src, + alt, + ocr: None, + caption: None, + }), + // TODO(P8): audio extractor will resolve workspace assets and + // produce real AssetIds. This skip-and-warn shim is a + // placeholder. `AssetId::from_str` requires a 32-hex string, so + // synthesizing `AssetId(String::new())` would break the + // invariant — instead we drop the block and surface a Warning + // (attributed to `kb-normalize` per §3.6 since this is the + // lift-stage decision). + ParsedPayload::AudioRef { src } => { + warnings.push(Warning { + kind: WarningKind::ExtractFailed, + note: format!( + "audio-ref AssetId resolution deferred to P8 — block dropped (src={src})" + ), + }); + return None; + } + }; + Some(block) +} + +/// Flatten a `Vec` into a plain text string. Used by list-item +/// `TextBlock.text` since `ParsedPayload::List` only carries inline trees +/// per item. +fn flatten_inlines(inlines: &[Inline]) -> String { + let mut out = String::new(); + for i in inlines { + flatten_inline(i, &mut out); + } + out +} + +fn flatten_inline(i: &Inline, out: &mut String) { + match i { + Inline::Text { text } => out.push_str(text), + Inline::Code { code } => out.push_str(code), + Inline::Link { text, .. } => out.push_str(text), + Inline::Strong { children } | Inline::Emph { children } => { + for c in children { + flatten_inline(c, out); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use kb_core::{ + AssetId, AssetStorage, Checksum, MediaType, SourceSpan, SourceType, SourceUri, + TrustLevel, WorkspacePath, normalize::to_posix, + }; + use serde_json::Value; + use std::path::{Path, PathBuf}; + use time::OffsetDateTime; + + fn fixture_asset() -> RawAsset { + let workspace_path = WorkspacePath::new("notes/example.md".into()).unwrap(); + RawAsset { + asset_id: AssetId("a".repeat(32)), + source_uri: SourceUri::File(PathBuf::from("/tmp/example.md")), + workspace_path, + media_type: MediaType::Markdown, + byte_len: 0, + checksum: Checksum("0".repeat(64)), + // Pin a fixed timestamp so determinism tests can compare + // outputs across runs without timestamp jitter outside the + // fields we explicitly strip. + discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + stored: AssetStorage::Reference { + path: PathBuf::from("/tmp/example.md"), + sha: Checksum("0".repeat(64)), + }, + } + } + + fn fixture_metadata() -> Metadata { + let mut user = serde_json::Map::new(); + user.insert("title".into(), Value::String("Example".into())); + user.insert("lang".into(), Value::String("en".into())); + user.insert("custom".into(), Value::Bool(true)); + Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Markdown, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user, + } + } + + fn parser_version() -> ParserVersion { + ParserVersion("kb-normalize-test-0".into()) + } + + /// Fixed 5-block input used by both the ordinal-scoping pinning test + /// and the determinism stress test (so the latter exercises the + /// `lift_block` path, not just the empty-blocks path). + fn fixture_blocks_five() -> Vec { + let h1_a = vec!["A".to_string()]; + let h1_b = vec!["B".to_string()]; + vec![ + ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Paragraph, + heading_path: h1_a.clone(), + source_span: SourceSpan::Line { start: 1, end: 1 }, + payload: ParsedPayload::Paragraph { + text: "p1".into(), + inlines: vec![], + }, + }, + ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Paragraph, + heading_path: h1_a.clone(), + source_span: SourceSpan::Line { start: 2, end: 2 }, + payload: ParsedPayload::Paragraph { + text: "p2".into(), + inlines: vec![], + }, + }, + ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Paragraph, + heading_path: h1_a.clone(), + source_span: SourceSpan::Line { start: 3, end: 3 }, + payload: ParsedPayload::Paragraph { + text: "p3".into(), + inlines: vec![], + }, + }, + ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Code, + heading_path: h1_a, + source_span: SourceSpan::Line { start: 4, end: 5 }, + payload: ParsedPayload::Code { + lang: None, + code: "x".into(), + }, + }, + ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Paragraph, + heading_path: h1_b, + source_span: SourceSpan::Line { start: 6, end: 6 }, + payload: ParsedPayload::Paragraph { + text: "q1".into(), + inlines: vec![], + }, + }, + ] + } + + /// `id_for_doc` is deterministic across 1000 invocations on the same + /// input — a regression in canonical JSON or BLAKE3 would surface + /// here immediately. + #[test] + fn id_for_doc_deterministic_1000() { + let path = WorkspacePath::new("a/b.md".into()).unwrap(); + let asset = AssetId("0123456789abcdef0123456789abcdef".into()); + let pv = ParserVersion("v1".into()); + let first = id_for_doc(&path, &asset, &pv); + for _ in 0..1000 { + assert_eq!(id_for_doc(&path, &asset, &pv), first); + } + } + + /// NFC vs NFD inputs for the same Korean glyph must produce the + /// same `doc_id` because `to_posix` runs NFC normalization. + #[test] + fn nfc_nfd_korean_path_same_id() { + let nfd = to_posix(Path::new("\u{1100}\u{1161}.md")).unwrap(); + let nfc = to_posix(Path::new("\u{AC00}.md")).unwrap(); + let asset = AssetId("0123456789abcdef0123456789abcdef".into()); + let pv = parser_version(); + assert_eq!(id_for_doc(&nfd, &asset, &pv), id_for_doc(&nfc, &asset, &pv)); + } + + /// `./a/b.md` and `a/b.md` must collapse to the same POSIX form + /// before `id_for_doc`. + #[test] + fn posix_curdir_collapses_to_same_id() { + let a = to_posix(Path::new("./a/b.md")).unwrap(); + let b = to_posix(Path::new("a/b.md")).unwrap(); + let asset = AssetId("0123456789abcdef0123456789abcdef".into()); + let pv = parser_version(); + assert_eq!(id_for_doc(&a, &asset, &pv), id_for_doc(&b, &asset, &pv)); + } + + /// Ordinals are scoped to (heading_path, block_kind) per §4.3: + /// three paragraphs under H1 → 0/1/2; a code block under the same + /// H1 starts a fresh counter at 0; a paragraph under a different + /// H1 also starts a fresh counter at 0. + #[test] + fn block_ordinals_scoped_per_heading_and_kind() { + let h1_a = vec!["A".to_string()]; + let h1_b = vec!["B".to_string()]; + let blocks = fixture_blocks_five(); + let asset = fixture_asset(); + let metadata = fixture_metadata(); + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); + + // Compute the expected IDs out-of-band so the test pins both + // the (heading_path, kind) ordinal grouping AND the value of + // each block_id under the recipe. + let p1 = id_for_block( + &doc.doc_id, + "paragraph", + &h1_a, + 0, + &SourceSpan::Line { start: 1, end: 1 }, + ); + let p2 = id_for_block( + &doc.doc_id, + "paragraph", + &h1_a, + 1, + &SourceSpan::Line { start: 2, end: 2 }, + ); + let p3 = id_for_block( + &doc.doc_id, + "paragraph", + &h1_a, + 2, + &SourceSpan::Line { start: 3, end: 3 }, + ); + let c0 = id_for_block( + &doc.doc_id, + "code", + &h1_a, + 0, + &SourceSpan::Line { start: 4, end: 5 }, + ); + let q0 = id_for_block( + &doc.doc_id, + "paragraph", + &h1_b, + 0, + &SourceSpan::Line { start: 6, end: 6 }, + ); + + let ids: Vec<&BlockId> = doc + .blocks + .iter() + .map(|b| match b { + Block::Paragraph(t) | Block::Quote(t) => &t.common.block_id, + Block::Heading(h) => &h.common.block_id, + Block::List(l) => &l.common.block_id, + Block::Code(c) => &c.common.block_id, + Block::Table(t) => &t.common.block_id, + Block::ImageRef(i) => &i.common.block_id, + Block::AudioRef(a) => &a.common.block_id, + }) + .collect(); + assert_eq!(ids, vec![&p1, &p2, &p3, &c0, &q0]); + } + + /// Provenance events appear in the documented order: `Discovered` + /// (from the asset), `Parsed`, then `Normalized`. Warnings (none in + /// this test) would follow. + #[test] + fn provenance_contains_stage_events_in_order() { + let asset = fixture_asset(); + let metadata = fixture_metadata(); + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + let kinds: Vec<_> = doc.provenance.events.iter().map(|e| e.kind).collect(); + assert_eq!( + kinds, + vec![ + ProvenanceKind::Discovered, + ProvenanceKind::Parsed, + ProvenanceKind::Normalized, + ] + ); + let events = &doc.provenance.events; + assert_eq!(events[0].at, asset.discovered_at); + assert_eq!(events[0].agent, "kb-source-fs"); + assert_eq!(events[1].agent, "kb-parse-md"); + assert_eq!(events[2].agent, "kb-normalize"); + // Pin the implementation invariant that Parsed and Normalized + // share the single `now_utc()` reading inside one call. + assert_eq!(events[1].at, events[2].at, "Parsed and Normalized share now_utc"); + } + + /// Warnings carried into `build_canonical_document` are emitted as + /// `ProvenanceKind::Warning` events with the upstream agent. + #[test] + fn provenance_includes_warnings() { + let asset = fixture_asset(); + let metadata = fixture_metadata(); + let pv = parser_version(); + let warnings = vec![Warning { + kind: WarningKind::MalformedFrontmatter, + note: "missing closing fence".into(), + }]; + let doc = + build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap(); + assert_eq!(doc.provenance.events.len(), 4); + let last = doc.provenance.events.last().unwrap(); + assert_eq!(last.kind, ProvenanceKind::Warning); + assert_eq!(last.agent, "kb-parse-md"); + assert!(last.note.as_deref().unwrap().contains("missing closing fence")); + } + + /// `metadata.user["title"]` and `metadata.user["lang"]` are lifted + /// to the dedicated `CanonicalDocument` fields and stripped from + /// the user map (so the wire form does not duplicate the data). + /// Other user keys survive intact. + #[test] + fn lifts_title_and_lang_from_user_map() { + let asset = fixture_asset(); + let metadata = fixture_metadata(); + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + assert_eq!(doc.title, "Example"); + assert_eq!(doc.lang, Lang("en".into())); + assert!(!doc.metadata.user.contains_key("title")); + assert!(!doc.metadata.user.contains_key("lang")); + assert!(doc.metadata.user.contains_key("custom")); + } + + /// Determinism property: 1000 iterations of `build_canonical_document` + /// over identical inputs produce byte-identical JSON, modulo the two + /// non-deterministic `now_utc()` timestamps for the Parsed/Normalized + /// events. We strip those timestamps before comparing. Must finish + /// within 1 second. + #[test] + fn determinism_1000_iterations_under_1s() { + let asset = fixture_asset(); + let metadata = fixture_metadata(); + let pv = parser_version(); + + // Helper: serialize and replace the two now_utc-derived timestamps + // (Parsed + Normalized + any Warning events) with a sentinel so + // the comparison only checks the deterministic fields. + fn strip_dynamic_at(doc: &CanonicalDocument) -> Value { + let mut v = serde_json::to_value(doc).unwrap(); + if let Some(events) = v + .get_mut("provenance") + .and_then(|p| p.get_mut("events")) + .and_then(|e| e.as_array_mut()) + { + for (i, ev) in events.iter_mut().enumerate() { + // index 0 is Discovered (deterministic — pinned in + // the fixture). Strip everything after. + if i > 0 + && let Some(obj) = ev.as_object_mut() + { + obj.insert("at".into(), Value::String("".into())); + } + } + } + v + } + + // Use the same 5-block fixture as the ordinal-scoping test so + // determinism is exercised on a non-empty `lift_block` path + // (block_id hashing, NFC normalization, ordinal counters), not + // just an empty Vec. + let baseline = build_canonical_document( + &asset, + metadata.clone(), + fixture_blocks_five(), + &pv, + vec![], + ) + .unwrap(); + let baseline_json = serde_json::to_string(&strip_dynamic_at(&baseline)).unwrap(); + + let start = std::time::Instant::now(); + for _ in 0..1000 { + let next = build_canonical_document( + &asset, + metadata.clone(), + fixture_blocks_five(), + &pv, + vec![], + ) + .unwrap(); + let next_json = serde_json::to_string(&strip_dynamic_at(&next)).unwrap(); + assert_eq!(baseline_json, next_json); + } + assert!( + start.elapsed() < std::time::Duration::from_secs(1), + "1000 iterations took {:?}", + start.elapsed() + ); + } + + /// I1 regression — `WarningKind::ExtractFailed` is emitted by + /// `kb-parse-md` (panic-recovery in `blocks.rs`), so the resulting + /// `ProvenanceEvent::agent` must read `"kb-parse-md"`. A regression + /// to `"kb-normalize"` would mis-attribute parse panics and break + /// stage-filtered debugging. + #[test] + fn provenance_with_extract_failed_warning_attributes_to_kb_parse_md() { + let asset = fixture_asset(); + let metadata = fixture_metadata(); + let pv = parser_version(); + let warnings = vec![Warning { + kind: WarningKind::ExtractFailed, + note: "pulldown-cmark panicked; body discarded".into(), + }]; + let doc = + build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap(); + let warning_event = doc + .provenance + .events + .iter() + .find(|e| e.kind == ProvenanceKind::Warning) + .expect("warning event present"); + assert_eq!(warning_event.agent, "kb-parse-md"); + assert!( + warning_event + .note + .as_deref() + .unwrap() + .contains("ExtractFailed") + ); + } + + /// I2 regression — `ParsedPayload::AudioRef` is dropped (not lifted + /// into a `Block::AudioRef` with a synthesized empty `AssetId`, + /// which would violate `AssetId::from_str`'s 32-hex invariant). A + /// `Warning` is surfaced in Provenance, attributed to + /// `"kb-normalize"` because the decision is made at the lift stage. + #[test] + fn audio_ref_block_skipped_with_warning() { + let span = SourceSpan::Line { start: 1, end: 1 }; + let blocks = vec![ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::AudioRef, + heading_path: vec![], + source_span: span, + payload: ParsedPayload::AudioRef { + src: "voice.m4a".into(), + }, + }]; + let asset = fixture_asset(); + let metadata = fixture_metadata(); + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); + + // No AudioRef block in the canonical output. + assert!( + !doc.blocks + .iter() + .any(|b| matches!(b, Block::AudioRef(_))), + "AudioRef block should be skipped pre-P8" + ); + + // Exactly one Warning event mentioning the AudioRef src. + let warning_events: Vec<_> = doc + .provenance + .events + .iter() + .filter(|e| e.kind == ProvenanceKind::Warning) + .collect(); + assert_eq!(warning_events.len(), 1); + let w = warning_events[0]; + assert_eq!(w.agent, "kb-normalize"); + assert!(w.note.as_deref().unwrap().contains("voice.m4a")); + } + + /// I3 regression — heading-path strings are NFC-normalized before + /// feeding into `id_for_block`, so canonically-equivalent NFD and + /// NFC inputs produce the same `block_id`. Mirrors + /// `nfc_nfd_korean_path_same_id` for `doc_id`. + #[test] + fn nfc_nfd_korean_heading_path_same_block_id() { + let span = SourceSpan::Line { start: 1, end: 1 }; + let nfd_heading = "\u{1100}\u{1161}".to_string(); // 가 (NFD) + let nfc_heading = "\u{AC00}".to_string(); // 가 (NFC) + let mk_block = |heading: String| ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Paragraph, + heading_path: vec![heading], + source_span: span.clone(), + payload: ParsedPayload::Paragraph { + text: "p".into(), + inlines: vec![], + }, + }; + let asset = fixture_asset(); + let pv = parser_version(); + let doc_nfd = build_canonical_document( + &asset, + fixture_metadata(), + vec![mk_block(nfd_heading)], + &pv, + vec![], + ) + .unwrap(); + let doc_nfc = build_canonical_document( + &asset, + fixture_metadata(), + vec![mk_block(nfc_heading)], + &pv, + vec![], + ) + .unwrap(); + let id_nfd = match &doc_nfd.blocks[0] { + Block::Paragraph(t) => &t.common.block_id, + _ => panic!("expected Paragraph"), + }; + let id_nfc = match &doc_nfc.blocks[0] { + Block::Paragraph(t) => &t.common.block_id, + _ => panic!("expected Paragraph"), + }; + assert_eq!(id_nfd, id_nfc, "NFD and NFC heading paths must hash equal"); + } + + /// M7 — `metadata.user["title"] = ""` is stringy and lifts to an + /// empty `CanonicalDocument.title`. This pins the policy: an + /// explicit empty string is *not* dropped, it's lifted as-is. + #[test] + fn title_empty_string_in_user_map_falls_back_to_default() { + let asset = fixture_asset(); + let mut metadata = fixture_metadata(); + metadata + .user + .insert("title".into(), Value::String(String::new())); + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + assert_eq!(doc.title, ""); + } + + /// M7 — `metadata.user["title"] = 42` is non-stringy and silently + /// drops; the fallback default (empty title) is used. + #[test] + fn title_non_string_in_user_map_silently_drops() { + let asset = fixture_asset(); + let mut metadata = fixture_metadata(); + metadata + .user + .insert("title".into(), Value::Number(42.into())); + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + assert_eq!(doc.title, ""); + } + + /// M7 — non-stringy `lang` (e.g. an array) silently drops. This is + /// defensive: P1-2 frontmatter validates the shape upstream, but we + /// don't trust it. + #[test] + fn lang_invalid_shape_silently_drops() { + let asset = fixture_asset(); + let mut metadata = fixture_metadata(); + metadata.user.insert("lang".into(), Value::Array(vec![])); + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + assert_eq!(doc.lang, Lang(String::new())); + } +} diff --git a/crates/kb-normalize/tests/normalize_snapshot.rs b/crates/kb-normalize/tests/normalize_snapshot.rs new file mode 100644 index 0000000..ec10ddf --- /dev/null +++ b/crates/kb-normalize/tests/normalize_snapshot.rs @@ -0,0 +1,160 @@ +//! Snapshot test pinning the full `CanonicalDocument` JSON for the +//! `code-and-table.md` fixture. +//! +//! This is an integration test (it lives under `tests/`) and depends on +//! `kb-parse-md` only as a dev-dep so the production crate's regular +//! deps still satisfy the §8 boundary (`cargo tree -p kb-normalize +//! --depth 1` without `-e dev` does not list any parser implementation). +//! +//! Non-deterministic fields are stripped before comparison: +//! +//! * `provenance.events[*].at` — each invocation calls `now_utc()` for +//! the Parsed/Normalized/Warning events. The Discovered event uses +//! the asset's pinned `discovered_at`, so we keep that one and replace +//! only indices ≥ 1. + +use std::path::PathBuf; + +use kb_core::{ + AssetId, AssetStorage, Checksum, MediaType, ParserVersion, RawAsset, SourceUri, + WorkspacePath, +}; +use kb_normalize::build_canonical_document; +use kb_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; +use serde_json::Value; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("..") + .join("fixtures") + .join("markdown") +} + +fn fixed_asset(workspace_path: &str) -> RawAsset { + let wp = WorkspacePath::new(workspace_path.into()).unwrap(); + RawAsset { + asset_id: AssetId("a".repeat(32)), + source_uri: SourceUri::File(PathBuf::from("/tmp/code-and-table.md")), + workspace_path: wp, + media_type: MediaType::Markdown, + byte_len: 0, + checksum: Checksum("0".repeat(64)), + // Pin discovered_at so the Discovered provenance event is + // deterministic across runs. + discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + stored: AssetStorage::Reference { + path: PathBuf::from("/tmp/code-and-table.md"), + sha: Checksum("0".repeat(64)), + }, + } +} + +fn strip_dynamic(mut v: Value) -> Value { + if let Some(events) = v + .get_mut("provenance") + .and_then(|p| p.get_mut("events")) + .and_then(|e| e.as_array_mut()) + { + for (i, ev) in events.iter_mut().enumerate() { + if i > 0 + && let Some(obj) = ev.as_object_mut() + { + obj.insert("at".into(), Value::String("".into())); + } + } + } + v +} + +#[test] +fn code_and_table_canonical_snapshot() { + let dir = fixtures_dir(); + let bytes = std::fs::read(dir.join("code-and-table.md")).expect("fixture readable"); + + // Frontmatter parse — code-and-table.md has none, so we provide + // BodyHints with deterministic timestamps so the lifted Metadata + // is reproducible. The body offset is 1 (no frontmatter prefix). + // + // We pin `first_h1` so the BodyHints → user.title → CanonicalDocument.title + // lift chain is exercised end-to-end (see `assert_eq!` on + // `doc.title` below). Without this, `code-and-table.md`'s lack of + // frontmatter title would leave `title == ""` and the chain would + // be uncovered by the snapshot. + let asset = fixed_asset("notes/code-and-table.md"); + let hints = BodyHints { + first_h1: Some("Code And Table".into()), + fs_ctime: asset.discovered_at, + fs_mtime: asset.discovered_at, + fallback_lang: Some("en".into()), + }; + let (metadata, fm_span, _fm_warns) = + parse_frontmatter(&bytes, &hints).expect("frontmatter parses"); + + // No frontmatter → body starts at line 1. With frontmatter, line + // count of the prelude is computed from the byte span; this fixture + // has none, so the constant 1 is fine. + let body_offset_lines: u32 = match fm_span { + // Defensive: count the newlines in the prelude. The fixture + // hits the `None` branch so this code path is not exercised + // by the test, but kept for completeness. + Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1, + None => 1, + }; + let (blocks, parse_warns) = + parse_blocks(&bytes, body_offset_lines).expect("blocks parse"); + + let parser_version = ParserVersion("kb-normalize-snapshot-test-0".into()); + let mut metadata = metadata; + // The `created_at` / `updated_at` lifted from BodyHints are pinned + // to `discovered_at` above, so they are already deterministic. + metadata.aliases.sort(); + metadata.tags.sort(); + + let doc = build_canonical_document( + &asset, + metadata, + blocks, + &parser_version, + parse_warns, + ) + .expect("build_canonical_document"); + + // Assert the BodyHints → first_h1 → user.title → CanonicalDocument.title + // lift chain end-to-end. Pinned in the snapshot too, but the explicit + // assertion makes a future drift fail with a clearer message. + assert_eq!(doc.title, "Code And Table"); + + let actual = strip_dynamic(serde_json::to_value(&doc).unwrap()); + + let baseline_path = dir.join("code-and-table.canonical.snapshot.json"); + let baseline_text = match std::fs::read_to_string(&baseline_path) { + Ok(s) => s, + Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + return; + } + Err(e) => panic!( + "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}", + baseline_path.display() + ), + }; + let expected: Value = + serde_json::from_str(&baseline_text).expect("baseline parses as json"); + + if actual != expected { + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + eprintln!("updated baseline {}", baseline_path.display()); + return; + } + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "canonical snapshot drift\n--- expected ({}) ---\n{baseline_text}\n--- actual ---\n{pretty}\nIf intentional, re-run with UPDATE_SNAPSHOTS=1.", + baseline_path.display() + ); + } +} diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs index 93bdc17..e2d7daa 100644 --- a/crates/kb-parse-md/src/blocks.rs +++ b/crates/kb-parse-md/src/blocks.rs @@ -300,12 +300,12 @@ impl InlineBuf { fn push_text(&mut self, s: &str) { self.text.push_str(s); - self.push_inline(Inline::Text(s.to_string())); + self.push_inline(Inline::Text { text: s.to_string() }); } fn push_code(&mut self, s: &str) { self.text.push_str(s); - self.push_inline(Inline::Code(s.to_string())); + self.push_inline(Inline::Code { code: s.to_string() }); } fn open_strong(&mut self) { @@ -313,7 +313,7 @@ impl InlineBuf { } fn close_strong(&mut self) { if let Some(InlineFrame::Strong(kids)) = self.stack.pop() { - self.push_inline(Inline::Strong(kids)); + self.push_inline(Inline::Strong { children: kids }); } } @@ -322,7 +322,7 @@ impl InlineBuf { } fn close_emph(&mut self) { if let Some(InlineFrame::Emph(kids)) = self.stack.pop() { - self.push_inline(Inline::Emph(kids)); + self.push_inline(Inline::Emph { children: kids }); } } @@ -361,8 +361,8 @@ impl InlineBuf { // If formatting tags were unbalanced we close them defensively. while self.stack.len() > 1 { match self.stack.pop().unwrap() { - InlineFrame::Strong(kids) => self.push_inline(Inline::Strong(kids)), - InlineFrame::Emph(kids) => self.push_inline(Inline::Emph(kids)), + InlineFrame::Strong(kids) => self.push_inline(Inline::Strong { children: kids }), + InlineFrame::Emph(kids) => self.push_inline(Inline::Emph { children: kids }), InlineFrame::Link { href, text, kids } => { let flat = if !text.is_empty() { text @@ -475,10 +475,11 @@ fn flatten_inlines_to_text(inlines: &[Inline]) -> String { fn flatten_one(i: &Inline, out: &mut String) { match i { - Inline::Text(s) | Inline::Code(s) => out.push_str(s), + Inline::Text { text } => out.push_str(text), + Inline::Code { code } => out.push_str(code), Inline::Link { text, .. } => out.push_str(text), - Inline::Strong(v) | Inline::Emph(v) => { - for c in v { + Inline::Strong { children } | Inline::Emph { children } => { + for c in children { flatten_one(c, out); } } @@ -823,7 +824,7 @@ impl<'a> WalkState<'a> { text.push('\n'); } text.push_str(t); - inlines.push(Inline::Text(t.clone())); + inlines.push(Inline::Text { text: t.clone() }); } _ => {} } @@ -921,7 +922,7 @@ impl<'a> WalkState<'a> { source_span: self.span_for(&range), payload: ParsedPayload::Paragraph { text: raw.clone(), - inlines: vec![Inline::Text(raw)], + inlines: vec![Inline::Text { text: raw }], }, } } else { @@ -1477,7 +1478,7 @@ mod tests { assert!( matches!( inl, - Inline::Text(_) | Inline::Code(_) | Inline::Link { .. } | Inline::Strong(_) | Inline::Emph(_) + Inline::Text { .. } | Inline::Code { .. } | Inline::Link { .. } | Inline::Strong { .. } | Inline::Emph { .. } ), "unexpected inline kind: {:?}", inl @@ -1736,11 +1737,11 @@ mod tests { match &blocks[0].payload { ParsedPayload::Paragraph { inlines, .. } => { let kinds: Vec<&'static str> = inlines.iter().map(|i| match i { - Inline::Text(_) => "Text", - Inline::Code(_) => "Code", + Inline::Text { .. } => "Text", + Inline::Code { .. } => "Code", Inline::Link { .. } => "Link", - Inline::Strong(_) => "Strong", - Inline::Emph(_) => "Emph", + Inline::Strong { .. } => "Strong", + Inline::Emph { .. } => "Emph", }).collect(); assert!(kinds.contains(&"Strong")); assert!(kinds.contains(&"Emph")); diff --git a/crates/kb-parse-md/src/frontmatter.rs b/crates/kb-parse-md/src/frontmatter.rs index 3f37990..93230da 100644 --- a/crates/kb-parse-md/src/frontmatter.rs +++ b/crates/kb-parse-md/src/frontmatter.rs @@ -379,8 +379,12 @@ fn derive_metadata( // ---- title ---- // Frontmatter → BodyHints.first_h1 → None. - // Filename fallback is the caller's responsibility (P1-4 normalize), per - // task brief — `BodyHints` does not carry a filename. + // Filename fallback for title is deferred to a later phase (P1-7 or + // kb-app integration); the parse_frontmatter -> build_canonical_document + // pipeline does not currently know the workspace_path filename component + // for fallback. CanonicalDocument.title may be empty for files without + // frontmatter title and without an H1; downstream display layer should + // fall back to filename via WorkspacePath inspection. let title = raw.title.or_else(|| hints.first_h1.clone()); if let Some(t) = title { user.insert("title".to_string(), Value::String(t)); diff --git a/crates/kb-parse-md/tests/blocks_snapshots.rs b/crates/kb-parse-md/tests/blocks_snapshots.rs index 483286e..1c569ee 100644 --- a/crates/kb-parse-md/tests/blocks_snapshots.rs +++ b/crates/kb-parse-md/tests/blocks_snapshots.rs @@ -4,19 +4,19 @@ //! below. `body_offset_lines = 1` is used for both fixtures (no //! frontmatter, body starts at file line 1). //! -//! Note on snapshot shape: `kb_core::Inline` carries a `serde(tag = "kind")` -//! enum representation that cannot serialize newtype variants holding a -//! primitive (`Inline::Text(String)` etc.) — that's a serde limitation, not -//! ours, and is fixed up in a later kb-core task. To keep the snapshot -//! human-readable (and stable across that future fix), we project each -//! `ParsedBlock` into a `BlockView` that flattens inline content to plain -//! strings before serialization. This still pins the *contract* that -//! matters for P1-3: heading paths, source spans, payload kinds, payload -//! text content, table headers/rows, and code lang/body. +//! Note: kb-parse-md's snapshot tests use the `#[ignore]` regenerator +//! pattern (run `cargo test ... -- --ignored` to refresh baselines), +//! whereas `kb-normalize`'s integration test uses an `UPDATE_SNAPSHOTS=1` +//! env-var pattern. Migrating kb-parse-md to the env-var style is out of +//! scope; both styles are intentional for now. +//! +//! Following the kb_core::Inline schema migration (struct-variant shape), +//! `ParsedBlock` now serializes directly through serde — no projection +//! shim is required. Inlines surface as structured objects, e.g. +//! `[{"kind":"text","text":"…"},{"kind":"code","code":"…"}]`. -use kb_core::{Inline, SourceSpan}; use kb_parse_md::parse_blocks; -use kb_parse_types::{ParsedBlock, ParsedPayload, Warning}; +use kb_parse_types::{ParsedBlock, Warning}; use serde::Serialize; use serde_json::Value; use std::fs; @@ -24,130 +24,10 @@ use std::path::PathBuf; #[derive(Serialize)] struct Snapshot { - blocks: Vec, + blocks: Vec, warnings: Vec, } -#[derive(Serialize)] -struct BlockView { - kind: String, - heading_path: Vec, - source_span: SourceSpan, - payload: PayloadView, -} - -#[derive(Serialize)] -#[serde(tag = "kind", rename_all = "lowercase")] -enum PayloadView { - Heading { - level: u8, - text: String, - }, - Paragraph { - text: String, - inlines_flat: String, - }, - List { - ordered: bool, - items_flat: Vec, - }, - Code { - lang: Option, - code: String, - }, - Table { - headers: Vec, - rows: Vec>, - }, - Quote { - text: String, - inlines_flat: String, - }, - ImageRef { - src: String, - alt: String, - }, - AudioRef { - src: String, - }, -} - -fn flatten_inline(i: &Inline, out: &mut String) { - match i { - Inline::Text(s) | Inline::Code(s) => out.push_str(s), - Inline::Link { text, href } => { - out.push('['); - out.push_str(text); - out.push_str("]("); - out.push_str(href); - out.push(')'); - } - Inline::Strong(v) => { - out.push_str("**"); - for c in v { - flatten_inline(c, out); - } - out.push_str("**"); - } - Inline::Emph(v) => { - out.push('*'); - for c in v { - flatten_inline(c, out); - } - out.push('*'); - } - } -} - -fn flatten(inlines: &[Inline]) -> String { - let mut out = String::new(); - for i in inlines { - flatten_inline(i, &mut out); - } - out -} - -fn block_to_view(b: &ParsedBlock) -> BlockView { - let kind = format!("{:?}", b.kind).to_lowercase(); - let payload = match &b.payload { - ParsedPayload::Heading { level, text } => PayloadView::Heading { - level: *level, - text: text.clone(), - }, - ParsedPayload::Paragraph { text, inlines } => PayloadView::Paragraph { - text: text.clone(), - inlines_flat: flatten(inlines), - }, - ParsedPayload::List { ordered, items } => PayloadView::List { - ordered: *ordered, - items_flat: items.iter().map(|it| flatten(it)).collect(), - }, - ParsedPayload::Code { lang, code } => PayloadView::Code { - lang: lang.clone(), - code: code.clone(), - }, - ParsedPayload::Table { headers, rows } => PayloadView::Table { - headers: headers.clone(), - rows: rows.clone(), - }, - ParsedPayload::Quote { text, inlines } => PayloadView::Quote { - text: text.clone(), - inlines_flat: flatten(inlines), - }, - ParsedPayload::ImageRef { src, alt } => PayloadView::ImageRef { - src: src.clone(), - alt: alt.clone(), - }, - ParsedPayload::AudioRef { src } => PayloadView::AudioRef { src: src.clone() }, - }; - BlockView { - kind, - heading_path: b.heading_path.clone(), - source_span: b.source_span.clone(), - payload, - } -} - fn fixtures_dir() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") @@ -162,7 +42,7 @@ fn assert_snapshot(fixture: &str, baseline: &str) { let (blocks, warns) = parse_blocks(&bytes, 1).unwrap(); let snap = Snapshot { - blocks: blocks.iter().map(block_to_view).collect(), + blocks, warnings: warns, }; let actual: Value = serde_json::to_value(&snap).unwrap(); @@ -211,7 +91,7 @@ fn emit_blocks_snapshots() { let bytes = fs::read(dir.join(fixture)).unwrap(); let (blocks, warns) = parse_blocks(&bytes, 1).unwrap(); let snap = Snapshot { - blocks: blocks.iter().map(block_to_view).collect(), + blocks, warnings: warns, }; let json = serde_json::to_string_pretty(&snap).unwrap(); @@ -227,14 +107,10 @@ fn snapshot_is_deterministic_across_runs() { let bytes = fs::read(dir.join("nested-headings.md")).unwrap(); let (a_blocks, a_warns) = parse_blocks(&bytes, 1).unwrap(); let (b_blocks, b_warns) = parse_blocks(&bytes, 1).unwrap(); - // Compare via the view (which is fully serializable) and via the - // structural equality on `ParsedBlock` itself (no serde involved). assert_eq!(a_blocks, b_blocks); assert_eq!(a_warns, b_warns); - let av: Vec<_> = a_blocks.iter().map(block_to_view).collect(); - let bv: Vec<_> = b_blocks.iter().map(block_to_view).collect(); assert_eq!( - serde_json::to_value(&av).unwrap(), - serde_json::to_value(&bv).unwrap() + serde_json::to_value(&a_blocks).unwrap(), + serde_json::to_value(&b_blocks).unwrap() ); } diff --git a/fixtures/markdown/code-and-table.canonical.snapshot.json b/fixtures/markdown/code-and-table.canonical.snapshot.json new file mode 100644 index 0000000..25628fd --- /dev/null +++ b/fixtures/markdown/code-and-table.canonical.snapshot.json @@ -0,0 +1,102 @@ +{ + "blocks": [ + { + "common": { + "block_id": "dd1528c6e84d8a66087cbf6faafd67c6", + "heading_path": [], + "source_span": { + "end": 1, + "kind": "line", + "start": 1 + } + }, + "kind": "heading", + "level": 1, + "text": "Code And Table" + }, + { + "code": "fn main() {\n println!(\"hi\");\n}", + "common": { + "block_id": "68ea34aca04b83413dd8556126ae4584", + "heading_path": [ + "Code And Table" + ], + "source_span": { + "end": 7, + "kind": "line", + "start": 3 + } + }, + "kind": "code", + "lang": "rust" + }, + { + "common": { + "block_id": "b50a8e941b11f1834ae17adba9e08118", + "heading_path": [ + "Code And Table" + ], + "source_span": { + "end": 12, + "kind": "line", + "start": 9 + } + }, + "headers": [ + "col a", + "col b" + ], + "kind": "table", + "rows": [ + [ + "1", + "2" + ], + [ + "3", + "4" + ] + ] + } + ], + "doc_id": "6a9ef317c9c097ff3f6aeb317559bd83", + "doc_version": 1, + "lang": "en", + "metadata": { + "aliases": [], + "created_at": "2023-11-14T22:13:20Z", + "source_type": "markdown", + "tags": [], + "trust_level": "primary", + "updated_at": "2023-11-14T22:13:20Z", + "user": {}, + "user_id_alias": null + }, + "parser_version": "kb-normalize-snapshot-test-0", + "provenance": { + "events": [ + { + "agent": "kb-source-fs", + "at": "2023-11-14T22:13:20Z", + "kind": "discovered", + "note": null + }, + { + "agent": "kb-parse-md", + "at": "", + "kind": "parsed", + "note": "parser_version=kb-normalize-snapshot-test-0" + }, + { + "agent": "kb-normalize", + "at": "", + "kind": "normalized", + "note": null + } + ] + }, + "schema_version": 1, + "source_asset_id": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "title": "Code And Table", + "workspace_path": "notes/code-and-table.md" +} diff --git a/fixtures/markdown/nested-headings.blocks.snapshot.json b/fixtures/markdown/nested-headings.blocks.snapshot.json index 032c3e6..edaa6b9 100644 --- a/fixtures/markdown/nested-headings.blocks.snapshot.json +++ b/fixtures/markdown/nested-headings.blocks.snapshot.json @@ -27,7 +27,12 @@ "payload": { "kind": "paragraph", "text": "intro", - "inlines_flat": "intro" + "inlines": [ + { + "kind": "text", + "text": "intro" + } + ] } }, { @@ -60,7 +65,12 @@ "payload": { "kind": "paragraph", "text": "body of A", - "inlines_flat": "body of A" + "inlines": [ + { + "kind": "text", + "text": "body of A" + } + ] } }, { @@ -95,7 +105,12 @@ "payload": { "kind": "paragraph", "text": "deeper", - "inlines_flat": "deeper" + "inlines": [ + { + "kind": "text", + "text": "deeper" + } + ] } }, { @@ -128,7 +143,12 @@ "payload": { "kind": "paragraph", "text": "body of B", - "inlines_flat": "body of B" + "inlines": [ + { + "kind": "text", + "text": "body of B" + } + ] } } ],