Merge pull request 'feat(p1-4): kb-normalize + kb-core Inline schema hotfix' (#9) from feat/p1-4-normalize into main

Reviewed-on: altair823-org/kb#9
2026-04-30 16:23:16 +00:00
parent 8ce44af95a 557275c04e
commit 4665910370
11 changed files with 1249 additions and 166 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -577,6 +577,21 @@ dependencies = [
 "unicode-normalization",
 ]

+[[package]]
+name = "kb-normalize"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "kb-core",
+ "kb-parse-md",
+ "kb-parse-types",
+ "serde",
+ "serde_json",
+ "time",
+ "tracing",
+ "unicode-normalization",
+]
+
 [[package]]
 name = "kb-parse-md"
 version = "0.1.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,7 @@ members = [
    "crates/kb-config",
    "crates/kb-source-fs",
    "crates/kb-parse-md",
+    "crates/kb-normalize",
    "crates/kb-app",
    "crates/kb-cli",
 ]
--- a/crates/kb-core/src/document.rs
+++ b/crates/kb-core/src/document.rs
@@ -100,11 +100,11 @@ pub struct AudioRefBlock {
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 #[serde(rename_all = "lowercase", tag = "kind")]
 pub enum Inline {
-    Text(String),
-    Code(String),
+    Text { text: String },
+    Code { code: String },
    Link { text: String, href: String },
-    Strong(Vec<Inline>),
-    Emph(Vec<Inline>),
+    Strong { children: Vec<Inline> },
+    Emph { children: Vec<Inline> },
 }

 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -175,3 +175,37 @@ pub struct TranscriptSegment {
    pub speaker: Option<String>,
    pub confidence: Option<f32>,
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Each `Inline` variant must serialize and deserialize cleanly under
+    /// the internally-tagged representation. Newtype-with-primitive variants
+    /// (`Text(String)`, `Code(String)`, `Strong(Vec<…>)`, `Emph(Vec<…>)`)
+    /// previously failed at serde runtime because `tag = "kind"` cannot
+    /// describe a newtype carrying a non-struct value. The struct-variant
+    /// shape used here is the §9 schema migration.
+    #[test]
+    fn inline_serde_round_trip() {
+        let cases = vec![
+            Inline::Text { text: "hi".into() },
+            Inline::Code { code: "x".into() },
+            Inline::Link {
+                text: "t".into(),
+                href: "h".into(),
+            },
+            Inline::Strong {
+                children: vec![Inline::Text { text: "bold".into() }],
+            },
+            Inline::Emph {
+                children: vec![Inline::Text { text: "em".into() }],
+            },
+        ];
+        for c in cases {
+            let s = serde_json::to_string(&c).expect("serialize");
+            let back: Inline = serde_json::from_str(&s).expect("deserialize");
+            assert_eq!(c, back);
+        }
+    }
+}
--- a/crates/kb-normalize/Cargo.toml
+++ b/crates/kb-normalize/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name          = "kb-normalize"
+version       = { workspace = true }
+edition       = { workspace = true }
+rust-version  = { workspace = true }
+license       = { workspace = true }
+repository    = { workspace = true }
+description   = "Lift parser output (kb-parse-types) into kb-core::CanonicalDocument with deterministic IDs (§3.4, §4.2, §4.3)"
+
+[dependencies]
+kb-core         = { path = "../kb-core" }
+kb-parse-types  = { path = "../kb-parse-types" }
+serde                     = { workspace = true }
+serde_json                = { workspace = true }
+unicode-normalization     = "0.1"
+time                      = { workspace = true }
+anyhow                    = { workspace = true }
+tracing                   = { workspace = true }
+
+[dev-dependencies]
+# kb-parse-md is permitted as a *dev*-dependency only — used by the
+# integration snapshot test to drive a fixture through the real parser.
+# Forbidden as a regular dep per design §8 (kb-normalize must not depend
+# on any specific parser); `cargo tree -p kb-normalize --depth 1` (the
+# default scope, excluding dev-deps) confirms this.
+kb-parse-md = { path = "../kb-parse-md" }
+serde_json  = { workspace = true }
--- a/crates/kb-normalize/src/lib.rs
+++ b/crates/kb-normalize/src/lib.rs
@@ -0,0 +1,843 @@
+//! `kb-normalize` — lift parser output (`kb-parse-types`) into a
+//! [`kb_core::CanonicalDocument`] with deterministic IDs.
+//!
+//! Per design §3.4 (CanonicalDocument / Block), §4.2 (ID recipe), §4.3
+//! (ordinal rule), §3.6 (Provenance), §8 (module boundaries).
+//!
+//! Public surface:
+//!
+//! * [`build_canonical_document`] — assemble a `CanonicalDocument` from
+//!   `(RawAsset, Metadata, Vec<ParsedBlock>, ParserVersion, Vec<Warning>)`.
+//! * [`id_for_doc`], [`id_for_block`] — re-exports of the canonical
+//!   ID-recipe functions in `kb-core::ids` (§4.2). `kb-core` is the only
+//!   implementation; `kb-normalize` is the canonical *entry point* per
+//!   design §8.
+//!
+//! This crate must NOT depend on any parser implementation crate
+//! (`kb-parse-md`, `kb-parse-pdf`, …). All parser output flows in via
+//! the shared `kb-parse-types` crate.
+
+use std::collections::HashMap;
+
+use anyhow::Result;
+use kb_core::{
+    Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, DocumentId, HeadingBlock,
+    ImageRefBlock, Inline, Lang, ListBlock, Metadata, ParserVersion, Provenance, ProvenanceEvent,
+    ProvenanceKind, RawAsset, TableBlock, TextBlock,
+};
+use kb_parse_types::{ParsedBlock, ParsedPayload, Warning, WarningKind};
+use time::OffsetDateTime;
+use unicode_normalization::UnicodeNormalization;
+
+pub use kb_core::{id_for_block, id_for_doc};
+
+/// Build a [`CanonicalDocument`] from the raw asset, frontmatter
+/// metadata, parser blocks, parser version, and any warnings.
+///
+/// Behavior contract (per design §3.4 / §4.2 / §4.3 / §3.6):
+///
+/// * `doc_id = id_for_doc(workspace_path, asset_id, parser_version)` —
+///   `workspace_path` is consumed verbatim from `asset` (already NFC +
+///   POSIX per `kb_core::normalize::to_posix`).
+/// * `block_id = id_for_block(doc_id, kind, heading_path, ordinal,
+///   source_span)` — `ordinal` is **0-based, scoped to (heading_path,
+///   block_kind), in document order** per §4.3.
+/// * `title` and `lang` are lifted from `metadata.user["title"]` /
+///   `metadata.user["lang"]` (where P1-2 stashes them) into the dedicated
+///   `CanonicalDocument` fields, and removed from the user map to avoid
+///   duplication. Both keys are lifted only if present and stringy;
+///   non-stringy values (e.g. `Number`, `Array`) and missing keys
+///   silently default to empty title / empty `Lang`. P1-2's frontmatter
+///   parser only writes these keys when the source value parses as a
+///   string, so the non-stringy branches are defense-in-depth.
+/// * `provenance` is seeded with `Discovered` (from `asset.discovered_at`),
+///   `Parsed`, `Normalized` events, and one `Warning` event per upstream
+///   warning. The two normalize-side events share one `now_utc()` reading
+///   so the timestamp jitter inside a single call is bounded — event
+///   ordering is preserved by `Vec` position.
+/// * `schema_version` and `doc_version` are pinned to `1` (initial).
+pub fn build_canonical_document(
+    asset: &RawAsset,
+    metadata: Metadata,
+    blocks: Vec<ParsedBlock>,
+    parser_version: &ParserVersion,
+    warnings: Vec<Warning>,
+) -> Result<CanonicalDocument> {
+    let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version);
+
+    // Lift title / lang from `metadata.user` (P1-2 stashed them there
+    // because `Metadata` does not carry them directly). Strip after
+    // lifting so the wire form does not duplicate the data.
+    let mut metadata = metadata;
+    let title = metadata
+        .user
+        .remove("title")
+        .and_then(|v| v.as_str().map(String::from))
+        .unwrap_or_default();
+    let lang = metadata
+        .user
+        .remove("lang")
+        .and_then(|v| v.as_str().map(|s| Lang(s.to_string())))
+        .unwrap_or_else(|| Lang(String::new()));
+
+    // §4.3 ordinal rule — per (heading_path, block_kind), 0-based,
+    // document order. A separate counter is kept for each grouping key.
+    let mut counters: HashMap<(Vec<String>, &'static str), u32> = HashMap::new();
+    // Some lift paths (e.g. AudioRef pre-P8) drop the block entirely and
+    // synthesize a Warning so the wire form never carries an invalid
+    // `AssetId`. These warnings originate at the lift stage and are
+    // attributed to `kb-normalize` (not to whatever upstream emitter the
+    // bare `WarningKind` would resolve to via `warning_agent`). They are
+    // tracked separately so the agent string is correct in Provenance.
+    let mut lift_warnings: Vec<Warning> = Vec::new();
+    let lifted_blocks: Vec<Block> = blocks
+        .into_iter()
+        .filter_map(|pb| lift_block(&doc_id, pb, &mut counters, &mut lift_warnings))
+        .collect();
+
+    tracing::debug!(
+        target: "kb-normalize",
+        "built canonical document doc_id={} blocks={}",
+        doc_id.0,
+        lifted_blocks.len()
+    );
+
+    // Provenance — share `now` between the parse + normalize stages so
+    // the per-call timestamp jitter is bounded.
+    let now = OffsetDateTime::now_utc();
+    let mut events: Vec<ProvenanceEvent> =
+        Vec::with_capacity(3 + warnings.len() + lift_warnings.len());
+    events.push(ProvenanceEvent {
+        at: asset.discovered_at,
+        agent: "kb-source-fs".to_string(),
+        kind: ProvenanceKind::Discovered,
+        note: None,
+    });
+    events.push(ProvenanceEvent {
+        at: now,
+        agent: "kb-parse-md".to_string(),
+        kind: ProvenanceKind::Parsed,
+        note: Some(format!("parser_version={}", parser_version.0)),
+    });
+    events.push(ProvenanceEvent {
+        at: now,
+        agent: "kb-normalize".to_string(),
+        kind: ProvenanceKind::Normalized,
+        note: None,
+    });
+    // {:?} on WarningKind renders camel-case variant name; intentional
+    // for human-readable Provenance trace.
+    for w in warnings {
+        events.push(ProvenanceEvent {
+            at: now,
+            agent: warning_agent(&w.kind).to_string(),
+            kind: ProvenanceKind::Warning,
+            note: Some(format!("{:?}: {}", w.kind, w.note)),
+        });
+    }
+    // Lift-stage warnings (currently only AudioRef-deferred drops) are
+    // unconditionally attributed to `kb-normalize`.
+    for w in lift_warnings {
+        events.push(ProvenanceEvent {
+            at: now,
+            agent: "kb-normalize".to_string(),
+            kind: ProvenanceKind::Warning,
+            note: Some(format!("{:?}: {}", w.kind, w.note)),
+        });
+    }
+    let provenance = Provenance { events };
+
+    Ok(CanonicalDocument {
+        doc_id,
+        source_asset_id: asset.asset_id.clone(),
+        workspace_path: asset.workspace_path.clone(),
+        title,
+        lang,
+        blocks: lifted_blocks,
+        metadata,
+        provenance,
+        parser_version: parser_version.clone(),
+        schema_version: 1,
+        doc_version: 1,
+    })
+}
+
+/// Resolve a `WarningKind` to the upstream agent that emitted it. Used
+/// to fill `ProvenanceEvent::agent` for the warning's event entry.
+///
+/// `ExtractFailed` is emitted today by `kb-parse-md`'s panic-recovery
+/// guard around `parse_blocks` — see `crates/kb-parse-md/src/blocks.rs`.
+/// If a future stage (e.g. `kb-normalize` itself, an extractor, …) starts
+/// emitting `ExtractFailed`, this mapping needs to grow context (perhaps
+/// a separate `WarningSource` field on `Warning`) so attribution stays
+/// honest. For now, all `ExtractFailed` warnings observed by
+/// `build_canonical_document` originated in the parser.
+fn warning_agent(kind: &WarningKind) -> &'static str {
+    match kind {
+        WarningKind::MalformedFrontmatter | WarningKind::EncodingFallback => "kb-parse-md",
+        WarningKind::MalformedTable => "kb-parse-md",
+        WarningKind::ExtractFailed => "kb-parse-md",
+    }
+}
+
+/// Map a `ParsedPayload` variant to the lowercase, no-spaces string used
+/// as `block_kind` in the §4.2 ID tuple.
+fn payload_kind(payload: &ParsedPayload) -> &'static str {
+    match payload {
+        ParsedPayload::Heading { .. } => "heading",
+        ParsedPayload::Paragraph { .. } => "paragraph",
+        ParsedPayload::List { .. } => "list",
+        ParsedPayload::Code { .. } => "code",
+        ParsedPayload::Table { .. } => "table",
+        ParsedPayload::Quote { .. } => "quote",
+        ParsedPayload::ImageRef { .. } => "imageref",
+        ParsedPayload::AudioRef { .. } => "audioref",
+    }
+}
+
+fn next_ordinal(
+    counters: &mut HashMap<(Vec<String>, &'static str), u32>,
+    heading_path: &[String],
+    kind: &'static str,
+) -> u32 {
+    let key = (heading_path.to_vec(), kind);
+    let entry = counters.entry(key).or_insert(0);
+    let ordinal = *entry;
+    *entry += 1;
+    ordinal
+}
+
+fn lift_block(
+    doc_id: &DocumentId,
+    pb: ParsedBlock,
+    counters: &mut HashMap<(Vec<String>, &'static str), u32>,
+    warnings: &mut Vec<Warning>,
+) -> Option<Block> {
+    let kind = payload_kind(&pb.payload);
+    // Task spec line 73: "All input strings normalized to NFC before
+    // hashing." `pulldown-cmark` does not NFC heading text, and
+    // `serde_json_canonicalizer` v0.3 does not normalize strings either,
+    // so we must NFC-normalize `heading_path` here before it feeds both
+    // the §4.2 ID recipe AND the on-disk `CommonBlock.heading_path` (so
+    // wire form matches ID input). Without this, NFD `\u{1100}\u{1161}`
+    // and NFC `\u{AC00}` (both render as 가) would produce different
+    // `block_id`s for what is logically the same heading.
+    let heading_path_nfc: Vec<String> =
+        pb.heading_path.iter().map(|s| s.nfc().collect()).collect();
+    let ordinal = next_ordinal(counters, &heading_path_nfc, kind);
+    let block_id: BlockId =
+        id_for_block(doc_id, kind, &heading_path_nfc, ordinal, &pb.source_span);
+    let common = CommonBlock {
+        block_id,
+        heading_path: heading_path_nfc,
+        source_span: pb.source_span,
+    };
+    let block = match pb.payload {
+        ParsedPayload::Heading { level, text } => Block::Heading(HeadingBlock {
+            common,
+            level,
+            text,
+        }),
+        ParsedPayload::Paragraph { text, inlines } => Block::Paragraph(TextBlock {
+            common,
+            text,
+            inlines,
+        }),
+        ParsedPayload::List { ordered, items } => Block::List(ListBlock {
+            common: common.clone(),
+            ordered,
+            items: items
+                .into_iter()
+                .map(|item_inlines| TextBlock {
+                    // All list items currently inherit the parent's
+                    // CommonBlock (incl. block_id). Per-item IDs would
+                    // require a §4.2 recipe extension. Spec (§3.4)
+                    // defines `ListBlock.items: Vec<TextBlock>` and
+                    // does not allocate per-item BlockIds. Re-using the
+                    // parent's common keeps the wire form deterministic
+                    // while letting the inline tree carry the item
+                    // content.
+                    common: common.clone(),
+                    text: flatten_inlines(&item_inlines),
+                    inlines: item_inlines,
+                })
+                .collect(),
+        }),
+        ParsedPayload::Code { lang, code } => Block::Code(CodeBlock { common, lang, code }),
+        ParsedPayload::Table { headers, rows } => Block::Table(TableBlock {
+            common,
+            headers,
+            rows,
+        }),
+        ParsedPayload::Quote { text, inlines } => Block::Quote(TextBlock {
+            common,
+            text,
+            inlines,
+        }),
+        ParsedPayload::ImageRef { src, alt } => Block::ImageRef(ImageRefBlock {
+            common,
+            asset_id: None,
+            src,
+            alt,
+            ocr: None,
+            caption: None,
+        }),
+        // TODO(P8): audio extractor will resolve workspace assets and
+        // produce real AssetIds. This skip-and-warn shim is a
+        // placeholder. `AssetId::from_str` requires a 32-hex string, so
+        // synthesizing `AssetId(String::new())` would break the
+        // invariant — instead we drop the block and surface a Warning
+        // (attributed to `kb-normalize` per §3.6 since this is the
+        // lift-stage decision).
+        ParsedPayload::AudioRef { src } => {
+            warnings.push(Warning {
+                kind: WarningKind::ExtractFailed,
+                note: format!(
+                    "audio-ref AssetId resolution deferred to P8 — block dropped (src={src})"
+                ),
+            });
+            return None;
+        }
+    };
+    Some(block)
+}
+
+/// Flatten a `Vec<Inline>` into a plain text string. Used by list-item
+/// `TextBlock.text` since `ParsedPayload::List` only carries inline trees
+/// per item.
+fn flatten_inlines(inlines: &[Inline]) -> String {
+    let mut out = String::new();
+    for i in inlines {
+        flatten_inline(i, &mut out);
+    }
+    out
+}
+
+fn flatten_inline(i: &Inline, out: &mut String) {
+    match i {
+        Inline::Text { text } => out.push_str(text),
+        Inline::Code { code } => out.push_str(code),
+        Inline::Link { text, .. } => out.push_str(text),
+        Inline::Strong { children } | Inline::Emph { children } => {
+            for c in children {
+                flatten_inline(c, out);
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use kb_core::{
+        AssetId, AssetStorage, Checksum, MediaType, SourceSpan, SourceType, SourceUri,
+        TrustLevel, WorkspacePath, normalize::to_posix,
+    };
+    use serde_json::Value;
+    use std::path::{Path, PathBuf};
+    use time::OffsetDateTime;
+
+    fn fixture_asset() -> RawAsset {
+        let workspace_path = WorkspacePath::new("notes/example.md".into()).unwrap();
+        RawAsset {
+            asset_id: AssetId("a".repeat(32)),
+            source_uri: SourceUri::File(PathBuf::from("/tmp/example.md")),
+            workspace_path,
+            media_type: MediaType::Markdown,
+            byte_len: 0,
+            checksum: Checksum("0".repeat(64)),
+            // Pin a fixed timestamp so determinism tests can compare
+            // outputs across runs without timestamp jitter outside the
+            // fields we explicitly strip.
+            discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
+            stored: AssetStorage::Reference {
+                path: PathBuf::from("/tmp/example.md"),
+                sha: Checksum("0".repeat(64)),
+            },
+        }
+    }
+
+    fn fixture_metadata() -> Metadata {
+        let mut user = serde_json::Map::new();
+        user.insert("title".into(), Value::String("Example".into()));
+        user.insert("lang".into(), Value::String("en".into()));
+        user.insert("custom".into(), Value::Bool(true));
+        Metadata {
+            aliases: vec![],
+            tags: vec![],
+            created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
+            updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
+            source_type: SourceType::Markdown,
+            trust_level: TrustLevel::Primary,
+            user_id_alias: None,
+            user,
+        }
+    }
+
+    fn parser_version() -> ParserVersion {
+        ParserVersion("kb-normalize-test-0".into())
+    }
+
+    /// Fixed 5-block input used by both the ordinal-scoping pinning test
+    /// and the determinism stress test (so the latter exercises the
+    /// `lift_block` path, not just the empty-blocks path).
+    fn fixture_blocks_five() -> Vec<ParsedBlock> {
+        let h1_a = vec!["A".to_string()];
+        let h1_b = vec!["B".to_string()];
+        vec![
+            ParsedBlock {
+                kind: kb_parse_types::ParsedBlockKind::Paragraph,
+                heading_path: h1_a.clone(),
+                source_span: SourceSpan::Line { start: 1, end: 1 },
+                payload: ParsedPayload::Paragraph {
+                    text: "p1".into(),
+                    inlines: vec![],
+                },
+            },
+            ParsedBlock {
+                kind: kb_parse_types::ParsedBlockKind::Paragraph,
+                heading_path: h1_a.clone(),
+                source_span: SourceSpan::Line { start: 2, end: 2 },
+                payload: ParsedPayload::Paragraph {
+                    text: "p2".into(),
+                    inlines: vec![],
+                },
+            },
+            ParsedBlock {
+                kind: kb_parse_types::ParsedBlockKind::Paragraph,
+                heading_path: h1_a.clone(),
+                source_span: SourceSpan::Line { start: 3, end: 3 },
+                payload: ParsedPayload::Paragraph {
+                    text: "p3".into(),
+                    inlines: vec![],
+                },
+            },
+            ParsedBlock {
+                kind: kb_parse_types::ParsedBlockKind::Code,
+                heading_path: h1_a,
+                source_span: SourceSpan::Line { start: 4, end: 5 },
+                payload: ParsedPayload::Code {
+                    lang: None,
+                    code: "x".into(),
+                },
+            },
+            ParsedBlock {
+                kind: kb_parse_types::ParsedBlockKind::Paragraph,
+                heading_path: h1_b,
+                source_span: SourceSpan::Line { start: 6, end: 6 },
+                payload: ParsedPayload::Paragraph {
+                    text: "q1".into(),
+                    inlines: vec![],
+                },
+            },
+        ]
+    }
+
+    /// `id_for_doc` is deterministic across 1000 invocations on the same
+    /// input — a regression in canonical JSON or BLAKE3 would surface
+    /// here immediately.
+    #[test]
+    fn id_for_doc_deterministic_1000() {
+        let path = WorkspacePath::new("a/b.md".into()).unwrap();
+        let asset = AssetId("0123456789abcdef0123456789abcdef".into());
+        let pv = ParserVersion("v1".into());
+        let first = id_for_doc(&path, &asset, &pv);
+        for _ in 0..1000 {
+            assert_eq!(id_for_doc(&path, &asset, &pv), first);
+        }
+    }
+
+    /// NFC vs NFD inputs for the same Korean glyph must produce the
+    /// same `doc_id` because `to_posix` runs NFC normalization.
+    #[test]
+    fn nfc_nfd_korean_path_same_id() {
+        let nfd = to_posix(Path::new("\u{1100}\u{1161}.md")).unwrap();
+        let nfc = to_posix(Path::new("\u{AC00}.md")).unwrap();
+        let asset = AssetId("0123456789abcdef0123456789abcdef".into());
+        let pv = parser_version();
+        assert_eq!(id_for_doc(&nfd, &asset, &pv), id_for_doc(&nfc, &asset, &pv));
+    }
+
+    /// `./a/b.md` and `a/b.md` must collapse to the same POSIX form
+    /// before `id_for_doc`.
+    #[test]
+    fn posix_curdir_collapses_to_same_id() {
+        let a = to_posix(Path::new("./a/b.md")).unwrap();
+        let b = to_posix(Path::new("a/b.md")).unwrap();
+        let asset = AssetId("0123456789abcdef0123456789abcdef".into());
+        let pv = parser_version();
+        assert_eq!(id_for_doc(&a, &asset, &pv), id_for_doc(&b, &asset, &pv));
+    }
+
+    /// Ordinals are scoped to (heading_path, block_kind) per §4.3:
+    /// three paragraphs under H1 → 0/1/2; a code block under the same
+    /// H1 starts a fresh counter at 0; a paragraph under a different
+    /// H1 also starts a fresh counter at 0.
+    #[test]
+    fn block_ordinals_scoped_per_heading_and_kind() {
+        let h1_a = vec!["A".to_string()];
+        let h1_b = vec!["B".to_string()];
+        let blocks = fixture_blocks_five();
+        let asset = fixture_asset();
+        let metadata = fixture_metadata();
+        let pv = parser_version();
+        let doc =
+            build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap();
+
+        // Compute the expected IDs out-of-band so the test pins both
+        // the (heading_path, kind) ordinal grouping AND the value of
+        // each block_id under the recipe.
+        let p1 = id_for_block(
+            &doc.doc_id,
+            "paragraph",
+            &h1_a,
+            0,
+            &SourceSpan::Line { start: 1, end: 1 },
+        );
+        let p2 = id_for_block(
+            &doc.doc_id,
+            "paragraph",
+            &h1_a,
+            1,
+            &SourceSpan::Line { start: 2, end: 2 },
+        );
+        let p3 = id_for_block(
+            &doc.doc_id,
+            "paragraph",
+            &h1_a,
+            2,
+            &SourceSpan::Line { start: 3, end: 3 },
+        );
+        let c0 = id_for_block(
+            &doc.doc_id,
+            "code",
+            &h1_a,
+            0,
+            &SourceSpan::Line { start: 4, end: 5 },
+        );
+        let q0 = id_for_block(
+            &doc.doc_id,
+            "paragraph",
+            &h1_b,
+            0,
+            &SourceSpan::Line { start: 6, end: 6 },
+        );
+
+        let ids: Vec<&BlockId> = doc
+            .blocks
+            .iter()
+            .map(|b| match b {
+                Block::Paragraph(t) | Block::Quote(t) => &t.common.block_id,
+                Block::Heading(h) => &h.common.block_id,
+                Block::List(l) => &l.common.block_id,
+                Block::Code(c) => &c.common.block_id,
+                Block::Table(t) => &t.common.block_id,
+                Block::ImageRef(i) => &i.common.block_id,
+                Block::AudioRef(a) => &a.common.block_id,
+            })
+            .collect();
+        assert_eq!(ids, vec![&p1, &p2, &p3, &c0, &q0]);
+    }
+
+    /// Provenance events appear in the documented order: `Discovered`
+    /// (from the asset), `Parsed`, then `Normalized`. Warnings (none in
+    /// this test) would follow.
+    #[test]
+    fn provenance_contains_stage_events_in_order() {
+        let asset = fixture_asset();
+        let metadata = fixture_metadata();
+        let pv = parser_version();
+        let doc =
+            build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
+        let kinds: Vec<_> = doc.provenance.events.iter().map(|e| e.kind).collect();
+        assert_eq!(
+            kinds,
+            vec![
+                ProvenanceKind::Discovered,
+                ProvenanceKind::Parsed,
+                ProvenanceKind::Normalized,
+            ]
+        );
+        let events = &doc.provenance.events;
+        assert_eq!(events[0].at, asset.discovered_at);
+        assert_eq!(events[0].agent, "kb-source-fs");
+        assert_eq!(events[1].agent, "kb-parse-md");
+        assert_eq!(events[2].agent, "kb-normalize");
+        // Pin the implementation invariant that Parsed and Normalized
+        // share the single `now_utc()` reading inside one call.
+        assert_eq!(events[1].at, events[2].at, "Parsed and Normalized share now_utc");
+    }
+
+    /// Warnings carried into `build_canonical_document` are emitted as
+    /// `ProvenanceKind::Warning` events with the upstream agent.
+    #[test]
+    fn provenance_includes_warnings() {
+        let asset = fixture_asset();
+        let metadata = fixture_metadata();
+        let pv = parser_version();
+        let warnings = vec![Warning {
+            kind: WarningKind::MalformedFrontmatter,
+            note: "missing closing fence".into(),
+        }];
+        let doc =
+            build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap();
+        assert_eq!(doc.provenance.events.len(), 4);
+        let last = doc.provenance.events.last().unwrap();
+        assert_eq!(last.kind, ProvenanceKind::Warning);
+        assert_eq!(last.agent, "kb-parse-md");
+        assert!(last.note.as_deref().unwrap().contains("missing closing fence"));
+    }
+
+    /// `metadata.user["title"]` and `metadata.user["lang"]` are lifted
+    /// to the dedicated `CanonicalDocument` fields and stripped from
+    /// the user map (so the wire form does not duplicate the data).
+    /// Other user keys survive intact.
+    #[test]
+    fn lifts_title_and_lang_from_user_map() {
+        let asset = fixture_asset();
+        let metadata = fixture_metadata();
+        let pv = parser_version();
+        let doc =
+            build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
+        assert_eq!(doc.title, "Example");
+        assert_eq!(doc.lang, Lang("en".into()));
+        assert!(!doc.metadata.user.contains_key("title"));
+        assert!(!doc.metadata.user.contains_key("lang"));
+        assert!(doc.metadata.user.contains_key("custom"));
+    }
+
+    /// Determinism property: 1000 iterations of `build_canonical_document`
+    /// over identical inputs produce byte-identical JSON, modulo the two
+    /// non-deterministic `now_utc()` timestamps for the Parsed/Normalized
+    /// events. We strip those timestamps before comparing. Must finish
+    /// within 1 second.
+    #[test]
+    fn determinism_1000_iterations_under_1s() {
+        let asset = fixture_asset();
+        let metadata = fixture_metadata();
+        let pv = parser_version();
+
+        // Helper: serialize and replace the two now_utc-derived timestamps
+        // (Parsed + Normalized + any Warning events) with a sentinel so
+        // the comparison only checks the deterministic fields.
+        fn strip_dynamic_at(doc: &CanonicalDocument) -> Value {
+            let mut v = serde_json::to_value(doc).unwrap();
+            if let Some(events) = v
+                .get_mut("provenance")
+                .and_then(|p| p.get_mut("events"))
+                .and_then(|e| e.as_array_mut())
+            {
+                for (i, ev) in events.iter_mut().enumerate() {
+                    // index 0 is Discovered (deterministic — pinned in
+                    // the fixture). Strip everything after.
+                    if i > 0
+                        && let Some(obj) = ev.as_object_mut()
+                    {
+                        obj.insert("at".into(), Value::String("<stripped>".into()));
+                    }
+                }
+            }
+            v
+        }
+
+        // Use the same 5-block fixture as the ordinal-scoping test so
+        // determinism is exercised on a non-empty `lift_block` path
+        // (block_id hashing, NFC normalization, ordinal counters), not
+        // just an empty Vec.
+        let baseline = build_canonical_document(
+            &asset,
+            metadata.clone(),
+            fixture_blocks_five(),
+            &pv,
+            vec![],
+        )
+        .unwrap();
+        let baseline_json = serde_json::to_string(&strip_dynamic_at(&baseline)).unwrap();
+
+        let start = std::time::Instant::now();
+        for _ in 0..1000 {
+            let next = build_canonical_document(
+                &asset,
+                metadata.clone(),
+                fixture_blocks_five(),
+                &pv,
+                vec![],
+            )
+            .unwrap();
+            let next_json = serde_json::to_string(&strip_dynamic_at(&next)).unwrap();
+            assert_eq!(baseline_json, next_json);
+        }
+        assert!(
+            start.elapsed() < std::time::Duration::from_secs(1),
+            "1000 iterations took {:?}",
+            start.elapsed()
+        );
+    }
+
+    /// I1 regression — `WarningKind::ExtractFailed` is emitted by
+    /// `kb-parse-md` (panic-recovery in `blocks.rs`), so the resulting
+    /// `ProvenanceEvent::agent` must read `"kb-parse-md"`. A regression
+    /// to `"kb-normalize"` would mis-attribute parse panics and break
+    /// stage-filtered debugging.
+    #[test]
+    fn provenance_with_extract_failed_warning_attributes_to_kb_parse_md() {
+        let asset = fixture_asset();
+        let metadata = fixture_metadata();
+        let pv = parser_version();
+        let warnings = vec![Warning {
+            kind: WarningKind::ExtractFailed,
+            note: "pulldown-cmark panicked; body discarded".into(),
+        }];
+        let doc =
+            build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap();
+        let warning_event = doc
+            .provenance
+            .events
+            .iter()
+            .find(|e| e.kind == ProvenanceKind::Warning)
+            .expect("warning event present");
+        assert_eq!(warning_event.agent, "kb-parse-md");
+        assert!(
+            warning_event
+                .note
+                .as_deref()
+                .unwrap()
+                .contains("ExtractFailed")
+        );
+    }
+
+    /// I2 regression — `ParsedPayload::AudioRef` is dropped (not lifted
+    /// into a `Block::AudioRef` with a synthesized empty `AssetId`,
+    /// which would violate `AssetId::from_str`'s 32-hex invariant). A
+    /// `Warning` is surfaced in Provenance, attributed to
+    /// `"kb-normalize"` because the decision is made at the lift stage.
+    #[test]
+    fn audio_ref_block_skipped_with_warning() {
+        let span = SourceSpan::Line { start: 1, end: 1 };
+        let blocks = vec![ParsedBlock {
+            kind: kb_parse_types::ParsedBlockKind::AudioRef,
+            heading_path: vec![],
+            source_span: span,
+            payload: ParsedPayload::AudioRef {
+                src: "voice.m4a".into(),
+            },
+        }];
+        let asset = fixture_asset();
+        let metadata = fixture_metadata();
+        let pv = parser_version();
+        let doc =
+            build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap();
+
+        // No AudioRef block in the canonical output.
+        assert!(
+            !doc.blocks
+                .iter()
+                .any(|b| matches!(b, Block::AudioRef(_))),
+            "AudioRef block should be skipped pre-P8"
+        );
+
+        // Exactly one Warning event mentioning the AudioRef src.
+        let warning_events: Vec<_> = doc
+            .provenance
+            .events
+            .iter()
+            .filter(|e| e.kind == ProvenanceKind::Warning)
+            .collect();
+        assert_eq!(warning_events.len(), 1);
+        let w = warning_events[0];
+        assert_eq!(w.agent, "kb-normalize");
+        assert!(w.note.as_deref().unwrap().contains("voice.m4a"));
+    }
+
+    /// I3 regression — heading-path strings are NFC-normalized before
+    /// feeding into `id_for_block`, so canonically-equivalent NFD and
+    /// NFC inputs produce the same `block_id`. Mirrors
+    /// `nfc_nfd_korean_path_same_id` for `doc_id`.
+    #[test]
+    fn nfc_nfd_korean_heading_path_same_block_id() {
+        let span = SourceSpan::Line { start: 1, end: 1 };
+        let nfd_heading = "\u{1100}\u{1161}".to_string(); // 가 (NFD)
+        let nfc_heading = "\u{AC00}".to_string(); // 가 (NFC)
+        let mk_block = |heading: String| ParsedBlock {
+            kind: kb_parse_types::ParsedBlockKind::Paragraph,
+            heading_path: vec![heading],
+            source_span: span.clone(),
+            payload: ParsedPayload::Paragraph {
+                text: "p".into(),
+                inlines: vec![],
+            },
+        };
+        let asset = fixture_asset();
+        let pv = parser_version();
+        let doc_nfd = build_canonical_document(
+            &asset,
+            fixture_metadata(),
+            vec![mk_block(nfd_heading)],
+            &pv,
+            vec![],
+        )
+        .unwrap();
+        let doc_nfc = build_canonical_document(
+            &asset,
+            fixture_metadata(),
+            vec![mk_block(nfc_heading)],
+            &pv,
+            vec![],
+        )
+        .unwrap();
+        let id_nfd = match &doc_nfd.blocks[0] {
+            Block::Paragraph(t) => &t.common.block_id,
+            _ => panic!("expected Paragraph"),
+        };
+        let id_nfc = match &doc_nfc.blocks[0] {
+            Block::Paragraph(t) => &t.common.block_id,
+            _ => panic!("expected Paragraph"),
+        };
+        assert_eq!(id_nfd, id_nfc, "NFD and NFC heading paths must hash equal");
+    }
+
+    /// M7 — `metadata.user["title"] = ""` is stringy and lifts to an
+    /// empty `CanonicalDocument.title`. This pins the policy: an
+    /// explicit empty string is *not* dropped, it's lifted as-is.
+    #[test]
+    fn title_empty_string_in_user_map_falls_back_to_default() {
+        let asset = fixture_asset();
+        let mut metadata = fixture_metadata();
+        metadata
+            .user
+            .insert("title".into(), Value::String(String::new()));
+        let pv = parser_version();
+        let doc =
+            build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
+        assert_eq!(doc.title, "");
+    }
+
+    /// M7 — `metadata.user["title"] = 42` is non-stringy and silently
+    /// drops; the fallback default (empty title) is used.
+    #[test]
+    fn title_non_string_in_user_map_silently_drops() {
+        let asset = fixture_asset();
+        let mut metadata = fixture_metadata();
+        metadata
+            .user
+            .insert("title".into(), Value::Number(42.into()));
+        let pv = parser_version();
+        let doc =
+            build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
+        assert_eq!(doc.title, "");
+    }
+
+    /// M7 — non-stringy `lang` (e.g. an array) silently drops. This is
+    /// defensive: P1-2 frontmatter validates the shape upstream, but we
+    /// don't trust it.
+    #[test]
+    fn lang_invalid_shape_silently_drops() {
+        let asset = fixture_asset();
+        let mut metadata = fixture_metadata();
+        metadata.user.insert("lang".into(), Value::Array(vec![]));
+        let pv = parser_version();
+        let doc =
+            build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
+        assert_eq!(doc.lang, Lang(String::new()));
+    }
+}
--- a/crates/kb-normalize/tests/normalize_snapshot.rs
+++ b/crates/kb-normalize/tests/normalize_snapshot.rs
@@ -0,0 +1,160 @@
+//! Snapshot test pinning the full `CanonicalDocument` JSON for the
+//! `code-and-table.md` fixture.
+//!
+//! This is an integration test (it lives under `tests/`) and depends on
+//! `kb-parse-md` only as a dev-dep so the production crate's regular
+//! deps still satisfy the §8 boundary (`cargo tree -p kb-normalize
+//! --depth 1` without `-e dev` does not list any parser implementation).
+//!
+//! Non-deterministic fields are stripped before comparison:
+//!
+//! * `provenance.events[*].at` — each invocation calls `now_utc()` for
+//!   the Parsed/Normalized/Warning events. The Discovered event uses
+//!   the asset's pinned `discovered_at`, so we keep that one and replace
+//!   only indices ≥ 1.
+
+use std::path::PathBuf;
+
+use kb_core::{
+    AssetId, AssetStorage, Checksum, MediaType, ParserVersion, RawAsset, SourceUri,
+    WorkspacePath,
+};
+use kb_normalize::build_canonical_document;
+use kb_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
+use serde_json::Value;
+use time::OffsetDateTime;
+
+fn fixtures_dir() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("..")
+        .join("..")
+        .join("fixtures")
+        .join("markdown")
+}
+
+fn fixed_asset(workspace_path: &str) -> RawAsset {
+    let wp = WorkspacePath::new(workspace_path.into()).unwrap();
+    RawAsset {
+        asset_id: AssetId("a".repeat(32)),
+        source_uri: SourceUri::File(PathBuf::from("/tmp/code-and-table.md")),
+        workspace_path: wp,
+        media_type: MediaType::Markdown,
+        byte_len: 0,
+        checksum: Checksum("0".repeat(64)),
+        // Pin discovered_at so the Discovered provenance event is
+        // deterministic across runs.
+        discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
+        stored: AssetStorage::Reference {
+            path: PathBuf::from("/tmp/code-and-table.md"),
+            sha: Checksum("0".repeat(64)),
+        },
+    }
+}
+
+fn strip_dynamic(mut v: Value) -> Value {
+    if let Some(events) = v
+        .get_mut("provenance")
+        .and_then(|p| p.get_mut("events"))
+        .and_then(|e| e.as_array_mut())
+    {
+        for (i, ev) in events.iter_mut().enumerate() {
+            if i > 0
+                && let Some(obj) = ev.as_object_mut()
+            {
+                obj.insert("at".into(), Value::String("<stripped>".into()));
+            }
+        }
+    }
+    v
+}
+
+#[test]
+fn code_and_table_canonical_snapshot() {
+    let dir = fixtures_dir();
+    let bytes = std::fs::read(dir.join("code-and-table.md")).expect("fixture readable");
+
+    // Frontmatter parse — code-and-table.md has none, so we provide
+    // BodyHints with deterministic timestamps so the lifted Metadata
+    // is reproducible. The body offset is 1 (no frontmatter prefix).
+    //
+    // We pin `first_h1` so the BodyHints → user.title → CanonicalDocument.title
+    // lift chain is exercised end-to-end (see `assert_eq!` on
+    // `doc.title` below). Without this, `code-and-table.md`'s lack of
+    // frontmatter title would leave `title == ""` and the chain would
+    // be uncovered by the snapshot.
+    let asset = fixed_asset("notes/code-and-table.md");
+    let hints = BodyHints {
+        first_h1: Some("Code And Table".into()),
+        fs_ctime: asset.discovered_at,
+        fs_mtime: asset.discovered_at,
+        fallback_lang: Some("en".into()),
+    };
+    let (metadata, fm_span, _fm_warns) =
+        parse_frontmatter(&bytes, &hints).expect("frontmatter parses");
+
+    // No frontmatter → body starts at line 1. With frontmatter, line
+    // count of the prelude is computed from the byte span; this fixture
+    // has none, so the constant 1 is fine.
+    let body_offset_lines: u32 = match fm_span {
+        // Defensive: count the newlines in the prelude. The fixture
+        // hits the `None` branch so this code path is not exercised
+        // by the test, but kept for completeness.
+        Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1,
+        None => 1,
+    };
+    let (blocks, parse_warns) =
+        parse_blocks(&bytes, body_offset_lines).expect("blocks parse");
+
+    let parser_version = ParserVersion("kb-normalize-snapshot-test-0".into());
+    let mut metadata = metadata;
+    // The `created_at` / `updated_at` lifted from BodyHints are pinned
+    // to `discovered_at` above, so they are already deterministic.
+    metadata.aliases.sort();
+    metadata.tags.sort();
+
+    let doc = build_canonical_document(
+        &asset,
+        metadata,
+        blocks,
+        &parser_version,
+        parse_warns,
+    )
+    .expect("build_canonical_document");
+
+    // Assert the BodyHints → first_h1 → user.title → CanonicalDocument.title
+    // lift chain end-to-end. Pinned in the snapshot too, but the explicit
+    // assertion makes a future drift fail with a clearer message.
+    assert_eq!(doc.title, "Code And Table");
+
+    let actual = strip_dynamic(serde_json::to_value(&doc).unwrap());
+
+    let baseline_path = dir.join("code-and-table.canonical.snapshot.json");
+    let baseline_text = match std::fs::read_to_string(&baseline_path) {
+        Ok(s) => s,
+        Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
+            let pretty = serde_json::to_string_pretty(&actual).unwrap();
+            std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
+            return;
+        }
+        Err(e) => panic!(
+            "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
+            baseline_path.display()
+        ),
+    };
+    let expected: Value =
+        serde_json::from_str(&baseline_text).expect("baseline parses as json");
+
+    if actual != expected {
+        if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
+            let pretty = serde_json::to_string_pretty(&actual).unwrap();
+            std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
+            eprintln!("updated baseline {}", baseline_path.display());
+            return;
+        }
+        let pretty = serde_json::to_string_pretty(&actual).unwrap();
+        panic!(
+            "canonical snapshot drift\n--- expected ({}) ---\n{baseline_text}\n--- actual ---\n{pretty}\nIf intentional, re-run with UPDATE_SNAPSHOTS=1.",
+            baseline_path.display()
+        );
+    }
+}
--- a/crates/kb-parse-md/src/blocks.rs
+++ b/crates/kb-parse-md/src/blocks.rs
@@ -300,12 +300,12 @@ impl InlineBuf {

    fn push_text(&mut self, s: &str) {
        self.text.push_str(s);
-        self.push_inline(Inline::Text(s.to_string()));
+        self.push_inline(Inline::Text { text: s.to_string() });
    }

    fn push_code(&mut self, s: &str) {
        self.text.push_str(s);
-        self.push_inline(Inline::Code(s.to_string()));
+        self.push_inline(Inline::Code { code: s.to_string() });
    }

    fn open_strong(&mut self) {
@@ -313,7 +313,7 @@ impl InlineBuf {
    }
    fn close_strong(&mut self) {
        if let Some(InlineFrame::Strong(kids)) = self.stack.pop() {
-            self.push_inline(Inline::Strong(kids));
+            self.push_inline(Inline::Strong { children: kids });
        }
    }

@@ -322,7 +322,7 @@ impl InlineBuf {
    }
    fn close_emph(&mut self) {
        if let Some(InlineFrame::Emph(kids)) = self.stack.pop() {
-            self.push_inline(Inline::Emph(kids));
+            self.push_inline(Inline::Emph { children: kids });
        }
    }

@@ -361,8 +361,8 @@ impl InlineBuf {
        // If formatting tags were unbalanced we close them defensively.
        while self.stack.len() > 1 {
            match self.stack.pop().unwrap() {
-                InlineFrame::Strong(kids) => self.push_inline(Inline::Strong(kids)),
-                InlineFrame::Emph(kids) => self.push_inline(Inline::Emph(kids)),
+                InlineFrame::Strong(kids) => self.push_inline(Inline::Strong { children: kids }),
+                InlineFrame::Emph(kids) => self.push_inline(Inline::Emph { children: kids }),
                InlineFrame::Link { href, text, kids } => {
                    let flat = if !text.is_empty() {
                        text
@@ -475,10 +475,11 @@ fn flatten_inlines_to_text(inlines: &[Inline]) -> String {

 fn flatten_one(i: &Inline, out: &mut String) {
    match i {
-        Inline::Text(s) | Inline::Code(s) => out.push_str(s),
+        Inline::Text { text } => out.push_str(text),
+        Inline::Code { code } => out.push_str(code),
        Inline::Link { text, .. } => out.push_str(text),
-        Inline::Strong(v) | Inline::Emph(v) => {
-            for c in v {
+        Inline::Strong { children } | Inline::Emph { children } => {
+            for c in children {
                flatten_one(c, out);
            }
        }
@@ -823,7 +824,7 @@ impl<'a> WalkState<'a> {
                                    text.push('\n');
                                }
                                text.push_str(t);
-                                inlines.push(Inline::Text(t.clone()));
+                                inlines.push(Inline::Text { text: t.clone() });
                            }
                            _ => {}
                        }
@@ -921,7 +922,7 @@ impl<'a> WalkState<'a> {
                            source_span: self.span_for(&range),
                            payload: ParsedPayload::Paragraph {
                                text: raw.clone(),
-                                inlines: vec![Inline::Text(raw)],
+                                inlines: vec![Inline::Text { text: raw }],
                            },
                        }
                    } else {
@@ -1477,7 +1478,7 @@ mod tests {
                    assert!(
                        matches!(
                            inl,
-                            Inline::Text(_) | Inline::Code(_) | Inline::Link { .. } | Inline::Strong(_) | Inline::Emph(_)
+                            Inline::Text { .. } | Inline::Code { .. } | Inline::Link { .. } | Inline::Strong { .. } | Inline::Emph { .. }
                        ),
                        "unexpected inline kind: {:?}",
                        inl
@@ -1736,11 +1737,11 @@ mod tests {
        match &blocks[0].payload {
            ParsedPayload::Paragraph { inlines, .. } => {
                let kinds: Vec<&'static str> = inlines.iter().map(|i| match i {
-                    Inline::Text(_) => "Text",
-                    Inline::Code(_) => "Code",
+                    Inline::Text { .. } => "Text",
+                    Inline::Code { .. } => "Code",
                    Inline::Link { .. } => "Link",
-                    Inline::Strong(_) => "Strong",
-                    Inline::Emph(_) => "Emph",
+                    Inline::Strong { .. } => "Strong",
+                    Inline::Emph { .. } => "Emph",
                }).collect();
                assert!(kinds.contains(&"Strong"));
                assert!(kinds.contains(&"Emph"));
--- a/crates/kb-parse-md/src/frontmatter.rs
+++ b/crates/kb-parse-md/src/frontmatter.rs
@@ -379,8 +379,12 @@ fn derive_metadata(

    // ---- title ----
    // Frontmatter → BodyHints.first_h1 → None.
-    // Filename fallback is the caller's responsibility (P1-4 normalize), per
-    // task brief — `BodyHints` does not carry a filename.
+    // Filename fallback for title is deferred to a later phase (P1-7 or
+    // kb-app integration); the parse_frontmatter -> build_canonical_document
+    // pipeline does not currently know the workspace_path filename component
+    // for fallback. CanonicalDocument.title may be empty for files without
+    // frontmatter title and without an H1; downstream display layer should
+    // fall back to filename via WorkspacePath inspection.
    let title = raw.title.or_else(|| hints.first_h1.clone());
    if let Some(t) = title {
        user.insert("title".to_string(), Value::String(t));
--- a/crates/kb-parse-md/tests/blocks_snapshots.rs
+++ b/crates/kb-parse-md/tests/blocks_snapshots.rs
@@ -4,19 +4,19 @@
 //! below. `body_offset_lines = 1` is used for both fixtures (no
 //! frontmatter, body starts at file line 1).
 //!
-//! Note on snapshot shape: `kb_core::Inline` carries a `serde(tag = "kind")`
-//! enum representation that cannot serialize newtype variants holding a
-//! primitive (`Inline::Text(String)` etc.) — that's a serde limitation, not
-//! ours, and is fixed up in a later kb-core task. To keep the snapshot
-//! human-readable (and stable across that future fix), we project each
-//! `ParsedBlock` into a `BlockView` that flattens inline content to plain
-//! strings before serialization. This still pins the *contract* that
-//! matters for P1-3: heading paths, source spans, payload kinds, payload
-//! text content, table headers/rows, and code lang/body.
+//! Note: kb-parse-md's snapshot tests use the `#[ignore]` regenerator
+//! pattern (run `cargo test ... -- --ignored` to refresh baselines),
+//! whereas `kb-normalize`'s integration test uses an `UPDATE_SNAPSHOTS=1`
+//! env-var pattern. Migrating kb-parse-md to the env-var style is out of
+//! scope; both styles are intentional for now.
+//!
+//! Following the kb_core::Inline schema migration (struct-variant shape),
+//! `ParsedBlock` now serializes directly through serde — no projection
+//! shim is required. Inlines surface as structured objects, e.g.
+//! `[{"kind":"text","text":"…"},{"kind":"code","code":"…"}]`.

-use kb_core::{Inline, SourceSpan};
 use kb_parse_md::parse_blocks;
-use kb_parse_types::{ParsedBlock, ParsedPayload, Warning};
+use kb_parse_types::{ParsedBlock, Warning};
 use serde::Serialize;
 use serde_json::Value;
 use std::fs;
@@ -24,130 +24,10 @@ use std::path::PathBuf;

 #[derive(Serialize)]
 struct Snapshot {
-    blocks: Vec<BlockView>,
+    blocks: Vec<ParsedBlock>,
    warnings: Vec<Warning>,
 }

-#[derive(Serialize)]
-struct BlockView {
-    kind: String,
-    heading_path: Vec<String>,
-    source_span: SourceSpan,
-    payload: PayloadView,
-}
-
-#[derive(Serialize)]
-#[serde(tag = "kind", rename_all = "lowercase")]
-enum PayloadView {
-    Heading {
-        level: u8,
-        text: String,
-    },
-    Paragraph {
-        text: String,
-        inlines_flat: String,
-    },
-    List {
-        ordered: bool,
-        items_flat: Vec<String>,
-    },
-    Code {
-        lang: Option<String>,
-        code: String,
-    },
-    Table {
-        headers: Vec<String>,
-        rows: Vec<Vec<String>>,
-    },
-    Quote {
-        text: String,
-        inlines_flat: String,
-    },
-    ImageRef {
-        src: String,
-        alt: String,
-    },
-    AudioRef {
-        src: String,
-    },
-}
-
-fn flatten_inline(i: &Inline, out: &mut String) {
-    match i {
-        Inline::Text(s) | Inline::Code(s) => out.push_str(s),
-        Inline::Link { text, href } => {
-            out.push('[');
-            out.push_str(text);
-            out.push_str("](");
-            out.push_str(href);
-            out.push(')');
-        }
-        Inline::Strong(v) => {
-            out.push_str("**");
-            for c in v {
-                flatten_inline(c, out);
-            }
-            out.push_str("**");
-        }
-        Inline::Emph(v) => {
-            out.push('*');
-            for c in v {
-                flatten_inline(c, out);
-            }
-            out.push('*');
-        }
-    }
-}
-
-fn flatten(inlines: &[Inline]) -> String {
-    let mut out = String::new();
-    for i in inlines {
-        flatten_inline(i, &mut out);
-    }
-    out
-}
-
-fn block_to_view(b: &ParsedBlock) -> BlockView {
-    let kind = format!("{:?}", b.kind).to_lowercase();
-    let payload = match &b.payload {
-        ParsedPayload::Heading { level, text } => PayloadView::Heading {
-            level: *level,
-            text: text.clone(),
-        },
-        ParsedPayload::Paragraph { text, inlines } => PayloadView::Paragraph {
-            text: text.clone(),
-            inlines_flat: flatten(inlines),
-        },
-        ParsedPayload::List { ordered, items } => PayloadView::List {
-            ordered: *ordered,
-            items_flat: items.iter().map(|it| flatten(it)).collect(),
-        },
-        ParsedPayload::Code { lang, code } => PayloadView::Code {
-            lang: lang.clone(),
-            code: code.clone(),
-        },
-        ParsedPayload::Table { headers, rows } => PayloadView::Table {
-            headers: headers.clone(),
-            rows: rows.clone(),
-        },
-        ParsedPayload::Quote { text, inlines } => PayloadView::Quote {
-            text: text.clone(),
-            inlines_flat: flatten(inlines),
-        },
-        ParsedPayload::ImageRef { src, alt } => PayloadView::ImageRef {
-            src: src.clone(),
-            alt: alt.clone(),
-        },
-        ParsedPayload::AudioRef { src } => PayloadView::AudioRef { src: src.clone() },
-    };
-    BlockView {
-        kind,
-        heading_path: b.heading_path.clone(),
-        source_span: b.source_span.clone(),
-        payload,
-    }
-}
-
 fn fixtures_dir() -> PathBuf {
    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
        .join("..")
@@ -162,7 +42,7 @@ fn assert_snapshot(fixture: &str, baseline: &str) {

    let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
    let snap = Snapshot {
-        blocks: blocks.iter().map(block_to_view).collect(),
+        blocks,
        warnings: warns,
    };
    let actual: Value = serde_json::to_value(&snap).unwrap();
@@ -211,7 +91,7 @@ fn emit_blocks_snapshots() {
        let bytes = fs::read(dir.join(fixture)).unwrap();
        let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
        let snap = Snapshot {
-            blocks: blocks.iter().map(block_to_view).collect(),
+            blocks,
            warnings: warns,
        };
        let json = serde_json::to_string_pretty(&snap).unwrap();
@@ -227,14 +107,10 @@ fn snapshot_is_deterministic_across_runs() {
    let bytes = fs::read(dir.join("nested-headings.md")).unwrap();
    let (a_blocks, a_warns) = parse_blocks(&bytes, 1).unwrap();
    let (b_blocks, b_warns) = parse_blocks(&bytes, 1).unwrap();
-    // Compare via the view (which is fully serializable) and via the
-    // structural equality on `ParsedBlock` itself (no serde involved).
    assert_eq!(a_blocks, b_blocks);
    assert_eq!(a_warns, b_warns);
-    let av: Vec<_> = a_blocks.iter().map(block_to_view).collect();
-    let bv: Vec<_> = b_blocks.iter().map(block_to_view).collect();
    assert_eq!(
-        serde_json::to_value(&av).unwrap(),
-        serde_json::to_value(&bv).unwrap()
+        serde_json::to_value(&a_blocks).unwrap(),
+        serde_json::to_value(&b_blocks).unwrap()
    );
 }
--- a/fixtures/markdown/code-and-table.canonical.snapshot.json
+++ b/fixtures/markdown/code-and-table.canonical.snapshot.json
@@ -0,0 +1,102 @@
+{
+  "blocks": [
+    {
+      "common": {
+        "block_id": "dd1528c6e84d8a66087cbf6faafd67c6",
+        "heading_path": [],
+        "source_span": {
+          "end": 1,
+          "kind": "line",
+          "start": 1
+        }
+      },
+      "kind": "heading",
+      "level": 1,
+      "text": "Code And Table"
+    },
+    {
+      "code": "fn main() {\n    println!(\"hi\");\n}",
+      "common": {
+        "block_id": "68ea34aca04b83413dd8556126ae4584",
+        "heading_path": [
+          "Code And Table"
+        ],
+        "source_span": {
+          "end": 7,
+          "kind": "line",
+          "start": 3
+        }
+      },
+      "kind": "code",
+      "lang": "rust"
+    },
+    {
+      "common": {
+        "block_id": "b50a8e941b11f1834ae17adba9e08118",
+        "heading_path": [
+          "Code And Table"
+        ],
+        "source_span": {
+          "end": 12,
+          "kind": "line",
+          "start": 9
+        }
+      },
+      "headers": [
+        "col a",
+        "col b"
+      ],
+      "kind": "table",
+      "rows": [
+        [
+          "1",
+          "2"
+        ],
+        [
+          "3",
+          "4"
+        ]
+      ]
+    }
+  ],
+  "doc_id": "6a9ef317c9c097ff3f6aeb317559bd83",
+  "doc_version": 1,
+  "lang": "en",
+  "metadata": {
+    "aliases": [],
+    "created_at": "2023-11-14T22:13:20Z",
+    "source_type": "markdown",
+    "tags": [],
+    "trust_level": "primary",
+    "updated_at": "2023-11-14T22:13:20Z",
+    "user": {},
+    "user_id_alias": null
+  },
+  "parser_version": "kb-normalize-snapshot-test-0",
+  "provenance": {
+    "events": [
+      {
+        "agent": "kb-source-fs",
+        "at": "2023-11-14T22:13:20Z",
+        "kind": "discovered",
+        "note": null
+      },
+      {
+        "agent": "kb-parse-md",
+        "at": "<stripped>",
+        "kind": "parsed",
+        "note": "parser_version=kb-normalize-snapshot-test-0"
+      },
+      {
+        "agent": "kb-normalize",
+        "at": "<stripped>",
+        "kind": "normalized",
+        "note": null
+      }
+    ]
+  },
+  "schema_version": 1,
+  "source_asset_id": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+  "title": "Code And Table",
+  "workspace_path": "notes/code-and-table.md"
+}
--- a/fixtures/markdown/nested-headings.blocks.snapshot.json
+++ b/fixtures/markdown/nested-headings.blocks.snapshot.json
@@ -27,7 +27,12 @@
      "payload": {
        "kind": "paragraph",
        "text": "intro",
-        "inlines_flat": "intro"
+        "inlines": [
+          {
+            "kind": "text",
+            "text": "intro"
+          }
+        ]
      }
    },
    {
@@ -60,7 +65,12 @@
      "payload": {
        "kind": "paragraph",
        "text": "body of A",
-        "inlines_flat": "body of A"
+        "inlines": [
+          {
+            "kind": "text",
+            "text": "body of A"
+          }
+        ]
      }
    },
    {
@@ -95,7 +105,12 @@
      "payload": {
        "kind": "paragraph",
        "text": "deeper",
-        "inlines_flat": "deeper"
+        "inlines": [
+          {
+            "kind": "text",
+            "text": "deeper"
+          }
+        ]
      }
    },
    {
@@ -128,7 +143,12 @@
      "payload": {
        "kind": "paragraph",
        "text": "body of B",
-        "inlines_flat": "body of B"
+        "inlines": [
+          {
+            "kind": "text",
+            "text": "body of B"
+          }
+        ]
      }
    }
  ],