diff --git a/Cargo.lock b/Cargo.lock index 62a0784..0ce2a30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -582,14 +582,11 @@ name = "kb-normalize" version = "0.1.0" dependencies = [ "anyhow", - "blake3", - "kb-config", "kb-core", "kb-parse-md", "kb-parse-types", "serde", "serde_json", - "serde_json_canonicalizer", "time", "tracing", "unicode-normalization", diff --git a/crates/kb-normalize/Cargo.toml b/crates/kb-normalize/Cargo.toml index feaf5c7..6d61e35 100644 --- a/crates/kb-normalize/Cargo.toml +++ b/crates/kb-normalize/Cargo.toml @@ -10,11 +10,8 @@ description = "Lift parser output (kb-parse-types) into kb-core::CanonicalDocu [dependencies] kb-core = { path = "../kb-core" } kb-parse-types = { path = "../kb-parse-types" } -kb-config = { path = "../kb-config" } serde = { workspace = true } serde_json = { workspace = true } -serde_json_canonicalizer = "0.3" -blake3 = { workspace = true } unicode-normalization = "0.1" time = { workspace = true } anyhow = { workspace = true } diff --git a/crates/kb-normalize/src/lib.rs b/crates/kb-normalize/src/lib.rs index 3bf1e11..9e08c76 100644 --- a/crates/kb-normalize/src/lib.rs +++ b/crates/kb-normalize/src/lib.rs @@ -21,12 +21,13 @@ use std::collections::HashMap; use anyhow::Result; use kb_core::{ - AudioRefBlock, Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, DocumentId, - HeadingBlock, ImageRefBlock, Inline, Lang, ListBlock, Metadata, ParserVersion, Provenance, - ProvenanceEvent, ProvenanceKind, RawAsset, TableBlock, TextBlock, + Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, DocumentId, HeadingBlock, + ImageRefBlock, Inline, Lang, ListBlock, Metadata, ParserVersion, Provenance, ProvenanceEvent, + ProvenanceKind, RawAsset, TableBlock, TextBlock, }; use kb_parse_types::{ParsedBlock, ParsedPayload, Warning, WarningKind}; use time::OffsetDateTime; +use unicode_normalization::UnicodeNormalization; pub use kb_core::{id_for_block, id_for_doc}; @@ -44,7 +45,11 @@ pub use kb_core::{id_for_block, id_for_doc}; /// * `title` and `lang` are lifted from `metadata.user["title"]` / /// `metadata.user["lang"]` (where P1-2 stashes them) into the dedicated /// `CanonicalDocument` fields, and removed from the user map to avoid -/// duplication. Missing keys default to empty string / empty `Lang`. +/// duplication. Both keys are lifted only if present and stringy; +/// non-stringy values (e.g. `Number`, `Array`) and missing keys +/// silently default to empty title / empty `Lang`. P1-2's frontmatter +/// parser only writes these keys when the source value parses as a +/// string, so the non-stringy branches are defense-in-depth. /// * `provenance` is seeded with `Discovered` (from `asset.discovered_at`), /// `Parsed`, `Normalized` events, and one `Warning` event per upstream /// warning. The two normalize-side events share one `now_utc()` reading @@ -78,15 +83,30 @@ pub fn build_canonical_document( // §4.3 ordinal rule — per (heading_path, block_kind), 0-based, // document order. A separate counter is kept for each grouping key. let mut counters: HashMap<(Vec, &'static str), u32> = HashMap::new(); + // Some lift paths (e.g. AudioRef pre-P8) drop the block entirely and + // synthesize a Warning so the wire form never carries an invalid + // `AssetId`. These warnings originate at the lift stage and are + // attributed to `kb-normalize` (not to whatever upstream emitter the + // bare `WarningKind` would resolve to via `warning_agent`). They are + // tracked separately so the agent string is correct in Provenance. + let mut lift_warnings: Vec = Vec::new(); let lifted_blocks: Vec = blocks .into_iter() - .map(|pb| lift_block(&doc_id, pb, &mut counters)) + .filter_map(|pb| lift_block(&doc_id, pb, &mut counters, &mut lift_warnings)) .collect(); + tracing::debug!( + target: "kb-normalize", + "built canonical document doc_id={} blocks={}", + doc_id.0, + lifted_blocks.len() + ); + // Provenance — share `now` between the parse + normalize stages so // the per-call timestamp jitter is bounded. let now = OffsetDateTime::now_utc(); - let mut events: Vec = Vec::with_capacity(3 + warnings.len()); + let mut events: Vec = + Vec::with_capacity(3 + warnings.len() + lift_warnings.len()); events.push(ProvenanceEvent { at: asset.discovered_at, agent: "kb-source-fs".to_string(), @@ -105,6 +125,8 @@ pub fn build_canonical_document( kind: ProvenanceKind::Normalized, note: None, }); + // {:?} on WarningKind renders camel-case variant name; intentional + // for human-readable Provenance trace. for w in warnings { events.push(ProvenanceEvent { at: now, @@ -113,6 +135,16 @@ pub fn build_canonical_document( note: Some(format!("{:?}: {}", w.kind, w.note)), }); } + // Lift-stage warnings (currently only AudioRef-deferred drops) are + // unconditionally attributed to `kb-normalize`. + for w in lift_warnings { + events.push(ProvenanceEvent { + at: now, + agent: "kb-normalize".to_string(), + kind: ProvenanceKind::Warning, + note: Some(format!("{:?}: {}", w.kind, w.note)), + }); + } let provenance = Provenance { events }; Ok(CanonicalDocument { @@ -132,11 +164,19 @@ pub fn build_canonical_document( /// Resolve a `WarningKind` to the upstream agent that emitted it. Used /// to fill `ProvenanceEvent::agent` for the warning's event entry. +/// +/// `ExtractFailed` is emitted today by `kb-parse-md`'s panic-recovery +/// guard around `parse_blocks` — see `crates/kb-parse-md/src/blocks.rs`. +/// If a future stage (e.g. `kb-normalize` itself, an extractor, …) starts +/// emitting `ExtractFailed`, this mapping needs to grow context (perhaps +/// a separate `WarningSource` field on `Warning`) so attribution stays +/// honest. For now, all `ExtractFailed` warnings observed by +/// `build_canonical_document` originated in the parser. fn warning_agent(kind: &WarningKind) -> &'static str { match kind { WarningKind::MalformedFrontmatter | WarningKind::EncodingFallback => "kb-parse-md", WarningKind::MalformedTable => "kb-parse-md", - WarningKind::ExtractFailed => "kb-normalize", + WarningKind::ExtractFailed => "kb-parse-md", } } @@ -171,16 +211,28 @@ fn lift_block( doc_id: &DocumentId, pb: ParsedBlock, counters: &mut HashMap<(Vec, &'static str), u32>, -) -> Block { + warnings: &mut Vec, +) -> Option { let kind = payload_kind(&pb.payload); - let ordinal = next_ordinal(counters, &pb.heading_path, kind); - let block_id: BlockId = id_for_block(doc_id, kind, &pb.heading_path, ordinal, &pb.source_span); + // Task spec line 73: "All input strings normalized to NFC before + // hashing." `pulldown-cmark` does not NFC heading text, and + // `serde_json_canonicalizer` v0.3 does not normalize strings either, + // so we must NFC-normalize `heading_path` here before it feeds both + // the §4.2 ID recipe AND the on-disk `CommonBlock.heading_path` (so + // wire form matches ID input). Without this, NFD `\u{1100}\u{1161}` + // and NFC `\u{AC00}` (both render as 가) would produce different + // `block_id`s for what is logically the same heading. + let heading_path_nfc: Vec = + pb.heading_path.iter().map(|s| s.nfc().collect()).collect(); + let ordinal = next_ordinal(counters, &heading_path_nfc, kind); + let block_id: BlockId = + id_for_block(doc_id, kind, &heading_path_nfc, ordinal, &pb.source_span); let common = CommonBlock { block_id, - heading_path: pb.heading_path, + heading_path: heading_path_nfc, source_span: pb.source_span, }; - match pb.payload { + let block = match pb.payload { ParsedPayload::Heading { level, text } => Block::Heading(HeadingBlock { common, level, @@ -197,11 +249,14 @@ fn lift_block( items: items .into_iter() .map(|item_inlines| TextBlock { - // List items inherit the parent list's CommonBlock; spec - // (§3.4) defines `ListBlock.items: Vec` and + // All list items currently inherit the parent's + // CommonBlock (incl. block_id). Per-item IDs would + // require a §4.2 recipe extension. Spec (§3.4) + // defines `ListBlock.items: Vec` and // does not allocate per-item BlockIds. Re-using the // parent's common keeps the wire form deterministic - // while letting the inline tree carry the item content. + // while letting the inline tree carry the item + // content. common: common.clone(), text: flatten_inlines(&item_inlines), inlines: item_inlines, @@ -227,17 +282,24 @@ fn lift_block( ocr: None, caption: None, }), - // P1-4 does not extract audio metadata from disk — `asset_id` - // and `duration_ms` placeholders are filled in by the audio - // extractor (P8). For now we synthesize a minimal record so - // the document is well-typed. - ParsedPayload::AudioRef { src: _ } => Block::AudioRef(AudioRefBlock { - common, - asset_id: kb_core::AssetId(String::new()), - duration_ms: 0, - transcript: None, - }), - } + // TODO(P8): audio extractor will resolve workspace assets and + // produce real AssetIds. This skip-and-warn shim is a + // placeholder. `AssetId::from_str` requires a 32-hex string, so + // synthesizing `AssetId(String::new())` would break the + // invariant — instead we drop the block and surface a Warning + // (attributed to `kb-normalize` per §3.6 since this is the + // lift-stage decision). + ParsedPayload::AudioRef { src } => { + warnings.push(Warning { + kind: WarningKind::ExtractFailed, + note: format!( + "audio-ref AssetId resolution deferred to P8 — block dropped (src={src})" + ), + }); + return None; + } + }; + Some(block) } /// Flatten a `Vec` into a plain text string. Used by list-item @@ -316,6 +378,61 @@ mod tests { ParserVersion("kb-normalize-test-0".into()) } + /// Fixed 5-block input used by both the ordinal-scoping pinning test + /// and the determinism stress test (so the latter exercises the + /// `lift_block` path, not just the empty-blocks path). + fn fixture_blocks_five() -> Vec { + let h1_a = vec!["A".to_string()]; + let h1_b = vec!["B".to_string()]; + vec![ + ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Paragraph, + heading_path: h1_a.clone(), + source_span: SourceSpan::Line { start: 1, end: 1 }, + payload: ParsedPayload::Paragraph { + text: "p1".into(), + inlines: vec![], + }, + }, + ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Paragraph, + heading_path: h1_a.clone(), + source_span: SourceSpan::Line { start: 2, end: 2 }, + payload: ParsedPayload::Paragraph { + text: "p2".into(), + inlines: vec![], + }, + }, + ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Paragraph, + heading_path: h1_a.clone(), + source_span: SourceSpan::Line { start: 3, end: 3 }, + payload: ParsedPayload::Paragraph { + text: "p3".into(), + inlines: vec![], + }, + }, + ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Code, + heading_path: h1_a, + source_span: SourceSpan::Line { start: 4, end: 5 }, + payload: ParsedPayload::Code { + lang: None, + code: "x".into(), + }, + }, + ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Paragraph, + heading_path: h1_b, + source_span: SourceSpan::Line { start: 6, end: 6 }, + payload: ParsedPayload::Paragraph { + text: "q1".into(), + inlines: vec![], + }, + }, + ] + } + /// `id_for_doc` is deterministic across 1000 invocations on the same /// input — a regression in canonical JSON or BLAKE3 would surface /// here immediately. @@ -358,56 +475,9 @@ mod tests { /// H1 also starts a fresh counter at 0. #[test] fn block_ordinals_scoped_per_heading_and_kind() { - let span = SourceSpan::Line { start: 1, end: 1 }; let h1_a = vec!["A".to_string()]; let h1_b = vec!["B".to_string()]; - let blocks = vec![ - ParsedBlock { - kind: kb_parse_types::ParsedBlockKind::Paragraph, - heading_path: h1_a.clone(), - source_span: span.clone(), - payload: ParsedPayload::Paragraph { - text: "p1".into(), - inlines: vec![], - }, - }, - ParsedBlock { - kind: kb_parse_types::ParsedBlockKind::Paragraph, - heading_path: h1_a.clone(), - source_span: SourceSpan::Line { start: 2, end: 2 }, - payload: ParsedPayload::Paragraph { - text: "p2".into(), - inlines: vec![], - }, - }, - ParsedBlock { - kind: kb_parse_types::ParsedBlockKind::Paragraph, - heading_path: h1_a.clone(), - source_span: SourceSpan::Line { start: 3, end: 3 }, - payload: ParsedPayload::Paragraph { - text: "p3".into(), - inlines: vec![], - }, - }, - ParsedBlock { - kind: kb_parse_types::ParsedBlockKind::Code, - heading_path: h1_a.clone(), - source_span: SourceSpan::Line { start: 4, end: 5 }, - payload: ParsedPayload::Code { - lang: None, - code: "x".into(), - }, - }, - ParsedBlock { - kind: kb_parse_types::ParsedBlockKind::Paragraph, - heading_path: h1_b.clone(), - source_span: SourceSpan::Line { start: 6, end: 6 }, - payload: ParsedPayload::Paragraph { - text: "q1".into(), - inlines: vec![], - }, - }, - ]; + let blocks = fixture_blocks_five(); let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); @@ -417,7 +487,13 @@ mod tests { // Compute the expected IDs out-of-band so the test pins both // the (heading_path, kind) ordinal grouping AND the value of // each block_id under the recipe. - let p1 = id_for_block(&doc.doc_id, "paragraph", &h1_a, 0, &span); + let p1 = id_for_block( + &doc.doc_id, + "paragraph", + &h1_a, + 0, + &SourceSpan::Line { start: 1, end: 1 }, + ); let p2 = id_for_block( &doc.doc_id, "paragraph", @@ -482,10 +558,14 @@ mod tests { ProvenanceKind::Normalized, ] ); - assert_eq!(doc.provenance.events[0].at, asset.discovered_at); - assert_eq!(doc.provenance.events[0].agent, "kb-source-fs"); - assert_eq!(doc.provenance.events[1].agent, "kb-parse-md"); - assert_eq!(doc.provenance.events[2].agent, "kb-normalize"); + let events = &doc.provenance.events; + assert_eq!(events[0].at, asset.discovered_at); + assert_eq!(events[0].agent, "kb-source-fs"); + assert_eq!(events[1].agent, "kb-parse-md"); + assert_eq!(events[2].agent, "kb-normalize"); + // Pin the implementation invariant that Parsed and Normalized + // share the single `now_utc()` reading inside one call. + assert_eq!(events[1].at, events[2].at, "Parsed and Normalized share now_utc"); } /// Warnings carried into `build_canonical_document` are emitted as @@ -560,10 +640,14 @@ mod tests { v } + // Use the same 5-block fixture as the ordinal-scoping test so + // determinism is exercised on a non-empty `lift_block` path + // (block_id hashing, NFC normalization, ordinal counters), not + // just an empty Vec. let baseline = build_canonical_document( &asset, metadata.clone(), - vec![], + fixture_blocks_five(), &pv, vec![], ) @@ -575,7 +659,7 @@ mod tests { let next = build_canonical_document( &asset, metadata.clone(), - vec![], + fixture_blocks_five(), &pv, vec![], ) @@ -589,4 +673,171 @@ mod tests { start.elapsed() ); } + + /// I1 regression — `WarningKind::ExtractFailed` is emitted by + /// `kb-parse-md` (panic-recovery in `blocks.rs`), so the resulting + /// `ProvenanceEvent::agent` must read `"kb-parse-md"`. A regression + /// to `"kb-normalize"` would mis-attribute parse panics and break + /// stage-filtered debugging. + #[test] + fn provenance_with_extract_failed_warning_attributes_to_kb_parse_md() { + let asset = fixture_asset(); + let metadata = fixture_metadata(); + let pv = parser_version(); + let warnings = vec![Warning { + kind: WarningKind::ExtractFailed, + note: "pulldown-cmark panicked; body discarded".into(), + }]; + let doc = + build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap(); + let warning_event = doc + .provenance + .events + .iter() + .find(|e| e.kind == ProvenanceKind::Warning) + .expect("warning event present"); + assert_eq!(warning_event.agent, "kb-parse-md"); + assert!( + warning_event + .note + .as_deref() + .unwrap() + .contains("ExtractFailed") + ); + } + + /// I2 regression — `ParsedPayload::AudioRef` is dropped (not lifted + /// into a `Block::AudioRef` with a synthesized empty `AssetId`, + /// which would violate `AssetId::from_str`'s 32-hex invariant). A + /// `Warning` is surfaced in Provenance, attributed to + /// `"kb-normalize"` because the decision is made at the lift stage. + #[test] + fn audio_ref_block_skipped_with_warning() { + let span = SourceSpan::Line { start: 1, end: 1 }; + let blocks = vec![ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::AudioRef, + heading_path: vec![], + source_span: span, + payload: ParsedPayload::AudioRef { + src: "voice.m4a".into(), + }, + }]; + let asset = fixture_asset(); + let metadata = fixture_metadata(); + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); + + // No AudioRef block in the canonical output. + assert!( + !doc.blocks + .iter() + .any(|b| matches!(b, Block::AudioRef(_))), + "AudioRef block should be skipped pre-P8" + ); + + // Exactly one Warning event mentioning the AudioRef src. + let warning_events: Vec<_> = doc + .provenance + .events + .iter() + .filter(|e| e.kind == ProvenanceKind::Warning) + .collect(); + assert_eq!(warning_events.len(), 1); + let w = warning_events[0]; + assert_eq!(w.agent, "kb-normalize"); + assert!(w.note.as_deref().unwrap().contains("voice.m4a")); + } + + /// I3 regression — heading-path strings are NFC-normalized before + /// feeding into `id_for_block`, so canonically-equivalent NFD and + /// NFC inputs produce the same `block_id`. Mirrors + /// `nfc_nfd_korean_path_same_id` for `doc_id`. + #[test] + fn nfc_nfd_korean_heading_path_same_block_id() { + let span = SourceSpan::Line { start: 1, end: 1 }; + let nfd_heading = "\u{1100}\u{1161}".to_string(); // 가 (NFD) + let nfc_heading = "\u{AC00}".to_string(); // 가 (NFC) + let mk_block = |heading: String| ParsedBlock { + kind: kb_parse_types::ParsedBlockKind::Paragraph, + heading_path: vec![heading], + source_span: span.clone(), + payload: ParsedPayload::Paragraph { + text: "p".into(), + inlines: vec![], + }, + }; + let asset = fixture_asset(); + let pv = parser_version(); + let doc_nfd = build_canonical_document( + &asset, + fixture_metadata(), + vec![mk_block(nfd_heading)], + &pv, + vec![], + ) + .unwrap(); + let doc_nfc = build_canonical_document( + &asset, + fixture_metadata(), + vec![mk_block(nfc_heading)], + &pv, + vec![], + ) + .unwrap(); + let id_nfd = match &doc_nfd.blocks[0] { + Block::Paragraph(t) => &t.common.block_id, + _ => panic!("expected Paragraph"), + }; + let id_nfc = match &doc_nfc.blocks[0] { + Block::Paragraph(t) => &t.common.block_id, + _ => panic!("expected Paragraph"), + }; + assert_eq!(id_nfd, id_nfc, "NFD and NFC heading paths must hash equal"); + } + + /// M7 — `metadata.user["title"] = ""` is stringy and lifts to an + /// empty `CanonicalDocument.title`. This pins the policy: an + /// explicit empty string is *not* dropped, it's lifted as-is. + #[test] + fn title_empty_string_in_user_map_falls_back_to_default() { + let asset = fixture_asset(); + let mut metadata = fixture_metadata(); + metadata + .user + .insert("title".into(), Value::String(String::new())); + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + assert_eq!(doc.title, ""); + } + + /// M7 — `metadata.user["title"] = 42` is non-stringy and silently + /// drops; the fallback default (empty title) is used. + #[test] + fn title_non_string_in_user_map_silently_drops() { + let asset = fixture_asset(); + let mut metadata = fixture_metadata(); + metadata + .user + .insert("title".into(), Value::Number(42.into())); + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + assert_eq!(doc.title, ""); + } + + /// M7 — non-stringy `lang` (e.g. an array) silently drops. This is + /// defensive: P1-2 frontmatter validates the shape upstream, but we + /// don't trust it. + #[test] + fn lang_invalid_shape_silently_drops() { + let asset = fixture_asset(); + let mut metadata = fixture_metadata(); + metadata.user.insert("lang".into(), Value::Array(vec![])); + let pv = parser_version(); + let doc = + build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); + assert_eq!(doc.lang, Lang(String::new())); + } } diff --git a/crates/kb-normalize/tests/normalize_snapshot.rs b/crates/kb-normalize/tests/normalize_snapshot.rs index 890ddd8..ec10ddf 100644 --- a/crates/kb-normalize/tests/normalize_snapshot.rs +++ b/crates/kb-normalize/tests/normalize_snapshot.rs @@ -76,9 +76,15 @@ fn code_and_table_canonical_snapshot() { // Frontmatter parse — code-and-table.md has none, so we provide // BodyHints with deterministic timestamps so the lifted Metadata // is reproducible. The body offset is 1 (no frontmatter prefix). + // + // We pin `first_h1` so the BodyHints → user.title → CanonicalDocument.title + // lift chain is exercised end-to-end (see `assert_eq!` on + // `doc.title` below). Without this, `code-and-table.md`'s lack of + // frontmatter title would leave `title == ""` and the chain would + // be uncovered by the snapshot. let asset = fixed_asset("notes/code-and-table.md"); let hints = BodyHints { - first_h1: None, + first_h1: Some("Code And Table".into()), fs_ctime: asset.discovered_at, fs_mtime: asset.discovered_at, fallback_lang: Some("en".into()), @@ -115,6 +121,11 @@ fn code_and_table_canonical_snapshot() { ) .expect("build_canonical_document"); + // Assert the BodyHints → first_h1 → user.title → CanonicalDocument.title + // lift chain end-to-end. Pinned in the snapshot too, but the explicit + // assertion makes a future drift fail with a clearer message. + assert_eq!(doc.title, "Code And Table"); + let actual = strip_dynamic(serde_json::to_value(&doc).unwrap()); let baseline_path = dir.join("code-and-table.canonical.snapshot.json"); diff --git a/fixtures/markdown/code-and-table.canonical.snapshot.json b/fixtures/markdown/code-and-table.canonical.snapshot.json index 9fb909c..25628fd 100644 --- a/fixtures/markdown/code-and-table.canonical.snapshot.json +++ b/fixtures/markdown/code-and-table.canonical.snapshot.json @@ -97,6 +97,6 @@ }, "schema_version": 1, "source_asset_id": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", - "title": "", + "title": "Code And Table", "workspace_path": "notes/code-and-table.md" }