//! `kb-normalize` — lift parser output (`kb-parse-types`) into a //! [`kebab_core::CanonicalDocument`] with deterministic IDs. //! //! Per design §3.4 (CanonicalDocument / Block), §4.2 (ID recipe), §4.3 //! (ordinal rule), §3.6 (Provenance), §8 (module boundaries). //! //! Public surface: //! //! * [`build_canonical_document`] — assemble a `CanonicalDocument` from //! `(RawAsset, Metadata, Vec, ParserVersion, Vec)`. //! * [`id_for_doc`], [`id_for_block`] — re-exports of the canonical //! ID-recipe functions in `kb-core::ids` (§4.2). `kb-core` is the only //! implementation; `kb-normalize` is the canonical *entry point* per //! design §8. //! //! This crate must NOT depend on any parser implementation crate //! (`kb-parse-md`, `kb-parse-pdf`, …). All parser output flows in via //! the shared `kb-parse-types` crate. use std::collections::HashMap; use std::path::Path; use anyhow::Result; use kebab_core::{ Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, DocumentId, HeadingBlock, ImageRefBlock, Inline, Lang, ListBlock, Metadata, ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, RawAsset, TableBlock, TextBlock, }; use kebab_parse_types::{ParsedBlock, ParsedPayload, Warning, WarningKind}; use time::OffsetDateTime; use unicode_normalization::UnicodeNormalization; pub use kebab_core::{id_for_block, id_for_doc}; /// Build a [`CanonicalDocument`] from the raw asset, frontmatter /// metadata, parser blocks, parser version, and any warnings. /// /// Behavior contract (per design §3.4 / §4.2 / §4.3 / §3.6): /// /// * `doc_id = id_for_doc(workspace_path, asset_id, parser_version)` — /// `workspace_path` is consumed verbatim from `asset` (already NFC + /// POSIX per `kebab_core::normalize::to_posix`). /// * `block_id = id_for_block(doc_id, kind, heading_path, ordinal, /// source_span)` — `ordinal` is **0-based, scoped to (heading_path, /// block_kind), in document order** per §4.3. /// * `title` and `lang` are lifted from `metadata.user["title"]` / /// `metadata.user["lang"]` (where P1-2 stashes them) into the dedicated /// `CanonicalDocument` fields, and removed from the user map to avoid /// duplication. Both keys are lifted only if present and stringy; /// non-stringy values (e.g. `Number`, `Array`) and missing keys /// silently default to empty title / empty `Lang`. P1-2's frontmatter /// parser only writes these keys when the source value parses as a /// string, so the non-stringy branches are defense-in-depth. /// * `provenance` is seeded with `Discovered` (from `asset.discovered_at`), /// `Parsed`, `Normalized` events, and one `Warning` event per upstream /// warning. The two normalize-side events share one `now_utc()` reading /// so the timestamp jitter inside a single call is bounded — event /// ordering is preserved by `Vec` position. /// * `schema_version` and `doc_version` are pinned to `1` (initial). pub fn build_canonical_document( asset: &RawAsset, metadata: Metadata, blocks: Vec, parser_version: &ParserVersion, warnings: Vec, ) -> Result { let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version); // Lift title / lang from `metadata.user` (P1-2 stashed them there // because `Metadata` does not carry them directly). Strip after // lifting so the wire form does not duplicate the data. let mut metadata = metadata; let title = metadata .user .remove("title") .and_then(|v| v.as_str().map(String::from)) .unwrap_or_default(); let lang = metadata .user .remove("lang") .and_then(|v| v.as_str().map(|s| Lang(s.to_string()))) .unwrap_or_else(|| Lang(String::new())); // §4.3 ordinal rule — per (heading_path, block_kind), 0-based, // document order. A separate counter is kept for each grouping key. let mut counters: HashMap<(Vec, &'static str), u32> = HashMap::new(); // Some lift paths (e.g. AudioRef pre-P8) drop the block entirely and // synthesize a Warning so the wire form never carries an invalid // `AssetId`. These warnings originate at the lift stage and are // attributed to `kb-normalize` (not to whatever upstream emitter the // bare `WarningKind` would resolve to via `warning_agent`). They are // tracked separately so the agent string is correct in Provenance. let mut lift_warnings: Vec = Vec::new(); let lifted_blocks: Vec = blocks .into_iter() .filter_map(|pb| lift_block(&doc_id, pb, &mut counters, &mut lift_warnings)) .collect(); // p9-fb-07: title fallback chain. `title` so far holds the // frontmatter `title` (step 1). If empty / whitespace, walk the // lifted blocks for an H1 → H2 → first paragraph excerpt → file // stem. NFC-normalize the chosen string so the on-wire title is // canonically equivalent to whatever the user stored, regardless // of source NFD/NFC form. let file_stem = workspace_path_stem(&asset.workspace_path.0); let title = derive_title(&title, &lifted_blocks, &file_stem); tracing::debug!( target: "kebab-normalize", "built canonical document doc_id={} blocks={}", doc_id.0, lifted_blocks.len() ); // Provenance — share `now` between the parse + normalize stages so // the per-call timestamp jitter is bounded. let now = OffsetDateTime::now_utc(); let mut events: Vec = Vec::with_capacity(3 + warnings.len() + lift_warnings.len()); events.push(ProvenanceEvent { at: asset.discovered_at, agent: "kb-source-fs".to_string(), kind: ProvenanceKind::Discovered, note: None, }); events.push(ProvenanceEvent { at: now, agent: "kb-parse-md".to_string(), kind: ProvenanceKind::Parsed, note: Some(format!("parser_version={}", parser_version.0)), }); events.push(ProvenanceEvent { at: now, agent: "kb-normalize".to_string(), kind: ProvenanceKind::Normalized, note: None, }); // {:?} on WarningKind renders camel-case variant name; intentional // for human-readable Provenance trace. for w in warnings { events.push(ProvenanceEvent { at: now, agent: warning_agent(&w.kind).to_string(), kind: ProvenanceKind::Warning, note: Some(format!("{:?}: {}", w.kind, w.note)), }); } // Lift-stage warnings (currently only AudioRef-deferred drops) are // unconditionally attributed to `kb-normalize`. for w in lift_warnings { events.push(ProvenanceEvent { at: now, agent: "kb-normalize".to_string(), kind: ProvenanceKind::Warning, note: Some(format!("{:?}: {}", w.kind, w.note)), }); } let provenance = Provenance { events }; Ok(CanonicalDocument { doc_id, source_asset_id: asset.asset_id.clone(), workspace_path: asset.workspace_path.clone(), title, lang, blocks: lifted_blocks, metadata, provenance, parser_version: parser_version.clone(), schema_version: 1, doc_version: 1, last_chunker_version: None, last_embedding_version: None, }) } /// Resolve a `WarningKind` to the upstream agent that emitted it. Used /// to fill `ProvenanceEvent::agent` for the warning's event entry. /// /// `ExtractFailed` is emitted today by `kb-parse-md`'s panic-recovery /// guard around `parse_blocks` — see `crates/kb-parse-md/src/blocks.rs`. /// If a future stage (e.g. `kb-normalize` itself, an extractor, …) starts /// emitting `ExtractFailed`, this mapping needs to grow context (perhaps /// a separate `WarningSource` field on `Warning`) so attribution stays /// honest. For now, all `ExtractFailed` warnings observed by /// `build_canonical_document` originated in the parser. fn warning_agent(kind: &WarningKind) -> &'static str { match kind { WarningKind::MalformedFrontmatter | WarningKind::EncodingFallback => "kb-parse-md", WarningKind::MalformedTable => "kb-parse-md", WarningKind::ExtractFailed => "kb-parse-md", } } /// Map a `ParsedPayload` variant to the lowercase, no-spaces string used /// as `block_kind` in the §4.2 ID tuple. fn payload_kind(payload: &ParsedPayload) -> &'static str { match payload { ParsedPayload::Heading { .. } => "heading", ParsedPayload::Paragraph { .. } => "paragraph", ParsedPayload::List { .. } => "list", ParsedPayload::Code { .. } => "code", ParsedPayload::Table { .. } => "table", ParsedPayload::Quote { .. } => "quote", ParsedPayload::ImageRef { .. } => "imageref", ParsedPayload::AudioRef { .. } => "audioref", } } fn next_ordinal( counters: &mut HashMap<(Vec, &'static str), u32>, heading_path: &[String], kind: &'static str, ) -> u32 { let key = (heading_path.to_vec(), kind); let entry = counters.entry(key).or_insert(0); let ordinal = *entry; *entry += 1; ordinal } fn lift_block( doc_id: &DocumentId, pb: ParsedBlock, counters: &mut HashMap<(Vec, &'static str), u32>, warnings: &mut Vec, ) -> Option { let kind = payload_kind(&pb.payload); // Task spec line 73: "All input strings normalized to NFC before // hashing." `pulldown-cmark` does not NFC heading text, and // `serde_json_canonicalizer` v0.3 does not normalize strings either, // so we must NFC-normalize `heading_path` here before it feeds both // the §4.2 ID recipe AND the on-disk `CommonBlock.heading_path` (so // wire form matches ID input). Without this, NFD `\u{1100}\u{1161}` // and NFC `\u{AC00}` (both render as 가) would produce different // `block_id`s for what is logically the same heading. let heading_path_nfc: Vec = pb.heading_path.iter().map(|s| s.nfc().collect()).collect(); let ordinal = next_ordinal(counters, &heading_path_nfc, kind); let block_id: BlockId = id_for_block(doc_id, kind, &heading_path_nfc, ordinal, &pb.source_span); let common = CommonBlock { block_id, heading_path: heading_path_nfc, source_span: pb.source_span, }; let block = match pb.payload { ParsedPayload::Heading { level, text } => Block::Heading(HeadingBlock { common, level, text, }), ParsedPayload::Paragraph { text, inlines } => Block::Paragraph(TextBlock { common, text, inlines, }), ParsedPayload::List { ordered, items } => Block::List(ListBlock { common: common.clone(), ordered, items: items .into_iter() .map(|item_inlines| TextBlock { // All list items currently inherit the parent's // CommonBlock (incl. block_id). Per-item IDs would // require a §4.2 recipe extension. Spec (§3.4) // defines `ListBlock.items: Vec` and // does not allocate per-item BlockIds. Re-using the // parent's common keeps the wire form deterministic // while letting the inline tree carry the item // content. common: common.clone(), text: flatten_inlines(&item_inlines), inlines: item_inlines, }) .collect(), }), ParsedPayload::Code { lang, code } => Block::Code(CodeBlock { common, lang, code }), ParsedPayload::Table { headers, rows } => Block::Table(TableBlock { common, headers, rows, }), ParsedPayload::Quote { text, inlines } => Block::Quote(TextBlock { common, text, inlines, }), ParsedPayload::ImageRef { src, alt } => Block::ImageRef(ImageRefBlock { common, asset_id: None, src, alt, ocr: None, caption: None, }), // TODO(P8): audio extractor will resolve workspace assets and // produce real AssetIds. This skip-and-warn shim is a // placeholder. `AssetId::from_str` requires a 32-hex string, so // synthesizing `AssetId(String::new())` would break the // invariant — instead we drop the block and surface a Warning // (attributed to `kb-normalize` per §3.6 since this is the // lift-stage decision). ParsedPayload::AudioRef { src } => { warnings.push(Warning { kind: WarningKind::ExtractFailed, note: format!( "audio-ref AssetId resolution deferred to P8 — block dropped (src={src})" ), }); return None; } }; Some(block) } /// Flatten a `Vec` into a plain text string. Used by list-item /// `TextBlock.text` since `ParsedPayload::List` only carries inline trees /// per item. fn flatten_inlines(inlines: &[Inline]) -> String { let mut out = String::new(); for i in inlines { flatten_inline(i, &mut out); } out } fn flatten_inline(i: &Inline, out: &mut String) { match i { Inline::Text { text } => out.push_str(text), Inline::Code { code } => out.push_str(code), Inline::Link { text, .. } => out.push_str(text), Inline::Strong { children } | Inline::Emph { children } => { for c in children { flatten_inline(c, out); } } } } /// p9-fb-07: derive a usable title from the frontmatter, lifted blocks, /// and the source filename, using a documented fallback chain. /// /// Priority (first non-blank wins): /// /// 1. `frontmatter_title` — verbatim, after trimming whitespace. /// 2. First `Heading` block at level 1 with non-blank text. /// 3. First `Heading` block at level 2 with non-blank text. /// 4. First `Paragraph` block (NOT `Quote`, `List`, `Code`, `Table`, /// `ImageRef`, `AudioRef`) with non-blank text — first 80 chars. /// 5. `file_stem` (filename minus extension — returned verbatim, no /// case transformation; whatever the on-disk filename is becomes /// the title text). /// /// The chosen string is NFC-normalized so the on-wire title is /// canonically equivalent to the source content. Never returns an /// empty string — if every step is blank (e.g. an empty file), the /// `file_stem` fallback ensures a non-empty result. If `file_stem` is /// also blank (pathological), returns `"untitled"` as a last resort. pub fn derive_title(frontmatter_title: &str, blocks: &[Block], file_stem: &str) -> String { let trimmed = frontmatter_title.trim(); if !trimmed.is_empty() { return trimmed.nfc().collect(); } if let Some(text) = first_heading_text(blocks, 1) { return text; } if let Some(text) = first_heading_text(blocks, 2) { return text; } if let Some(excerpt) = first_paragraph_excerpt(blocks, 80) { return excerpt; } // `file_stem` originates from `WorkspacePath`, which `to_posix` // already NFC-normalizes (§6.6). No second NFC pass needed — pass // through verbatim after a defensive `trim`. let stem = file_stem.trim(); if !stem.is_empty() { return stem.to_string(); } "untitled".to_string() } fn first_heading_text(blocks: &[Block], level: u8) -> Option { blocks.iter().find_map(|b| match b { Block::Heading(h) if h.level == level => { let trimmed = h.text.trim(); if trimmed.is_empty() { None } else { Some(trimmed.nfc().collect()) } } _ => None, }) } fn first_paragraph_excerpt(blocks: &[Block], max_chars: usize) -> Option { blocks.iter().find_map(|b| match b { Block::Paragraph(t) => { let trimmed = t.text.trim(); if trimmed.is_empty() { None } else { let nfc: String = trimmed.nfc().collect(); Some(nfc.chars().take(max_chars).collect()) } } _ => None, }) } /// Extract the filename stem (no extension) from a workspace path /// string. Returns the empty string if no filename can be derived /// (e.g. trailing slash). Multi-extension cases (`foo.tar.gz`) follow /// `Path::file_stem` semantics — only the last extension is stripped. fn workspace_path_stem(workspace_path: &str) -> String { Path::new(workspace_path) .file_stem() .and_then(|s| s.to_str()) .unwrap_or("") .to_string() } #[cfg(test)] mod tests { use super::*; use kebab_core::{ AssetId, AssetStorage, Checksum, MediaType, SourceSpan, SourceType, SourceUri, TrustLevel, WorkspacePath, normalize::to_posix, }; use serde_json::Value; use std::path::{Path, PathBuf}; use time::OffsetDateTime; fn fixture_asset() -> RawAsset { let workspace_path = WorkspacePath::new("notes/example.md".into()).unwrap(); RawAsset { asset_id: AssetId("a".repeat(32)), source_uri: SourceUri::File(PathBuf::from("/tmp/example.md")), workspace_path, media_type: MediaType::Markdown, byte_len: 0, checksum: Checksum("0".repeat(64)), // Pin a fixed timestamp so determinism tests can compare // outputs across runs without timestamp jitter outside the // fields we explicitly strip. discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), stored: AssetStorage::Reference { path: PathBuf::from("/tmp/example.md"), sha: Checksum("0".repeat(64)), }, } } fn fixture_metadata() -> Metadata { let mut user = serde_json::Map::new(); user.insert("title".into(), Value::String("Example".into())); user.insert("lang".into(), Value::String("en".into())); user.insert("custom".into(), Value::Bool(true)); Metadata { aliases: vec![], tags: vec![], created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), source_type: SourceType::Markdown, trust_level: TrustLevel::Primary, user_id_alias: None, user, repo: None, git_branch: None, git_commit: None, code_lang: None, } } fn parser_version() -> ParserVersion { ParserVersion("kb-normalize-test-0".into()) } /// Fixed 5-block input used by both the ordinal-scoping pinning test /// and the determinism stress test (so the latter exercises the /// `lift_block` path, not just the empty-blocks path). fn fixture_blocks_five() -> Vec { let h1_a = vec!["A".to_string()]; let h1_b = vec!["B".to_string()]; vec![ ParsedBlock { kind: kebab_parse_types::ParsedBlockKind::Paragraph, heading_path: h1_a.clone(), source_span: SourceSpan::Line { start: 1, end: 1 }, payload: ParsedPayload::Paragraph { text: "p1".into(), inlines: vec![], }, }, ParsedBlock { kind: kebab_parse_types::ParsedBlockKind::Paragraph, heading_path: h1_a.clone(), source_span: SourceSpan::Line { start: 2, end: 2 }, payload: ParsedPayload::Paragraph { text: "p2".into(), inlines: vec![], }, }, ParsedBlock { kind: kebab_parse_types::ParsedBlockKind::Paragraph, heading_path: h1_a.clone(), source_span: SourceSpan::Line { start: 3, end: 3 }, payload: ParsedPayload::Paragraph { text: "p3".into(), inlines: vec![], }, }, ParsedBlock { kind: kebab_parse_types::ParsedBlockKind::Code, heading_path: h1_a, source_span: SourceSpan::Line { start: 4, end: 5 }, payload: ParsedPayload::Code { lang: None, code: "x".into(), }, }, ParsedBlock { kind: kebab_parse_types::ParsedBlockKind::Paragraph, heading_path: h1_b, source_span: SourceSpan::Line { start: 6, end: 6 }, payload: ParsedPayload::Paragraph { text: "q1".into(), inlines: vec![], }, }, ] } /// `id_for_doc` is deterministic across 1000 invocations on the same /// input — a regression in canonical JSON or BLAKE3 would surface /// here immediately. #[test] fn id_for_doc_deterministic_1000() { let path = WorkspacePath::new("a/b.md".into()).unwrap(); let asset = AssetId("0123456789abcdef0123456789abcdef".into()); let pv = ParserVersion("v1".into()); let first = id_for_doc(&path, &asset, &pv); for _ in 0..1000 { assert_eq!(id_for_doc(&path, &asset, &pv), first); } } /// NFC vs NFD inputs for the same Korean glyph must produce the /// same `doc_id` because `to_posix` runs NFC normalization. #[test] fn nfc_nfd_korean_path_same_id() { let nfd = to_posix(Path::new("\u{1100}\u{1161}.md")).unwrap(); let nfc = to_posix(Path::new("\u{AC00}.md")).unwrap(); let asset = AssetId("0123456789abcdef0123456789abcdef".into()); let pv = parser_version(); assert_eq!(id_for_doc(&nfd, &asset, &pv), id_for_doc(&nfc, &asset, &pv)); } /// `./a/b.md` and `a/b.md` must collapse to the same POSIX form /// before `id_for_doc`. #[test] fn posix_curdir_collapses_to_same_id() { let a = to_posix(Path::new("./a/b.md")).unwrap(); let b = to_posix(Path::new("a/b.md")).unwrap(); let asset = AssetId("0123456789abcdef0123456789abcdef".into()); let pv = parser_version(); assert_eq!(id_for_doc(&a, &asset, &pv), id_for_doc(&b, &asset, &pv)); } /// Ordinals are scoped to (heading_path, block_kind) per §4.3: /// three paragraphs under H1 → 0/1/2; a code block under the same /// H1 starts a fresh counter at 0; a paragraph under a different /// H1 also starts a fresh counter at 0. #[test] fn block_ordinals_scoped_per_heading_and_kind() { let h1_a = vec!["A".to_string()]; let h1_b = vec!["B".to_string()]; let blocks = fixture_blocks_five(); let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); let doc = build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); // Compute the expected IDs out-of-band so the test pins both // the (heading_path, kind) ordinal grouping AND the value of // each block_id under the recipe. let p1 = id_for_block( &doc.doc_id, "paragraph", &h1_a, 0, &SourceSpan::Line { start: 1, end: 1 }, ); let p2 = id_for_block( &doc.doc_id, "paragraph", &h1_a, 1, &SourceSpan::Line { start: 2, end: 2 }, ); let p3 = id_for_block( &doc.doc_id, "paragraph", &h1_a, 2, &SourceSpan::Line { start: 3, end: 3 }, ); let c0 = id_for_block( &doc.doc_id, "code", &h1_a, 0, &SourceSpan::Line { start: 4, end: 5 }, ); let q0 = id_for_block( &doc.doc_id, "paragraph", &h1_b, 0, &SourceSpan::Line { start: 6, end: 6 }, ); let ids: Vec<&BlockId> = doc .blocks .iter() .map(|b| match b { Block::Paragraph(t) | Block::Quote(t) => &t.common.block_id, Block::Heading(h) => &h.common.block_id, Block::List(l) => &l.common.block_id, Block::Code(c) => &c.common.block_id, Block::Table(t) => &t.common.block_id, Block::ImageRef(i) => &i.common.block_id, Block::AudioRef(a) => &a.common.block_id, }) .collect(); assert_eq!(ids, vec![&p1, &p2, &p3, &c0, &q0]); } /// Provenance events appear in the documented order: `Discovered` /// (from the asset), `Parsed`, then `Normalized`. Warnings (none in /// this test) would follow. #[test] fn provenance_contains_stage_events_in_order() { let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); let kinds: Vec<_> = doc.provenance.events.iter().map(|e| e.kind).collect(); assert_eq!( kinds, vec![ ProvenanceKind::Discovered, ProvenanceKind::Parsed, ProvenanceKind::Normalized, ] ); let events = &doc.provenance.events; assert_eq!(events[0].at, asset.discovered_at); assert_eq!(events[0].agent, "kb-source-fs"); assert_eq!(events[1].agent, "kb-parse-md"); assert_eq!(events[2].agent, "kb-normalize"); // Pin the implementation invariant that Parsed and Normalized // share the single `now_utc()` reading inside one call. assert_eq!(events[1].at, events[2].at, "Parsed and Normalized share now_utc"); } /// Warnings carried into `build_canonical_document` are emitted as /// `ProvenanceKind::Warning` events with the upstream agent. #[test] fn provenance_includes_warnings() { let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); let warnings = vec![Warning { kind: WarningKind::MalformedFrontmatter, note: "missing closing fence".into(), }]; let doc = build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap(); assert_eq!(doc.provenance.events.len(), 4); let last = doc.provenance.events.last().unwrap(); assert_eq!(last.kind, ProvenanceKind::Warning); assert_eq!(last.agent, "kb-parse-md"); assert!(last.note.as_deref().unwrap().contains("missing closing fence")); } /// `metadata.user["title"]` and `metadata.user["lang"]` are lifted /// to the dedicated `CanonicalDocument` fields and stripped from /// the user map (so the wire form does not duplicate the data). /// Other user keys survive intact. #[test] fn lifts_title_and_lang_from_user_map() { let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); assert_eq!(doc.title, "Example"); assert_eq!(doc.lang, Lang("en".into())); assert!(!doc.metadata.user.contains_key("title")); assert!(!doc.metadata.user.contains_key("lang")); assert!(doc.metadata.user.contains_key("custom")); } /// Determinism property: 1000 iterations of `build_canonical_document` /// over identical inputs produce byte-identical JSON, modulo the two /// non-deterministic `now_utc()` timestamps for the Parsed/Normalized /// events. We strip those timestamps before comparing. Must finish /// within 1 second. #[test] fn determinism_1000_iterations_under_1s() { let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); // Helper: serialize and replace the two now_utc-derived timestamps // (Parsed + Normalized + any Warning events) with a sentinel so // the comparison only checks the deterministic fields. fn strip_dynamic_at(doc: &CanonicalDocument) -> Value { let mut v = serde_json::to_value(doc).unwrap(); if let Some(events) = v .get_mut("provenance") .and_then(|p| p.get_mut("events")) .and_then(|e| e.as_array_mut()) { for (i, ev) in events.iter_mut().enumerate() { // index 0 is Discovered (deterministic — pinned in // the fixture). Strip everything after. if i > 0 && let Some(obj) = ev.as_object_mut() { obj.insert("at".into(), Value::String("".into())); } } } v } // Use the same 5-block fixture as the ordinal-scoping test so // determinism is exercised on a non-empty `lift_block` path // (block_id hashing, NFC normalization, ordinal counters), not // just an empty Vec. let baseline = build_canonical_document( &asset, metadata.clone(), fixture_blocks_five(), &pv, vec![], ) .unwrap(); let baseline_json = serde_json::to_string(&strip_dynamic_at(&baseline)).unwrap(); let start = std::time::Instant::now(); for _ in 0..1000 { let next = build_canonical_document( &asset, metadata.clone(), fixture_blocks_five(), &pv, vec![], ) .unwrap(); let next_json = serde_json::to_string(&strip_dynamic_at(&next)).unwrap(); assert_eq!(baseline_json, next_json); } assert!( start.elapsed() < std::time::Duration::from_secs(1), "1000 iterations took {:?}", start.elapsed() ); } /// I1 regression — `WarningKind::ExtractFailed` is emitted by /// `kb-parse-md` (panic-recovery in `blocks.rs`), so the resulting /// `ProvenanceEvent::agent` must read `"kb-parse-md"`. A regression /// to `"kb-normalize"` would mis-attribute parse panics and break /// stage-filtered debugging. #[test] fn provenance_with_extract_failed_warning_attributes_to_kb_parse_md() { let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); let warnings = vec![Warning { kind: WarningKind::ExtractFailed, note: "pulldown-cmark panicked; body discarded".into(), }]; let doc = build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap(); let warning_event = doc .provenance .events .iter() .find(|e| e.kind == ProvenanceKind::Warning) .expect("warning event present"); assert_eq!(warning_event.agent, "kb-parse-md"); assert!( warning_event .note .as_deref() .unwrap() .contains("ExtractFailed") ); } /// I2 regression — `ParsedPayload::AudioRef` is dropped (not lifted /// into a `Block::AudioRef` with a synthesized empty `AssetId`, /// which would violate `AssetId::from_str`'s 32-hex invariant). A /// `Warning` is surfaced in Provenance, attributed to /// `"kb-normalize"` because the decision is made at the lift stage. #[test] fn audio_ref_block_skipped_with_warning() { let span = SourceSpan::Line { start: 1, end: 1 }; let blocks = vec![ParsedBlock { kind: kebab_parse_types::ParsedBlockKind::AudioRef, heading_path: vec![], source_span: span, payload: ParsedPayload::AudioRef { src: "voice.m4a".into(), }, }]; let asset = fixture_asset(); let metadata = fixture_metadata(); let pv = parser_version(); let doc = build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); // No AudioRef block in the canonical output. assert!( !doc.blocks .iter() .any(|b| matches!(b, Block::AudioRef(_))), "AudioRef block should be skipped pre-P8" ); // Exactly one Warning event mentioning the AudioRef src. let warning_events: Vec<_> = doc .provenance .events .iter() .filter(|e| e.kind == ProvenanceKind::Warning) .collect(); assert_eq!(warning_events.len(), 1); let w = warning_events[0]; assert_eq!(w.agent, "kb-normalize"); assert!(w.note.as_deref().unwrap().contains("voice.m4a")); } /// I3 regression — heading-path strings are NFC-normalized before /// feeding into `id_for_block`, so canonically-equivalent NFD and /// NFC inputs produce the same `block_id`. Mirrors /// `nfc_nfd_korean_path_same_id` for `doc_id`. #[test] fn nfc_nfd_korean_heading_path_same_block_id() { let span = SourceSpan::Line { start: 1, end: 1 }; let nfd_heading = "\u{1100}\u{1161}".to_string(); // 가 (NFD) let nfc_heading = "\u{AC00}".to_string(); // 가 (NFC) let mk_block = |heading: String| ParsedBlock { kind: kebab_parse_types::ParsedBlockKind::Paragraph, heading_path: vec![heading], source_span: span.clone(), payload: ParsedPayload::Paragraph { text: "p".into(), inlines: vec![], }, }; let asset = fixture_asset(); let pv = parser_version(); let doc_nfd = build_canonical_document( &asset, fixture_metadata(), vec![mk_block(nfd_heading)], &pv, vec![], ) .unwrap(); let doc_nfc = build_canonical_document( &asset, fixture_metadata(), vec![mk_block(nfc_heading)], &pv, vec![], ) .unwrap(); let id_nfd = match &doc_nfd.blocks[0] { Block::Paragraph(t) => &t.common.block_id, _ => panic!("expected Paragraph"), }; let id_nfc = match &doc_nfc.blocks[0] { Block::Paragraph(t) => &t.common.block_id, _ => panic!("expected Paragraph"), }; assert_eq!(id_nfd, id_nfc, "NFD and NFC heading paths must hash equal"); } /// M7 (revised by p9-fb-07) — `metadata.user["title"] = ""` lifts /// as an empty string but the new derive_title fallback chain /// promotes the file stem so the resulting title is non-empty. /// spec p9-fb-07: "빈 문자열 반환 금지". #[test] fn title_empty_string_in_user_map_falls_back_to_file_stem() { let asset = fixture_asset(); let mut metadata = fixture_metadata(); metadata .user .insert("title".into(), Value::String(String::new())); let pv = parser_version(); let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); // workspace_path = "notes/example.md" → stem "example". assert_eq!(doc.title, "example"); } /// M7 (revised by p9-fb-07) — `metadata.user["title"] = 42` is /// non-stringy and silently drops at the lift stage; derive_title /// then falls back through the chain to the file stem. /// spec p9-fb-07: "빈 문자열 반환 금지". #[test] fn title_non_string_in_user_map_falls_back_to_file_stem() { let asset = fixture_asset(); let mut metadata = fixture_metadata(); metadata .user .insert("title".into(), Value::Number(42.into())); let pv = parser_version(); let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); assert_eq!(doc.title, "example"); } /// M7 — non-stringy `lang` (e.g. an array) silently drops. This is /// defensive: P1-2 frontmatter validates the shape upstream, but we /// don't trust it. #[test] fn lang_invalid_shape_silently_drops() { let asset = fixture_asset(); let mut metadata = fixture_metadata(); metadata.user.insert("lang".into(), Value::Array(vec![])); let pv = parser_version(); let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); assert_eq!(doc.lang, Lang(String::new())); } // ── p9-fb-07: derive_title fallback chain ─────────────────────────── fn span() -> SourceSpan { SourceSpan::Line { start: 1, end: 1 } } fn common_for_test() -> CommonBlock { CommonBlock { block_id: BlockId("0".repeat(32)), heading_path: vec![], source_span: span(), } } fn heading(level: u8, text: &str) -> Block { Block::Heading(HeadingBlock { common: common_for_test(), level, text: text.to_string(), }) } fn paragraph(text: &str) -> Block { Block::Paragraph(TextBlock { common: common_for_test(), text: text.to_string(), inlines: vec![], }) } /// Step 1 — frontmatter title wins, NFC-normalized. #[test] fn derive_title_uses_frontmatter_first() { let blocks = vec![heading(1, "H1 Title"), paragraph("body")]; assert_eq!( derive_title("Frontmatter Title", &blocks, "fallback-stem"), "Frontmatter Title" ); } /// Whitespace-only frontmatter title falls through to the next step. #[test] fn derive_title_blank_frontmatter_falls_through_to_h1() { let blocks = vec![heading(1, "First H1")]; assert_eq!(derive_title(" ", &blocks, "stem"), "First H1"); } /// Step 2 — first H1 wins when frontmatter empty. #[test] fn derive_title_uses_h1_when_no_frontmatter() { let blocks = vec![paragraph("intro"), heading(1, "Real Title"), heading(2, "Sub")]; assert_eq!(derive_title("", &blocks, "stem"), "Real Title"); } /// Step 3 — first H2 wins when no H1. #[test] fn derive_title_uses_h2_when_no_h1() { let blocks = vec![heading(2, "First H2"), heading(2, "Second H2"), heading(1, "")]; assert_eq!(derive_title("", &blocks, "stem"), "First H2"); } /// Step 4 — first non-blank Paragraph wins; truncated to 80 chars. /// Quotes / Lists / Code / Tables / ImageRefs do not qualify. #[test] fn derive_title_uses_first_paragraph_excerpt() { let blocks = vec![ Block::Quote(TextBlock { common: common_for_test(), text: "blockquote should be skipped".into(), inlines: vec![], }), Block::Code(CodeBlock { common: common_for_test(), lang: None, code: "code should be skipped".into(), }), paragraph("This paragraph wins. Long text that would exceed eighty characters once concatenated end-to-end here."), ]; let title = derive_title("", &blocks, "stem"); assert_eq!(title.chars().count(), 80); assert!(title.starts_with("This paragraph wins.")); } /// Step 5 — file stem is the final fallback when there are no /// usable blocks (e.g. table-only doc with no paragraphs). #[test] fn derive_title_falls_back_to_file_stem() { let blocks = vec![Block::Table(TableBlock { common: common_for_test(), headers: vec!["a".into()], rows: vec![vec!["1".into()]], })]; assert_eq!(derive_title("", &blocks, "table-only-doc"), "table-only-doc"); } /// Step 5 sentinel — empty file_stem AND no usable blocks falls back /// to the literal `"untitled"`. Pathological case (workspace_path /// with no filename component). #[test] fn derive_title_returns_untitled_when_everything_blank() { assert_eq!(derive_title("", &[], ""), "untitled"); assert_eq!(derive_title(" ", &[], " "), "untitled"); } /// Korean H1 in NFD form is normalized to NFC before being chosen /// as the title. Mirrors the heading_path NFC pin elsewhere. #[test] fn derive_title_nfc_normalizes_korean_h1() { let nfd = "\u{1100}\u{1161}".to_string(); // 가 (NFD) let nfc = "\u{AC00}".to_string(); // 가 (NFC) let blocks = vec![heading(1, &nfd)]; assert_eq!(derive_title("", &blocks, "stem"), nfc); } /// `build_canonical_document` integrates the derive_title chain — /// when frontmatter title is empty, the first H1 is used. #[test] fn build_canonical_document_falls_back_to_first_h1() { let asset = fixture_asset(); let mut metadata = fixture_metadata(); metadata.user.remove("title"); let blocks = vec![ParsedBlock { kind: kebab_parse_types::ParsedBlockKind::Heading, heading_path: vec![], source_span: span(), payload: ParsedPayload::Heading { level: 1, text: "Lifted From H1".into(), }, }]; let pv = parser_version(); let doc = build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap(); assert_eq!(doc.title, "Lifted From H1"); } /// `build_canonical_document` integrates the file_stem fallback — /// no frontmatter title, no headings, no paragraphs → filename /// (stripped of extension). #[test] fn build_canonical_document_falls_back_to_file_stem() { let asset = fixture_asset(); // workspace_path = "notes/example.md" → stem "example" let mut metadata = fixture_metadata(); metadata.user.remove("title"); let pv = parser_version(); let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap(); assert_eq!(doc.title, "example"); } }