Files
kebab/crates/kebab-normalize/src/lib.rs
th-kim0823 bf4ebf8d2a feat(p10-1a-1): add Metadata.repo / git_branch / git_commit / code_lang
Four optional, serde-skipped-when-None fields added to `Metadata` for
code ingest context. All 11 downstream construction sites patched with
`repo: None, git_branch: None, git_commit: None, code_lang: None`.
Full workspace check (`--tests`) and per-crate test suite pass clean.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 15:44:18 +09:00

1098 lines
42 KiB
Rust

//! `kb-normalize` — lift parser output (`kb-parse-types`) into a
//! [`kebab_core::CanonicalDocument`] with deterministic IDs.
//!
//! Per design §3.4 (CanonicalDocument / Block), §4.2 (ID recipe), §4.3
//! (ordinal rule), §3.6 (Provenance), §8 (module boundaries).
//!
//! Public surface:
//!
//! * [`build_canonical_document`] — assemble a `CanonicalDocument` from
//! `(RawAsset, Metadata, Vec<ParsedBlock>, ParserVersion, Vec<Warning>)`.
//! * [`id_for_doc`], [`id_for_block`] — re-exports of the canonical
//! ID-recipe functions in `kb-core::ids` (§4.2). `kb-core` is the only
//! implementation; `kb-normalize` is the canonical *entry point* per
//! design §8.
//!
//! This crate must NOT depend on any parser implementation crate
//! (`kb-parse-md`, `kb-parse-pdf`, …). All parser output flows in via
//! the shared `kb-parse-types` crate.
use std::collections::HashMap;
use std::path::Path;
use anyhow::Result;
use kebab_core::{
Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, DocumentId, HeadingBlock,
ImageRefBlock, Inline, Lang, ListBlock, Metadata, ParserVersion, Provenance, ProvenanceEvent,
ProvenanceKind, RawAsset, TableBlock, TextBlock,
};
use kebab_parse_types::{ParsedBlock, ParsedPayload, Warning, WarningKind};
use time::OffsetDateTime;
use unicode_normalization::UnicodeNormalization;
pub use kebab_core::{id_for_block, id_for_doc};
/// Build a [`CanonicalDocument`] from the raw asset, frontmatter
/// metadata, parser blocks, parser version, and any warnings.
///
/// Behavior contract (per design §3.4 / §4.2 / §4.3 / §3.6):
///
/// * `doc_id = id_for_doc(workspace_path, asset_id, parser_version)` —
/// `workspace_path` is consumed verbatim from `asset` (already NFC +
/// POSIX per `kebab_core::normalize::to_posix`).
/// * `block_id = id_for_block(doc_id, kind, heading_path, ordinal,
/// source_span)` — `ordinal` is **0-based, scoped to (heading_path,
/// block_kind), in document order** per §4.3.
/// * `title` and `lang` are lifted from `metadata.user["title"]` /
/// `metadata.user["lang"]` (where P1-2 stashes them) into the dedicated
/// `CanonicalDocument` fields, and removed from the user map to avoid
/// duplication. Both keys are lifted only if present and stringy;
/// non-stringy values (e.g. `Number`, `Array`) and missing keys
/// silently default to empty title / empty `Lang`. P1-2's frontmatter
/// parser only writes these keys when the source value parses as a
/// string, so the non-stringy branches are defense-in-depth.
/// * `provenance` is seeded with `Discovered` (from `asset.discovered_at`),
/// `Parsed`, `Normalized` events, and one `Warning` event per upstream
/// warning. The two normalize-side events share one `now_utc()` reading
/// so the timestamp jitter inside a single call is bounded — event
/// ordering is preserved by `Vec` position.
/// * `schema_version` and `doc_version` are pinned to `1` (initial).
pub fn build_canonical_document(
asset: &RawAsset,
metadata: Metadata,
blocks: Vec<ParsedBlock>,
parser_version: &ParserVersion,
warnings: Vec<Warning>,
) -> Result<CanonicalDocument> {
let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version);
// Lift title / lang from `metadata.user` (P1-2 stashed them there
// because `Metadata` does not carry them directly). Strip after
// lifting so the wire form does not duplicate the data.
let mut metadata = metadata;
let title = metadata
.user
.remove("title")
.and_then(|v| v.as_str().map(String::from))
.unwrap_or_default();
let lang = metadata
.user
.remove("lang")
.and_then(|v| v.as_str().map(|s| Lang(s.to_string())))
.unwrap_or_else(|| Lang(String::new()));
// §4.3 ordinal rule — per (heading_path, block_kind), 0-based,
// document order. A separate counter is kept for each grouping key.
let mut counters: HashMap<(Vec<String>, &'static str), u32> = HashMap::new();
// Some lift paths (e.g. AudioRef pre-P8) drop the block entirely and
// synthesize a Warning so the wire form never carries an invalid
// `AssetId`. These warnings originate at the lift stage and are
// attributed to `kb-normalize` (not to whatever upstream emitter the
// bare `WarningKind` would resolve to via `warning_agent`). They are
// tracked separately so the agent string is correct in Provenance.
let mut lift_warnings: Vec<Warning> = Vec::new();
let lifted_blocks: Vec<Block> = blocks
.into_iter()
.filter_map(|pb| lift_block(&doc_id, pb, &mut counters, &mut lift_warnings))
.collect();
// p9-fb-07: title fallback chain. `title` so far holds the
// frontmatter `title` (step 1). If empty / whitespace, walk the
// lifted blocks for an H1 → H2 → first paragraph excerpt → file
// stem. NFC-normalize the chosen string so the on-wire title is
// canonically equivalent to whatever the user stored, regardless
// of source NFD/NFC form.
let file_stem = workspace_path_stem(&asset.workspace_path.0);
let title = derive_title(&title, &lifted_blocks, &file_stem);
tracing::debug!(
target: "kebab-normalize",
"built canonical document doc_id={} blocks={}",
doc_id.0,
lifted_blocks.len()
);
// Provenance — share `now` between the parse + normalize stages so
// the per-call timestamp jitter is bounded.
let now = OffsetDateTime::now_utc();
let mut events: Vec<ProvenanceEvent> =
Vec::with_capacity(3 + warnings.len() + lift_warnings.len());
events.push(ProvenanceEvent {
at: asset.discovered_at,
agent: "kb-source-fs".to_string(),
kind: ProvenanceKind::Discovered,
note: None,
});
events.push(ProvenanceEvent {
at: now,
agent: "kb-parse-md".to_string(),
kind: ProvenanceKind::Parsed,
note: Some(format!("parser_version={}", parser_version.0)),
});
events.push(ProvenanceEvent {
at: now,
agent: "kb-normalize".to_string(),
kind: ProvenanceKind::Normalized,
note: None,
});
// {:?} on WarningKind renders camel-case variant name; intentional
// for human-readable Provenance trace.
for w in warnings {
events.push(ProvenanceEvent {
at: now,
agent: warning_agent(&w.kind).to_string(),
kind: ProvenanceKind::Warning,
note: Some(format!("{:?}: {}", w.kind, w.note)),
});
}
// Lift-stage warnings (currently only AudioRef-deferred drops) are
// unconditionally attributed to `kb-normalize`.
for w in lift_warnings {
events.push(ProvenanceEvent {
at: now,
agent: "kb-normalize".to_string(),
kind: ProvenanceKind::Warning,
note: Some(format!("{:?}: {}", w.kind, w.note)),
});
}
let provenance = Provenance { events };
Ok(CanonicalDocument {
doc_id,
source_asset_id: asset.asset_id.clone(),
workspace_path: asset.workspace_path.clone(),
title,
lang,
blocks: lifted_blocks,
metadata,
provenance,
parser_version: parser_version.clone(),
schema_version: 1,
doc_version: 1,
last_chunker_version: None,
last_embedding_version: None,
})
}
/// Resolve a `WarningKind` to the upstream agent that emitted it. Used
/// to fill `ProvenanceEvent::agent` for the warning's event entry.
///
/// `ExtractFailed` is emitted today by `kb-parse-md`'s panic-recovery
/// guard around `parse_blocks` — see `crates/kb-parse-md/src/blocks.rs`.
/// If a future stage (e.g. `kb-normalize` itself, an extractor, …) starts
/// emitting `ExtractFailed`, this mapping needs to grow context (perhaps
/// a separate `WarningSource` field on `Warning`) so attribution stays
/// honest. For now, all `ExtractFailed` warnings observed by
/// `build_canonical_document` originated in the parser.
fn warning_agent(kind: &WarningKind) -> &'static str {
match kind {
WarningKind::MalformedFrontmatter | WarningKind::EncodingFallback => "kb-parse-md",
WarningKind::MalformedTable => "kb-parse-md",
WarningKind::ExtractFailed => "kb-parse-md",
}
}
/// Map a `ParsedPayload` variant to the lowercase, no-spaces string used
/// as `block_kind` in the §4.2 ID tuple.
fn payload_kind(payload: &ParsedPayload) -> &'static str {
match payload {
ParsedPayload::Heading { .. } => "heading",
ParsedPayload::Paragraph { .. } => "paragraph",
ParsedPayload::List { .. } => "list",
ParsedPayload::Code { .. } => "code",
ParsedPayload::Table { .. } => "table",
ParsedPayload::Quote { .. } => "quote",
ParsedPayload::ImageRef { .. } => "imageref",
ParsedPayload::AudioRef { .. } => "audioref",
}
}
fn next_ordinal(
counters: &mut HashMap<(Vec<String>, &'static str), u32>,
heading_path: &[String],
kind: &'static str,
) -> u32 {
let key = (heading_path.to_vec(), kind);
let entry = counters.entry(key).or_insert(0);
let ordinal = *entry;
*entry += 1;
ordinal
}
fn lift_block(
doc_id: &DocumentId,
pb: ParsedBlock,
counters: &mut HashMap<(Vec<String>, &'static str), u32>,
warnings: &mut Vec<Warning>,
) -> Option<Block> {
let kind = payload_kind(&pb.payload);
// Task spec line 73: "All input strings normalized to NFC before
// hashing." `pulldown-cmark` does not NFC heading text, and
// `serde_json_canonicalizer` v0.3 does not normalize strings either,
// so we must NFC-normalize `heading_path` here before it feeds both
// the §4.2 ID recipe AND the on-disk `CommonBlock.heading_path` (so
// wire form matches ID input). Without this, NFD `\u{1100}\u{1161}`
// and NFC `\u{AC00}` (both render as 가) would produce different
// `block_id`s for what is logically the same heading.
let heading_path_nfc: Vec<String> =
pb.heading_path.iter().map(|s| s.nfc().collect()).collect();
let ordinal = next_ordinal(counters, &heading_path_nfc, kind);
let block_id: BlockId =
id_for_block(doc_id, kind, &heading_path_nfc, ordinal, &pb.source_span);
let common = CommonBlock {
block_id,
heading_path: heading_path_nfc,
source_span: pb.source_span,
};
let block = match pb.payload {
ParsedPayload::Heading { level, text } => Block::Heading(HeadingBlock {
common,
level,
text,
}),
ParsedPayload::Paragraph { text, inlines } => Block::Paragraph(TextBlock {
common,
text,
inlines,
}),
ParsedPayload::List { ordered, items } => Block::List(ListBlock {
common: common.clone(),
ordered,
items: items
.into_iter()
.map(|item_inlines| TextBlock {
// All list items currently inherit the parent's
// CommonBlock (incl. block_id). Per-item IDs would
// require a §4.2 recipe extension. Spec (§3.4)
// defines `ListBlock.items: Vec<TextBlock>` and
// does not allocate per-item BlockIds. Re-using the
// parent's common keeps the wire form deterministic
// while letting the inline tree carry the item
// content.
common: common.clone(),
text: flatten_inlines(&item_inlines),
inlines: item_inlines,
})
.collect(),
}),
ParsedPayload::Code { lang, code } => Block::Code(CodeBlock { common, lang, code }),
ParsedPayload::Table { headers, rows } => Block::Table(TableBlock {
common,
headers,
rows,
}),
ParsedPayload::Quote { text, inlines } => Block::Quote(TextBlock {
common,
text,
inlines,
}),
ParsedPayload::ImageRef { src, alt } => Block::ImageRef(ImageRefBlock {
common,
asset_id: None,
src,
alt,
ocr: None,
caption: None,
}),
// TODO(P8): audio extractor will resolve workspace assets and
// produce real AssetIds. This skip-and-warn shim is a
// placeholder. `AssetId::from_str` requires a 32-hex string, so
// synthesizing `AssetId(String::new())` would break the
// invariant — instead we drop the block and surface a Warning
// (attributed to `kb-normalize` per §3.6 since this is the
// lift-stage decision).
ParsedPayload::AudioRef { src } => {
warnings.push(Warning {
kind: WarningKind::ExtractFailed,
note: format!(
"audio-ref AssetId resolution deferred to P8 — block dropped (src={src})"
),
});
return None;
}
};
Some(block)
}
/// Flatten a `Vec<Inline>` into a plain text string. Used by list-item
/// `TextBlock.text` since `ParsedPayload::List` only carries inline trees
/// per item.
fn flatten_inlines(inlines: &[Inline]) -> String {
let mut out = String::new();
for i in inlines {
flatten_inline(i, &mut out);
}
out
}
fn flatten_inline(i: &Inline, out: &mut String) {
match i {
Inline::Text { text } => out.push_str(text),
Inline::Code { code } => out.push_str(code),
Inline::Link { text, .. } => out.push_str(text),
Inline::Strong { children } | Inline::Emph { children } => {
for c in children {
flatten_inline(c, out);
}
}
}
}
/// p9-fb-07: derive a usable title from the frontmatter, lifted blocks,
/// and the source filename, using a documented fallback chain.
///
/// Priority (first non-blank wins):
///
/// 1. `frontmatter_title` — verbatim, after trimming whitespace.
/// 2. First `Heading` block at level 1 with non-blank text.
/// 3. First `Heading` block at level 2 with non-blank text.
/// 4. First `Paragraph` block (NOT `Quote`, `List`, `Code`, `Table`,
/// `ImageRef`, `AudioRef`) with non-blank text — first 80 chars.
/// 5. `file_stem` (filename minus extension — returned verbatim, no
/// case transformation; whatever the on-disk filename is becomes
/// the title text).
///
/// The chosen string is NFC-normalized so the on-wire title is
/// canonically equivalent to the source content. Never returns an
/// empty string — if every step is blank (e.g. an empty file), the
/// `file_stem` fallback ensures a non-empty result. If `file_stem` is
/// also blank (pathological), returns `"untitled"` as a last resort.
pub fn derive_title(frontmatter_title: &str, blocks: &[Block], file_stem: &str) -> String {
let trimmed = frontmatter_title.trim();
if !trimmed.is_empty() {
return trimmed.nfc().collect();
}
if let Some(text) = first_heading_text(blocks, 1) {
return text;
}
if let Some(text) = first_heading_text(blocks, 2) {
return text;
}
if let Some(excerpt) = first_paragraph_excerpt(blocks, 80) {
return excerpt;
}
// `file_stem` originates from `WorkspacePath`, which `to_posix`
// already NFC-normalizes (§6.6). No second NFC pass needed — pass
// through verbatim after a defensive `trim`.
let stem = file_stem.trim();
if !stem.is_empty() {
return stem.to_string();
}
"untitled".to_string()
}
fn first_heading_text(blocks: &[Block], level: u8) -> Option<String> {
blocks.iter().find_map(|b| match b {
Block::Heading(h) if h.level == level => {
let trimmed = h.text.trim();
if trimmed.is_empty() {
None
} else {
Some(trimmed.nfc().collect())
}
}
_ => None,
})
}
fn first_paragraph_excerpt(blocks: &[Block], max_chars: usize) -> Option<String> {
blocks.iter().find_map(|b| match b {
Block::Paragraph(t) => {
let trimmed = t.text.trim();
if trimmed.is_empty() {
None
} else {
let nfc: String = trimmed.nfc().collect();
Some(nfc.chars().take(max_chars).collect())
}
}
_ => None,
})
}
/// Extract the filename stem (no extension) from a workspace path
/// string. Returns the empty string if no filename can be derived
/// (e.g. trailing slash). Multi-extension cases (`foo.tar.gz`) follow
/// `Path::file_stem` semantics — only the last extension is stripped.
fn workspace_path_stem(workspace_path: &str) -> String {
Path::new(workspace_path)
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("")
.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
use kebab_core::{
AssetId, AssetStorage, Checksum, MediaType, SourceSpan, SourceType, SourceUri,
TrustLevel, WorkspacePath, normalize::to_posix,
};
use serde_json::Value;
use std::path::{Path, PathBuf};
use time::OffsetDateTime;
fn fixture_asset() -> RawAsset {
let workspace_path = WorkspacePath::new("notes/example.md".into()).unwrap();
RawAsset {
asset_id: AssetId("a".repeat(32)),
source_uri: SourceUri::File(PathBuf::from("/tmp/example.md")),
workspace_path,
media_type: MediaType::Markdown,
byte_len: 0,
checksum: Checksum("0".repeat(64)),
// Pin a fixed timestamp so determinism tests can compare
// outputs across runs without timestamp jitter outside the
// fields we explicitly strip.
discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
stored: AssetStorage::Reference {
path: PathBuf::from("/tmp/example.md"),
sha: Checksum("0".repeat(64)),
},
}
}
fn fixture_metadata() -> Metadata {
let mut user = serde_json::Map::new();
user.insert("title".into(), Value::String("Example".into()));
user.insert("lang".into(), Value::String("en".into()));
user.insert("custom".into(), Value::Bool(true));
Metadata {
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Markdown,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user,
repo: None,
git_branch: None,
git_commit: None,
code_lang: None,
}
}
fn parser_version() -> ParserVersion {
ParserVersion("kb-normalize-test-0".into())
}
/// Fixed 5-block input used by both the ordinal-scoping pinning test
/// and the determinism stress test (so the latter exercises the
/// `lift_block` path, not just the empty-blocks path).
fn fixture_blocks_five() -> Vec<ParsedBlock> {
let h1_a = vec!["A".to_string()];
let h1_b = vec!["B".to_string()];
vec![
ParsedBlock {
kind: kebab_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_a.clone(),
source_span: SourceSpan::Line { start: 1, end: 1 },
payload: ParsedPayload::Paragraph {
text: "p1".into(),
inlines: vec![],
},
},
ParsedBlock {
kind: kebab_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_a.clone(),
source_span: SourceSpan::Line { start: 2, end: 2 },
payload: ParsedPayload::Paragraph {
text: "p2".into(),
inlines: vec![],
},
},
ParsedBlock {
kind: kebab_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_a.clone(),
source_span: SourceSpan::Line { start: 3, end: 3 },
payload: ParsedPayload::Paragraph {
text: "p3".into(),
inlines: vec![],
},
},
ParsedBlock {
kind: kebab_parse_types::ParsedBlockKind::Code,
heading_path: h1_a,
source_span: SourceSpan::Line { start: 4, end: 5 },
payload: ParsedPayload::Code {
lang: None,
code: "x".into(),
},
},
ParsedBlock {
kind: kebab_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_b,
source_span: SourceSpan::Line { start: 6, end: 6 },
payload: ParsedPayload::Paragraph {
text: "q1".into(),
inlines: vec![],
},
},
]
}
/// `id_for_doc` is deterministic across 1000 invocations on the same
/// input — a regression in canonical JSON or BLAKE3 would surface
/// here immediately.
#[test]
fn id_for_doc_deterministic_1000() {
let path = WorkspacePath::new("a/b.md".into()).unwrap();
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
let pv = ParserVersion("v1".into());
let first = id_for_doc(&path, &asset, &pv);
for _ in 0..1000 {
assert_eq!(id_for_doc(&path, &asset, &pv), first);
}
}
/// NFC vs NFD inputs for the same Korean glyph must produce the
/// same `doc_id` because `to_posix` runs NFC normalization.
#[test]
fn nfc_nfd_korean_path_same_id() {
let nfd = to_posix(Path::new("\u{1100}\u{1161}.md")).unwrap();
let nfc = to_posix(Path::new("\u{AC00}.md")).unwrap();
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
let pv = parser_version();
assert_eq!(id_for_doc(&nfd, &asset, &pv), id_for_doc(&nfc, &asset, &pv));
}
/// `./a/b.md` and `a/b.md` must collapse to the same POSIX form
/// before `id_for_doc`.
#[test]
fn posix_curdir_collapses_to_same_id() {
let a = to_posix(Path::new("./a/b.md")).unwrap();
let b = to_posix(Path::new("a/b.md")).unwrap();
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
let pv = parser_version();
assert_eq!(id_for_doc(&a, &asset, &pv), id_for_doc(&b, &asset, &pv));
}
/// Ordinals are scoped to (heading_path, block_kind) per §4.3:
/// three paragraphs under H1 → 0/1/2; a code block under the same
/// H1 starts a fresh counter at 0; a paragraph under a different
/// H1 also starts a fresh counter at 0.
#[test]
fn block_ordinals_scoped_per_heading_and_kind() {
let h1_a = vec!["A".to_string()];
let h1_b = vec!["B".to_string()];
let blocks = fixture_blocks_five();
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap();
// Compute the expected IDs out-of-band so the test pins both
// the (heading_path, kind) ordinal grouping AND the value of
// each block_id under the recipe.
let p1 = id_for_block(
&doc.doc_id,
"paragraph",
&h1_a,
0,
&SourceSpan::Line { start: 1, end: 1 },
);
let p2 = id_for_block(
&doc.doc_id,
"paragraph",
&h1_a,
1,
&SourceSpan::Line { start: 2, end: 2 },
);
let p3 = id_for_block(
&doc.doc_id,
"paragraph",
&h1_a,
2,
&SourceSpan::Line { start: 3, end: 3 },
);
let c0 = id_for_block(
&doc.doc_id,
"code",
&h1_a,
0,
&SourceSpan::Line { start: 4, end: 5 },
);
let q0 = id_for_block(
&doc.doc_id,
"paragraph",
&h1_b,
0,
&SourceSpan::Line { start: 6, end: 6 },
);
let ids: Vec<&BlockId> = doc
.blocks
.iter()
.map(|b| match b {
Block::Paragraph(t) | Block::Quote(t) => &t.common.block_id,
Block::Heading(h) => &h.common.block_id,
Block::List(l) => &l.common.block_id,
Block::Code(c) => &c.common.block_id,
Block::Table(t) => &t.common.block_id,
Block::ImageRef(i) => &i.common.block_id,
Block::AudioRef(a) => &a.common.block_id,
})
.collect();
assert_eq!(ids, vec![&p1, &p2, &p3, &c0, &q0]);
}
/// Provenance events appear in the documented order: `Discovered`
/// (from the asset), `Parsed`, then `Normalized`. Warnings (none in
/// this test) would follow.
#[test]
fn provenance_contains_stage_events_in_order() {
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
let kinds: Vec<_> = doc.provenance.events.iter().map(|e| e.kind).collect();
assert_eq!(
kinds,
vec![
ProvenanceKind::Discovered,
ProvenanceKind::Parsed,
ProvenanceKind::Normalized,
]
);
let events = &doc.provenance.events;
assert_eq!(events[0].at, asset.discovered_at);
assert_eq!(events[0].agent, "kb-source-fs");
assert_eq!(events[1].agent, "kb-parse-md");
assert_eq!(events[2].agent, "kb-normalize");
// Pin the implementation invariant that Parsed and Normalized
// share the single `now_utc()` reading inside one call.
assert_eq!(events[1].at, events[2].at, "Parsed and Normalized share now_utc");
}
/// Warnings carried into `build_canonical_document` are emitted as
/// `ProvenanceKind::Warning` events with the upstream agent.
#[test]
fn provenance_includes_warnings() {
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let warnings = vec![Warning {
kind: WarningKind::MalformedFrontmatter,
note: "missing closing fence".into(),
}];
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap();
assert_eq!(doc.provenance.events.len(), 4);
let last = doc.provenance.events.last().unwrap();
assert_eq!(last.kind, ProvenanceKind::Warning);
assert_eq!(last.agent, "kb-parse-md");
assert!(last.note.as_deref().unwrap().contains("missing closing fence"));
}
/// `metadata.user["title"]` and `metadata.user["lang"]` are lifted
/// to the dedicated `CanonicalDocument` fields and stripped from
/// the user map (so the wire form does not duplicate the data).
/// Other user keys survive intact.
#[test]
fn lifts_title_and_lang_from_user_map() {
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
assert_eq!(doc.title, "Example");
assert_eq!(doc.lang, Lang("en".into()));
assert!(!doc.metadata.user.contains_key("title"));
assert!(!doc.metadata.user.contains_key("lang"));
assert!(doc.metadata.user.contains_key("custom"));
}
/// Determinism property: 1000 iterations of `build_canonical_document`
/// over identical inputs produce byte-identical JSON, modulo the two
/// non-deterministic `now_utc()` timestamps for the Parsed/Normalized
/// events. We strip those timestamps before comparing. Must finish
/// within 1 second.
#[test]
fn determinism_1000_iterations_under_1s() {
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
// Helper: serialize and replace the two now_utc-derived timestamps
// (Parsed + Normalized + any Warning events) with a sentinel so
// the comparison only checks the deterministic fields.
fn strip_dynamic_at(doc: &CanonicalDocument) -> Value {
let mut v = serde_json::to_value(doc).unwrap();
if let Some(events) = v
.get_mut("provenance")
.and_then(|p| p.get_mut("events"))
.and_then(|e| e.as_array_mut())
{
for (i, ev) in events.iter_mut().enumerate() {
// index 0 is Discovered (deterministic — pinned in
// the fixture). Strip everything after.
if i > 0
&& let Some(obj) = ev.as_object_mut()
{
obj.insert("at".into(), Value::String("<stripped>".into()));
}
}
}
v
}
// Use the same 5-block fixture as the ordinal-scoping test so
// determinism is exercised on a non-empty `lift_block` path
// (block_id hashing, NFC normalization, ordinal counters), not
// just an empty Vec.
let baseline = build_canonical_document(
&asset,
metadata.clone(),
fixture_blocks_five(),
&pv,
vec![],
)
.unwrap();
let baseline_json = serde_json::to_string(&strip_dynamic_at(&baseline)).unwrap();
let start = std::time::Instant::now();
for _ in 0..1000 {
let next = build_canonical_document(
&asset,
metadata.clone(),
fixture_blocks_five(),
&pv,
vec![],
)
.unwrap();
let next_json = serde_json::to_string(&strip_dynamic_at(&next)).unwrap();
assert_eq!(baseline_json, next_json);
}
assert!(
start.elapsed() < std::time::Duration::from_secs(1),
"1000 iterations took {:?}",
start.elapsed()
);
}
/// I1 regression — `WarningKind::ExtractFailed` is emitted by
/// `kb-parse-md` (panic-recovery in `blocks.rs`), so the resulting
/// `ProvenanceEvent::agent` must read `"kb-parse-md"`. A regression
/// to `"kb-normalize"` would mis-attribute parse panics and break
/// stage-filtered debugging.
#[test]
fn provenance_with_extract_failed_warning_attributes_to_kb_parse_md() {
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let warnings = vec![Warning {
kind: WarningKind::ExtractFailed,
note: "pulldown-cmark panicked; body discarded".into(),
}];
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap();
let warning_event = doc
.provenance
.events
.iter()
.find(|e| e.kind == ProvenanceKind::Warning)
.expect("warning event present");
assert_eq!(warning_event.agent, "kb-parse-md");
assert!(
warning_event
.note
.as_deref()
.unwrap()
.contains("ExtractFailed")
);
}
/// I2 regression — `ParsedPayload::AudioRef` is dropped (not lifted
/// into a `Block::AudioRef` with a synthesized empty `AssetId`,
/// which would violate `AssetId::from_str`'s 32-hex invariant). A
/// `Warning` is surfaced in Provenance, attributed to
/// `"kb-normalize"` because the decision is made at the lift stage.
#[test]
fn audio_ref_block_skipped_with_warning() {
let span = SourceSpan::Line { start: 1, end: 1 };
let blocks = vec![ParsedBlock {
kind: kebab_parse_types::ParsedBlockKind::AudioRef,
heading_path: vec![],
source_span: span,
payload: ParsedPayload::AudioRef {
src: "voice.m4a".into(),
},
}];
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap();
// No AudioRef block in the canonical output.
assert!(
!doc.blocks
.iter()
.any(|b| matches!(b, Block::AudioRef(_))),
"AudioRef block should be skipped pre-P8"
);
// Exactly one Warning event mentioning the AudioRef src.
let warning_events: Vec<_> = doc
.provenance
.events
.iter()
.filter(|e| e.kind == ProvenanceKind::Warning)
.collect();
assert_eq!(warning_events.len(), 1);
let w = warning_events[0];
assert_eq!(w.agent, "kb-normalize");
assert!(w.note.as_deref().unwrap().contains("voice.m4a"));
}
/// I3 regression — heading-path strings are NFC-normalized before
/// feeding into `id_for_block`, so canonically-equivalent NFD and
/// NFC inputs produce the same `block_id`. Mirrors
/// `nfc_nfd_korean_path_same_id` for `doc_id`.
#[test]
fn nfc_nfd_korean_heading_path_same_block_id() {
let span = SourceSpan::Line { start: 1, end: 1 };
let nfd_heading = "\u{1100}\u{1161}".to_string(); // 가 (NFD)
let nfc_heading = "\u{AC00}".to_string(); // 가 (NFC)
let mk_block = |heading: String| ParsedBlock {
kind: kebab_parse_types::ParsedBlockKind::Paragraph,
heading_path: vec![heading],
source_span: span.clone(),
payload: ParsedPayload::Paragraph {
text: "p".into(),
inlines: vec![],
},
};
let asset = fixture_asset();
let pv = parser_version();
let doc_nfd = build_canonical_document(
&asset,
fixture_metadata(),
vec![mk_block(nfd_heading)],
&pv,
vec![],
)
.unwrap();
let doc_nfc = build_canonical_document(
&asset,
fixture_metadata(),
vec![mk_block(nfc_heading)],
&pv,
vec![],
)
.unwrap();
let id_nfd = match &doc_nfd.blocks[0] {
Block::Paragraph(t) => &t.common.block_id,
_ => panic!("expected Paragraph"),
};
let id_nfc = match &doc_nfc.blocks[0] {
Block::Paragraph(t) => &t.common.block_id,
_ => panic!("expected Paragraph"),
};
assert_eq!(id_nfd, id_nfc, "NFD and NFC heading paths must hash equal");
}
/// M7 (revised by p9-fb-07) — `metadata.user["title"] = ""` lifts
/// as an empty string but the new derive_title fallback chain
/// promotes the file stem so the resulting title is non-empty.
/// spec p9-fb-07: "빈 문자열 반환 금지".
#[test]
fn title_empty_string_in_user_map_falls_back_to_file_stem() {
let asset = fixture_asset();
let mut metadata = fixture_metadata();
metadata
.user
.insert("title".into(), Value::String(String::new()));
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
// workspace_path = "notes/example.md" → stem "example".
assert_eq!(doc.title, "example");
}
/// M7 (revised by p9-fb-07) — `metadata.user["title"] = 42` is
/// non-stringy and silently drops at the lift stage; derive_title
/// then falls back through the chain to the file stem.
/// spec p9-fb-07: "빈 문자열 반환 금지".
#[test]
fn title_non_string_in_user_map_falls_back_to_file_stem() {
let asset = fixture_asset();
let mut metadata = fixture_metadata();
metadata
.user
.insert("title".into(), Value::Number(42.into()));
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
assert_eq!(doc.title, "example");
}
/// M7 — non-stringy `lang` (e.g. an array) silently drops. This is
/// defensive: P1-2 frontmatter validates the shape upstream, but we
/// don't trust it.
#[test]
fn lang_invalid_shape_silently_drops() {
let asset = fixture_asset();
let mut metadata = fixture_metadata();
metadata.user.insert("lang".into(), Value::Array(vec![]));
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
assert_eq!(doc.lang, Lang(String::new()));
}
// ── p9-fb-07: derive_title fallback chain ───────────────────────────
fn span() -> SourceSpan {
SourceSpan::Line { start: 1, end: 1 }
}
fn common_for_test() -> CommonBlock {
CommonBlock {
block_id: BlockId("0".repeat(32)),
heading_path: vec![],
source_span: span(),
}
}
fn heading(level: u8, text: &str) -> Block {
Block::Heading(HeadingBlock {
common: common_for_test(),
level,
text: text.to_string(),
})
}
fn paragraph(text: &str) -> Block {
Block::Paragraph(TextBlock {
common: common_for_test(),
text: text.to_string(),
inlines: vec![],
})
}
/// Step 1 — frontmatter title wins, NFC-normalized.
#[test]
fn derive_title_uses_frontmatter_first() {
let blocks = vec![heading(1, "H1 Title"), paragraph("body")];
assert_eq!(
derive_title("Frontmatter Title", &blocks, "fallback-stem"),
"Frontmatter Title"
);
}
/// Whitespace-only frontmatter title falls through to the next step.
#[test]
fn derive_title_blank_frontmatter_falls_through_to_h1() {
let blocks = vec![heading(1, "First H1")];
assert_eq!(derive_title(" ", &blocks, "stem"), "First H1");
}
/// Step 2 — first H1 wins when frontmatter empty.
#[test]
fn derive_title_uses_h1_when_no_frontmatter() {
let blocks = vec![paragraph("intro"), heading(1, "Real Title"), heading(2, "Sub")];
assert_eq!(derive_title("", &blocks, "stem"), "Real Title");
}
/// Step 3 — first H2 wins when no H1.
#[test]
fn derive_title_uses_h2_when_no_h1() {
let blocks = vec![heading(2, "First H2"), heading(2, "Second H2"), heading(1, "")];
assert_eq!(derive_title("", &blocks, "stem"), "First H2");
}
/// Step 4 — first non-blank Paragraph wins; truncated to 80 chars.
/// Quotes / Lists / Code / Tables / ImageRefs do not qualify.
#[test]
fn derive_title_uses_first_paragraph_excerpt() {
let blocks = vec![
Block::Quote(TextBlock {
common: common_for_test(),
text: "blockquote should be skipped".into(),
inlines: vec![],
}),
Block::Code(CodeBlock {
common: common_for_test(),
lang: None,
code: "code should be skipped".into(),
}),
paragraph("This paragraph wins. Long text that would exceed eighty characters once concatenated end-to-end here."),
];
let title = derive_title("", &blocks, "stem");
assert_eq!(title.chars().count(), 80);
assert!(title.starts_with("This paragraph wins."));
}
/// Step 5 — file stem is the final fallback when there are no
/// usable blocks (e.g. table-only doc with no paragraphs).
#[test]
fn derive_title_falls_back_to_file_stem() {
let blocks = vec![Block::Table(TableBlock {
common: common_for_test(),
headers: vec!["a".into()],
rows: vec![vec!["1".into()]],
})];
assert_eq!(derive_title("", &blocks, "table-only-doc"), "table-only-doc");
}
/// Step 5 sentinel — empty file_stem AND no usable blocks falls back
/// to the literal `"untitled"`. Pathological case (workspace_path
/// with no filename component).
#[test]
fn derive_title_returns_untitled_when_everything_blank() {
assert_eq!(derive_title("", &[], ""), "untitled");
assert_eq!(derive_title(" ", &[], " "), "untitled");
}
/// Korean H1 in NFD form is normalized to NFC before being chosen
/// as the title. Mirrors the heading_path NFC pin elsewhere.
#[test]
fn derive_title_nfc_normalizes_korean_h1() {
let nfd = "\u{1100}\u{1161}".to_string(); // 가 (NFD)
let nfc = "\u{AC00}".to_string(); // 가 (NFC)
let blocks = vec![heading(1, &nfd)];
assert_eq!(derive_title("", &blocks, "stem"), nfc);
}
/// `build_canonical_document` integrates the derive_title chain —
/// when frontmatter title is empty, the first H1 is used.
#[test]
fn build_canonical_document_falls_back_to_first_h1() {
let asset = fixture_asset();
let mut metadata = fixture_metadata();
metadata.user.remove("title");
let blocks = vec![ParsedBlock {
kind: kebab_parse_types::ParsedBlockKind::Heading,
heading_path: vec![],
source_span: span(),
payload: ParsedPayload::Heading {
level: 1,
text: "Lifted From H1".into(),
},
}];
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap();
assert_eq!(doc.title, "Lifted From H1");
}
/// `build_canonical_document` integrates the file_stem fallback —
/// no frontmatter title, no headings, no paragraphs → filename
/// (stripped of extension).
#[test]
fn build_canonical_document_falls_back_to_file_stem() {
let asset = fixture_asset();
// workspace_path = "notes/example.md" → stem "example"
let mut metadata = fixture_metadata();
metadata.user.remove("title");
let pv = parser_version();
let doc = build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
assert_eq!(doc.title, "example");
}
}