p1-4: build_canonical_document core + ID assignment

Implement the §4.3 ordinal rule and §3.4 block lift. Each `ParsedBlock`
maps to a `kb_core::Block` variant carrying a `CommonBlock` whose
`block_id = id_for_block(doc_id, payload_kind, heading_path, ordinal,
source_span)`. Ordinals are scoped to `(heading_path, payload_kind)`,
0-based, in document order — three paragraphs under one H1 get 0/1/2,
a code block under the same H1 starts fresh at 0, a paragraph under a
different H1 also starts at 0.

`payload_kind` is the lowercase-no-spaces convention from §4.2:
"heading", "paragraph", "list", "code", "table", "quote", "imageref",
"audioref".

`ListBlock.items` re-uses the parent list's `CommonBlock` per §3.4 (no
per-item BlockId is allocated). `AudioRefBlock` placeholder fields
(`asset_id`, `duration_ms`) are filled in by P8 — for now we synthesize
the minimal record so the document is well-typed.

Tests pin the four §4.4 ID properties (1000-iteration determinism, NFC
≡ NFD Korean path, `./a/b.md` ≡ `a/b.md`, ordinal grouping). Provenance
and title/lang lift land in the next commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-30 15:18:19 +00:00
parent c0096ce44b
commit fc05f3a2be

View File

@@ -17,19 +17,23 @@
//! (`kb-parse-md`, `kb-parse-pdf`, …). All parser output flows in via
//! the shared `kb-parse-types` crate.
use std::collections::HashMap;
use anyhow::Result;
use kb_core::{
CanonicalDocument, Lang, Metadata, ParserVersion, Provenance, RawAsset,
AudioRefBlock, Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, DocumentId,
HeadingBlock, ImageRefBlock, Inline, Lang, ListBlock, Metadata, ParserVersion, Provenance,
RawAsset, TableBlock, TextBlock,
};
use kb_parse_types::{ParsedBlock, Warning};
use kb_parse_types::{ParsedBlock, ParsedPayload, Warning};
pub use kb_core::{id_for_block, id_for_doc};
/// Build a [`CanonicalDocument`] from the raw asset, frontmatter
/// metadata, parser blocks, parser version, and any warnings. Full
/// behavior (block ID assignment, provenance, title/lang lift) is
/// filled in by subsequent commits in this series; this stub establishes
/// the public signature and the doc_id derivation only.
/// metadata, parser blocks, parser version, and any warnings.
///
/// This commit fills in the §4.3 ordinal rule and the §3.4 block lift.
/// `Provenance` and the title/lang lift are added in the next commit.
pub fn build_canonical_document(
asset: &RawAsset,
metadata: Metadata,
@@ -38,24 +42,350 @@ pub fn build_canonical_document(
_warnings: Vec<Warning>,
) -> Result<CanonicalDocument> {
let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version);
// §4.3 ordinal rule — per (heading_path, block_kind), 0-based,
// document order. A separate counter is kept for each grouping key.
let mut counters: HashMap<(Vec<String>, &'static str), u32> = HashMap::new();
let lifted_blocks: Vec<Block> = blocks
.into_iter()
.map(|pb| lift_block(&doc_id, pb, &mut counters))
.collect();
Ok(CanonicalDocument {
doc_id,
source_asset_id: asset.asset_id.clone(),
workspace_path: asset.workspace_path.clone(),
title: String::new(),
lang: Lang(String::new()),
blocks: Vec::new(),
blocks: lifted_blocks,
metadata,
provenance: Provenance { events: Vec::new() },
parser_version: parser_version.clone(),
schema_version: 1,
doc_version: 1,
})
.map(|d| {
// `blocks` is consumed but not yet lifted — flag it as live to
// satisfy the unused-binding lint until the next commit fills
// in the real lifting logic.
let _ = blocks;
d
})
}
/// Map a `ParsedPayload` variant to the lowercase, no-spaces string used
/// as `block_kind` in the §4.2 ID tuple.
fn payload_kind(payload: &ParsedPayload) -> &'static str {
match payload {
ParsedPayload::Heading { .. } => "heading",
ParsedPayload::Paragraph { .. } => "paragraph",
ParsedPayload::List { .. } => "list",
ParsedPayload::Code { .. } => "code",
ParsedPayload::Table { .. } => "table",
ParsedPayload::Quote { .. } => "quote",
ParsedPayload::ImageRef { .. } => "imageref",
ParsedPayload::AudioRef { .. } => "audioref",
}
}
fn next_ordinal(
counters: &mut HashMap<(Vec<String>, &'static str), u32>,
heading_path: &[String],
kind: &'static str,
) -> u32 {
let key = (heading_path.to_vec(), kind);
let entry = counters.entry(key).or_insert(0);
let ordinal = *entry;
*entry += 1;
ordinal
}
fn lift_block(
doc_id: &DocumentId,
pb: ParsedBlock,
counters: &mut HashMap<(Vec<String>, &'static str), u32>,
) -> Block {
let kind = payload_kind(&pb.payload);
let ordinal = next_ordinal(counters, &pb.heading_path, kind);
let block_id: BlockId = id_for_block(doc_id, kind, &pb.heading_path, ordinal, &pb.source_span);
let common = CommonBlock {
block_id,
heading_path: pb.heading_path,
source_span: pb.source_span,
};
match pb.payload {
ParsedPayload::Heading { level, text } => Block::Heading(HeadingBlock {
common,
level,
text,
}),
ParsedPayload::Paragraph { text, inlines } => Block::Paragraph(TextBlock {
common,
text,
inlines,
}),
ParsedPayload::List { ordered, items } => Block::List(ListBlock {
common: common.clone(),
ordered,
items: items
.into_iter()
.map(|item_inlines| TextBlock {
// List items inherit the parent list's CommonBlock; spec
// (§3.4) defines `ListBlock.items: Vec<TextBlock>` and
// does not allocate per-item BlockIds. Re-using the
// parent's common keeps the wire form deterministic
// while letting the inline tree carry the item content.
common: common.clone(),
text: flatten_inlines(&item_inlines),
inlines: item_inlines,
})
.collect(),
}),
ParsedPayload::Code { lang, code } => Block::Code(CodeBlock { common, lang, code }),
ParsedPayload::Table { headers, rows } => Block::Table(TableBlock {
common,
headers,
rows,
}),
ParsedPayload::Quote { text, inlines } => Block::Quote(TextBlock {
common,
text,
inlines,
}),
ParsedPayload::ImageRef { src, alt } => Block::ImageRef(ImageRefBlock {
common,
asset_id: None,
src,
alt,
ocr: None,
caption: None,
}),
// P1-4 does not extract audio metadata from disk — `asset_id`
// and `duration_ms` placeholders are filled in by the audio
// extractor (P8). For now we synthesize a minimal record so
// the document is well-typed.
ParsedPayload::AudioRef { src: _ } => Block::AudioRef(AudioRefBlock {
common,
asset_id: kb_core::AssetId(String::new()),
duration_ms: 0,
transcript: None,
}),
}
}
/// Flatten a `Vec<Inline>` into a plain text string. Used by list-item
/// `TextBlock.text` since `ParsedPayload::List` only carries inline trees
/// per item.
fn flatten_inlines(inlines: &[Inline]) -> String {
let mut out = String::new();
for i in inlines {
flatten_inline(i, &mut out);
}
out
}
fn flatten_inline(i: &Inline, out: &mut String) {
match i {
Inline::Text { text } => out.push_str(text),
Inline::Code { code } => out.push_str(code),
Inline::Link { text, .. } => out.push_str(text),
Inline::Strong { children } | Inline::Emph { children } => {
for c in children {
flatten_inline(c, out);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use kb_core::{
AssetId, AssetStorage, Checksum, MediaType, SourceSpan, SourceType, SourceUri,
TrustLevel, WorkspacePath, normalize::to_posix,
};
use serde_json::Value;
use std::path::{Path, PathBuf};
use time::OffsetDateTime;
fn fixture_asset() -> RawAsset {
let workspace_path = WorkspacePath::new("notes/example.md".into()).unwrap();
RawAsset {
asset_id: AssetId("a".repeat(32)),
source_uri: SourceUri::File(PathBuf::from("/tmp/example.md")),
workspace_path,
media_type: MediaType::Markdown,
byte_len: 0,
checksum: Checksum("0".repeat(64)),
// Pin a fixed timestamp so determinism tests can compare
// outputs across runs without timestamp jitter outside the
// fields we explicitly strip.
discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
stored: AssetStorage::Reference {
path: PathBuf::from("/tmp/example.md"),
sha: Checksum("0".repeat(64)),
},
}
}
fn fixture_metadata() -> Metadata {
let mut user = serde_json::Map::new();
user.insert("title".into(), Value::String("Example".into()));
user.insert("lang".into(), Value::String("en".into()));
user.insert("custom".into(), Value::Bool(true));
Metadata {
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Markdown,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user,
}
}
fn parser_version() -> ParserVersion {
ParserVersion("kb-normalize-test-0".into())
}
/// `id_for_doc` is deterministic across 1000 invocations on the same
/// input — a regression in canonical JSON or BLAKE3 would surface
/// here immediately.
#[test]
fn id_for_doc_deterministic_1000() {
let path = WorkspacePath::new("a/b.md".into()).unwrap();
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
let pv = ParserVersion("v1".into());
let first = id_for_doc(&path, &asset, &pv);
for _ in 0..1000 {
assert_eq!(id_for_doc(&path, &asset, &pv), first);
}
}
/// NFC vs NFD inputs for the same Korean glyph must produce the
/// same `doc_id` because `to_posix` runs NFC normalization.
#[test]
fn nfc_nfd_korean_path_same_id() {
let nfd = to_posix(Path::new("\u{1100}\u{1161}.md")).unwrap();
let nfc = to_posix(Path::new("\u{AC00}.md")).unwrap();
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
let pv = parser_version();
assert_eq!(id_for_doc(&nfd, &asset, &pv), id_for_doc(&nfc, &asset, &pv));
}
/// `./a/b.md` and `a/b.md` must collapse to the same POSIX form
/// before `id_for_doc`.
#[test]
fn posix_curdir_collapses_to_same_id() {
let a = to_posix(Path::new("./a/b.md")).unwrap();
let b = to_posix(Path::new("a/b.md")).unwrap();
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
let pv = parser_version();
assert_eq!(id_for_doc(&a, &asset, &pv), id_for_doc(&b, &asset, &pv));
}
/// Ordinals are scoped to (heading_path, block_kind) per §4.3:
/// three paragraphs under H1 → 0/1/2; a code block under the same
/// H1 starts a fresh counter at 0; a paragraph under a different
/// H1 also starts a fresh counter at 0.
#[test]
fn block_ordinals_scoped_per_heading_and_kind() {
let span = SourceSpan::Line { start: 1, end: 1 };
let h1_a = vec!["A".to_string()];
let h1_b = vec!["B".to_string()];
let blocks = vec![
ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_a.clone(),
source_span: span.clone(),
payload: ParsedPayload::Paragraph {
text: "p1".into(),
inlines: vec![],
},
},
ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_a.clone(),
source_span: SourceSpan::Line { start: 2, end: 2 },
payload: ParsedPayload::Paragraph {
text: "p2".into(),
inlines: vec![],
},
},
ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_a.clone(),
source_span: SourceSpan::Line { start: 3, end: 3 },
payload: ParsedPayload::Paragraph {
text: "p3".into(),
inlines: vec![],
},
},
ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::Code,
heading_path: h1_a.clone(),
source_span: SourceSpan::Line { start: 4, end: 5 },
payload: ParsedPayload::Code {
lang: None,
code: "x".into(),
},
},
ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_b.clone(),
source_span: SourceSpan::Line { start: 6, end: 6 },
payload: ParsedPayload::Paragraph {
text: "q1".into(),
inlines: vec![],
},
},
];
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap();
// Compute the expected IDs out-of-band so the test pins both
// the (heading_path, kind) ordinal grouping AND the value of
// each block_id under the recipe.
let p1 = id_for_block(&doc.doc_id, "paragraph", &h1_a, 0, &span);
let p2 = id_for_block(
&doc.doc_id,
"paragraph",
&h1_a,
1,
&SourceSpan::Line { start: 2, end: 2 },
);
let p3 = id_for_block(
&doc.doc_id,
"paragraph",
&h1_a,
2,
&SourceSpan::Line { start: 3, end: 3 },
);
let c0 = id_for_block(
&doc.doc_id,
"code",
&h1_a,
0,
&SourceSpan::Line { start: 4, end: 5 },
);
let q0 = id_for_block(
&doc.doc_id,
"paragraph",
&h1_b,
0,
&SourceSpan::Line { start: 6, end: 6 },
);
let ids: Vec<&BlockId> = doc
.blocks
.iter()
.map(|b| match b {
Block::Paragraph(t) | Block::Quote(t) => &t.common.block_id,
Block::Heading(h) => &h.common.block_id,
Block::List(l) => &l.common.block_id,
Block::Code(c) => &c.common.block_id,
Block::Table(t) => &t.common.block_id,
Block::ImageRef(i) => &i.common.block_id,
Block::AudioRef(a) => &a.common.block_id,
})
.collect();
assert_eq!(ids, vec![&p1, &p2, &p3, &c0, &q0]);
}
}