Merge pull request 'feat(p1-4): kb-normalize + kb-core Inline schema hotfix' (#9) from feat/p1-4-normalize into main

Reviewed-on: altair823-org/kb#9
This commit was merged in pull request #9.
This commit is contained in:
2026-04-30 16:23:16 +00:00
11 changed files with 1249 additions and 166 deletions

15
Cargo.lock generated
View File

@@ -577,6 +577,21 @@ dependencies = [
"unicode-normalization",
]
[[package]]
name = "kb-normalize"
version = "0.1.0"
dependencies = [
"anyhow",
"kb-core",
"kb-parse-md",
"kb-parse-types",
"serde",
"serde_json",
"time",
"tracing",
"unicode-normalization",
]
[[package]]
name = "kb-parse-md"
version = "0.1.0"

View File

@@ -6,6 +6,7 @@ members = [
"crates/kb-config",
"crates/kb-source-fs",
"crates/kb-parse-md",
"crates/kb-normalize",
"crates/kb-app",
"crates/kb-cli",
]

View File

@@ -100,11 +100,11 @@ pub struct AudioRefBlock {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", tag = "kind")]
pub enum Inline {
Text(String),
Code(String),
Text { text: String },
Code { code: String },
Link { text: String, href: String },
Strong(Vec<Inline>),
Emph(Vec<Inline>),
Strong { children: Vec<Inline> },
Emph { children: Vec<Inline> },
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -175,3 +175,37 @@ pub struct TranscriptSegment {
pub speaker: Option<String>,
pub confidence: Option<f32>,
}
#[cfg(test)]
mod tests {
use super::*;
/// Each `Inline` variant must serialize and deserialize cleanly under
/// the internally-tagged representation. Newtype-with-primitive variants
/// (`Text(String)`, `Code(String)`, `Strong(Vec<…>)`, `Emph(Vec<…>)`)
/// previously failed at serde runtime because `tag = "kind"` cannot
/// describe a newtype carrying a non-struct value. The struct-variant
/// shape used here is the §9 schema migration.
#[test]
fn inline_serde_round_trip() {
let cases = vec![
Inline::Text { text: "hi".into() },
Inline::Code { code: "x".into() },
Inline::Link {
text: "t".into(),
href: "h".into(),
},
Inline::Strong {
children: vec![Inline::Text { text: "bold".into() }],
},
Inline::Emph {
children: vec![Inline::Text { text: "em".into() }],
},
];
for c in cases {
let s = serde_json::to_string(&c).expect("serialize");
let back: Inline = serde_json::from_str(&s).expect("deserialize");
assert_eq!(c, back);
}
}
}

View File

@@ -0,0 +1,27 @@
[package]
name = "kb-normalize"
version = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "Lift parser output (kb-parse-types) into kb-core::CanonicalDocument with deterministic IDs (§3.4, §4.2, §4.3)"
[dependencies]
kb-core = { path = "../kb-core" }
kb-parse-types = { path = "../kb-parse-types" }
serde = { workspace = true }
serde_json = { workspace = true }
unicode-normalization = "0.1"
time = { workspace = true }
anyhow = { workspace = true }
tracing = { workspace = true }
[dev-dependencies]
# kb-parse-md is permitted as a *dev*-dependency only — used by the
# integration snapshot test to drive a fixture through the real parser.
# Forbidden as a regular dep per design §8 (kb-normalize must not depend
# on any specific parser); `cargo tree -p kb-normalize --depth 1` (the
# default scope, excluding dev-deps) confirms this.
kb-parse-md = { path = "../kb-parse-md" }
serde_json = { workspace = true }

View File

@@ -0,0 +1,843 @@
//! `kb-normalize` — lift parser output (`kb-parse-types`) into a
//! [`kb_core::CanonicalDocument`] with deterministic IDs.
//!
//! Per design §3.4 (CanonicalDocument / Block), §4.2 (ID recipe), §4.3
//! (ordinal rule), §3.6 (Provenance), §8 (module boundaries).
//!
//! Public surface:
//!
//! * [`build_canonical_document`] — assemble a `CanonicalDocument` from
//! `(RawAsset, Metadata, Vec<ParsedBlock>, ParserVersion, Vec<Warning>)`.
//! * [`id_for_doc`], [`id_for_block`] — re-exports of the canonical
//! ID-recipe functions in `kb-core::ids` (§4.2). `kb-core` is the only
//! implementation; `kb-normalize` is the canonical *entry point* per
//! design §8.
//!
//! This crate must NOT depend on any parser implementation crate
//! (`kb-parse-md`, `kb-parse-pdf`, …). All parser output flows in via
//! the shared `kb-parse-types` crate.
use std::collections::HashMap;
use anyhow::Result;
use kb_core::{
Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, DocumentId, HeadingBlock,
ImageRefBlock, Inline, Lang, ListBlock, Metadata, ParserVersion, Provenance, ProvenanceEvent,
ProvenanceKind, RawAsset, TableBlock, TextBlock,
};
use kb_parse_types::{ParsedBlock, ParsedPayload, Warning, WarningKind};
use time::OffsetDateTime;
use unicode_normalization::UnicodeNormalization;
pub use kb_core::{id_for_block, id_for_doc};
/// Build a [`CanonicalDocument`] from the raw asset, frontmatter
/// metadata, parser blocks, parser version, and any warnings.
///
/// Behavior contract (per design §3.4 / §4.2 / §4.3 / §3.6):
///
/// * `doc_id = id_for_doc(workspace_path, asset_id, parser_version)` —
/// `workspace_path` is consumed verbatim from `asset` (already NFC +
/// POSIX per `kb_core::normalize::to_posix`).
/// * `block_id = id_for_block(doc_id, kind, heading_path, ordinal,
/// source_span)` — `ordinal` is **0-based, scoped to (heading_path,
/// block_kind), in document order** per §4.3.
/// * `title` and `lang` are lifted from `metadata.user["title"]` /
/// `metadata.user["lang"]` (where P1-2 stashes them) into the dedicated
/// `CanonicalDocument` fields, and removed from the user map to avoid
/// duplication. Both keys are lifted only if present and stringy;
/// non-stringy values (e.g. `Number`, `Array`) and missing keys
/// silently default to empty title / empty `Lang`. P1-2's frontmatter
/// parser only writes these keys when the source value parses as a
/// string, so the non-stringy branches are defense-in-depth.
/// * `provenance` is seeded with `Discovered` (from `asset.discovered_at`),
/// `Parsed`, `Normalized` events, and one `Warning` event per upstream
/// warning. The two normalize-side events share one `now_utc()` reading
/// so the timestamp jitter inside a single call is bounded — event
/// ordering is preserved by `Vec` position.
/// * `schema_version` and `doc_version` are pinned to `1` (initial).
pub fn build_canonical_document(
asset: &RawAsset,
metadata: Metadata,
blocks: Vec<ParsedBlock>,
parser_version: &ParserVersion,
warnings: Vec<Warning>,
) -> Result<CanonicalDocument> {
let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version);
// Lift title / lang from `metadata.user` (P1-2 stashed them there
// because `Metadata` does not carry them directly). Strip after
// lifting so the wire form does not duplicate the data.
let mut metadata = metadata;
let title = metadata
.user
.remove("title")
.and_then(|v| v.as_str().map(String::from))
.unwrap_or_default();
let lang = metadata
.user
.remove("lang")
.and_then(|v| v.as_str().map(|s| Lang(s.to_string())))
.unwrap_or_else(|| Lang(String::new()));
// §4.3 ordinal rule — per (heading_path, block_kind), 0-based,
// document order. A separate counter is kept for each grouping key.
let mut counters: HashMap<(Vec<String>, &'static str), u32> = HashMap::new();
// Some lift paths (e.g. AudioRef pre-P8) drop the block entirely and
// synthesize a Warning so the wire form never carries an invalid
// `AssetId`. These warnings originate at the lift stage and are
// attributed to `kb-normalize` (not to whatever upstream emitter the
// bare `WarningKind` would resolve to via `warning_agent`). They are
// tracked separately so the agent string is correct in Provenance.
let mut lift_warnings: Vec<Warning> = Vec::new();
let lifted_blocks: Vec<Block> = blocks
.into_iter()
.filter_map(|pb| lift_block(&doc_id, pb, &mut counters, &mut lift_warnings))
.collect();
tracing::debug!(
target: "kb-normalize",
"built canonical document doc_id={} blocks={}",
doc_id.0,
lifted_blocks.len()
);
// Provenance — share `now` between the parse + normalize stages so
// the per-call timestamp jitter is bounded.
let now = OffsetDateTime::now_utc();
let mut events: Vec<ProvenanceEvent> =
Vec::with_capacity(3 + warnings.len() + lift_warnings.len());
events.push(ProvenanceEvent {
at: asset.discovered_at,
agent: "kb-source-fs".to_string(),
kind: ProvenanceKind::Discovered,
note: None,
});
events.push(ProvenanceEvent {
at: now,
agent: "kb-parse-md".to_string(),
kind: ProvenanceKind::Parsed,
note: Some(format!("parser_version={}", parser_version.0)),
});
events.push(ProvenanceEvent {
at: now,
agent: "kb-normalize".to_string(),
kind: ProvenanceKind::Normalized,
note: None,
});
// {:?} on WarningKind renders camel-case variant name; intentional
// for human-readable Provenance trace.
for w in warnings {
events.push(ProvenanceEvent {
at: now,
agent: warning_agent(&w.kind).to_string(),
kind: ProvenanceKind::Warning,
note: Some(format!("{:?}: {}", w.kind, w.note)),
});
}
// Lift-stage warnings (currently only AudioRef-deferred drops) are
// unconditionally attributed to `kb-normalize`.
for w in lift_warnings {
events.push(ProvenanceEvent {
at: now,
agent: "kb-normalize".to_string(),
kind: ProvenanceKind::Warning,
note: Some(format!("{:?}: {}", w.kind, w.note)),
});
}
let provenance = Provenance { events };
Ok(CanonicalDocument {
doc_id,
source_asset_id: asset.asset_id.clone(),
workspace_path: asset.workspace_path.clone(),
title,
lang,
blocks: lifted_blocks,
metadata,
provenance,
parser_version: parser_version.clone(),
schema_version: 1,
doc_version: 1,
})
}
/// Resolve a `WarningKind` to the upstream agent that emitted it. Used
/// to fill `ProvenanceEvent::agent` for the warning's event entry.
///
/// `ExtractFailed` is emitted today by `kb-parse-md`'s panic-recovery
/// guard around `parse_blocks` — see `crates/kb-parse-md/src/blocks.rs`.
/// If a future stage (e.g. `kb-normalize` itself, an extractor, …) starts
/// emitting `ExtractFailed`, this mapping needs to grow context (perhaps
/// a separate `WarningSource` field on `Warning`) so attribution stays
/// honest. For now, all `ExtractFailed` warnings observed by
/// `build_canonical_document` originated in the parser.
fn warning_agent(kind: &WarningKind) -> &'static str {
match kind {
WarningKind::MalformedFrontmatter | WarningKind::EncodingFallback => "kb-parse-md",
WarningKind::MalformedTable => "kb-parse-md",
WarningKind::ExtractFailed => "kb-parse-md",
}
}
/// Map a `ParsedPayload` variant to the lowercase, no-spaces string used
/// as `block_kind` in the §4.2 ID tuple.
fn payload_kind(payload: &ParsedPayload) -> &'static str {
match payload {
ParsedPayload::Heading { .. } => "heading",
ParsedPayload::Paragraph { .. } => "paragraph",
ParsedPayload::List { .. } => "list",
ParsedPayload::Code { .. } => "code",
ParsedPayload::Table { .. } => "table",
ParsedPayload::Quote { .. } => "quote",
ParsedPayload::ImageRef { .. } => "imageref",
ParsedPayload::AudioRef { .. } => "audioref",
}
}
fn next_ordinal(
counters: &mut HashMap<(Vec<String>, &'static str), u32>,
heading_path: &[String],
kind: &'static str,
) -> u32 {
let key = (heading_path.to_vec(), kind);
let entry = counters.entry(key).or_insert(0);
let ordinal = *entry;
*entry += 1;
ordinal
}
fn lift_block(
doc_id: &DocumentId,
pb: ParsedBlock,
counters: &mut HashMap<(Vec<String>, &'static str), u32>,
warnings: &mut Vec<Warning>,
) -> Option<Block> {
let kind = payload_kind(&pb.payload);
// Task spec line 73: "All input strings normalized to NFC before
// hashing." `pulldown-cmark` does not NFC heading text, and
// `serde_json_canonicalizer` v0.3 does not normalize strings either,
// so we must NFC-normalize `heading_path` here before it feeds both
// the §4.2 ID recipe AND the on-disk `CommonBlock.heading_path` (so
// wire form matches ID input). Without this, NFD `\u{1100}\u{1161}`
// and NFC `\u{AC00}` (both render as 가) would produce different
// `block_id`s for what is logically the same heading.
let heading_path_nfc: Vec<String> =
pb.heading_path.iter().map(|s| s.nfc().collect()).collect();
let ordinal = next_ordinal(counters, &heading_path_nfc, kind);
let block_id: BlockId =
id_for_block(doc_id, kind, &heading_path_nfc, ordinal, &pb.source_span);
let common = CommonBlock {
block_id,
heading_path: heading_path_nfc,
source_span: pb.source_span,
};
let block = match pb.payload {
ParsedPayload::Heading { level, text } => Block::Heading(HeadingBlock {
common,
level,
text,
}),
ParsedPayload::Paragraph { text, inlines } => Block::Paragraph(TextBlock {
common,
text,
inlines,
}),
ParsedPayload::List { ordered, items } => Block::List(ListBlock {
common: common.clone(),
ordered,
items: items
.into_iter()
.map(|item_inlines| TextBlock {
// All list items currently inherit the parent's
// CommonBlock (incl. block_id). Per-item IDs would
// require a §4.2 recipe extension. Spec (§3.4)
// defines `ListBlock.items: Vec<TextBlock>` and
// does not allocate per-item BlockIds. Re-using the
// parent's common keeps the wire form deterministic
// while letting the inline tree carry the item
// content.
common: common.clone(),
text: flatten_inlines(&item_inlines),
inlines: item_inlines,
})
.collect(),
}),
ParsedPayload::Code { lang, code } => Block::Code(CodeBlock { common, lang, code }),
ParsedPayload::Table { headers, rows } => Block::Table(TableBlock {
common,
headers,
rows,
}),
ParsedPayload::Quote { text, inlines } => Block::Quote(TextBlock {
common,
text,
inlines,
}),
ParsedPayload::ImageRef { src, alt } => Block::ImageRef(ImageRefBlock {
common,
asset_id: None,
src,
alt,
ocr: None,
caption: None,
}),
// TODO(P8): audio extractor will resolve workspace assets and
// produce real AssetIds. This skip-and-warn shim is a
// placeholder. `AssetId::from_str` requires a 32-hex string, so
// synthesizing `AssetId(String::new())` would break the
// invariant — instead we drop the block and surface a Warning
// (attributed to `kb-normalize` per §3.6 since this is the
// lift-stage decision).
ParsedPayload::AudioRef { src } => {
warnings.push(Warning {
kind: WarningKind::ExtractFailed,
note: format!(
"audio-ref AssetId resolution deferred to P8 — block dropped (src={src})"
),
});
return None;
}
};
Some(block)
}
/// Flatten a `Vec<Inline>` into a plain text string. Used by list-item
/// `TextBlock.text` since `ParsedPayload::List` only carries inline trees
/// per item.
fn flatten_inlines(inlines: &[Inline]) -> String {
let mut out = String::new();
for i in inlines {
flatten_inline(i, &mut out);
}
out
}
fn flatten_inline(i: &Inline, out: &mut String) {
match i {
Inline::Text { text } => out.push_str(text),
Inline::Code { code } => out.push_str(code),
Inline::Link { text, .. } => out.push_str(text),
Inline::Strong { children } | Inline::Emph { children } => {
for c in children {
flatten_inline(c, out);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use kb_core::{
AssetId, AssetStorage, Checksum, MediaType, SourceSpan, SourceType, SourceUri,
TrustLevel, WorkspacePath, normalize::to_posix,
};
use serde_json::Value;
use std::path::{Path, PathBuf};
use time::OffsetDateTime;
fn fixture_asset() -> RawAsset {
let workspace_path = WorkspacePath::new("notes/example.md".into()).unwrap();
RawAsset {
asset_id: AssetId("a".repeat(32)),
source_uri: SourceUri::File(PathBuf::from("/tmp/example.md")),
workspace_path,
media_type: MediaType::Markdown,
byte_len: 0,
checksum: Checksum("0".repeat(64)),
// Pin a fixed timestamp so determinism tests can compare
// outputs across runs without timestamp jitter outside the
// fields we explicitly strip.
discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
stored: AssetStorage::Reference {
path: PathBuf::from("/tmp/example.md"),
sha: Checksum("0".repeat(64)),
},
}
}
fn fixture_metadata() -> Metadata {
let mut user = serde_json::Map::new();
user.insert("title".into(), Value::String("Example".into()));
user.insert("lang".into(), Value::String("en".into()));
user.insert("custom".into(), Value::Bool(true));
Metadata {
aliases: vec![],
tags: vec![],
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
source_type: SourceType::Markdown,
trust_level: TrustLevel::Primary,
user_id_alias: None,
user,
}
}
fn parser_version() -> ParserVersion {
ParserVersion("kb-normalize-test-0".into())
}
/// Fixed 5-block input used by both the ordinal-scoping pinning test
/// and the determinism stress test (so the latter exercises the
/// `lift_block` path, not just the empty-blocks path).
fn fixture_blocks_five() -> Vec<ParsedBlock> {
let h1_a = vec!["A".to_string()];
let h1_b = vec!["B".to_string()];
vec![
ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_a.clone(),
source_span: SourceSpan::Line { start: 1, end: 1 },
payload: ParsedPayload::Paragraph {
text: "p1".into(),
inlines: vec![],
},
},
ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_a.clone(),
source_span: SourceSpan::Line { start: 2, end: 2 },
payload: ParsedPayload::Paragraph {
text: "p2".into(),
inlines: vec![],
},
},
ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_a.clone(),
source_span: SourceSpan::Line { start: 3, end: 3 },
payload: ParsedPayload::Paragraph {
text: "p3".into(),
inlines: vec![],
},
},
ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::Code,
heading_path: h1_a,
source_span: SourceSpan::Line { start: 4, end: 5 },
payload: ParsedPayload::Code {
lang: None,
code: "x".into(),
},
},
ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::Paragraph,
heading_path: h1_b,
source_span: SourceSpan::Line { start: 6, end: 6 },
payload: ParsedPayload::Paragraph {
text: "q1".into(),
inlines: vec![],
},
},
]
}
/// `id_for_doc` is deterministic across 1000 invocations on the same
/// input — a regression in canonical JSON or BLAKE3 would surface
/// here immediately.
#[test]
fn id_for_doc_deterministic_1000() {
let path = WorkspacePath::new("a/b.md".into()).unwrap();
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
let pv = ParserVersion("v1".into());
let first = id_for_doc(&path, &asset, &pv);
for _ in 0..1000 {
assert_eq!(id_for_doc(&path, &asset, &pv), first);
}
}
/// NFC vs NFD inputs for the same Korean glyph must produce the
/// same `doc_id` because `to_posix` runs NFC normalization.
#[test]
fn nfc_nfd_korean_path_same_id() {
let nfd = to_posix(Path::new("\u{1100}\u{1161}.md")).unwrap();
let nfc = to_posix(Path::new("\u{AC00}.md")).unwrap();
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
let pv = parser_version();
assert_eq!(id_for_doc(&nfd, &asset, &pv), id_for_doc(&nfc, &asset, &pv));
}
/// `./a/b.md` and `a/b.md` must collapse to the same POSIX form
/// before `id_for_doc`.
#[test]
fn posix_curdir_collapses_to_same_id() {
let a = to_posix(Path::new("./a/b.md")).unwrap();
let b = to_posix(Path::new("a/b.md")).unwrap();
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
let pv = parser_version();
assert_eq!(id_for_doc(&a, &asset, &pv), id_for_doc(&b, &asset, &pv));
}
/// Ordinals are scoped to (heading_path, block_kind) per §4.3:
/// three paragraphs under H1 → 0/1/2; a code block under the same
/// H1 starts a fresh counter at 0; a paragraph under a different
/// H1 also starts a fresh counter at 0.
#[test]
fn block_ordinals_scoped_per_heading_and_kind() {
let h1_a = vec!["A".to_string()];
let h1_b = vec!["B".to_string()];
let blocks = fixture_blocks_five();
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap();
// Compute the expected IDs out-of-band so the test pins both
// the (heading_path, kind) ordinal grouping AND the value of
// each block_id under the recipe.
let p1 = id_for_block(
&doc.doc_id,
"paragraph",
&h1_a,
0,
&SourceSpan::Line { start: 1, end: 1 },
);
let p2 = id_for_block(
&doc.doc_id,
"paragraph",
&h1_a,
1,
&SourceSpan::Line { start: 2, end: 2 },
);
let p3 = id_for_block(
&doc.doc_id,
"paragraph",
&h1_a,
2,
&SourceSpan::Line { start: 3, end: 3 },
);
let c0 = id_for_block(
&doc.doc_id,
"code",
&h1_a,
0,
&SourceSpan::Line { start: 4, end: 5 },
);
let q0 = id_for_block(
&doc.doc_id,
"paragraph",
&h1_b,
0,
&SourceSpan::Line { start: 6, end: 6 },
);
let ids: Vec<&BlockId> = doc
.blocks
.iter()
.map(|b| match b {
Block::Paragraph(t) | Block::Quote(t) => &t.common.block_id,
Block::Heading(h) => &h.common.block_id,
Block::List(l) => &l.common.block_id,
Block::Code(c) => &c.common.block_id,
Block::Table(t) => &t.common.block_id,
Block::ImageRef(i) => &i.common.block_id,
Block::AudioRef(a) => &a.common.block_id,
})
.collect();
assert_eq!(ids, vec![&p1, &p2, &p3, &c0, &q0]);
}
/// Provenance events appear in the documented order: `Discovered`
/// (from the asset), `Parsed`, then `Normalized`. Warnings (none in
/// this test) would follow.
#[test]
fn provenance_contains_stage_events_in_order() {
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
let kinds: Vec<_> = doc.provenance.events.iter().map(|e| e.kind).collect();
assert_eq!(
kinds,
vec![
ProvenanceKind::Discovered,
ProvenanceKind::Parsed,
ProvenanceKind::Normalized,
]
);
let events = &doc.provenance.events;
assert_eq!(events[0].at, asset.discovered_at);
assert_eq!(events[0].agent, "kb-source-fs");
assert_eq!(events[1].agent, "kb-parse-md");
assert_eq!(events[2].agent, "kb-normalize");
// Pin the implementation invariant that Parsed and Normalized
// share the single `now_utc()` reading inside one call.
assert_eq!(events[1].at, events[2].at, "Parsed and Normalized share now_utc");
}
/// Warnings carried into `build_canonical_document` are emitted as
/// `ProvenanceKind::Warning` events with the upstream agent.
#[test]
fn provenance_includes_warnings() {
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let warnings = vec![Warning {
kind: WarningKind::MalformedFrontmatter,
note: "missing closing fence".into(),
}];
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap();
assert_eq!(doc.provenance.events.len(), 4);
let last = doc.provenance.events.last().unwrap();
assert_eq!(last.kind, ProvenanceKind::Warning);
assert_eq!(last.agent, "kb-parse-md");
assert!(last.note.as_deref().unwrap().contains("missing closing fence"));
}
/// `metadata.user["title"]` and `metadata.user["lang"]` are lifted
/// to the dedicated `CanonicalDocument` fields and stripped from
/// the user map (so the wire form does not duplicate the data).
/// Other user keys survive intact.
#[test]
fn lifts_title_and_lang_from_user_map() {
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
assert_eq!(doc.title, "Example");
assert_eq!(doc.lang, Lang("en".into()));
assert!(!doc.metadata.user.contains_key("title"));
assert!(!doc.metadata.user.contains_key("lang"));
assert!(doc.metadata.user.contains_key("custom"));
}
/// Determinism property: 1000 iterations of `build_canonical_document`
/// over identical inputs produce byte-identical JSON, modulo the two
/// non-deterministic `now_utc()` timestamps for the Parsed/Normalized
/// events. We strip those timestamps before comparing. Must finish
/// within 1 second.
#[test]
fn determinism_1000_iterations_under_1s() {
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
// Helper: serialize and replace the two now_utc-derived timestamps
// (Parsed + Normalized + any Warning events) with a sentinel so
// the comparison only checks the deterministic fields.
fn strip_dynamic_at(doc: &CanonicalDocument) -> Value {
let mut v = serde_json::to_value(doc).unwrap();
if let Some(events) = v
.get_mut("provenance")
.and_then(|p| p.get_mut("events"))
.and_then(|e| e.as_array_mut())
{
for (i, ev) in events.iter_mut().enumerate() {
// index 0 is Discovered (deterministic — pinned in
// the fixture). Strip everything after.
if i > 0
&& let Some(obj) = ev.as_object_mut()
{
obj.insert("at".into(), Value::String("<stripped>".into()));
}
}
}
v
}
// Use the same 5-block fixture as the ordinal-scoping test so
// determinism is exercised on a non-empty `lift_block` path
// (block_id hashing, NFC normalization, ordinal counters), not
// just an empty Vec.
let baseline = build_canonical_document(
&asset,
metadata.clone(),
fixture_blocks_five(),
&pv,
vec![],
)
.unwrap();
let baseline_json = serde_json::to_string(&strip_dynamic_at(&baseline)).unwrap();
let start = std::time::Instant::now();
for _ in 0..1000 {
let next = build_canonical_document(
&asset,
metadata.clone(),
fixture_blocks_five(),
&pv,
vec![],
)
.unwrap();
let next_json = serde_json::to_string(&strip_dynamic_at(&next)).unwrap();
assert_eq!(baseline_json, next_json);
}
assert!(
start.elapsed() < std::time::Duration::from_secs(1),
"1000 iterations took {:?}",
start.elapsed()
);
}
/// I1 regression — `WarningKind::ExtractFailed` is emitted by
/// `kb-parse-md` (panic-recovery in `blocks.rs`), so the resulting
/// `ProvenanceEvent::agent` must read `"kb-parse-md"`. A regression
/// to `"kb-normalize"` would mis-attribute parse panics and break
/// stage-filtered debugging.
#[test]
fn provenance_with_extract_failed_warning_attributes_to_kb_parse_md() {
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let warnings = vec![Warning {
kind: WarningKind::ExtractFailed,
note: "pulldown-cmark panicked; body discarded".into(),
}];
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap();
let warning_event = doc
.provenance
.events
.iter()
.find(|e| e.kind == ProvenanceKind::Warning)
.expect("warning event present");
assert_eq!(warning_event.agent, "kb-parse-md");
assert!(
warning_event
.note
.as_deref()
.unwrap()
.contains("ExtractFailed")
);
}
/// I2 regression — `ParsedPayload::AudioRef` is dropped (not lifted
/// into a `Block::AudioRef` with a synthesized empty `AssetId`,
/// which would violate `AssetId::from_str`'s 32-hex invariant). A
/// `Warning` is surfaced in Provenance, attributed to
/// `"kb-normalize"` because the decision is made at the lift stage.
#[test]
fn audio_ref_block_skipped_with_warning() {
let span = SourceSpan::Line { start: 1, end: 1 };
let blocks = vec![ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::AudioRef,
heading_path: vec![],
source_span: span,
payload: ParsedPayload::AudioRef {
src: "voice.m4a".into(),
},
}];
let asset = fixture_asset();
let metadata = fixture_metadata();
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap();
// No AudioRef block in the canonical output.
assert!(
!doc.blocks
.iter()
.any(|b| matches!(b, Block::AudioRef(_))),
"AudioRef block should be skipped pre-P8"
);
// Exactly one Warning event mentioning the AudioRef src.
let warning_events: Vec<_> = doc
.provenance
.events
.iter()
.filter(|e| e.kind == ProvenanceKind::Warning)
.collect();
assert_eq!(warning_events.len(), 1);
let w = warning_events[0];
assert_eq!(w.agent, "kb-normalize");
assert!(w.note.as_deref().unwrap().contains("voice.m4a"));
}
/// I3 regression — heading-path strings are NFC-normalized before
/// feeding into `id_for_block`, so canonically-equivalent NFD and
/// NFC inputs produce the same `block_id`. Mirrors
/// `nfc_nfd_korean_path_same_id` for `doc_id`.
#[test]
fn nfc_nfd_korean_heading_path_same_block_id() {
let span = SourceSpan::Line { start: 1, end: 1 };
let nfd_heading = "\u{1100}\u{1161}".to_string(); // 가 (NFD)
let nfc_heading = "\u{AC00}".to_string(); // 가 (NFC)
let mk_block = |heading: String| ParsedBlock {
kind: kb_parse_types::ParsedBlockKind::Paragraph,
heading_path: vec![heading],
source_span: span.clone(),
payload: ParsedPayload::Paragraph {
text: "p".into(),
inlines: vec![],
},
};
let asset = fixture_asset();
let pv = parser_version();
let doc_nfd = build_canonical_document(
&asset,
fixture_metadata(),
vec![mk_block(nfd_heading)],
&pv,
vec![],
)
.unwrap();
let doc_nfc = build_canonical_document(
&asset,
fixture_metadata(),
vec![mk_block(nfc_heading)],
&pv,
vec![],
)
.unwrap();
let id_nfd = match &doc_nfd.blocks[0] {
Block::Paragraph(t) => &t.common.block_id,
_ => panic!("expected Paragraph"),
};
let id_nfc = match &doc_nfc.blocks[0] {
Block::Paragraph(t) => &t.common.block_id,
_ => panic!("expected Paragraph"),
};
assert_eq!(id_nfd, id_nfc, "NFD and NFC heading paths must hash equal");
}
/// M7 — `metadata.user["title"] = ""` is stringy and lifts to an
/// empty `CanonicalDocument.title`. This pins the policy: an
/// explicit empty string is *not* dropped, it's lifted as-is.
#[test]
fn title_empty_string_in_user_map_falls_back_to_default() {
let asset = fixture_asset();
let mut metadata = fixture_metadata();
metadata
.user
.insert("title".into(), Value::String(String::new()));
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
assert_eq!(doc.title, "");
}
/// M7 — `metadata.user["title"] = 42` is non-stringy and silently
/// drops; the fallback default (empty title) is used.
#[test]
fn title_non_string_in_user_map_silently_drops() {
let asset = fixture_asset();
let mut metadata = fixture_metadata();
metadata
.user
.insert("title".into(), Value::Number(42.into()));
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
assert_eq!(doc.title, "");
}
/// M7 — non-stringy `lang` (e.g. an array) silently drops. This is
/// defensive: P1-2 frontmatter validates the shape upstream, but we
/// don't trust it.
#[test]
fn lang_invalid_shape_silently_drops() {
let asset = fixture_asset();
let mut metadata = fixture_metadata();
metadata.user.insert("lang".into(), Value::Array(vec![]));
let pv = parser_version();
let doc =
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
assert_eq!(doc.lang, Lang(String::new()));
}
}

View File

@@ -0,0 +1,160 @@
//! Snapshot test pinning the full `CanonicalDocument` JSON for the
//! `code-and-table.md` fixture.
//!
//! This is an integration test (it lives under `tests/`) and depends on
//! `kb-parse-md` only as a dev-dep so the production crate's regular
//! deps still satisfy the §8 boundary (`cargo tree -p kb-normalize
//! --depth 1` without `-e dev` does not list any parser implementation).
//!
//! Non-deterministic fields are stripped before comparison:
//!
//! * `provenance.events[*].at` — each invocation calls `now_utc()` for
//! the Parsed/Normalized/Warning events. The Discovered event uses
//! the asset's pinned `discovered_at`, so we keep that one and replace
//! only indices ≥ 1.
use std::path::PathBuf;
use kb_core::{
AssetId, AssetStorage, Checksum, MediaType, ParserVersion, RawAsset, SourceUri,
WorkspacePath,
};
use kb_normalize::build_canonical_document;
use kb_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
use serde_json::Value;
use time::OffsetDateTime;
fn fixtures_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("..")
.join("..")
.join("fixtures")
.join("markdown")
}
fn fixed_asset(workspace_path: &str) -> RawAsset {
let wp = WorkspacePath::new(workspace_path.into()).unwrap();
RawAsset {
asset_id: AssetId("a".repeat(32)),
source_uri: SourceUri::File(PathBuf::from("/tmp/code-and-table.md")),
workspace_path: wp,
media_type: MediaType::Markdown,
byte_len: 0,
checksum: Checksum("0".repeat(64)),
// Pin discovered_at so the Discovered provenance event is
// deterministic across runs.
discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
stored: AssetStorage::Reference {
path: PathBuf::from("/tmp/code-and-table.md"),
sha: Checksum("0".repeat(64)),
},
}
}
fn strip_dynamic(mut v: Value) -> Value {
if let Some(events) = v
.get_mut("provenance")
.and_then(|p| p.get_mut("events"))
.and_then(|e| e.as_array_mut())
{
for (i, ev) in events.iter_mut().enumerate() {
if i > 0
&& let Some(obj) = ev.as_object_mut()
{
obj.insert("at".into(), Value::String("<stripped>".into()));
}
}
}
v
}
#[test]
fn code_and_table_canonical_snapshot() {
let dir = fixtures_dir();
let bytes = std::fs::read(dir.join("code-and-table.md")).expect("fixture readable");
// Frontmatter parse — code-and-table.md has none, so we provide
// BodyHints with deterministic timestamps so the lifted Metadata
// is reproducible. The body offset is 1 (no frontmatter prefix).
//
// We pin `first_h1` so the BodyHints → user.title → CanonicalDocument.title
// lift chain is exercised end-to-end (see `assert_eq!` on
// `doc.title` below). Without this, `code-and-table.md`'s lack of
// frontmatter title would leave `title == ""` and the chain would
// be uncovered by the snapshot.
let asset = fixed_asset("notes/code-and-table.md");
let hints = BodyHints {
first_h1: Some("Code And Table".into()),
fs_ctime: asset.discovered_at,
fs_mtime: asset.discovered_at,
fallback_lang: Some("en".into()),
};
let (metadata, fm_span, _fm_warns) =
parse_frontmatter(&bytes, &hints).expect("frontmatter parses");
// No frontmatter → body starts at line 1. With frontmatter, line
// count of the prelude is computed from the byte span; this fixture
// has none, so the constant 1 is fine.
let body_offset_lines: u32 = match fm_span {
// Defensive: count the newlines in the prelude. The fixture
// hits the `None` branch so this code path is not exercised
// by the test, but kept for completeness.
Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1,
None => 1,
};
let (blocks, parse_warns) =
parse_blocks(&bytes, body_offset_lines).expect("blocks parse");
let parser_version = ParserVersion("kb-normalize-snapshot-test-0".into());
let mut metadata = metadata;
// The `created_at` / `updated_at` lifted from BodyHints are pinned
// to `discovered_at` above, so they are already deterministic.
metadata.aliases.sort();
metadata.tags.sort();
let doc = build_canonical_document(
&asset,
metadata,
blocks,
&parser_version,
parse_warns,
)
.expect("build_canonical_document");
// Assert the BodyHints → first_h1 → user.title → CanonicalDocument.title
// lift chain end-to-end. Pinned in the snapshot too, but the explicit
// assertion makes a future drift fail with a clearer message.
assert_eq!(doc.title, "Code And Table");
let actual = strip_dynamic(serde_json::to_value(&doc).unwrap());
let baseline_path = dir.join("code-and-table.canonical.snapshot.json");
let baseline_text = match std::fs::read_to_string(&baseline_path) {
Ok(s) => s,
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
let pretty = serde_json::to_string_pretty(&actual).unwrap();
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
return;
}
Err(e) => panic!(
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
baseline_path.display()
),
};
let expected: Value =
serde_json::from_str(&baseline_text).expect("baseline parses as json");
if actual != expected {
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
let pretty = serde_json::to_string_pretty(&actual).unwrap();
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
eprintln!("updated baseline {}", baseline_path.display());
return;
}
let pretty = serde_json::to_string_pretty(&actual).unwrap();
panic!(
"canonical snapshot drift\n--- expected ({}) ---\n{baseline_text}\n--- actual ---\n{pretty}\nIf intentional, re-run with UPDATE_SNAPSHOTS=1.",
baseline_path.display()
);
}
}

View File

@@ -300,12 +300,12 @@ impl InlineBuf {
fn push_text(&mut self, s: &str) {
self.text.push_str(s);
self.push_inline(Inline::Text(s.to_string()));
self.push_inline(Inline::Text { text: s.to_string() });
}
fn push_code(&mut self, s: &str) {
self.text.push_str(s);
self.push_inline(Inline::Code(s.to_string()));
self.push_inline(Inline::Code { code: s.to_string() });
}
fn open_strong(&mut self) {
@@ -313,7 +313,7 @@ impl InlineBuf {
}
fn close_strong(&mut self) {
if let Some(InlineFrame::Strong(kids)) = self.stack.pop() {
self.push_inline(Inline::Strong(kids));
self.push_inline(Inline::Strong { children: kids });
}
}
@@ -322,7 +322,7 @@ impl InlineBuf {
}
fn close_emph(&mut self) {
if let Some(InlineFrame::Emph(kids)) = self.stack.pop() {
self.push_inline(Inline::Emph(kids));
self.push_inline(Inline::Emph { children: kids });
}
}
@@ -361,8 +361,8 @@ impl InlineBuf {
// If formatting tags were unbalanced we close them defensively.
while self.stack.len() > 1 {
match self.stack.pop().unwrap() {
InlineFrame::Strong(kids) => self.push_inline(Inline::Strong(kids)),
InlineFrame::Emph(kids) => self.push_inline(Inline::Emph(kids)),
InlineFrame::Strong(kids) => self.push_inline(Inline::Strong { children: kids }),
InlineFrame::Emph(kids) => self.push_inline(Inline::Emph { children: kids }),
InlineFrame::Link { href, text, kids } => {
let flat = if !text.is_empty() {
text
@@ -475,10 +475,11 @@ fn flatten_inlines_to_text(inlines: &[Inline]) -> String {
fn flatten_one(i: &Inline, out: &mut String) {
match i {
Inline::Text(s) | Inline::Code(s) => out.push_str(s),
Inline::Text { text } => out.push_str(text),
Inline::Code { code } => out.push_str(code),
Inline::Link { text, .. } => out.push_str(text),
Inline::Strong(v) | Inline::Emph(v) => {
for c in v {
Inline::Strong { children } | Inline::Emph { children } => {
for c in children {
flatten_one(c, out);
}
}
@@ -823,7 +824,7 @@ impl<'a> WalkState<'a> {
text.push('\n');
}
text.push_str(t);
inlines.push(Inline::Text(t.clone()));
inlines.push(Inline::Text { text: t.clone() });
}
_ => {}
}
@@ -921,7 +922,7 @@ impl<'a> WalkState<'a> {
source_span: self.span_for(&range),
payload: ParsedPayload::Paragraph {
text: raw.clone(),
inlines: vec![Inline::Text(raw)],
inlines: vec![Inline::Text { text: raw }],
},
}
} else {
@@ -1477,7 +1478,7 @@ mod tests {
assert!(
matches!(
inl,
Inline::Text(_) | Inline::Code(_) | Inline::Link { .. } | Inline::Strong(_) | Inline::Emph(_)
Inline::Text { .. } | Inline::Code { .. } | Inline::Link { .. } | Inline::Strong { .. } | Inline::Emph { .. }
),
"unexpected inline kind: {:?}",
inl
@@ -1736,11 +1737,11 @@ mod tests {
match &blocks[0].payload {
ParsedPayload::Paragraph { inlines, .. } => {
let kinds: Vec<&'static str> = inlines.iter().map(|i| match i {
Inline::Text(_) => "Text",
Inline::Code(_) => "Code",
Inline::Text { .. } => "Text",
Inline::Code { .. } => "Code",
Inline::Link { .. } => "Link",
Inline::Strong(_) => "Strong",
Inline::Emph(_) => "Emph",
Inline::Strong { .. } => "Strong",
Inline::Emph { .. } => "Emph",
}).collect();
assert!(kinds.contains(&"Strong"));
assert!(kinds.contains(&"Emph"));

View File

@@ -379,8 +379,12 @@ fn derive_metadata(
// ---- title ----
// Frontmatter → BodyHints.first_h1 → None.
// Filename fallback is the caller's responsibility (P1-4 normalize), per
// task brief — `BodyHints` does not carry a filename.
// Filename fallback for title is deferred to a later phase (P1-7 or
// kb-app integration); the parse_frontmatter -> build_canonical_document
// pipeline does not currently know the workspace_path filename component
// for fallback. CanonicalDocument.title may be empty for files without
// frontmatter title and without an H1; downstream display layer should
// fall back to filename via WorkspacePath inspection.
let title = raw.title.or_else(|| hints.first_h1.clone());
if let Some(t) = title {
user.insert("title".to_string(), Value::String(t));

View File

@@ -4,19 +4,19 @@
//! below. `body_offset_lines = 1` is used for both fixtures (no
//! frontmatter, body starts at file line 1).
//!
//! Note on snapshot shape: `kb_core::Inline` carries a `serde(tag = "kind")`
//! enum representation that cannot serialize newtype variants holding a
//! primitive (`Inline::Text(String)` etc.) — that's a serde limitation, not
//! ours, and is fixed up in a later kb-core task. To keep the snapshot
//! human-readable (and stable across that future fix), we project each
//! `ParsedBlock` into a `BlockView` that flattens inline content to plain
//! strings before serialization. This still pins the *contract* that
//! matters for P1-3: heading paths, source spans, payload kinds, payload
//! text content, table headers/rows, and code lang/body.
//! Note: kb-parse-md's snapshot tests use the `#[ignore]` regenerator
//! pattern (run `cargo test ... -- --ignored` to refresh baselines),
//! whereas `kb-normalize`'s integration test uses an `UPDATE_SNAPSHOTS=1`
//! env-var pattern. Migrating kb-parse-md to the env-var style is out of
//! scope; both styles are intentional for now.
//!
//! Following the kb_core::Inline schema migration (struct-variant shape),
//! `ParsedBlock` now serializes directly through serde — no projection
//! shim is required. Inlines surface as structured objects, e.g.
//! `[{"kind":"text","text":"…"},{"kind":"code","code":"…"}]`.
use kb_core::{Inline, SourceSpan};
use kb_parse_md::parse_blocks;
use kb_parse_types::{ParsedBlock, ParsedPayload, Warning};
use kb_parse_types::{ParsedBlock, Warning};
use serde::Serialize;
use serde_json::Value;
use std::fs;
@@ -24,130 +24,10 @@ use std::path::PathBuf;
#[derive(Serialize)]
struct Snapshot {
blocks: Vec<BlockView>,
blocks: Vec<ParsedBlock>,
warnings: Vec<Warning>,
}
#[derive(Serialize)]
struct BlockView {
kind: String,
heading_path: Vec<String>,
source_span: SourceSpan,
payload: PayloadView,
}
#[derive(Serialize)]
#[serde(tag = "kind", rename_all = "lowercase")]
enum PayloadView {
Heading {
level: u8,
text: String,
},
Paragraph {
text: String,
inlines_flat: String,
},
List {
ordered: bool,
items_flat: Vec<String>,
},
Code {
lang: Option<String>,
code: String,
},
Table {
headers: Vec<String>,
rows: Vec<Vec<String>>,
},
Quote {
text: String,
inlines_flat: String,
},
ImageRef {
src: String,
alt: String,
},
AudioRef {
src: String,
},
}
fn flatten_inline(i: &Inline, out: &mut String) {
match i {
Inline::Text(s) | Inline::Code(s) => out.push_str(s),
Inline::Link { text, href } => {
out.push('[');
out.push_str(text);
out.push_str("](");
out.push_str(href);
out.push(')');
}
Inline::Strong(v) => {
out.push_str("**");
for c in v {
flatten_inline(c, out);
}
out.push_str("**");
}
Inline::Emph(v) => {
out.push('*');
for c in v {
flatten_inline(c, out);
}
out.push('*');
}
}
}
fn flatten(inlines: &[Inline]) -> String {
let mut out = String::new();
for i in inlines {
flatten_inline(i, &mut out);
}
out
}
fn block_to_view(b: &ParsedBlock) -> BlockView {
let kind = format!("{:?}", b.kind).to_lowercase();
let payload = match &b.payload {
ParsedPayload::Heading { level, text } => PayloadView::Heading {
level: *level,
text: text.clone(),
},
ParsedPayload::Paragraph { text, inlines } => PayloadView::Paragraph {
text: text.clone(),
inlines_flat: flatten(inlines),
},
ParsedPayload::List { ordered, items } => PayloadView::List {
ordered: *ordered,
items_flat: items.iter().map(|it| flatten(it)).collect(),
},
ParsedPayload::Code { lang, code } => PayloadView::Code {
lang: lang.clone(),
code: code.clone(),
},
ParsedPayload::Table { headers, rows } => PayloadView::Table {
headers: headers.clone(),
rows: rows.clone(),
},
ParsedPayload::Quote { text, inlines } => PayloadView::Quote {
text: text.clone(),
inlines_flat: flatten(inlines),
},
ParsedPayload::ImageRef { src, alt } => PayloadView::ImageRef {
src: src.clone(),
alt: alt.clone(),
},
ParsedPayload::AudioRef { src } => PayloadView::AudioRef { src: src.clone() },
};
BlockView {
kind,
heading_path: b.heading_path.clone(),
source_span: b.source_span.clone(),
payload,
}
}
fn fixtures_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("..")
@@ -162,7 +42,7 @@ fn assert_snapshot(fixture: &str, baseline: &str) {
let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
let snap = Snapshot {
blocks: blocks.iter().map(block_to_view).collect(),
blocks,
warnings: warns,
};
let actual: Value = serde_json::to_value(&snap).unwrap();
@@ -211,7 +91,7 @@ fn emit_blocks_snapshots() {
let bytes = fs::read(dir.join(fixture)).unwrap();
let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
let snap = Snapshot {
blocks: blocks.iter().map(block_to_view).collect(),
blocks,
warnings: warns,
};
let json = serde_json::to_string_pretty(&snap).unwrap();
@@ -227,14 +107,10 @@ fn snapshot_is_deterministic_across_runs() {
let bytes = fs::read(dir.join("nested-headings.md")).unwrap();
let (a_blocks, a_warns) = parse_blocks(&bytes, 1).unwrap();
let (b_blocks, b_warns) = parse_blocks(&bytes, 1).unwrap();
// Compare via the view (which is fully serializable) and via the
// structural equality on `ParsedBlock` itself (no serde involved).
assert_eq!(a_blocks, b_blocks);
assert_eq!(a_warns, b_warns);
let av: Vec<_> = a_blocks.iter().map(block_to_view).collect();
let bv: Vec<_> = b_blocks.iter().map(block_to_view).collect();
assert_eq!(
serde_json::to_value(&av).unwrap(),
serde_json::to_value(&bv).unwrap()
serde_json::to_value(&a_blocks).unwrap(),
serde_json::to_value(&b_blocks).unwrap()
);
}

View File

@@ -0,0 +1,102 @@
{
"blocks": [
{
"common": {
"block_id": "dd1528c6e84d8a66087cbf6faafd67c6",
"heading_path": [],
"source_span": {
"end": 1,
"kind": "line",
"start": 1
}
},
"kind": "heading",
"level": 1,
"text": "Code And Table"
},
{
"code": "fn main() {\n println!(\"hi\");\n}",
"common": {
"block_id": "68ea34aca04b83413dd8556126ae4584",
"heading_path": [
"Code And Table"
],
"source_span": {
"end": 7,
"kind": "line",
"start": 3
}
},
"kind": "code",
"lang": "rust"
},
{
"common": {
"block_id": "b50a8e941b11f1834ae17adba9e08118",
"heading_path": [
"Code And Table"
],
"source_span": {
"end": 12,
"kind": "line",
"start": 9
}
},
"headers": [
"col a",
"col b"
],
"kind": "table",
"rows": [
[
"1",
"2"
],
[
"3",
"4"
]
]
}
],
"doc_id": "6a9ef317c9c097ff3f6aeb317559bd83",
"doc_version": 1,
"lang": "en",
"metadata": {
"aliases": [],
"created_at": "2023-11-14T22:13:20Z",
"source_type": "markdown",
"tags": [],
"trust_level": "primary",
"updated_at": "2023-11-14T22:13:20Z",
"user": {},
"user_id_alias": null
},
"parser_version": "kb-normalize-snapshot-test-0",
"provenance": {
"events": [
{
"agent": "kb-source-fs",
"at": "2023-11-14T22:13:20Z",
"kind": "discovered",
"note": null
},
{
"agent": "kb-parse-md",
"at": "<stripped>",
"kind": "parsed",
"note": "parser_version=kb-normalize-snapshot-test-0"
},
{
"agent": "kb-normalize",
"at": "<stripped>",
"kind": "normalized",
"note": null
}
]
},
"schema_version": 1,
"source_asset_id": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
"title": "Code And Table",
"workspace_path": "notes/code-and-table.md"
}

View File

@@ -27,7 +27,12 @@
"payload": {
"kind": "paragraph",
"text": "intro",
"inlines_flat": "intro"
"inlines": [
{
"kind": "text",
"text": "intro"
}
]
}
},
{
@@ -60,7 +65,12 @@
"payload": {
"kind": "paragraph",
"text": "body of A",
"inlines_flat": "body of A"
"inlines": [
{
"kind": "text",
"text": "body of A"
}
]
}
},
{
@@ -95,7 +105,12 @@
"payload": {
"kind": "paragraph",
"text": "deeper",
"inlines_flat": "deeper"
"inlines": [
{
"kind": "text",
"text": "deeper"
}
]
}
},
{
@@ -128,7 +143,12 @@
"payload": {
"kind": "paragraph",
"text": "body of B",
"inlines_flat": "body of B"
"inlines": [
{
"kind": "text",
"text": "body of B"
}
]
}
}
],