Merge pull request 'feat(p1-4): kb-normalize + kb-core Inline schema hotfix' (#9) from feat/p1-4-normalize into main
Reviewed-on: altair823-org/kb#9
This commit was merged in pull request #9.
This commit is contained in:
15
Cargo.lock
generated
15
Cargo.lock
generated
@@ -577,6 +577,21 @@ dependencies = [
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-normalize"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"kb-core",
|
||||
"kb-parse-md",
|
||||
"kb-parse-types",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"time",
|
||||
"tracing",
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-parse-md"
|
||||
version = "0.1.0"
|
||||
|
||||
@@ -6,6 +6,7 @@ members = [
|
||||
"crates/kb-config",
|
||||
"crates/kb-source-fs",
|
||||
"crates/kb-parse-md",
|
||||
"crates/kb-normalize",
|
||||
"crates/kb-app",
|
||||
"crates/kb-cli",
|
||||
]
|
||||
|
||||
@@ -100,11 +100,11 @@ pub struct AudioRefBlock {
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum Inline {
|
||||
Text(String),
|
||||
Code(String),
|
||||
Text { text: String },
|
||||
Code { code: String },
|
||||
Link { text: String, href: String },
|
||||
Strong(Vec<Inline>),
|
||||
Emph(Vec<Inline>),
|
||||
Strong { children: Vec<Inline> },
|
||||
Emph { children: Vec<Inline> },
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -175,3 +175,37 @@ pub struct TranscriptSegment {
|
||||
pub speaker: Option<String>,
|
||||
pub confidence: Option<f32>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Each `Inline` variant must serialize and deserialize cleanly under
|
||||
/// the internally-tagged representation. Newtype-with-primitive variants
|
||||
/// (`Text(String)`, `Code(String)`, `Strong(Vec<…>)`, `Emph(Vec<…>)`)
|
||||
/// previously failed at serde runtime because `tag = "kind"` cannot
|
||||
/// describe a newtype carrying a non-struct value. The struct-variant
|
||||
/// shape used here is the §9 schema migration.
|
||||
#[test]
|
||||
fn inline_serde_round_trip() {
|
||||
let cases = vec![
|
||||
Inline::Text { text: "hi".into() },
|
||||
Inline::Code { code: "x".into() },
|
||||
Inline::Link {
|
||||
text: "t".into(),
|
||||
href: "h".into(),
|
||||
},
|
||||
Inline::Strong {
|
||||
children: vec![Inline::Text { text: "bold".into() }],
|
||||
},
|
||||
Inline::Emph {
|
||||
children: vec![Inline::Text { text: "em".into() }],
|
||||
},
|
||||
];
|
||||
for c in cases {
|
||||
let s = serde_json::to_string(&c).expect("serialize");
|
||||
let back: Inline = serde_json::from_str(&s).expect("deserialize");
|
||||
assert_eq!(c, back);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
27
crates/kb-normalize/Cargo.toml
Normal file
27
crates/kb-normalize/Cargo.toml
Normal file
@@ -0,0 +1,27 @@
|
||||
[package]
|
||||
name = "kb-normalize"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Lift parser output (kb-parse-types) into kb-core::CanonicalDocument with deterministic IDs (§3.4, §4.2, §4.3)"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
kb-parse-types = { path = "../kb-parse-types" }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
unicode-normalization = "0.1"
|
||||
time = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
# kb-parse-md is permitted as a *dev*-dependency only — used by the
|
||||
# integration snapshot test to drive a fixture through the real parser.
|
||||
# Forbidden as a regular dep per design §8 (kb-normalize must not depend
|
||||
# on any specific parser); `cargo tree -p kb-normalize --depth 1` (the
|
||||
# default scope, excluding dev-deps) confirms this.
|
||||
kb-parse-md = { path = "../kb-parse-md" }
|
||||
serde_json = { workspace = true }
|
||||
843
crates/kb-normalize/src/lib.rs
Normal file
843
crates/kb-normalize/src/lib.rs
Normal file
@@ -0,0 +1,843 @@
|
||||
//! `kb-normalize` — lift parser output (`kb-parse-types`) into a
|
||||
//! [`kb_core::CanonicalDocument`] with deterministic IDs.
|
||||
//!
|
||||
//! Per design §3.4 (CanonicalDocument / Block), §4.2 (ID recipe), §4.3
|
||||
//! (ordinal rule), §3.6 (Provenance), §8 (module boundaries).
|
||||
//!
|
||||
//! Public surface:
|
||||
//!
|
||||
//! * [`build_canonical_document`] — assemble a `CanonicalDocument` from
|
||||
//! `(RawAsset, Metadata, Vec<ParsedBlock>, ParserVersion, Vec<Warning>)`.
|
||||
//! * [`id_for_doc`], [`id_for_block`] — re-exports of the canonical
|
||||
//! ID-recipe functions in `kb-core::ids` (§4.2). `kb-core` is the only
|
||||
//! implementation; `kb-normalize` is the canonical *entry point* per
|
||||
//! design §8.
|
||||
//!
|
||||
//! This crate must NOT depend on any parser implementation crate
|
||||
//! (`kb-parse-md`, `kb-parse-pdf`, …). All parser output flows in via
|
||||
//! the shared `kb-parse-types` crate.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::Result;
|
||||
use kb_core::{
|
||||
Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, DocumentId, HeadingBlock,
|
||||
ImageRefBlock, Inline, Lang, ListBlock, Metadata, ParserVersion, Provenance, ProvenanceEvent,
|
||||
ProvenanceKind, RawAsset, TableBlock, TextBlock,
|
||||
};
|
||||
use kb_parse_types::{ParsedBlock, ParsedPayload, Warning, WarningKind};
|
||||
use time::OffsetDateTime;
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
pub use kb_core::{id_for_block, id_for_doc};
|
||||
|
||||
/// Build a [`CanonicalDocument`] from the raw asset, frontmatter
|
||||
/// metadata, parser blocks, parser version, and any warnings.
|
||||
///
|
||||
/// Behavior contract (per design §3.4 / §4.2 / §4.3 / §3.6):
|
||||
///
|
||||
/// * `doc_id = id_for_doc(workspace_path, asset_id, parser_version)` —
|
||||
/// `workspace_path` is consumed verbatim from `asset` (already NFC +
|
||||
/// POSIX per `kb_core::normalize::to_posix`).
|
||||
/// * `block_id = id_for_block(doc_id, kind, heading_path, ordinal,
|
||||
/// source_span)` — `ordinal` is **0-based, scoped to (heading_path,
|
||||
/// block_kind), in document order** per §4.3.
|
||||
/// * `title` and `lang` are lifted from `metadata.user["title"]` /
|
||||
/// `metadata.user["lang"]` (where P1-2 stashes them) into the dedicated
|
||||
/// `CanonicalDocument` fields, and removed from the user map to avoid
|
||||
/// duplication. Both keys are lifted only if present and stringy;
|
||||
/// non-stringy values (e.g. `Number`, `Array`) and missing keys
|
||||
/// silently default to empty title / empty `Lang`. P1-2's frontmatter
|
||||
/// parser only writes these keys when the source value parses as a
|
||||
/// string, so the non-stringy branches are defense-in-depth.
|
||||
/// * `provenance` is seeded with `Discovered` (from `asset.discovered_at`),
|
||||
/// `Parsed`, `Normalized` events, and one `Warning` event per upstream
|
||||
/// warning. The two normalize-side events share one `now_utc()` reading
|
||||
/// so the timestamp jitter inside a single call is bounded — event
|
||||
/// ordering is preserved by `Vec` position.
|
||||
/// * `schema_version` and `doc_version` are pinned to `1` (initial).
|
||||
pub fn build_canonical_document(
|
||||
asset: &RawAsset,
|
||||
metadata: Metadata,
|
||||
blocks: Vec<ParsedBlock>,
|
||||
parser_version: &ParserVersion,
|
||||
warnings: Vec<Warning>,
|
||||
) -> Result<CanonicalDocument> {
|
||||
let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version);
|
||||
|
||||
// Lift title / lang from `metadata.user` (P1-2 stashed them there
|
||||
// because `Metadata` does not carry them directly). Strip after
|
||||
// lifting so the wire form does not duplicate the data.
|
||||
let mut metadata = metadata;
|
||||
let title = metadata
|
||||
.user
|
||||
.remove("title")
|
||||
.and_then(|v| v.as_str().map(String::from))
|
||||
.unwrap_or_default();
|
||||
let lang = metadata
|
||||
.user
|
||||
.remove("lang")
|
||||
.and_then(|v| v.as_str().map(|s| Lang(s.to_string())))
|
||||
.unwrap_or_else(|| Lang(String::new()));
|
||||
|
||||
// §4.3 ordinal rule — per (heading_path, block_kind), 0-based,
|
||||
// document order. A separate counter is kept for each grouping key.
|
||||
let mut counters: HashMap<(Vec<String>, &'static str), u32> = HashMap::new();
|
||||
// Some lift paths (e.g. AudioRef pre-P8) drop the block entirely and
|
||||
// synthesize a Warning so the wire form never carries an invalid
|
||||
// `AssetId`. These warnings originate at the lift stage and are
|
||||
// attributed to `kb-normalize` (not to whatever upstream emitter the
|
||||
// bare `WarningKind` would resolve to via `warning_agent`). They are
|
||||
// tracked separately so the agent string is correct in Provenance.
|
||||
let mut lift_warnings: Vec<Warning> = Vec::new();
|
||||
let lifted_blocks: Vec<Block> = blocks
|
||||
.into_iter()
|
||||
.filter_map(|pb| lift_block(&doc_id, pb, &mut counters, &mut lift_warnings))
|
||||
.collect();
|
||||
|
||||
tracing::debug!(
|
||||
target: "kb-normalize",
|
||||
"built canonical document doc_id={} blocks={}",
|
||||
doc_id.0,
|
||||
lifted_blocks.len()
|
||||
);
|
||||
|
||||
// Provenance — share `now` between the parse + normalize stages so
|
||||
// the per-call timestamp jitter is bounded.
|
||||
let now = OffsetDateTime::now_utc();
|
||||
let mut events: Vec<ProvenanceEvent> =
|
||||
Vec::with_capacity(3 + warnings.len() + lift_warnings.len());
|
||||
events.push(ProvenanceEvent {
|
||||
at: asset.discovered_at,
|
||||
agent: "kb-source-fs".to_string(),
|
||||
kind: ProvenanceKind::Discovered,
|
||||
note: None,
|
||||
});
|
||||
events.push(ProvenanceEvent {
|
||||
at: now,
|
||||
agent: "kb-parse-md".to_string(),
|
||||
kind: ProvenanceKind::Parsed,
|
||||
note: Some(format!("parser_version={}", parser_version.0)),
|
||||
});
|
||||
events.push(ProvenanceEvent {
|
||||
at: now,
|
||||
agent: "kb-normalize".to_string(),
|
||||
kind: ProvenanceKind::Normalized,
|
||||
note: None,
|
||||
});
|
||||
// {:?} on WarningKind renders camel-case variant name; intentional
|
||||
// for human-readable Provenance trace.
|
||||
for w in warnings {
|
||||
events.push(ProvenanceEvent {
|
||||
at: now,
|
||||
agent: warning_agent(&w.kind).to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(format!("{:?}: {}", w.kind, w.note)),
|
||||
});
|
||||
}
|
||||
// Lift-stage warnings (currently only AudioRef-deferred drops) are
|
||||
// unconditionally attributed to `kb-normalize`.
|
||||
for w in lift_warnings {
|
||||
events.push(ProvenanceEvent {
|
||||
at: now,
|
||||
agent: "kb-normalize".to_string(),
|
||||
kind: ProvenanceKind::Warning,
|
||||
note: Some(format!("{:?}: {}", w.kind, w.note)),
|
||||
});
|
||||
}
|
||||
let provenance = Provenance { events };
|
||||
|
||||
Ok(CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: asset.asset_id.clone(),
|
||||
workspace_path: asset.workspace_path.clone(),
|
||||
title,
|
||||
lang,
|
||||
blocks: lifted_blocks,
|
||||
metadata,
|
||||
provenance,
|
||||
parser_version: parser_version.clone(),
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
})
|
||||
}
|
||||
|
||||
/// Resolve a `WarningKind` to the upstream agent that emitted it. Used
|
||||
/// to fill `ProvenanceEvent::agent` for the warning's event entry.
|
||||
///
|
||||
/// `ExtractFailed` is emitted today by `kb-parse-md`'s panic-recovery
|
||||
/// guard around `parse_blocks` — see `crates/kb-parse-md/src/blocks.rs`.
|
||||
/// If a future stage (e.g. `kb-normalize` itself, an extractor, …) starts
|
||||
/// emitting `ExtractFailed`, this mapping needs to grow context (perhaps
|
||||
/// a separate `WarningSource` field on `Warning`) so attribution stays
|
||||
/// honest. For now, all `ExtractFailed` warnings observed by
|
||||
/// `build_canonical_document` originated in the parser.
|
||||
fn warning_agent(kind: &WarningKind) -> &'static str {
|
||||
match kind {
|
||||
WarningKind::MalformedFrontmatter | WarningKind::EncodingFallback => "kb-parse-md",
|
||||
WarningKind::MalformedTable => "kb-parse-md",
|
||||
WarningKind::ExtractFailed => "kb-parse-md",
|
||||
}
|
||||
}
|
||||
|
||||
/// Map a `ParsedPayload` variant to the lowercase, no-spaces string used
|
||||
/// as `block_kind` in the §4.2 ID tuple.
|
||||
fn payload_kind(payload: &ParsedPayload) -> &'static str {
|
||||
match payload {
|
||||
ParsedPayload::Heading { .. } => "heading",
|
||||
ParsedPayload::Paragraph { .. } => "paragraph",
|
||||
ParsedPayload::List { .. } => "list",
|
||||
ParsedPayload::Code { .. } => "code",
|
||||
ParsedPayload::Table { .. } => "table",
|
||||
ParsedPayload::Quote { .. } => "quote",
|
||||
ParsedPayload::ImageRef { .. } => "imageref",
|
||||
ParsedPayload::AudioRef { .. } => "audioref",
|
||||
}
|
||||
}
|
||||
|
||||
fn next_ordinal(
|
||||
counters: &mut HashMap<(Vec<String>, &'static str), u32>,
|
||||
heading_path: &[String],
|
||||
kind: &'static str,
|
||||
) -> u32 {
|
||||
let key = (heading_path.to_vec(), kind);
|
||||
let entry = counters.entry(key).or_insert(0);
|
||||
let ordinal = *entry;
|
||||
*entry += 1;
|
||||
ordinal
|
||||
}
|
||||
|
||||
fn lift_block(
|
||||
doc_id: &DocumentId,
|
||||
pb: ParsedBlock,
|
||||
counters: &mut HashMap<(Vec<String>, &'static str), u32>,
|
||||
warnings: &mut Vec<Warning>,
|
||||
) -> Option<Block> {
|
||||
let kind = payload_kind(&pb.payload);
|
||||
// Task spec line 73: "All input strings normalized to NFC before
|
||||
// hashing." `pulldown-cmark` does not NFC heading text, and
|
||||
// `serde_json_canonicalizer` v0.3 does not normalize strings either,
|
||||
// so we must NFC-normalize `heading_path` here before it feeds both
|
||||
// the §4.2 ID recipe AND the on-disk `CommonBlock.heading_path` (so
|
||||
// wire form matches ID input). Without this, NFD `\u{1100}\u{1161}`
|
||||
// and NFC `\u{AC00}` (both render as 가) would produce different
|
||||
// `block_id`s for what is logically the same heading.
|
||||
let heading_path_nfc: Vec<String> =
|
||||
pb.heading_path.iter().map(|s| s.nfc().collect()).collect();
|
||||
let ordinal = next_ordinal(counters, &heading_path_nfc, kind);
|
||||
let block_id: BlockId =
|
||||
id_for_block(doc_id, kind, &heading_path_nfc, ordinal, &pb.source_span);
|
||||
let common = CommonBlock {
|
||||
block_id,
|
||||
heading_path: heading_path_nfc,
|
||||
source_span: pb.source_span,
|
||||
};
|
||||
let block = match pb.payload {
|
||||
ParsedPayload::Heading { level, text } => Block::Heading(HeadingBlock {
|
||||
common,
|
||||
level,
|
||||
text,
|
||||
}),
|
||||
ParsedPayload::Paragraph { text, inlines } => Block::Paragraph(TextBlock {
|
||||
common,
|
||||
text,
|
||||
inlines,
|
||||
}),
|
||||
ParsedPayload::List { ordered, items } => Block::List(ListBlock {
|
||||
common: common.clone(),
|
||||
ordered,
|
||||
items: items
|
||||
.into_iter()
|
||||
.map(|item_inlines| TextBlock {
|
||||
// All list items currently inherit the parent's
|
||||
// CommonBlock (incl. block_id). Per-item IDs would
|
||||
// require a §4.2 recipe extension. Spec (§3.4)
|
||||
// defines `ListBlock.items: Vec<TextBlock>` and
|
||||
// does not allocate per-item BlockIds. Re-using the
|
||||
// parent's common keeps the wire form deterministic
|
||||
// while letting the inline tree carry the item
|
||||
// content.
|
||||
common: common.clone(),
|
||||
text: flatten_inlines(&item_inlines),
|
||||
inlines: item_inlines,
|
||||
})
|
||||
.collect(),
|
||||
}),
|
||||
ParsedPayload::Code { lang, code } => Block::Code(CodeBlock { common, lang, code }),
|
||||
ParsedPayload::Table { headers, rows } => Block::Table(TableBlock {
|
||||
common,
|
||||
headers,
|
||||
rows,
|
||||
}),
|
||||
ParsedPayload::Quote { text, inlines } => Block::Quote(TextBlock {
|
||||
common,
|
||||
text,
|
||||
inlines,
|
||||
}),
|
||||
ParsedPayload::ImageRef { src, alt } => Block::ImageRef(ImageRefBlock {
|
||||
common,
|
||||
asset_id: None,
|
||||
src,
|
||||
alt,
|
||||
ocr: None,
|
||||
caption: None,
|
||||
}),
|
||||
// TODO(P8): audio extractor will resolve workspace assets and
|
||||
// produce real AssetIds. This skip-and-warn shim is a
|
||||
// placeholder. `AssetId::from_str` requires a 32-hex string, so
|
||||
// synthesizing `AssetId(String::new())` would break the
|
||||
// invariant — instead we drop the block and surface a Warning
|
||||
// (attributed to `kb-normalize` per §3.6 since this is the
|
||||
// lift-stage decision).
|
||||
ParsedPayload::AudioRef { src } => {
|
||||
warnings.push(Warning {
|
||||
kind: WarningKind::ExtractFailed,
|
||||
note: format!(
|
||||
"audio-ref AssetId resolution deferred to P8 — block dropped (src={src})"
|
||||
),
|
||||
});
|
||||
return None;
|
||||
}
|
||||
};
|
||||
Some(block)
|
||||
}
|
||||
|
||||
/// Flatten a `Vec<Inline>` into a plain text string. Used by list-item
|
||||
/// `TextBlock.text` since `ParsedPayload::List` only carries inline trees
|
||||
/// per item.
|
||||
fn flatten_inlines(inlines: &[Inline]) -> String {
|
||||
let mut out = String::new();
|
||||
for i in inlines {
|
||||
flatten_inline(i, &mut out);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn flatten_inline(i: &Inline, out: &mut String) {
|
||||
match i {
|
||||
Inline::Text { text } => out.push_str(text),
|
||||
Inline::Code { code } => out.push_str(code),
|
||||
Inline::Link { text, .. } => out.push_str(text),
|
||||
Inline::Strong { children } | Inline::Emph { children } => {
|
||||
for c in children {
|
||||
flatten_inline(c, out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kb_core::{
|
||||
AssetId, AssetStorage, Checksum, MediaType, SourceSpan, SourceType, SourceUri,
|
||||
TrustLevel, WorkspacePath, normalize::to_posix,
|
||||
};
|
||||
use serde_json::Value;
|
||||
use std::path::{Path, PathBuf};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixture_asset() -> RawAsset {
|
||||
let workspace_path = WorkspacePath::new("notes/example.md".into()).unwrap();
|
||||
RawAsset {
|
||||
asset_id: AssetId("a".repeat(32)),
|
||||
source_uri: SourceUri::File(PathBuf::from("/tmp/example.md")),
|
||||
workspace_path,
|
||||
media_type: MediaType::Markdown,
|
||||
byte_len: 0,
|
||||
checksum: Checksum("0".repeat(64)),
|
||||
// Pin a fixed timestamp so determinism tests can compare
|
||||
// outputs across runs without timestamp jitter outside the
|
||||
// fields we explicitly strip.
|
||||
discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
stored: AssetStorage::Reference {
|
||||
path: PathBuf::from("/tmp/example.md"),
|
||||
sha: Checksum("0".repeat(64)),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn fixture_metadata() -> Metadata {
|
||||
let mut user = serde_json::Map::new();
|
||||
user.insert("title".into(), Value::String("Example".into()));
|
||||
user.insert("lang".into(), Value::String("en".into()));
|
||||
user.insert("custom".into(), Value::Bool(true));
|
||||
Metadata {
|
||||
aliases: vec![],
|
||||
tags: vec![],
|
||||
created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
source_type: SourceType::Markdown,
|
||||
trust_level: TrustLevel::Primary,
|
||||
user_id_alias: None,
|
||||
user,
|
||||
}
|
||||
}
|
||||
|
||||
fn parser_version() -> ParserVersion {
|
||||
ParserVersion("kb-normalize-test-0".into())
|
||||
}
|
||||
|
||||
/// Fixed 5-block input used by both the ordinal-scoping pinning test
|
||||
/// and the determinism stress test (so the latter exercises the
|
||||
/// `lift_block` path, not just the empty-blocks path).
|
||||
fn fixture_blocks_five() -> Vec<ParsedBlock> {
|
||||
let h1_a = vec!["A".to_string()];
|
||||
let h1_b = vec!["B".to_string()];
|
||||
vec![
|
||||
ParsedBlock {
|
||||
kind: kb_parse_types::ParsedBlockKind::Paragraph,
|
||||
heading_path: h1_a.clone(),
|
||||
source_span: SourceSpan::Line { start: 1, end: 1 },
|
||||
payload: ParsedPayload::Paragraph {
|
||||
text: "p1".into(),
|
||||
inlines: vec![],
|
||||
},
|
||||
},
|
||||
ParsedBlock {
|
||||
kind: kb_parse_types::ParsedBlockKind::Paragraph,
|
||||
heading_path: h1_a.clone(),
|
||||
source_span: SourceSpan::Line { start: 2, end: 2 },
|
||||
payload: ParsedPayload::Paragraph {
|
||||
text: "p2".into(),
|
||||
inlines: vec![],
|
||||
},
|
||||
},
|
||||
ParsedBlock {
|
||||
kind: kb_parse_types::ParsedBlockKind::Paragraph,
|
||||
heading_path: h1_a.clone(),
|
||||
source_span: SourceSpan::Line { start: 3, end: 3 },
|
||||
payload: ParsedPayload::Paragraph {
|
||||
text: "p3".into(),
|
||||
inlines: vec![],
|
||||
},
|
||||
},
|
||||
ParsedBlock {
|
||||
kind: kb_parse_types::ParsedBlockKind::Code,
|
||||
heading_path: h1_a,
|
||||
source_span: SourceSpan::Line { start: 4, end: 5 },
|
||||
payload: ParsedPayload::Code {
|
||||
lang: None,
|
||||
code: "x".into(),
|
||||
},
|
||||
},
|
||||
ParsedBlock {
|
||||
kind: kb_parse_types::ParsedBlockKind::Paragraph,
|
||||
heading_path: h1_b,
|
||||
source_span: SourceSpan::Line { start: 6, end: 6 },
|
||||
payload: ParsedPayload::Paragraph {
|
||||
text: "q1".into(),
|
||||
inlines: vec![],
|
||||
},
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
/// `id_for_doc` is deterministic across 1000 invocations on the same
|
||||
/// input — a regression in canonical JSON or BLAKE3 would surface
|
||||
/// here immediately.
|
||||
#[test]
|
||||
fn id_for_doc_deterministic_1000() {
|
||||
let path = WorkspacePath::new("a/b.md".into()).unwrap();
|
||||
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
|
||||
let pv = ParserVersion("v1".into());
|
||||
let first = id_for_doc(&path, &asset, &pv);
|
||||
for _ in 0..1000 {
|
||||
assert_eq!(id_for_doc(&path, &asset, &pv), first);
|
||||
}
|
||||
}
|
||||
|
||||
/// NFC vs NFD inputs for the same Korean glyph must produce the
|
||||
/// same `doc_id` because `to_posix` runs NFC normalization.
|
||||
#[test]
|
||||
fn nfc_nfd_korean_path_same_id() {
|
||||
let nfd = to_posix(Path::new("\u{1100}\u{1161}.md")).unwrap();
|
||||
let nfc = to_posix(Path::new("\u{AC00}.md")).unwrap();
|
||||
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
|
||||
let pv = parser_version();
|
||||
assert_eq!(id_for_doc(&nfd, &asset, &pv), id_for_doc(&nfc, &asset, &pv));
|
||||
}
|
||||
|
||||
/// `./a/b.md` and `a/b.md` must collapse to the same POSIX form
|
||||
/// before `id_for_doc`.
|
||||
#[test]
|
||||
fn posix_curdir_collapses_to_same_id() {
|
||||
let a = to_posix(Path::new("./a/b.md")).unwrap();
|
||||
let b = to_posix(Path::new("a/b.md")).unwrap();
|
||||
let asset = AssetId("0123456789abcdef0123456789abcdef".into());
|
||||
let pv = parser_version();
|
||||
assert_eq!(id_for_doc(&a, &asset, &pv), id_for_doc(&b, &asset, &pv));
|
||||
}
|
||||
|
||||
/// Ordinals are scoped to (heading_path, block_kind) per §4.3:
|
||||
/// three paragraphs under H1 → 0/1/2; a code block under the same
|
||||
/// H1 starts a fresh counter at 0; a paragraph under a different
|
||||
/// H1 also starts a fresh counter at 0.
|
||||
#[test]
|
||||
fn block_ordinals_scoped_per_heading_and_kind() {
|
||||
let h1_a = vec!["A".to_string()];
|
||||
let h1_b = vec!["B".to_string()];
|
||||
let blocks = fixture_blocks_five();
|
||||
let asset = fixture_asset();
|
||||
let metadata = fixture_metadata();
|
||||
let pv = parser_version();
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap();
|
||||
|
||||
// Compute the expected IDs out-of-band so the test pins both
|
||||
// the (heading_path, kind) ordinal grouping AND the value of
|
||||
// each block_id under the recipe.
|
||||
let p1 = id_for_block(
|
||||
&doc.doc_id,
|
||||
"paragraph",
|
||||
&h1_a,
|
||||
0,
|
||||
&SourceSpan::Line { start: 1, end: 1 },
|
||||
);
|
||||
let p2 = id_for_block(
|
||||
&doc.doc_id,
|
||||
"paragraph",
|
||||
&h1_a,
|
||||
1,
|
||||
&SourceSpan::Line { start: 2, end: 2 },
|
||||
);
|
||||
let p3 = id_for_block(
|
||||
&doc.doc_id,
|
||||
"paragraph",
|
||||
&h1_a,
|
||||
2,
|
||||
&SourceSpan::Line { start: 3, end: 3 },
|
||||
);
|
||||
let c0 = id_for_block(
|
||||
&doc.doc_id,
|
||||
"code",
|
||||
&h1_a,
|
||||
0,
|
||||
&SourceSpan::Line { start: 4, end: 5 },
|
||||
);
|
||||
let q0 = id_for_block(
|
||||
&doc.doc_id,
|
||||
"paragraph",
|
||||
&h1_b,
|
||||
0,
|
||||
&SourceSpan::Line { start: 6, end: 6 },
|
||||
);
|
||||
|
||||
let ids: Vec<&BlockId> = doc
|
||||
.blocks
|
||||
.iter()
|
||||
.map(|b| match b {
|
||||
Block::Paragraph(t) | Block::Quote(t) => &t.common.block_id,
|
||||
Block::Heading(h) => &h.common.block_id,
|
||||
Block::List(l) => &l.common.block_id,
|
||||
Block::Code(c) => &c.common.block_id,
|
||||
Block::Table(t) => &t.common.block_id,
|
||||
Block::ImageRef(i) => &i.common.block_id,
|
||||
Block::AudioRef(a) => &a.common.block_id,
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(ids, vec![&p1, &p2, &p3, &c0, &q0]);
|
||||
}
|
||||
|
||||
/// Provenance events appear in the documented order: `Discovered`
|
||||
/// (from the asset), `Parsed`, then `Normalized`. Warnings (none in
|
||||
/// this test) would follow.
|
||||
#[test]
|
||||
fn provenance_contains_stage_events_in_order() {
|
||||
let asset = fixture_asset();
|
||||
let metadata = fixture_metadata();
|
||||
let pv = parser_version();
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
|
||||
let kinds: Vec<_> = doc.provenance.events.iter().map(|e| e.kind).collect();
|
||||
assert_eq!(
|
||||
kinds,
|
||||
vec![
|
||||
ProvenanceKind::Discovered,
|
||||
ProvenanceKind::Parsed,
|
||||
ProvenanceKind::Normalized,
|
||||
]
|
||||
);
|
||||
let events = &doc.provenance.events;
|
||||
assert_eq!(events[0].at, asset.discovered_at);
|
||||
assert_eq!(events[0].agent, "kb-source-fs");
|
||||
assert_eq!(events[1].agent, "kb-parse-md");
|
||||
assert_eq!(events[2].agent, "kb-normalize");
|
||||
// Pin the implementation invariant that Parsed and Normalized
|
||||
// share the single `now_utc()` reading inside one call.
|
||||
assert_eq!(events[1].at, events[2].at, "Parsed and Normalized share now_utc");
|
||||
}
|
||||
|
||||
/// Warnings carried into `build_canonical_document` are emitted as
|
||||
/// `ProvenanceKind::Warning` events with the upstream agent.
|
||||
#[test]
|
||||
fn provenance_includes_warnings() {
|
||||
let asset = fixture_asset();
|
||||
let metadata = fixture_metadata();
|
||||
let pv = parser_version();
|
||||
let warnings = vec![Warning {
|
||||
kind: WarningKind::MalformedFrontmatter,
|
||||
note: "missing closing fence".into(),
|
||||
}];
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap();
|
||||
assert_eq!(doc.provenance.events.len(), 4);
|
||||
let last = doc.provenance.events.last().unwrap();
|
||||
assert_eq!(last.kind, ProvenanceKind::Warning);
|
||||
assert_eq!(last.agent, "kb-parse-md");
|
||||
assert!(last.note.as_deref().unwrap().contains("missing closing fence"));
|
||||
}
|
||||
|
||||
/// `metadata.user["title"]` and `metadata.user["lang"]` are lifted
|
||||
/// to the dedicated `CanonicalDocument` fields and stripped from
|
||||
/// the user map (so the wire form does not duplicate the data).
|
||||
/// Other user keys survive intact.
|
||||
#[test]
|
||||
fn lifts_title_and_lang_from_user_map() {
|
||||
let asset = fixture_asset();
|
||||
let metadata = fixture_metadata();
|
||||
let pv = parser_version();
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
|
||||
assert_eq!(doc.title, "Example");
|
||||
assert_eq!(doc.lang, Lang("en".into()));
|
||||
assert!(!doc.metadata.user.contains_key("title"));
|
||||
assert!(!doc.metadata.user.contains_key("lang"));
|
||||
assert!(doc.metadata.user.contains_key("custom"));
|
||||
}
|
||||
|
||||
/// Determinism property: 1000 iterations of `build_canonical_document`
|
||||
/// over identical inputs produce byte-identical JSON, modulo the two
|
||||
/// non-deterministic `now_utc()` timestamps for the Parsed/Normalized
|
||||
/// events. We strip those timestamps before comparing. Must finish
|
||||
/// within 1 second.
|
||||
#[test]
|
||||
fn determinism_1000_iterations_under_1s() {
|
||||
let asset = fixture_asset();
|
||||
let metadata = fixture_metadata();
|
||||
let pv = parser_version();
|
||||
|
||||
// Helper: serialize and replace the two now_utc-derived timestamps
|
||||
// (Parsed + Normalized + any Warning events) with a sentinel so
|
||||
// the comparison only checks the deterministic fields.
|
||||
fn strip_dynamic_at(doc: &CanonicalDocument) -> Value {
|
||||
let mut v = serde_json::to_value(doc).unwrap();
|
||||
if let Some(events) = v
|
||||
.get_mut("provenance")
|
||||
.and_then(|p| p.get_mut("events"))
|
||||
.and_then(|e| e.as_array_mut())
|
||||
{
|
||||
for (i, ev) in events.iter_mut().enumerate() {
|
||||
// index 0 is Discovered (deterministic — pinned in
|
||||
// the fixture). Strip everything after.
|
||||
if i > 0
|
||||
&& let Some(obj) = ev.as_object_mut()
|
||||
{
|
||||
obj.insert("at".into(), Value::String("<stripped>".into()));
|
||||
}
|
||||
}
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
// Use the same 5-block fixture as the ordinal-scoping test so
|
||||
// determinism is exercised on a non-empty `lift_block` path
|
||||
// (block_id hashing, NFC normalization, ordinal counters), not
|
||||
// just an empty Vec.
|
||||
let baseline = build_canonical_document(
|
||||
&asset,
|
||||
metadata.clone(),
|
||||
fixture_blocks_five(),
|
||||
&pv,
|
||||
vec![],
|
||||
)
|
||||
.unwrap();
|
||||
let baseline_json = serde_json::to_string(&strip_dynamic_at(&baseline)).unwrap();
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
for _ in 0..1000 {
|
||||
let next = build_canonical_document(
|
||||
&asset,
|
||||
metadata.clone(),
|
||||
fixture_blocks_five(),
|
||||
&pv,
|
||||
vec![],
|
||||
)
|
||||
.unwrap();
|
||||
let next_json = serde_json::to_string(&strip_dynamic_at(&next)).unwrap();
|
||||
assert_eq!(baseline_json, next_json);
|
||||
}
|
||||
assert!(
|
||||
start.elapsed() < std::time::Duration::from_secs(1),
|
||||
"1000 iterations took {:?}",
|
||||
start.elapsed()
|
||||
);
|
||||
}
|
||||
|
||||
/// I1 regression — `WarningKind::ExtractFailed` is emitted by
|
||||
/// `kb-parse-md` (panic-recovery in `blocks.rs`), so the resulting
|
||||
/// `ProvenanceEvent::agent` must read `"kb-parse-md"`. A regression
|
||||
/// to `"kb-normalize"` would mis-attribute parse panics and break
|
||||
/// stage-filtered debugging.
|
||||
#[test]
|
||||
fn provenance_with_extract_failed_warning_attributes_to_kb_parse_md() {
|
||||
let asset = fixture_asset();
|
||||
let metadata = fixture_metadata();
|
||||
let pv = parser_version();
|
||||
let warnings = vec![Warning {
|
||||
kind: WarningKind::ExtractFailed,
|
||||
note: "pulldown-cmark panicked; body discarded".into(),
|
||||
}];
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, vec![], &pv, warnings).unwrap();
|
||||
let warning_event = doc
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.find(|e| e.kind == ProvenanceKind::Warning)
|
||||
.expect("warning event present");
|
||||
assert_eq!(warning_event.agent, "kb-parse-md");
|
||||
assert!(
|
||||
warning_event
|
||||
.note
|
||||
.as_deref()
|
||||
.unwrap()
|
||||
.contains("ExtractFailed")
|
||||
);
|
||||
}
|
||||
|
||||
/// I2 regression — `ParsedPayload::AudioRef` is dropped (not lifted
|
||||
/// into a `Block::AudioRef` with a synthesized empty `AssetId`,
|
||||
/// which would violate `AssetId::from_str`'s 32-hex invariant). A
|
||||
/// `Warning` is surfaced in Provenance, attributed to
|
||||
/// `"kb-normalize"` because the decision is made at the lift stage.
|
||||
#[test]
|
||||
fn audio_ref_block_skipped_with_warning() {
|
||||
let span = SourceSpan::Line { start: 1, end: 1 };
|
||||
let blocks = vec![ParsedBlock {
|
||||
kind: kb_parse_types::ParsedBlockKind::AudioRef,
|
||||
heading_path: vec![],
|
||||
source_span: span,
|
||||
payload: ParsedPayload::AudioRef {
|
||||
src: "voice.m4a".into(),
|
||||
},
|
||||
}];
|
||||
let asset = fixture_asset();
|
||||
let metadata = fixture_metadata();
|
||||
let pv = parser_version();
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, blocks, &pv, vec![]).unwrap();
|
||||
|
||||
// No AudioRef block in the canonical output.
|
||||
assert!(
|
||||
!doc.blocks
|
||||
.iter()
|
||||
.any(|b| matches!(b, Block::AudioRef(_))),
|
||||
"AudioRef block should be skipped pre-P8"
|
||||
);
|
||||
|
||||
// Exactly one Warning event mentioning the AudioRef src.
|
||||
let warning_events: Vec<_> = doc
|
||||
.provenance
|
||||
.events
|
||||
.iter()
|
||||
.filter(|e| e.kind == ProvenanceKind::Warning)
|
||||
.collect();
|
||||
assert_eq!(warning_events.len(), 1);
|
||||
let w = warning_events[0];
|
||||
assert_eq!(w.agent, "kb-normalize");
|
||||
assert!(w.note.as_deref().unwrap().contains("voice.m4a"));
|
||||
}
|
||||
|
||||
/// I3 regression — heading-path strings are NFC-normalized before
|
||||
/// feeding into `id_for_block`, so canonically-equivalent NFD and
|
||||
/// NFC inputs produce the same `block_id`. Mirrors
|
||||
/// `nfc_nfd_korean_path_same_id` for `doc_id`.
|
||||
#[test]
|
||||
fn nfc_nfd_korean_heading_path_same_block_id() {
|
||||
let span = SourceSpan::Line { start: 1, end: 1 };
|
||||
let nfd_heading = "\u{1100}\u{1161}".to_string(); // 가 (NFD)
|
||||
let nfc_heading = "\u{AC00}".to_string(); // 가 (NFC)
|
||||
let mk_block = |heading: String| ParsedBlock {
|
||||
kind: kb_parse_types::ParsedBlockKind::Paragraph,
|
||||
heading_path: vec![heading],
|
||||
source_span: span.clone(),
|
||||
payload: ParsedPayload::Paragraph {
|
||||
text: "p".into(),
|
||||
inlines: vec![],
|
||||
},
|
||||
};
|
||||
let asset = fixture_asset();
|
||||
let pv = parser_version();
|
||||
let doc_nfd = build_canonical_document(
|
||||
&asset,
|
||||
fixture_metadata(),
|
||||
vec![mk_block(nfd_heading)],
|
||||
&pv,
|
||||
vec![],
|
||||
)
|
||||
.unwrap();
|
||||
let doc_nfc = build_canonical_document(
|
||||
&asset,
|
||||
fixture_metadata(),
|
||||
vec![mk_block(nfc_heading)],
|
||||
&pv,
|
||||
vec![],
|
||||
)
|
||||
.unwrap();
|
||||
let id_nfd = match &doc_nfd.blocks[0] {
|
||||
Block::Paragraph(t) => &t.common.block_id,
|
||||
_ => panic!("expected Paragraph"),
|
||||
};
|
||||
let id_nfc = match &doc_nfc.blocks[0] {
|
||||
Block::Paragraph(t) => &t.common.block_id,
|
||||
_ => panic!("expected Paragraph"),
|
||||
};
|
||||
assert_eq!(id_nfd, id_nfc, "NFD and NFC heading paths must hash equal");
|
||||
}
|
||||
|
||||
/// M7 — `metadata.user["title"] = ""` is stringy and lifts to an
|
||||
/// empty `CanonicalDocument.title`. This pins the policy: an
|
||||
/// explicit empty string is *not* dropped, it's lifted as-is.
|
||||
#[test]
|
||||
fn title_empty_string_in_user_map_falls_back_to_default() {
|
||||
let asset = fixture_asset();
|
||||
let mut metadata = fixture_metadata();
|
||||
metadata
|
||||
.user
|
||||
.insert("title".into(), Value::String(String::new()));
|
||||
let pv = parser_version();
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
|
||||
assert_eq!(doc.title, "");
|
||||
}
|
||||
|
||||
/// M7 — `metadata.user["title"] = 42` is non-stringy and silently
|
||||
/// drops; the fallback default (empty title) is used.
|
||||
#[test]
|
||||
fn title_non_string_in_user_map_silently_drops() {
|
||||
let asset = fixture_asset();
|
||||
let mut metadata = fixture_metadata();
|
||||
metadata
|
||||
.user
|
||||
.insert("title".into(), Value::Number(42.into()));
|
||||
let pv = parser_version();
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
|
||||
assert_eq!(doc.title, "");
|
||||
}
|
||||
|
||||
/// M7 — non-stringy `lang` (e.g. an array) silently drops. This is
|
||||
/// defensive: P1-2 frontmatter validates the shape upstream, but we
|
||||
/// don't trust it.
|
||||
#[test]
|
||||
fn lang_invalid_shape_silently_drops() {
|
||||
let asset = fixture_asset();
|
||||
let mut metadata = fixture_metadata();
|
||||
metadata.user.insert("lang".into(), Value::Array(vec![]));
|
||||
let pv = parser_version();
|
||||
let doc =
|
||||
build_canonical_document(&asset, metadata, vec![], &pv, vec![]).unwrap();
|
||||
assert_eq!(doc.lang, Lang(String::new()));
|
||||
}
|
||||
}
|
||||
160
crates/kb-normalize/tests/normalize_snapshot.rs
Normal file
160
crates/kb-normalize/tests/normalize_snapshot.rs
Normal file
@@ -0,0 +1,160 @@
|
||||
//! Snapshot test pinning the full `CanonicalDocument` JSON for the
|
||||
//! `code-and-table.md` fixture.
|
||||
//!
|
||||
//! This is an integration test (it lives under `tests/`) and depends on
|
||||
//! `kb-parse-md` only as a dev-dep so the production crate's regular
|
||||
//! deps still satisfy the §8 boundary (`cargo tree -p kb-normalize
|
||||
//! --depth 1` without `-e dev` does not list any parser implementation).
|
||||
//!
|
||||
//! Non-deterministic fields are stripped before comparison:
|
||||
//!
|
||||
//! * `provenance.events[*].at` — each invocation calls `now_utc()` for
|
||||
//! the Parsed/Normalized/Warning events. The Discovered event uses
|
||||
//! the asset's pinned `discovered_at`, so we keep that one and replace
|
||||
//! only indices ≥ 1.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kb_core::{
|
||||
AssetId, AssetStorage, Checksum, MediaType, ParserVersion, RawAsset, SourceUri,
|
||||
WorkspacePath,
|
||||
};
|
||||
use kb_normalize::build_canonical_document;
|
||||
use kb_parse_md::{BodyHints, parse_blocks, parse_frontmatter};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
.join("..")
|
||||
.join("fixtures")
|
||||
.join("markdown")
|
||||
}
|
||||
|
||||
fn fixed_asset(workspace_path: &str) -> RawAsset {
|
||||
let wp = WorkspacePath::new(workspace_path.into()).unwrap();
|
||||
RawAsset {
|
||||
asset_id: AssetId("a".repeat(32)),
|
||||
source_uri: SourceUri::File(PathBuf::from("/tmp/code-and-table.md")),
|
||||
workspace_path: wp,
|
||||
media_type: MediaType::Markdown,
|
||||
byte_len: 0,
|
||||
checksum: Checksum("0".repeat(64)),
|
||||
// Pin discovered_at so the Discovered provenance event is
|
||||
// deterministic across runs.
|
||||
discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(),
|
||||
stored: AssetStorage::Reference {
|
||||
path: PathBuf::from("/tmp/code-and-table.md"),
|
||||
sha: Checksum("0".repeat(64)),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn strip_dynamic(mut v: Value) -> Value {
|
||||
if let Some(events) = v
|
||||
.get_mut("provenance")
|
||||
.and_then(|p| p.get_mut("events"))
|
||||
.and_then(|e| e.as_array_mut())
|
||||
{
|
||||
for (i, ev) in events.iter_mut().enumerate() {
|
||||
if i > 0
|
||||
&& let Some(obj) = ev.as_object_mut()
|
||||
{
|
||||
obj.insert("at".into(), Value::String("<stripped>".into()));
|
||||
}
|
||||
}
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_and_table_canonical_snapshot() {
|
||||
let dir = fixtures_dir();
|
||||
let bytes = std::fs::read(dir.join("code-and-table.md")).expect("fixture readable");
|
||||
|
||||
// Frontmatter parse — code-and-table.md has none, so we provide
|
||||
// BodyHints with deterministic timestamps so the lifted Metadata
|
||||
// is reproducible. The body offset is 1 (no frontmatter prefix).
|
||||
//
|
||||
// We pin `first_h1` so the BodyHints → user.title → CanonicalDocument.title
|
||||
// lift chain is exercised end-to-end (see `assert_eq!` on
|
||||
// `doc.title` below). Without this, `code-and-table.md`'s lack of
|
||||
// frontmatter title would leave `title == ""` and the chain would
|
||||
// be uncovered by the snapshot.
|
||||
let asset = fixed_asset("notes/code-and-table.md");
|
||||
let hints = BodyHints {
|
||||
first_h1: Some("Code And Table".into()),
|
||||
fs_ctime: asset.discovered_at,
|
||||
fs_mtime: asset.discovered_at,
|
||||
fallback_lang: Some("en".into()),
|
||||
};
|
||||
let (metadata, fm_span, _fm_warns) =
|
||||
parse_frontmatter(&bytes, &hints).expect("frontmatter parses");
|
||||
|
||||
// No frontmatter → body starts at line 1. With frontmatter, line
|
||||
// count of the prelude is computed from the byte span; this fixture
|
||||
// has none, so the constant 1 is fine.
|
||||
let body_offset_lines: u32 = match fm_span {
|
||||
// Defensive: count the newlines in the prelude. The fixture
|
||||
// hits the `None` branch so this code path is not exercised
|
||||
// by the test, but kept for completeness.
|
||||
Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1,
|
||||
None => 1,
|
||||
};
|
||||
let (blocks, parse_warns) =
|
||||
parse_blocks(&bytes, body_offset_lines).expect("blocks parse");
|
||||
|
||||
let parser_version = ParserVersion("kb-normalize-snapshot-test-0".into());
|
||||
let mut metadata = metadata;
|
||||
// The `created_at` / `updated_at` lifted from BodyHints are pinned
|
||||
// to `discovered_at` above, so they are already deterministic.
|
||||
metadata.aliases.sort();
|
||||
metadata.tags.sort();
|
||||
|
||||
let doc = build_canonical_document(
|
||||
&asset,
|
||||
metadata,
|
||||
blocks,
|
||||
&parser_version,
|
||||
parse_warns,
|
||||
)
|
||||
.expect("build_canonical_document");
|
||||
|
||||
// Assert the BodyHints → first_h1 → user.title → CanonicalDocument.title
|
||||
// lift chain end-to-end. Pinned in the snapshot too, but the explicit
|
||||
// assertion makes a future drift fail with a clearer message.
|
||||
assert_eq!(doc.title, "Code And Table");
|
||||
|
||||
let actual = strip_dynamic(serde_json::to_value(&doc).unwrap());
|
||||
|
||||
let baseline_path = dir.join("code-and-table.canonical.snapshot.json");
|
||||
let baseline_text = match std::fs::read_to_string(&baseline_path) {
|
||||
Ok(s) => s,
|
||||
Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
return;
|
||||
}
|
||||
Err(e) => panic!(
|
||||
"missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}",
|
||||
baseline_path.display()
|
||||
),
|
||||
};
|
||||
let expected: Value =
|
||||
serde_json::from_str(&baseline_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
if std::env::var("UPDATE_SNAPSHOTS").is_ok() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap();
|
||||
eprintln!("updated baseline {}", baseline_path.display());
|
||||
return;
|
||||
}
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"canonical snapshot drift\n--- expected ({}) ---\n{baseline_text}\n--- actual ---\n{pretty}\nIf intentional, re-run with UPDATE_SNAPSHOTS=1.",
|
||||
baseline_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -300,12 +300,12 @@ impl InlineBuf {
|
||||
|
||||
fn push_text(&mut self, s: &str) {
|
||||
self.text.push_str(s);
|
||||
self.push_inline(Inline::Text(s.to_string()));
|
||||
self.push_inline(Inline::Text { text: s.to_string() });
|
||||
}
|
||||
|
||||
fn push_code(&mut self, s: &str) {
|
||||
self.text.push_str(s);
|
||||
self.push_inline(Inline::Code(s.to_string()));
|
||||
self.push_inline(Inline::Code { code: s.to_string() });
|
||||
}
|
||||
|
||||
fn open_strong(&mut self) {
|
||||
@@ -313,7 +313,7 @@ impl InlineBuf {
|
||||
}
|
||||
fn close_strong(&mut self) {
|
||||
if let Some(InlineFrame::Strong(kids)) = self.stack.pop() {
|
||||
self.push_inline(Inline::Strong(kids));
|
||||
self.push_inline(Inline::Strong { children: kids });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -322,7 +322,7 @@ impl InlineBuf {
|
||||
}
|
||||
fn close_emph(&mut self) {
|
||||
if let Some(InlineFrame::Emph(kids)) = self.stack.pop() {
|
||||
self.push_inline(Inline::Emph(kids));
|
||||
self.push_inline(Inline::Emph { children: kids });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -361,8 +361,8 @@ impl InlineBuf {
|
||||
// If formatting tags were unbalanced we close them defensively.
|
||||
while self.stack.len() > 1 {
|
||||
match self.stack.pop().unwrap() {
|
||||
InlineFrame::Strong(kids) => self.push_inline(Inline::Strong(kids)),
|
||||
InlineFrame::Emph(kids) => self.push_inline(Inline::Emph(kids)),
|
||||
InlineFrame::Strong(kids) => self.push_inline(Inline::Strong { children: kids }),
|
||||
InlineFrame::Emph(kids) => self.push_inline(Inline::Emph { children: kids }),
|
||||
InlineFrame::Link { href, text, kids } => {
|
||||
let flat = if !text.is_empty() {
|
||||
text
|
||||
@@ -475,10 +475,11 @@ fn flatten_inlines_to_text(inlines: &[Inline]) -> String {
|
||||
|
||||
fn flatten_one(i: &Inline, out: &mut String) {
|
||||
match i {
|
||||
Inline::Text(s) | Inline::Code(s) => out.push_str(s),
|
||||
Inline::Text { text } => out.push_str(text),
|
||||
Inline::Code { code } => out.push_str(code),
|
||||
Inline::Link { text, .. } => out.push_str(text),
|
||||
Inline::Strong(v) | Inline::Emph(v) => {
|
||||
for c in v {
|
||||
Inline::Strong { children } | Inline::Emph { children } => {
|
||||
for c in children {
|
||||
flatten_one(c, out);
|
||||
}
|
||||
}
|
||||
@@ -823,7 +824,7 @@ impl<'a> WalkState<'a> {
|
||||
text.push('\n');
|
||||
}
|
||||
text.push_str(t);
|
||||
inlines.push(Inline::Text(t.clone()));
|
||||
inlines.push(Inline::Text { text: t.clone() });
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
@@ -921,7 +922,7 @@ impl<'a> WalkState<'a> {
|
||||
source_span: self.span_for(&range),
|
||||
payload: ParsedPayload::Paragraph {
|
||||
text: raw.clone(),
|
||||
inlines: vec![Inline::Text(raw)],
|
||||
inlines: vec![Inline::Text { text: raw }],
|
||||
},
|
||||
}
|
||||
} else {
|
||||
@@ -1477,7 +1478,7 @@ mod tests {
|
||||
assert!(
|
||||
matches!(
|
||||
inl,
|
||||
Inline::Text(_) | Inline::Code(_) | Inline::Link { .. } | Inline::Strong(_) | Inline::Emph(_)
|
||||
Inline::Text { .. } | Inline::Code { .. } | Inline::Link { .. } | Inline::Strong { .. } | Inline::Emph { .. }
|
||||
),
|
||||
"unexpected inline kind: {:?}",
|
||||
inl
|
||||
@@ -1736,11 +1737,11 @@ mod tests {
|
||||
match &blocks[0].payload {
|
||||
ParsedPayload::Paragraph { inlines, .. } => {
|
||||
let kinds: Vec<&'static str> = inlines.iter().map(|i| match i {
|
||||
Inline::Text(_) => "Text",
|
||||
Inline::Code(_) => "Code",
|
||||
Inline::Text { .. } => "Text",
|
||||
Inline::Code { .. } => "Code",
|
||||
Inline::Link { .. } => "Link",
|
||||
Inline::Strong(_) => "Strong",
|
||||
Inline::Emph(_) => "Emph",
|
||||
Inline::Strong { .. } => "Strong",
|
||||
Inline::Emph { .. } => "Emph",
|
||||
}).collect();
|
||||
assert!(kinds.contains(&"Strong"));
|
||||
assert!(kinds.contains(&"Emph"));
|
||||
|
||||
@@ -379,8 +379,12 @@ fn derive_metadata(
|
||||
|
||||
// ---- title ----
|
||||
// Frontmatter → BodyHints.first_h1 → None.
|
||||
// Filename fallback is the caller's responsibility (P1-4 normalize), per
|
||||
// task brief — `BodyHints` does not carry a filename.
|
||||
// Filename fallback for title is deferred to a later phase (P1-7 or
|
||||
// kb-app integration); the parse_frontmatter -> build_canonical_document
|
||||
// pipeline does not currently know the workspace_path filename component
|
||||
// for fallback. CanonicalDocument.title may be empty for files without
|
||||
// frontmatter title and without an H1; downstream display layer should
|
||||
// fall back to filename via WorkspacePath inspection.
|
||||
let title = raw.title.or_else(|| hints.first_h1.clone());
|
||||
if let Some(t) = title {
|
||||
user.insert("title".to_string(), Value::String(t));
|
||||
|
||||
@@ -4,19 +4,19 @@
|
||||
//! below. `body_offset_lines = 1` is used for both fixtures (no
|
||||
//! frontmatter, body starts at file line 1).
|
||||
//!
|
||||
//! Note on snapshot shape: `kb_core::Inline` carries a `serde(tag = "kind")`
|
||||
//! enum representation that cannot serialize newtype variants holding a
|
||||
//! primitive (`Inline::Text(String)` etc.) — that's a serde limitation, not
|
||||
//! ours, and is fixed up in a later kb-core task. To keep the snapshot
|
||||
//! human-readable (and stable across that future fix), we project each
|
||||
//! `ParsedBlock` into a `BlockView` that flattens inline content to plain
|
||||
//! strings before serialization. This still pins the *contract* that
|
||||
//! matters for P1-3: heading paths, source spans, payload kinds, payload
|
||||
//! text content, table headers/rows, and code lang/body.
|
||||
//! Note: kb-parse-md's snapshot tests use the `#[ignore]` regenerator
|
||||
//! pattern (run `cargo test ... -- --ignored` to refresh baselines),
|
||||
//! whereas `kb-normalize`'s integration test uses an `UPDATE_SNAPSHOTS=1`
|
||||
//! env-var pattern. Migrating kb-parse-md to the env-var style is out of
|
||||
//! scope; both styles are intentional for now.
|
||||
//!
|
||||
//! Following the kb_core::Inline schema migration (struct-variant shape),
|
||||
//! `ParsedBlock` now serializes directly through serde — no projection
|
||||
//! shim is required. Inlines surface as structured objects, e.g.
|
||||
//! `[{"kind":"text","text":"…"},{"kind":"code","code":"…"}]`.
|
||||
|
||||
use kb_core::{Inline, SourceSpan};
|
||||
use kb_parse_md::parse_blocks;
|
||||
use kb_parse_types::{ParsedBlock, ParsedPayload, Warning};
|
||||
use kb_parse_types::{ParsedBlock, Warning};
|
||||
use serde::Serialize;
|
||||
use serde_json::Value;
|
||||
use std::fs;
|
||||
@@ -24,130 +24,10 @@ use std::path::PathBuf;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Snapshot {
|
||||
blocks: Vec<BlockView>,
|
||||
blocks: Vec<ParsedBlock>,
|
||||
warnings: Vec<Warning>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct BlockView {
|
||||
kind: String,
|
||||
heading_path: Vec<String>,
|
||||
source_span: SourceSpan,
|
||||
payload: PayloadView,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[serde(tag = "kind", rename_all = "lowercase")]
|
||||
enum PayloadView {
|
||||
Heading {
|
||||
level: u8,
|
||||
text: String,
|
||||
},
|
||||
Paragraph {
|
||||
text: String,
|
||||
inlines_flat: String,
|
||||
},
|
||||
List {
|
||||
ordered: bool,
|
||||
items_flat: Vec<String>,
|
||||
},
|
||||
Code {
|
||||
lang: Option<String>,
|
||||
code: String,
|
||||
},
|
||||
Table {
|
||||
headers: Vec<String>,
|
||||
rows: Vec<Vec<String>>,
|
||||
},
|
||||
Quote {
|
||||
text: String,
|
||||
inlines_flat: String,
|
||||
},
|
||||
ImageRef {
|
||||
src: String,
|
||||
alt: String,
|
||||
},
|
||||
AudioRef {
|
||||
src: String,
|
||||
},
|
||||
}
|
||||
|
||||
fn flatten_inline(i: &Inline, out: &mut String) {
|
||||
match i {
|
||||
Inline::Text(s) | Inline::Code(s) => out.push_str(s),
|
||||
Inline::Link { text, href } => {
|
||||
out.push('[');
|
||||
out.push_str(text);
|
||||
out.push_str("](");
|
||||
out.push_str(href);
|
||||
out.push(')');
|
||||
}
|
||||
Inline::Strong(v) => {
|
||||
out.push_str("**");
|
||||
for c in v {
|
||||
flatten_inline(c, out);
|
||||
}
|
||||
out.push_str("**");
|
||||
}
|
||||
Inline::Emph(v) => {
|
||||
out.push('*');
|
||||
for c in v {
|
||||
flatten_inline(c, out);
|
||||
}
|
||||
out.push('*');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn flatten(inlines: &[Inline]) -> String {
|
||||
let mut out = String::new();
|
||||
for i in inlines {
|
||||
flatten_inline(i, &mut out);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn block_to_view(b: &ParsedBlock) -> BlockView {
|
||||
let kind = format!("{:?}", b.kind).to_lowercase();
|
||||
let payload = match &b.payload {
|
||||
ParsedPayload::Heading { level, text } => PayloadView::Heading {
|
||||
level: *level,
|
||||
text: text.clone(),
|
||||
},
|
||||
ParsedPayload::Paragraph { text, inlines } => PayloadView::Paragraph {
|
||||
text: text.clone(),
|
||||
inlines_flat: flatten(inlines),
|
||||
},
|
||||
ParsedPayload::List { ordered, items } => PayloadView::List {
|
||||
ordered: *ordered,
|
||||
items_flat: items.iter().map(|it| flatten(it)).collect(),
|
||||
},
|
||||
ParsedPayload::Code { lang, code } => PayloadView::Code {
|
||||
lang: lang.clone(),
|
||||
code: code.clone(),
|
||||
},
|
||||
ParsedPayload::Table { headers, rows } => PayloadView::Table {
|
||||
headers: headers.clone(),
|
||||
rows: rows.clone(),
|
||||
},
|
||||
ParsedPayload::Quote { text, inlines } => PayloadView::Quote {
|
||||
text: text.clone(),
|
||||
inlines_flat: flatten(inlines),
|
||||
},
|
||||
ParsedPayload::ImageRef { src, alt } => PayloadView::ImageRef {
|
||||
src: src.clone(),
|
||||
alt: alt.clone(),
|
||||
},
|
||||
ParsedPayload::AudioRef { src } => PayloadView::AudioRef { src: src.clone() },
|
||||
};
|
||||
BlockView {
|
||||
kind,
|
||||
heading_path: b.heading_path.clone(),
|
||||
source_span: b.source_span.clone(),
|
||||
payload,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
@@ -162,7 +42,7 @@ fn assert_snapshot(fixture: &str, baseline: &str) {
|
||||
|
||||
let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
|
||||
let snap = Snapshot {
|
||||
blocks: blocks.iter().map(block_to_view).collect(),
|
||||
blocks,
|
||||
warnings: warns,
|
||||
};
|
||||
let actual: Value = serde_json::to_value(&snap).unwrap();
|
||||
@@ -211,7 +91,7 @@ fn emit_blocks_snapshots() {
|
||||
let bytes = fs::read(dir.join(fixture)).unwrap();
|
||||
let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
|
||||
let snap = Snapshot {
|
||||
blocks: blocks.iter().map(block_to_view).collect(),
|
||||
blocks,
|
||||
warnings: warns,
|
||||
};
|
||||
let json = serde_json::to_string_pretty(&snap).unwrap();
|
||||
@@ -227,14 +107,10 @@ fn snapshot_is_deterministic_across_runs() {
|
||||
let bytes = fs::read(dir.join("nested-headings.md")).unwrap();
|
||||
let (a_blocks, a_warns) = parse_blocks(&bytes, 1).unwrap();
|
||||
let (b_blocks, b_warns) = parse_blocks(&bytes, 1).unwrap();
|
||||
// Compare via the view (which is fully serializable) and via the
|
||||
// structural equality on `ParsedBlock` itself (no serde involved).
|
||||
assert_eq!(a_blocks, b_blocks);
|
||||
assert_eq!(a_warns, b_warns);
|
||||
let av: Vec<_> = a_blocks.iter().map(block_to_view).collect();
|
||||
let bv: Vec<_> = b_blocks.iter().map(block_to_view).collect();
|
||||
assert_eq!(
|
||||
serde_json::to_value(&av).unwrap(),
|
||||
serde_json::to_value(&bv).unwrap()
|
||||
serde_json::to_value(&a_blocks).unwrap(),
|
||||
serde_json::to_value(&b_blocks).unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
102
fixtures/markdown/code-and-table.canonical.snapshot.json
Normal file
102
fixtures/markdown/code-and-table.canonical.snapshot.json
Normal file
@@ -0,0 +1,102 @@
|
||||
{
|
||||
"blocks": [
|
||||
{
|
||||
"common": {
|
||||
"block_id": "dd1528c6e84d8a66087cbf6faafd67c6",
|
||||
"heading_path": [],
|
||||
"source_span": {
|
||||
"end": 1,
|
||||
"kind": "line",
|
||||
"start": 1
|
||||
}
|
||||
},
|
||||
"kind": "heading",
|
||||
"level": 1,
|
||||
"text": "Code And Table"
|
||||
},
|
||||
{
|
||||
"code": "fn main() {\n println!(\"hi\");\n}",
|
||||
"common": {
|
||||
"block_id": "68ea34aca04b83413dd8556126ae4584",
|
||||
"heading_path": [
|
||||
"Code And Table"
|
||||
],
|
||||
"source_span": {
|
||||
"end": 7,
|
||||
"kind": "line",
|
||||
"start": 3
|
||||
}
|
||||
},
|
||||
"kind": "code",
|
||||
"lang": "rust"
|
||||
},
|
||||
{
|
||||
"common": {
|
||||
"block_id": "b50a8e941b11f1834ae17adba9e08118",
|
||||
"heading_path": [
|
||||
"Code And Table"
|
||||
],
|
||||
"source_span": {
|
||||
"end": 12,
|
||||
"kind": "line",
|
||||
"start": 9
|
||||
}
|
||||
},
|
||||
"headers": [
|
||||
"col a",
|
||||
"col b"
|
||||
],
|
||||
"kind": "table",
|
||||
"rows": [
|
||||
[
|
||||
"1",
|
||||
"2"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"4"
|
||||
]
|
||||
]
|
||||
}
|
||||
],
|
||||
"doc_id": "6a9ef317c9c097ff3f6aeb317559bd83",
|
||||
"doc_version": 1,
|
||||
"lang": "en",
|
||||
"metadata": {
|
||||
"aliases": [],
|
||||
"created_at": "2023-11-14T22:13:20Z",
|
||||
"source_type": "markdown",
|
||||
"tags": [],
|
||||
"trust_level": "primary",
|
||||
"updated_at": "2023-11-14T22:13:20Z",
|
||||
"user": {},
|
||||
"user_id_alias": null
|
||||
},
|
||||
"parser_version": "kb-normalize-snapshot-test-0",
|
||||
"provenance": {
|
||||
"events": [
|
||||
{
|
||||
"agent": "kb-source-fs",
|
||||
"at": "2023-11-14T22:13:20Z",
|
||||
"kind": "discovered",
|
||||
"note": null
|
||||
},
|
||||
{
|
||||
"agent": "kb-parse-md",
|
||||
"at": "<stripped>",
|
||||
"kind": "parsed",
|
||||
"note": "parser_version=kb-normalize-snapshot-test-0"
|
||||
},
|
||||
{
|
||||
"agent": "kb-normalize",
|
||||
"at": "<stripped>",
|
||||
"kind": "normalized",
|
||||
"note": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"schema_version": 1,
|
||||
"source_asset_id": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
"title": "Code And Table",
|
||||
"workspace_path": "notes/code-and-table.md"
|
||||
}
|
||||
@@ -27,7 +27,12 @@
|
||||
"payload": {
|
||||
"kind": "paragraph",
|
||||
"text": "intro",
|
||||
"inlines_flat": "intro"
|
||||
"inlines": [
|
||||
{
|
||||
"kind": "text",
|
||||
"text": "intro"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -60,7 +65,12 @@
|
||||
"payload": {
|
||||
"kind": "paragraph",
|
||||
"text": "body of A",
|
||||
"inlines_flat": "body of A"
|
||||
"inlines": [
|
||||
{
|
||||
"kind": "text",
|
||||
"text": "body of A"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -95,7 +105,12 @@
|
||||
"payload": {
|
||||
"kind": "paragraph",
|
||||
"text": "deeper",
|
||||
"inlines_flat": "deeper"
|
||||
"inlines": [
|
||||
{
|
||||
"kind": "text",
|
||||
"text": "deeper"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -128,7 +143,12 @@
|
||||
"payload": {
|
||||
"kind": "paragraph",
|
||||
"text": "body of B",
|
||||
"inlines_flat": "body of B"
|
||||
"inlines": [
|
||||
{
|
||||
"kind": "text",
|
||||
"text": "body of B"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user