diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs index 1ddfaa6..86913de 100644 --- a/crates/kb-parse-md/src/blocks.rs +++ b/crates/kb-parse-md/src/blocks.rs @@ -528,7 +528,7 @@ impl<'a> WalkState<'a> { let _ = level_u8; // Update heading stack: clear deeper levels, set this level. - if level_to_use >= 1 && level_to_use <= 6 { + if (1..=6).contains(&level_to_use) { let idx = (level_to_use - 1) as usize; for slot in &mut self.heading_stack[idx + 1..] { *slot = None; diff --git a/crates/kb-parse-md/tests/blocks_snapshots.rs b/crates/kb-parse-md/tests/blocks_snapshots.rs new file mode 100644 index 0000000..483286e --- /dev/null +++ b/crates/kb-parse-md/tests/blocks_snapshots.rs @@ -0,0 +1,240 @@ +//! Snapshot tests pinning the `parse_blocks` output for two fixtures. +//! +//! Baselines are hand-authored / regenerated via the `--ignored` emitter +//! below. `body_offset_lines = 1` is used for both fixtures (no +//! frontmatter, body starts at file line 1). +//! +//! Note on snapshot shape: `kb_core::Inline` carries a `serde(tag = "kind")` +//! enum representation that cannot serialize newtype variants holding a +//! primitive (`Inline::Text(String)` etc.) — that's a serde limitation, not +//! ours, and is fixed up in a later kb-core task. To keep the snapshot +//! human-readable (and stable across that future fix), we project each +//! `ParsedBlock` into a `BlockView` that flattens inline content to plain +//! strings before serialization. This still pins the *contract* that +//! matters for P1-3: heading paths, source spans, payload kinds, payload +//! text content, table headers/rows, and code lang/body. + +use kb_core::{Inline, SourceSpan}; +use kb_parse_md::parse_blocks; +use kb_parse_types::{ParsedBlock, ParsedPayload, Warning}; +use serde::Serialize; +use serde_json::Value; +use std::fs; +use std::path::PathBuf; + +#[derive(Serialize)] +struct Snapshot { + blocks: Vec, + warnings: Vec, +} + +#[derive(Serialize)] +struct BlockView { + kind: String, + heading_path: Vec, + source_span: SourceSpan, + payload: PayloadView, +} + +#[derive(Serialize)] +#[serde(tag = "kind", rename_all = "lowercase")] +enum PayloadView { + Heading { + level: u8, + text: String, + }, + Paragraph { + text: String, + inlines_flat: String, + }, + List { + ordered: bool, + items_flat: Vec, + }, + Code { + lang: Option, + code: String, + }, + Table { + headers: Vec, + rows: Vec>, + }, + Quote { + text: String, + inlines_flat: String, + }, + ImageRef { + src: String, + alt: String, + }, + AudioRef { + src: String, + }, +} + +fn flatten_inline(i: &Inline, out: &mut String) { + match i { + Inline::Text(s) | Inline::Code(s) => out.push_str(s), + Inline::Link { text, href } => { + out.push('['); + out.push_str(text); + out.push_str("]("); + out.push_str(href); + out.push(')'); + } + Inline::Strong(v) => { + out.push_str("**"); + for c in v { + flatten_inline(c, out); + } + out.push_str("**"); + } + Inline::Emph(v) => { + out.push('*'); + for c in v { + flatten_inline(c, out); + } + out.push('*'); + } + } +} + +fn flatten(inlines: &[Inline]) -> String { + let mut out = String::new(); + for i in inlines { + flatten_inline(i, &mut out); + } + out +} + +fn block_to_view(b: &ParsedBlock) -> BlockView { + let kind = format!("{:?}", b.kind).to_lowercase(); + let payload = match &b.payload { + ParsedPayload::Heading { level, text } => PayloadView::Heading { + level: *level, + text: text.clone(), + }, + ParsedPayload::Paragraph { text, inlines } => PayloadView::Paragraph { + text: text.clone(), + inlines_flat: flatten(inlines), + }, + ParsedPayload::List { ordered, items } => PayloadView::List { + ordered: *ordered, + items_flat: items.iter().map(|it| flatten(it)).collect(), + }, + ParsedPayload::Code { lang, code } => PayloadView::Code { + lang: lang.clone(), + code: code.clone(), + }, + ParsedPayload::Table { headers, rows } => PayloadView::Table { + headers: headers.clone(), + rows: rows.clone(), + }, + ParsedPayload::Quote { text, inlines } => PayloadView::Quote { + text: text.clone(), + inlines_flat: flatten(inlines), + }, + ParsedPayload::ImageRef { src, alt } => PayloadView::ImageRef { + src: src.clone(), + alt: alt.clone(), + }, + ParsedPayload::AudioRef { src } => PayloadView::AudioRef { src: src.clone() }, + }; + BlockView { + kind, + heading_path: b.heading_path.clone(), + source_span: b.source_span.clone(), + payload, + } +} + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("..") + .join("fixtures") + .join("markdown") +} + +fn assert_snapshot(fixture: &str, baseline: &str) { + let dir = fixtures_dir(); + let bytes = fs::read(dir.join(fixture)).expect("fixture readable"); + + let (blocks, warns) = parse_blocks(&bytes, 1).unwrap(); + let snap = Snapshot { + blocks: blocks.iter().map(block_to_view).collect(), + warnings: warns, + }; + let actual: Value = serde_json::to_value(&snap).unwrap(); + + let expected_text = + fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable"); + let expected: Value = serde_json::from_str(&expected_text).expect("baseline parses as json"); + + if actual != expected { + let actual_pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "snapshot drift for {fixture}\n\ + --- expected ({baseline}) ---\n{expected_text}\n\ + --- actual ---\n{actual_pretty}\n\ + If the change is intentional, update {baseline}." + ); + } +} + +#[test] +fn nested_headings_blocks_snapshot() { + assert_snapshot( + "nested-headings.md", + "nested-headings.blocks.snapshot.json", + ); +} + +#[test] +fn code_and_table_blocks_snapshot() { + assert_snapshot( + "code-and-table.md", + "code-and-table.blocks.snapshot.json", + ); +} + +/// Run with `cargo test -p kb-parse-md --test blocks_snapshots emit_blocks_snapshots -- --ignored --nocapture` +/// to regenerate the baseline JSON files from the current parser output. +#[test] +#[ignore] +fn emit_blocks_snapshots() { + let dir = fixtures_dir(); + for (fixture, baseline) in [ + ("nested-headings.md", "nested-headings.blocks.snapshot.json"), + ("code-and-table.md", "code-and-table.blocks.snapshot.json"), + ] { + let bytes = fs::read(dir.join(fixture)).unwrap(); + let (blocks, warns) = parse_blocks(&bytes, 1).unwrap(); + let snap = Snapshot { + blocks: blocks.iter().map(block_to_view).collect(), + warnings: warns, + }; + let json = serde_json::to_string_pretty(&snap).unwrap(); + fs::write(dir.join(baseline), format!("{json}\n")).unwrap(); + eprintln!("wrote {}", dir.join(baseline).display()); + } +} + +/// Determinism: parsing the same fixture twice in a row must give equal output. +#[test] +fn snapshot_is_deterministic_across_runs() { + let dir = fixtures_dir(); + let bytes = fs::read(dir.join("nested-headings.md")).unwrap(); + let (a_blocks, a_warns) = parse_blocks(&bytes, 1).unwrap(); + let (b_blocks, b_warns) = parse_blocks(&bytes, 1).unwrap(); + // Compare via the view (which is fully serializable) and via the + // structural equality on `ParsedBlock` itself (no serde involved). + assert_eq!(a_blocks, b_blocks); + assert_eq!(a_warns, b_warns); + let av: Vec<_> = a_blocks.iter().map(block_to_view).collect(); + let bv: Vec<_> = b_blocks.iter().map(block_to_view).collect(); + assert_eq!( + serde_json::to_value(&av).unwrap(), + serde_json::to_value(&bv).unwrap() + ); +} diff --git a/fixtures/markdown/code-and-table.blocks.snapshot.json b/fixtures/markdown/code-and-table.blocks.snapshot.json new file mode 100644 index 0000000..8335e43 --- /dev/null +++ b/fixtures/markdown/code-and-table.blocks.snapshot.json @@ -0,0 +1,63 @@ +{ + "blocks": [ + { + "kind": "heading", + "heading_path": [], + "source_span": { + "kind": "line", + "start": 1, + "end": 1 + }, + "payload": { + "kind": "heading", + "level": 1, + "text": "Code And Table" + } + }, + { + "kind": "code", + "heading_path": [ + "Code And Table" + ], + "source_span": { + "kind": "line", + "start": 3, + "end": 7 + }, + "payload": { + "kind": "code", + "lang": "rust", + "code": "fn main() {\n println!(\"hi\");\n}" + } + }, + { + "kind": "table", + "heading_path": [ + "Code And Table" + ], + "source_span": { + "kind": "line", + "start": 9, + "end": 12 + }, + "payload": { + "kind": "table", + "headers": [ + "col a", + "col b" + ], + "rows": [ + [ + "1", + "2" + ], + [ + "3", + "4" + ] + ] + } + } + ], + "warnings": [] +} diff --git a/fixtures/markdown/nested-headings.blocks.snapshot.json b/fixtures/markdown/nested-headings.blocks.snapshot.json new file mode 100644 index 0000000..032c3e6 --- /dev/null +++ b/fixtures/markdown/nested-headings.blocks.snapshot.json @@ -0,0 +1,136 @@ +{ + "blocks": [ + { + "kind": "heading", + "heading_path": [], + "source_span": { + "kind": "line", + "start": 1, + "end": 1 + }, + "payload": { + "kind": "heading", + "level": 1, + "text": "Top" + } + }, + { + "kind": "paragraph", + "heading_path": [ + "Top" + ], + "source_span": { + "kind": "line", + "start": 3, + "end": 3 + }, + "payload": { + "kind": "paragraph", + "text": "intro", + "inlines_flat": "intro" + } + }, + { + "kind": "heading", + "heading_path": [ + "Top" + ], + "source_span": { + "kind": "line", + "start": 5, + "end": 5 + }, + "payload": { + "kind": "heading", + "level": 2, + "text": "Section A" + } + }, + { + "kind": "paragraph", + "heading_path": [ + "Top", + "Section A" + ], + "source_span": { + "kind": "line", + "start": 7, + "end": 7 + }, + "payload": { + "kind": "paragraph", + "text": "body of A", + "inlines_flat": "body of A" + } + }, + { + "kind": "heading", + "heading_path": [ + "Top", + "Section A" + ], + "source_span": { + "kind": "line", + "start": 9, + "end": 9 + }, + "payload": { + "kind": "heading", + "level": 3, + "text": "Sub A.1" + } + }, + { + "kind": "paragraph", + "heading_path": [ + "Top", + "Section A", + "Sub A.1" + ], + "source_span": { + "kind": "line", + "start": 11, + "end": 11 + }, + "payload": { + "kind": "paragraph", + "text": "deeper", + "inlines_flat": "deeper" + } + }, + { + "kind": "heading", + "heading_path": [ + "Top" + ], + "source_span": { + "kind": "line", + "start": 13, + "end": 13 + }, + "payload": { + "kind": "heading", + "level": 2, + "text": "Section B" + } + }, + { + "kind": "paragraph", + "heading_path": [ + "Top", + "Section B" + ], + "source_span": { + "kind": "line", + "start": 15, + "end": 15 + }, + "payload": { + "kind": "paragraph", + "text": "body of B", + "inlines_flat": "body of B" + } + } + ], + "warnings": [] +}