feat(p1-3): kb-parse-md blocks (Markdown body → ParsedBlock tree) #8
19
Cargo.lock
generated
19
Cargo.lock
generated
@@ -585,11 +585,13 @@ dependencies = [
|
||||
"kb-core",
|
||||
"kb-parse-types",
|
||||
"lingua",
|
||||
"pulldown-cmark",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_yaml_ng",
|
||||
"time",
|
||||
"toml",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -834,6 +836,17 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pulldown-cmark"
|
||||
version = "0.13.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"memchr",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.45"
|
||||
@@ -1367,6 +1380,12 @@ dependencies = [
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.24"
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Markdown frontmatter (and, in p1-3, block) parsing into kb-core::Metadata / kb-parse-types intermediates"
|
||||
description = "Markdown frontmatter and block parsing into kb-core::Metadata / kb-parse-types intermediates"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
@@ -14,6 +14,12 @@ anyhow = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
# pulldown-cmark is the CommonMark parser used by the `blocks` submodule.
|
||||
# GFM tables are gated by the runtime `Options::ENABLE_TABLES` flag, not a
|
||||
# cargo feature; we strip the default `getopts` + `html` features since we
|
||||
# only use the pull-parser API.
|
||||
pulldown-cmark = { version = "0.13", default-features = false }
|
||||
# serde_yaml (dtolnay) was archived as unmaintained in 2024.
|
||||
# We use the maintained fork serde_yaml_ng. Keeping the same `serde_yaml`-style
|
||||
# API surface lets us swap if a different fork wins long term.
|
||||
|
||||
1787
crates/kb-parse-md/src/blocks.rs
Normal file
1787
crates/kb-parse-md/src/blocks.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,19 +1,21 @@
|
||||
//! `kb-parse-md` — Markdown parsing for the KB pipeline (§3.7b).
|
||||
//!
|
||||
//! P1-2 implements the **frontmatter** submodule only. P1-3 will add a
|
||||
//! sibling `blocks` submodule for block parsing using `pulldown-cmark`.
|
||||
//!
|
||||
//! Public surface for P1-2 is intentionally narrow:
|
||||
//! Public surface:
|
||||
//!
|
||||
//! * [`parse_frontmatter`] — pure function from Markdown bytes to
|
||||
//! `(Metadata, Option<FrontmatterSpan>, Vec<Warning>)`.
|
||||
//! `(Metadata, Option<FrontmatterSpan>, Vec<Warning>)` (P1-2).
|
||||
//! * [`BodyHints`] — caller-supplied fallbacks that feed the §0 Q9 derive
|
||||
//! table when frontmatter is missing or partial.
|
||||
//! table when frontmatter is missing or partial (P1-2).
|
||||
//! * [`FrontmatterSpan`] — byte offsets of the frontmatter region in the
|
||||
//! input slice (returned by [`parse_frontmatter`]).
|
||||
//! input slice (returned by [`parse_frontmatter`]) (P1-2).
|
||||
//! * [`parse_blocks`] — pure function from Markdown body bytes to
|
||||
//! `(Vec<ParsedBlock>, Vec<Warning>)` with heading paths and 1-indexed
|
||||
//! `SourceSpan::Line` ranges relative to the original file (P1-3).
|
||||
//!
|
||||
//! Anything else in this crate is `pub(crate)` and may change without notice.
|
||||
|
||||
pub mod blocks;
|
||||
pub mod frontmatter;
|
||||
|
||||
pub use blocks::parse_blocks;
|
||||
pub use frontmatter::{BodyHints, FrontmatterSpan, parse_frontmatter};
|
||||
|
||||
240
crates/kb-parse-md/tests/blocks_snapshots.rs
Normal file
240
crates/kb-parse-md/tests/blocks_snapshots.rs
Normal file
@@ -0,0 +1,240 @@
|
||||
//! Snapshot tests pinning the `parse_blocks` output for two fixtures.
|
||||
//!
|
||||
//! Baselines are hand-authored / regenerated via the `--ignored` emitter
|
||||
//! below. `body_offset_lines = 1` is used for both fixtures (no
|
||||
//! frontmatter, body starts at file line 1).
|
||||
//!
|
||||
//! Note on snapshot shape: `kb_core::Inline` carries a `serde(tag = "kind")`
|
||||
//! enum representation that cannot serialize newtype variants holding a
|
||||
//! primitive (`Inline::Text(String)` etc.) — that's a serde limitation, not
|
||||
//! ours, and is fixed up in a later kb-core task. To keep the snapshot
|
||||
//! human-readable (and stable across that future fix), we project each
|
||||
//! `ParsedBlock` into a `BlockView` that flattens inline content to plain
|
||||
//! strings before serialization. This still pins the *contract* that
|
||||
//! matters for P1-3: heading paths, source spans, payload kinds, payload
|
||||
//! text content, table headers/rows, and code lang/body.
|
||||
|
||||
use kb_core::{Inline, SourceSpan};
|
||||
use kb_parse_md::parse_blocks;
|
||||
use kb_parse_types::{ParsedBlock, ParsedPayload, Warning};
|
||||
use serde::Serialize;
|
||||
use serde_json::Value;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Snapshot {
|
||||
blocks: Vec<BlockView>,
|
||||
warnings: Vec<Warning>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct BlockView {
|
||||
kind: String,
|
||||
heading_path: Vec<String>,
|
||||
source_span: SourceSpan,
|
||||
payload: PayloadView,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[serde(tag = "kind", rename_all = "lowercase")]
|
||||
enum PayloadView {
|
||||
Heading {
|
||||
level: u8,
|
||||
text: String,
|
||||
},
|
||||
Paragraph {
|
||||
text: String,
|
||||
inlines_flat: String,
|
||||
},
|
||||
List {
|
||||
ordered: bool,
|
||||
items_flat: Vec<String>,
|
||||
},
|
||||
Code {
|
||||
lang: Option<String>,
|
||||
code: String,
|
||||
},
|
||||
Table {
|
||||
headers: Vec<String>,
|
||||
rows: Vec<Vec<String>>,
|
||||
},
|
||||
Quote {
|
||||
text: String,
|
||||
inlines_flat: String,
|
||||
},
|
||||
ImageRef {
|
||||
src: String,
|
||||
alt: String,
|
||||
},
|
||||
AudioRef {
|
||||
src: String,
|
||||
},
|
||||
}
|
||||
|
||||
fn flatten_inline(i: &Inline, out: &mut String) {
|
||||
match i {
|
||||
Inline::Text(s) | Inline::Code(s) => out.push_str(s),
|
||||
Inline::Link { text, href } => {
|
||||
out.push('[');
|
||||
out.push_str(text);
|
||||
out.push_str("](");
|
||||
out.push_str(href);
|
||||
out.push(')');
|
||||
}
|
||||
Inline::Strong(v) => {
|
||||
out.push_str("**");
|
||||
for c in v {
|
||||
flatten_inline(c, out);
|
||||
}
|
||||
out.push_str("**");
|
||||
}
|
||||
Inline::Emph(v) => {
|
||||
out.push('*');
|
||||
for c in v {
|
||||
flatten_inline(c, out);
|
||||
}
|
||||
out.push('*');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn flatten(inlines: &[Inline]) -> String {
|
||||
let mut out = String::new();
|
||||
for i in inlines {
|
||||
flatten_inline(i, &mut out);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn block_to_view(b: &ParsedBlock) -> BlockView {
|
||||
let kind = format!("{:?}", b.kind).to_lowercase();
|
||||
let payload = match &b.payload {
|
||||
ParsedPayload::Heading { level, text } => PayloadView::Heading {
|
||||
level: *level,
|
||||
text: text.clone(),
|
||||
},
|
||||
ParsedPayload::Paragraph { text, inlines } => PayloadView::Paragraph {
|
||||
text: text.clone(),
|
||||
inlines_flat: flatten(inlines),
|
||||
},
|
||||
ParsedPayload::List { ordered, items } => PayloadView::List {
|
||||
ordered: *ordered,
|
||||
items_flat: items.iter().map(|it| flatten(it)).collect(),
|
||||
},
|
||||
ParsedPayload::Code { lang, code } => PayloadView::Code {
|
||||
lang: lang.clone(),
|
||||
code: code.clone(),
|
||||
},
|
||||
ParsedPayload::Table { headers, rows } => PayloadView::Table {
|
||||
headers: headers.clone(),
|
||||
rows: rows.clone(),
|
||||
},
|
||||
ParsedPayload::Quote { text, inlines } => PayloadView::Quote {
|
||||
text: text.clone(),
|
||||
inlines_flat: flatten(inlines),
|
||||
},
|
||||
ParsedPayload::ImageRef { src, alt } => PayloadView::ImageRef {
|
||||
src: src.clone(),
|
||||
alt: alt.clone(),
|
||||
},
|
||||
ParsedPayload::AudioRef { src } => PayloadView::AudioRef { src: src.clone() },
|
||||
};
|
||||
BlockView {
|
||||
kind,
|
||||
heading_path: b.heading_path.clone(),
|
||||
source_span: b.source_span.clone(),
|
||||
payload,
|
||||
}
|
||||
}
|
||||
|
||||
fn fixtures_dir() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
.join("..")
|
||||
.join("fixtures")
|
||||
.join("markdown")
|
||||
}
|
||||
|
||||
fn assert_snapshot(fixture: &str, baseline: &str) {
|
||||
let dir = fixtures_dir();
|
||||
let bytes = fs::read(dir.join(fixture)).expect("fixture readable");
|
||||
|
||||
let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
|
||||
let snap = Snapshot {
|
||||
blocks: blocks.iter().map(block_to_view).collect(),
|
||||
warnings: warns,
|
||||
};
|
||||
let actual: Value = serde_json::to_value(&snap).unwrap();
|
||||
|
||||
let expected_text =
|
||||
fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable");
|
||||
let expected: Value = serde_json::from_str(&expected_text).expect("baseline parses as json");
|
||||
|
||||
if actual != expected {
|
||||
let actual_pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
panic!(
|
||||
"snapshot drift for {fixture}\n\
|
||||
--- expected ({baseline}) ---\n{expected_text}\n\
|
||||
--- actual ---\n{actual_pretty}\n\
|
||||
If the change is intentional, update {baseline}."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nested_headings_blocks_snapshot() {
|
||||
assert_snapshot(
|
||||
"nested-headings.md",
|
||||
"nested-headings.blocks.snapshot.json",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn code_and_table_blocks_snapshot() {
|
||||
assert_snapshot(
|
||||
"code-and-table.md",
|
||||
"code-and-table.blocks.snapshot.json",
|
||||
);
|
||||
}
|
||||
|
||||
/// Run with `cargo test -p kb-parse-md --test blocks_snapshots emit_blocks_snapshots -- --ignored --nocapture`
|
||||
/// to regenerate the baseline JSON files from the current parser output.
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn emit_blocks_snapshots() {
|
||||
let dir = fixtures_dir();
|
||||
for (fixture, baseline) in [
|
||||
("nested-headings.md", "nested-headings.blocks.snapshot.json"),
|
||||
("code-and-table.md", "code-and-table.blocks.snapshot.json"),
|
||||
] {
|
||||
let bytes = fs::read(dir.join(fixture)).unwrap();
|
||||
let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
|
||||
let snap = Snapshot {
|
||||
blocks: blocks.iter().map(block_to_view).collect(),
|
||||
warnings: warns,
|
||||
};
|
||||
let json = serde_json::to_string_pretty(&snap).unwrap();
|
||||
fs::write(dir.join(baseline), format!("{json}\n")).unwrap();
|
||||
eprintln!("wrote {}", dir.join(baseline).display());
|
||||
}
|
||||
}
|
||||
|
||||
/// Determinism: parsing the same fixture twice in a row must give equal output.
|
||||
#[test]
|
||||
fn snapshot_is_deterministic_across_runs() {
|
||||
let dir = fixtures_dir();
|
||||
let bytes = fs::read(dir.join("nested-headings.md")).unwrap();
|
||||
let (a_blocks, a_warns) = parse_blocks(&bytes, 1).unwrap();
|
||||
let (b_blocks, b_warns) = parse_blocks(&bytes, 1).unwrap();
|
||||
// Compare via the view (which is fully serializable) and via the
|
||||
// structural equality on `ParsedBlock` itself (no serde involved).
|
||||
assert_eq!(a_blocks, b_blocks);
|
||||
assert_eq!(a_warns, b_warns);
|
||||
let av: Vec<_> = a_blocks.iter().map(block_to_view).collect();
|
||||
let bv: Vec<_> = b_blocks.iter().map(block_to_view).collect();
|
||||
assert_eq!(
|
||||
serde_json::to_value(&av).unwrap(),
|
||||
serde_json::to_value(&bv).unwrap()
|
||||
);
|
||||
}
|
||||
63
fixtures/markdown/code-and-table.blocks.snapshot.json
Normal file
63
fixtures/markdown/code-and-table.blocks.snapshot.json
Normal file
@@ -0,0 +1,63 @@
|
||||
{
|
||||
"blocks": [
|
||||
{
|
||||
"kind": "heading",
|
||||
"heading_path": [],
|
||||
"source_span": {
|
||||
"kind": "line",
|
||||
"start": 1,
|
||||
"end": 1
|
||||
},
|
||||
"payload": {
|
||||
"kind": "heading",
|
||||
"level": 1,
|
||||
"text": "Code And Table"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind": "code",
|
||||
"heading_path": [
|
||||
"Code And Table"
|
||||
],
|
||||
"source_span": {
|
||||
"kind": "line",
|
||||
"start": 3,
|
||||
"end": 7
|
||||
},
|
||||
"payload": {
|
||||
"kind": "code",
|
||||
"lang": "rust",
|
||||
"code": "fn main() {\n println!(\"hi\");\n}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind": "table",
|
||||
"heading_path": [
|
||||
"Code And Table"
|
||||
],
|
||||
"source_span": {
|
||||
"kind": "line",
|
||||
"start": 9,
|
||||
"end": 12
|
||||
},
|
||||
"payload": {
|
||||
"kind": "table",
|
||||
"headers": [
|
||||
"col a",
|
||||
"col b"
|
||||
],
|
||||
"rows": [
|
||||
[
|
||||
"1",
|
||||
"2"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"4"
|
||||
]
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
"warnings": []
|
||||
}
|
||||
136
fixtures/markdown/nested-headings.blocks.snapshot.json
Normal file
136
fixtures/markdown/nested-headings.blocks.snapshot.json
Normal file
@@ -0,0 +1,136 @@
|
||||
{
|
||||
"blocks": [
|
||||
{
|
||||
"kind": "heading",
|
||||
"heading_path": [],
|
||||
"source_span": {
|
||||
"kind": "line",
|
||||
"start": 1,
|
||||
"end": 1
|
||||
},
|
||||
"payload": {
|
||||
"kind": "heading",
|
||||
"level": 1,
|
||||
"text": "Top"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind": "paragraph",
|
||||
"heading_path": [
|
||||
"Top"
|
||||
],
|
||||
"source_span": {
|
||||
"kind": "line",
|
||||
"start": 3,
|
||||
"end": 3
|
||||
},
|
||||
"payload": {
|
||||
"kind": "paragraph",
|
||||
"text": "intro",
|
||||
"inlines_flat": "intro"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind": "heading",
|
||||
"heading_path": [
|
||||
"Top"
|
||||
],
|
||||
"source_span": {
|
||||
"kind": "line",
|
||||
"start": 5,
|
||||
"end": 5
|
||||
},
|
||||
"payload": {
|
||||
"kind": "heading",
|
||||
"level": 2,
|
||||
"text": "Section A"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind": "paragraph",
|
||||
"heading_path": [
|
||||
"Top",
|
||||
"Section A"
|
||||
],
|
||||
"source_span": {
|
||||
"kind": "line",
|
||||
"start": 7,
|
||||
"end": 7
|
||||
},
|
||||
"payload": {
|
||||
"kind": "paragraph",
|
||||
"text": "body of A",
|
||||
"inlines_flat": "body of A"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind": "heading",
|
||||
"heading_path": [
|
||||
"Top",
|
||||
"Section A"
|
||||
],
|
||||
"source_span": {
|
||||
"kind": "line",
|
||||
"start": 9,
|
||||
"end": 9
|
||||
},
|
||||
"payload": {
|
||||
"kind": "heading",
|
||||
"level": 3,
|
||||
"text": "Sub A.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind": "paragraph",
|
||||
"heading_path": [
|
||||
"Top",
|
||||
"Section A",
|
||||
"Sub A.1"
|
||||
],
|
||||
"source_span": {
|
||||
"kind": "line",
|
||||
"start": 11,
|
||||
"end": 11
|
||||
},
|
||||
"payload": {
|
||||
"kind": "paragraph",
|
||||
"text": "deeper",
|
||||
"inlines_flat": "deeper"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind": "heading",
|
||||
"heading_path": [
|
||||
"Top"
|
||||
],
|
||||
"source_span": {
|
||||
"kind": "line",
|
||||
"start": 13,
|
||||
"end": 13
|
||||
},
|
||||
"payload": {
|
||||
"kind": "heading",
|
||||
"level": 2,
|
||||
"text": "Section B"
|
||||
}
|
||||
},
|
||||
{
|
||||
"kind": "paragraph",
|
||||
"heading_path": [
|
||||
"Top",
|
||||
"Section B"
|
||||
],
|
||||
"source_span": {
|
||||
"kind": "line",
|
||||
"start": 15,
|
||||
"end": 15
|
||||
},
|
||||
"payload": {
|
||||
"kind": "paragraph",
|
||||
"text": "body of B",
|
||||
"inlines_flat": "body of B"
|
||||
}
|
||||
}
|
||||
],
|
||||
"warnings": []
|
||||
}
|
||||
Reference in New Issue
Block a user