feat(p1-3): kb-parse-md blocks (Markdown body → ParsedBlock tree) #8

Merged
altair823 merged 10 commits from feat/p1-3-parse-md-blocks into main 2026-04-30 15:03:26 +00:00
7 changed files with 2261 additions and 8 deletions

19
Cargo.lock generated
View File

@@ -585,11 +585,13 @@ dependencies = [
"kb-core",
"kb-parse-types",
"lingua",
"pulldown-cmark",
"serde",
"serde_json",
"serde_yaml_ng",
"time",
"toml",
"tracing",
]
[[package]]
@@ -834,6 +836,17 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "pulldown-cmark"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad"
dependencies = [
"bitflags",
"memchr",
"unicase",
]
[[package]]
name = "quote"
version = "1.0.45"
@@ -1367,6 +1380,12 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "unicase"
version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
[[package]]
name = "unicode-ident"
version = "1.0.24"

View File

@@ -5,7 +5,7 @@ edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "Markdown frontmatter (and, in p1-3, block) parsing into kb-core::Metadata / kb-parse-types intermediates"
description = "Markdown frontmatter and block parsing into kb-core::Metadata / kb-parse-types intermediates"
[dependencies]
kb-core = { path = "../kb-core" }
@@ -14,6 +14,12 @@ anyhow = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
time = { workspace = true }
tracing = { workspace = true }
# pulldown-cmark is the CommonMark parser used by the `blocks` submodule.
# GFM tables are gated by the runtime `Options::ENABLE_TABLES` flag, not a
# cargo feature; we strip the default `getopts` + `html` features since we
# only use the pull-parser API.
pulldown-cmark = { version = "0.13", default-features = false }
# serde_yaml (dtolnay) was archived as unmaintained in 2024.
# We use the maintained fork serde_yaml_ng. Keeping the same `serde_yaml`-style
# API surface lets us swap if a different fork wins long term.

File diff suppressed because it is too large Load Diff

View File

@@ -1,19 +1,21 @@
//! `kb-parse-md` — Markdown parsing for the KB pipeline (§3.7b).
//!
//! P1-2 implements the **frontmatter** submodule only. P1-3 will add a
//! sibling `blocks` submodule for block parsing using `pulldown-cmark`.
//!
//! Public surface for P1-2 is intentionally narrow:
//! Public surface:
//!
//! * [`parse_frontmatter`] — pure function from Markdown bytes to
//! `(Metadata, Option<FrontmatterSpan>, Vec<Warning>)`.
//! `(Metadata, Option<FrontmatterSpan>, Vec<Warning>)` (P1-2).
//! * [`BodyHints`] — caller-supplied fallbacks that feed the §0 Q9 derive
//! table when frontmatter is missing or partial.
//! table when frontmatter is missing or partial (P1-2).
//! * [`FrontmatterSpan`] — byte offsets of the frontmatter region in the
//! input slice (returned by [`parse_frontmatter`]).
//! input slice (returned by [`parse_frontmatter`]) (P1-2).
//! * [`parse_blocks`] — pure function from Markdown body bytes to
//! `(Vec<ParsedBlock>, Vec<Warning>)` with heading paths and 1-indexed
//! `SourceSpan::Line` ranges relative to the original file (P1-3).
//!
//! Anything else in this crate is `pub(crate)` and may change without notice.
pub mod blocks;
pub mod frontmatter;
pub use blocks::parse_blocks;
pub use frontmatter::{BodyHints, FrontmatterSpan, parse_frontmatter};

View File

@@ -0,0 +1,240 @@
//! Snapshot tests pinning the `parse_blocks` output for two fixtures.
//!
//! Baselines are hand-authored / regenerated via the `--ignored` emitter
//! below. `body_offset_lines = 1` is used for both fixtures (no
//! frontmatter, body starts at file line 1).
//!
//! Note on snapshot shape: `kb_core::Inline` carries a `serde(tag = "kind")`
//! enum representation that cannot serialize newtype variants holding a
//! primitive (`Inline::Text(String)` etc.) — that's a serde limitation, not
//! ours, and is fixed up in a later kb-core task. To keep the snapshot
//! human-readable (and stable across that future fix), we project each
//! `ParsedBlock` into a `BlockView` that flattens inline content to plain
//! strings before serialization. This still pins the *contract* that
//! matters for P1-3: heading paths, source spans, payload kinds, payload
//! text content, table headers/rows, and code lang/body.
use kb_core::{Inline, SourceSpan};
use kb_parse_md::parse_blocks;
use kb_parse_types::{ParsedBlock, ParsedPayload, Warning};
use serde::Serialize;
use serde_json::Value;
use std::fs;
use std::path::PathBuf;
#[derive(Serialize)]
struct Snapshot {
blocks: Vec<BlockView>,
warnings: Vec<Warning>,
}
#[derive(Serialize)]
struct BlockView {
kind: String,
heading_path: Vec<String>,
source_span: SourceSpan,
payload: PayloadView,
}
#[derive(Serialize)]
#[serde(tag = "kind", rename_all = "lowercase")]
enum PayloadView {
Heading {
level: u8,
text: String,
},
Paragraph {
text: String,
inlines_flat: String,
},
List {
ordered: bool,
items_flat: Vec<String>,
},
Code {
lang: Option<String>,
code: String,
},
Table {
headers: Vec<String>,
rows: Vec<Vec<String>>,
},
Quote {
text: String,
inlines_flat: String,
},
ImageRef {
src: String,
alt: String,
},
AudioRef {
src: String,
},
}
fn flatten_inline(i: &Inline, out: &mut String) {
match i {
Inline::Text(s) | Inline::Code(s) => out.push_str(s),
Inline::Link { text, href } => {
out.push('[');
out.push_str(text);
out.push_str("](");
out.push_str(href);
out.push(')');
}
Inline::Strong(v) => {
out.push_str("**");
for c in v {
flatten_inline(c, out);
}
out.push_str("**");
}
Inline::Emph(v) => {
out.push('*');
for c in v {
flatten_inline(c, out);
}
out.push('*');
}
}
}
fn flatten(inlines: &[Inline]) -> String {
let mut out = String::new();
for i in inlines {
flatten_inline(i, &mut out);
}
out
}
fn block_to_view(b: &ParsedBlock) -> BlockView {
let kind = format!("{:?}", b.kind).to_lowercase();
let payload = match &b.payload {
ParsedPayload::Heading { level, text } => PayloadView::Heading {
level: *level,
text: text.clone(),
},
ParsedPayload::Paragraph { text, inlines } => PayloadView::Paragraph {
text: text.clone(),
inlines_flat: flatten(inlines),
},
ParsedPayload::List { ordered, items } => PayloadView::List {
ordered: *ordered,
items_flat: items.iter().map(|it| flatten(it)).collect(),
},
ParsedPayload::Code { lang, code } => PayloadView::Code {
lang: lang.clone(),
code: code.clone(),
},
ParsedPayload::Table { headers, rows } => PayloadView::Table {
headers: headers.clone(),
rows: rows.clone(),
},
ParsedPayload::Quote { text, inlines } => PayloadView::Quote {
text: text.clone(),
inlines_flat: flatten(inlines),
},
ParsedPayload::ImageRef { src, alt } => PayloadView::ImageRef {
src: src.clone(),
alt: alt.clone(),
},
ParsedPayload::AudioRef { src } => PayloadView::AudioRef { src: src.clone() },
};
BlockView {
kind,
heading_path: b.heading_path.clone(),
source_span: b.source_span.clone(),
payload,
}
}
fn fixtures_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("..")
.join("..")
.join("fixtures")
.join("markdown")
}
fn assert_snapshot(fixture: &str, baseline: &str) {
let dir = fixtures_dir();
let bytes = fs::read(dir.join(fixture)).expect("fixture readable");
let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
let snap = Snapshot {
blocks: blocks.iter().map(block_to_view).collect(),
warnings: warns,
};
let actual: Value = serde_json::to_value(&snap).unwrap();
let expected_text =
fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable");
let expected: Value = serde_json::from_str(&expected_text).expect("baseline parses as json");
if actual != expected {
let actual_pretty = serde_json::to_string_pretty(&actual).unwrap();
panic!(
"snapshot drift for {fixture}\n\
--- expected ({baseline}) ---\n{expected_text}\n\
--- actual ---\n{actual_pretty}\n\
If the change is intentional, update {baseline}."
);
}
}
#[test]
fn nested_headings_blocks_snapshot() {
assert_snapshot(
"nested-headings.md",
"nested-headings.blocks.snapshot.json",
);
}
#[test]
fn code_and_table_blocks_snapshot() {
assert_snapshot(
"code-and-table.md",
"code-and-table.blocks.snapshot.json",
);
}
/// Run with `cargo test -p kb-parse-md --test blocks_snapshots emit_blocks_snapshots -- --ignored --nocapture`
/// to regenerate the baseline JSON files from the current parser output.
#[test]
#[ignore]
fn emit_blocks_snapshots() {
let dir = fixtures_dir();
for (fixture, baseline) in [
("nested-headings.md", "nested-headings.blocks.snapshot.json"),
("code-and-table.md", "code-and-table.blocks.snapshot.json"),
] {
let bytes = fs::read(dir.join(fixture)).unwrap();
let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
let snap = Snapshot {
blocks: blocks.iter().map(block_to_view).collect(),
warnings: warns,
};
let json = serde_json::to_string_pretty(&snap).unwrap();
fs::write(dir.join(baseline), format!("{json}\n")).unwrap();
eprintln!("wrote {}", dir.join(baseline).display());
}
}
/// Determinism: parsing the same fixture twice in a row must give equal output.
#[test]
fn snapshot_is_deterministic_across_runs() {
let dir = fixtures_dir();
let bytes = fs::read(dir.join("nested-headings.md")).unwrap();
let (a_blocks, a_warns) = parse_blocks(&bytes, 1).unwrap();
let (b_blocks, b_warns) = parse_blocks(&bytes, 1).unwrap();
// Compare via the view (which is fully serializable) and via the
// structural equality on `ParsedBlock` itself (no serde involved).
assert_eq!(a_blocks, b_blocks);
assert_eq!(a_warns, b_warns);
let av: Vec<_> = a_blocks.iter().map(block_to_view).collect();
let bv: Vec<_> = b_blocks.iter().map(block_to_view).collect();
assert_eq!(
serde_json::to_value(&av).unwrap(),
serde_json::to_value(&bv).unwrap()
);
}

View File

@@ -0,0 +1,63 @@
{
"blocks": [
{
"kind": "heading",
"heading_path": [],
"source_span": {
"kind": "line",
"start": 1,
"end": 1
},
"payload": {
"kind": "heading",
"level": 1,
"text": "Code And Table"
}
},
{
"kind": "code",
"heading_path": [
"Code And Table"
],
"source_span": {
"kind": "line",
"start": 3,
"end": 7
},
"payload": {
"kind": "code",
"lang": "rust",
"code": "fn main() {\n println!(\"hi\");\n}"
}
},
{
"kind": "table",
"heading_path": [
"Code And Table"
],
"source_span": {
"kind": "line",
"start": 9,
"end": 12
},
"payload": {
"kind": "table",
"headers": [
"col a",
"col b"
],
"rows": [
[
"1",
"2"
],
[
"3",
"4"
]
]
}
}
],
"warnings": []
}

View File

@@ -0,0 +1,136 @@
{
"blocks": [
{
"kind": "heading",
"heading_path": [],
"source_span": {
"kind": "line",
"start": 1,
"end": 1
},
"payload": {
"kind": "heading",
"level": 1,
"text": "Top"
}
},
{
"kind": "paragraph",
"heading_path": [
"Top"
],
"source_span": {
"kind": "line",
"start": 3,
"end": 3
},
"payload": {
"kind": "paragraph",
"text": "intro",
"inlines_flat": "intro"
}
},
{
"kind": "heading",
"heading_path": [
"Top"
],
"source_span": {
"kind": "line",
"start": 5,
"end": 5
},
"payload": {
"kind": "heading",
"level": 2,
"text": "Section A"
}
},
{
"kind": "paragraph",
"heading_path": [
"Top",
"Section A"
],
"source_span": {
"kind": "line",
"start": 7,
"end": 7
},
"payload": {
"kind": "paragraph",
"text": "body of A",
"inlines_flat": "body of A"
}
},
{
"kind": "heading",
"heading_path": [
"Top",
"Section A"
],
"source_span": {
"kind": "line",
"start": 9,
"end": 9
},
"payload": {
"kind": "heading",
"level": 3,
"text": "Sub A.1"
}
},
{
"kind": "paragraph",
"heading_path": [
"Top",
"Section A",
"Sub A.1"
],
"source_span": {
"kind": "line",
"start": 11,
"end": 11
},
"payload": {
"kind": "paragraph",
"text": "deeper",
"inlines_flat": "deeper"
}
},
{
"kind": "heading",
"heading_path": [
"Top"
],
"source_span": {
"kind": "line",
"start": 13,
"end": 13
},
"payload": {
"kind": "heading",
"level": 2,
"text": "Section B"
}
},
{
"kind": "paragraph",
"heading_path": [
"Top",
"Section B"
],
"source_span": {
"kind": "line",
"start": 15,
"end": 15
},
"payload": {
"kind": "paragraph",
"text": "body of B",
"inlines_flat": "body of B"
}
}
],
"warnings": []
}