p1-3: add parse_blocks (pulldown-cmark walker) submodule
Implements `kb_parse_md::parse_blocks(body, body_offset_lines)` returning a flat `Vec<ParsedBlock>` plus warnings. Walks pulldown-cmark events through a small frame-based state machine that tracks heading paths, accumulates inline buffers (Text/Code/Link/Strong/Emph only — design §3.4), and reports SourceSpan::Line spans in 1-indexed file-line coordinates. Covers headings, paragraphs, code blocks (lang from info string), GFM tables (with malformed fallback to paragraph + MalformedTable warning), lists (nested sub-lists flattened into parent item), and block-level image references. Inline images are dropped silently per the inline filter. Adversarial inputs are caught with `catch_unwind` and degrade to an empty output + ExtractFailed warning. 15 unit tests cover heading-path correctness, code lang, table parsing, malformed-table fallback (driven via synthetic events since pulldown-cmark auto-normalizes table widths), LF/CRLF line-range parity, image refs, nested-list flattening, inline filter, and 100-iteration random-bytes plus hand-crafted adversarial-input no-panic guards.
This commit is contained in:
19
Cargo.lock
generated
19
Cargo.lock
generated
@@ -585,11 +585,13 @@ dependencies = [
|
||||
"kb-core",
|
||||
"kb-parse-types",
|
||||
"lingua",
|
||||
"pulldown-cmark",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_yaml_ng",
|
||||
"time",
|
||||
"toml",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -834,6 +836,17 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pulldown-cmark"
|
||||
version = "0.13.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"memchr",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.45"
|
||||
@@ -1367,6 +1380,12 @@ dependencies = [
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.24"
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Markdown frontmatter (and, in p1-3, block) parsing into kb-core::Metadata / kb-parse-types intermediates"
|
||||
description = "Markdown frontmatter and block parsing into kb-core::Metadata / kb-parse-types intermediates"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
@@ -14,6 +14,12 @@ anyhow = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
time = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
# pulldown-cmark is the CommonMark parser used by the `blocks` submodule.
|
||||
# GFM tables are gated by the runtime `Options::ENABLE_TABLES` flag, not a
|
||||
# cargo feature; we strip the default `getopts` + `html` features since we
|
||||
# only use the pull-parser API.
|
||||
pulldown-cmark = { version = "0.13", default-features = false }
|
||||
# serde_yaml (dtolnay) was archived as unmaintained in 2024.
|
||||
# We use the maintained fork serde_yaml_ng. Keeping the same `serde_yaml`-style
|
||||
# API surface lets us swap if a different fork wins long term.
|
||||
|
||||
1221
crates/kb-parse-md/src/blocks.rs
Normal file
1221
crates/kb-parse-md/src/blocks.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,19 +1,21 @@
|
||||
//! `kb-parse-md` — Markdown parsing for the KB pipeline (§3.7b).
|
||||
//!
|
||||
//! P1-2 implements the **frontmatter** submodule only. P1-3 will add a
|
||||
//! sibling `blocks` submodule for block parsing using `pulldown-cmark`.
|
||||
//!
|
||||
//! Public surface for P1-2 is intentionally narrow:
|
||||
//! Public surface:
|
||||
//!
|
||||
//! * [`parse_frontmatter`] — pure function from Markdown bytes to
|
||||
//! `(Metadata, Option<FrontmatterSpan>, Vec<Warning>)`.
|
||||
//! `(Metadata, Option<FrontmatterSpan>, Vec<Warning>)` (P1-2).
|
||||
//! * [`BodyHints`] — caller-supplied fallbacks that feed the §0 Q9 derive
|
||||
//! table when frontmatter is missing or partial.
|
||||
//! table when frontmatter is missing or partial (P1-2).
|
||||
//! * [`FrontmatterSpan`] — byte offsets of the frontmatter region in the
|
||||
//! input slice (returned by [`parse_frontmatter`]).
|
||||
//! input slice (returned by [`parse_frontmatter`]) (P1-2).
|
||||
//! * [`parse_blocks`] — pure function from Markdown body bytes to
|
||||
//! `(Vec<ParsedBlock>, Vec<Warning>)` with heading paths and 1-indexed
|
||||
//! `SourceSpan::Line` ranges relative to the original file (P1-3).
|
||||
//!
|
||||
//! Anything else in this crate is `pub(crate)` and may change without notice.
|
||||
|
||||
pub mod blocks;
|
||||
pub mod frontmatter;
|
||||
|
||||
pub use blocks::parse_blocks;
|
||||
pub use frontmatter::{BodyHints, FrontmatterSpan, parse_frontmatter};
|
||||
|
||||
Reference in New Issue
Block a user