Implements `kb_parse_md::parse_blocks(body, body_offset_lines)` returning a flat `Vec<ParsedBlock>` plus warnings. Walks pulldown-cmark events through a small frame-based state machine that tracks heading paths, accumulates inline buffers (Text/Code/Link/Strong/Emph only — design §3.4), and reports SourceSpan::Line spans in 1-indexed file-line coordinates. Covers headings, paragraphs, code blocks (lang from info string), GFM tables (with malformed fallback to paragraph + MalformedTable warning), lists (nested sub-lists flattened into parent item), and block-level image references. Inline images are dropped silently per the inline filter. Adversarial inputs are caught with `catch_unwind` and degrade to an empty output + ExtractFailed warning. 15 unit tests cover heading-path correctness, code lang, table parsing, malformed-table fallback (driven via synthetic events since pulldown-cmark auto-normalizes table widths), LF/CRLF line-range parity, image refs, nested-list flattening, inline filter, and 100-iteration random-bytes plus hand-crafted adversarial-input no-panic guards.
41 lines
1.6 KiB
TOML
41 lines
1.6 KiB
TOML
[package]
|
|
name = "kb-parse-md"
|
|
version = { workspace = true }
|
|
edition = { workspace = true }
|
|
rust-version = { workspace = true }
|
|
license = { workspace = true }
|
|
repository = { workspace = true }
|
|
description = "Markdown frontmatter and block parsing into kb-core::Metadata / kb-parse-types intermediates"
|
|
|
|
[dependencies]
|
|
kb-core = { path = "../kb-core" }
|
|
kb-parse-types = { path = "../kb-parse-types" }
|
|
anyhow = { workspace = true }
|
|
serde = { workspace = true }
|
|
serde_json = { workspace = true }
|
|
time = { workspace = true }
|
|
tracing = { workspace = true }
|
|
# pulldown-cmark is the CommonMark parser used by the `blocks` submodule.
|
|
# GFM tables are gated by the runtime `Options::ENABLE_TABLES` flag, not a
|
|
# cargo feature; we strip the default `getopts` + `html` features since we
|
|
# only use the pull-parser API.
|
|
pulldown-cmark = { version = "0.13", default-features = false }
|
|
# serde_yaml (dtolnay) was archived as unmaintained in 2024.
|
|
# We use the maintained fork serde_yaml_ng. Keeping the same `serde_yaml`-style
|
|
# API surface lets us swap if a different fork wins long term.
|
|
serde_yaml_ng = "0.10"
|
|
toml = "0.8"
|
|
# `lingua` ships every supported language as a feature flag; the `default`
|
|
# feature pulls all 75+ language models (huge build time + binary size).
|
|
# For p1-2 we only need a small subset for autodetect + tests. Add more
|
|
# languages here as future tasks call for them.
|
|
lingua = { version = "1.8", default-features = false, features = [
|
|
"korean",
|
|
"english",
|
|
"japanese",
|
|
"chinese",
|
|
] }
|
|
|
|
[dev-dependencies]
|
|
serde_json = { workspace = true }
|