From 4e7e9cad8746d08fb4c389ef1908364e151490a3 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 14:14:34 +0000 Subject: [PATCH 01/10] p1-3: add parse_blocks (pulldown-cmark walker) submodule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements `kb_parse_md::parse_blocks(body, body_offset_lines)` returning a flat `Vec` plus warnings. Walks pulldown-cmark events through a small frame-based state machine that tracks heading paths, accumulates inline buffers (Text/Code/Link/Strong/Emph only — design §3.4), and reports SourceSpan::Line spans in 1-indexed file-line coordinates. Covers headings, paragraphs, code blocks (lang from info string), GFM tables (with malformed fallback to paragraph + MalformedTable warning), lists (nested sub-lists flattened into parent item), and block-level image references. Inline images are dropped silently per the inline filter. Adversarial inputs are caught with `catch_unwind` and degrade to an empty output + ExtractFailed warning. 15 unit tests cover heading-path correctness, code lang, table parsing, malformed-table fallback (driven via synthetic events since pulldown-cmark auto-normalizes table widths), LF/CRLF line-range parity, image refs, nested-list flattening, inline filter, and 100-iteration random-bytes plus hand-crafted adversarial-input no-panic guards. --- Cargo.lock | 19 + crates/kb-parse-md/Cargo.toml | 8 +- crates/kb-parse-md/src/blocks.rs | 1221 ++++++++++++++++++++++++++++++ crates/kb-parse-md/src/lib.rs | 16 +- 4 files changed, 1256 insertions(+), 8 deletions(-) create mode 100644 crates/kb-parse-md/src/blocks.rs diff --git a/Cargo.lock b/Cargo.lock index 7c34f44..ec3d4e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -585,11 +585,13 @@ dependencies = [ "kb-core", "kb-parse-types", "lingua", + "pulldown-cmark", "serde", "serde_json", "serde_yaml_ng", "time", "toml", + "tracing", ] [[package]] @@ -834,6 +836,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pulldown-cmark" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad" +dependencies = [ + "bitflags", + "memchr", + "unicase", +] + [[package]] name = "quote" version = "1.0.45" @@ -1367,6 +1380,12 @@ dependencies = [ "tracing-serde", ] +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + [[package]] name = "unicode-ident" version = "1.0.24" diff --git a/crates/kb-parse-md/Cargo.toml b/crates/kb-parse-md/Cargo.toml index db7b56e..3b39606 100644 --- a/crates/kb-parse-md/Cargo.toml +++ b/crates/kb-parse-md/Cargo.toml @@ -5,7 +5,7 @@ edition = { workspace = true } rust-version = { workspace = true } license = { workspace = true } repository = { workspace = true } -description = "Markdown frontmatter (and, in p1-3, block) parsing into kb-core::Metadata / kb-parse-types intermediates" +description = "Markdown frontmatter and block parsing into kb-core::Metadata / kb-parse-types intermediates" [dependencies] kb-core = { path = "../kb-core" } @@ -14,6 +14,12 @@ anyhow = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } time = { workspace = true } +tracing = { workspace = true } +# pulldown-cmark is the CommonMark parser used by the `blocks` submodule. +# GFM tables are gated by the runtime `Options::ENABLE_TABLES` flag, not a +# cargo feature; we strip the default `getopts` + `html` features since we +# only use the pull-parser API. +pulldown-cmark = { version = "0.13", default-features = false } # serde_yaml (dtolnay) was archived as unmaintained in 2024. # We use the maintained fork serde_yaml_ng. Keeping the same `serde_yaml`-style # API surface lets us swap if a different fork wins long term. diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs new file mode 100644 index 0000000..1ddfaa6 --- /dev/null +++ b/crates/kb-parse-md/src/blocks.rs @@ -0,0 +1,1221 @@ +//! Markdown body → flat `Vec` (§3.4 / §3.7b). +//! +//! Uses `pulldown-cmark` (with GFM tables enabled at runtime via +//! `Options::ENABLE_TABLES`) to walk the body once and emit a flat list of +//! parsed blocks. Heading paths are computed by tracking the most-recent +//! heading text at each level. Source spans are reported as +//! [`kb_core::SourceSpan::Line`] in 1-indexed file-line coordinates by +//! converting `pulldown-cmark`'s byte offsets to line numbers and adding the +//! caller-supplied `body_offset_lines`. +//! +//! ## Coordinate conventions +//! +//! * Body lines are *0-indexed* internally — line 0 is the first line of the +//! body slice. We map to file coordinates with +//! `file_line = body_line_zero_indexed + body_offset_lines`. So a caller +//! that passes `body_offset_lines = 6` is saying "the first line of `body` +//! is line 6 of the original file". +//! * `SourceSpan::Line { start, end }` is inclusive on both ends. +//! +//! ## Inline filter +//! +//! [`kb_core::Inline`] only models `Text | Code | Link | Strong | Emph`. +//! Inline images, footnotes, hard breaks, etc. are dropped silently per +//! design §3.4. Block-level `![alt](src)` (an image as the sole content of a +//! paragraph) is lifted to [`kb_parse_types::ParsedPayload::ImageRef`]. +//! +//! ## CRLF +//! +//! Line numbers are computed by counting `\n` bytes in the prefix; CRLF +//! input still has `\n` at end-of-line so the math is identical to LF input. +//! `pulldown-cmark` may include `\r` characters inside its emitted text +//! events; we leave them as-is for now (they round-trip through serde). + +use std::ops::Range; + +use kb_core::{Inline, SourceSpan}; +use kb_parse_types::{ParsedBlock, ParsedBlockKind, ParsedPayload, Warning, WarningKind}; +use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag, TagEnd}; + +/// Parse a Markdown body into a flat `Vec` plus any warnings. +/// +/// `body_offset_lines` is added to every body-relative line number so that +/// reported [`SourceSpan::Line`] values address the **original** file. See +/// the module-level docs for the coordinate convention. +/// +/// # Errors +/// +/// `Err` is reserved for genuinely fatal conditions; the current +/// implementation never produces one — even adversarial inputs degrade to +/// `Ok((vec![], vec![Warning::ExtractFailed]))` via a top-level +/// `catch_unwind` guard. The `Result` is kept on the signature so a future +/// I/O-backed input can be added without breaking callers. +pub fn parse_blocks( + body: &[u8], + body_offset_lines: u32, +) -> anyhow::Result<(Vec, Vec)> { + // Adversarial-input safety: pulldown-cmark is documented as + // panic-free on valid UTF-8, but a defensive catch_unwind keeps the + // contract ("never panics") even if the dependency regresses. + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + parse_blocks_inner(body, body_offset_lines) + })); + match result { + Ok(out) => Ok(out), + Err(_) => { + tracing::warn!("parse_blocks panicked on adversarial input; returning empty"); + Ok(( + Vec::new(), + vec![Warning { + kind: WarningKind::ExtractFailed, + note: "pulldown-cmark panicked; body discarded".to_string(), + }], + )) + } + } +} + +fn parse_blocks_inner(body: &[u8], body_offset_lines: u32) -> (Vec, Vec) { + // Lossy UTF-8: pulldown-cmark wants `&str`. Non-UTF-8 inputs get + // U+FFFD substitution; the byte→line mapping is built from the original + // bytes so spans remain accurate. + let body_str = match std::str::from_utf8(body) { + Ok(s) => std::borrow::Cow::Borrowed(s), + Err(_) => String::from_utf8_lossy(body), + }; + + let line_index = LineIndex::new(body); + let mut warnings: Vec = Vec::new(); + let mut state = WalkState::new(body_offset_lines, &line_index, body); + + let mut options = Options::empty(); + options.insert(Options::ENABLE_TABLES); + let parser = Parser::new_ext(body_str.as_ref(), options); + + for (event, range) in parser.into_offset_iter() { + state.handle_event(event, range, &mut warnings); + } + + (state.finish(), warnings) +} + +// --------------------------------------------------------------------------- +// Line index — byte offset → 1-indexed body line +// --------------------------------------------------------------------------- + +/// Maps byte offsets in the body to body-line numbers (0-indexed). +/// +/// Built once per parse. Stores the byte offset of every newline so +/// `byte → line` is a binary search. +struct LineIndex { + /// Byte offsets of `\n` characters in the body, in ascending order. + /// `newlines[i]` is the position of the `i`-th newline (0-indexed). + newlines: Vec, + body_len: usize, +} + +impl LineIndex { + fn new(body: &[u8]) -> Self { + let newlines = body + .iter() + .enumerate() + .filter_map(|(i, &b)| if b == b'\n' { Some(i) } else { None }) + .collect(); + Self { + newlines, + body_len: body.len(), + } + } + + /// Body line (0-indexed) containing byte offset `pos`. + /// + /// Bytes before the first `\n` are line 0. Bytes after the last `\n` are + /// line `newlines.len()`. A position equal to a newline byte itself is + /// considered to be on the line *ending at* that newline (so `pos` that + /// points to `\n` reports the line that newline terminates). + fn line_zero_indexed(&self, pos: usize) -> u32 { + // Clamp to body length — pulldown-cmark may emit ranges with `end` + // equal to body.len() for the final block. + let pos = pos.min(self.body_len); + // partition_point returns the count of newlines whose offset is `< pos`. + // That's exactly the line-index for the line containing `pos`, with + // the convention "trailing newline belongs to the previous line". + let n_before = self.newlines.partition_point(|&nl| nl < pos); + n_before as u32 + } + + /// Body line (0-indexed) containing byte offset `pos - 1` — i.e., the + /// line on which the byte just **before** `pos` lives. Used for `end` + /// positions of `pulldown-cmark` ranges, which are exclusive (point one + /// past the last byte of the construct). + fn end_line_zero_indexed(&self, end: usize) -> u32 { + if end == 0 { + return 0; + } + self.line_zero_indexed(end - 1) + } +} + +// --------------------------------------------------------------------------- +// Walker state machine +// --------------------------------------------------------------------------- + +struct WalkState<'a> { + body_offset_lines: u32, + line_index: &'a LineIndex, + body: &'a [u8], + + blocks: Vec, + /// Most-recent heading text at each level (1..=6), or `None` if no + /// heading at that level has been seen since a higher-level heading was + /// encountered. `heading_path()` snapshots this into a `Vec`. + heading_stack: [Option; 6], + + /// Accumulator for the currently-open block. We push events into this + /// frame and convert to a `ParsedBlock` when the matching End tag fires. + frames: Vec, +} + +/// One in-flight container we are accumulating events into. Stack-shaped so +/// nested constructs (list inside list, paragraph inside blockquote) work. +enum Frame { + Heading { + level: u8, + range: Range, + inlines: InlineBuf, + }, + Paragraph { + range: Range, + inlines: InlineBuf, + }, + Quote { + range: Range, + children: Vec, + }, + List { + ordered: bool, + range: Range, + items: Vec>, + /// True iff the immediate parent of this list is another list + /// item — i.e. this is a nested sub-list. Nested sub-lists get + /// flattened into the parent item's inline buffer instead of + /// being emitted as their own `ParsedPayload::List`. + nested_in_item: bool, + }, + /// One `
  • `. Inlines flow into `inlines`; nested sub-lists append + /// flattened text into `inlines` as well. + ListItem { inlines: InlineBuf }, + Code { + lang: Option, + range: Range, + code: String, + }, + Table { + range: Range, + headers: Vec, + rows: Vec>, + in_head: bool, + current_row: Vec, + current_cell: String, + cols: usize, + malformed: Option, + }, +} + +/// Inline accumulator with a stack of formatting wrappers (Strong / Emph / +/// Link). The base of the stack is the top-level inline list. +struct InlineBuf { + /// Stack: bottom is the top-level vec; each push starts a new nested + /// `Vec` for `Strong`/`Emph`/etc. that gets popped & wrapped + /// when its End tag fires. + stack: Vec, + /// Plain-text accumulator for `paragraph.text` — the literal text shown + /// to the user without formatting. Matches the inline events 1:1. + text: String, +} + +enum InlineFrame { + Top(Vec), + Strong(Vec), + Emph(Vec), + Link { href: String, text: String, kids: Vec }, +} + +impl InlineBuf { + fn new() -> Self { + Self { + stack: vec![InlineFrame::Top(Vec::new())], + text: String::new(), + } + } + + fn push_inline(&mut self, inline: Inline) { + match self.stack.last_mut().expect("inline stack non-empty") { + InlineFrame::Top(v) => v.push(inline), + InlineFrame::Strong(v) => v.push(inline), + InlineFrame::Emph(v) => v.push(inline), + InlineFrame::Link { kids, .. } => kids.push(inline), + } + } + + fn push_text(&mut self, s: &str) { + self.text.push_str(s); + self.push_inline(Inline::Text(s.to_string())); + } + + fn push_code(&mut self, s: &str) { + self.text.push_str(s); + self.push_inline(Inline::Code(s.to_string())); + } + + fn open_strong(&mut self) { + self.stack.push(InlineFrame::Strong(Vec::new())); + } + fn close_strong(&mut self) { + if let Some(InlineFrame::Strong(kids)) = self.stack.pop() { + self.push_inline(Inline::Strong(kids)); + } + } + + fn open_emph(&mut self) { + self.stack.push(InlineFrame::Emph(Vec::new())); + } + fn close_emph(&mut self) { + if let Some(InlineFrame::Emph(kids)) = self.stack.pop() { + self.push_inline(Inline::Emph(kids)); + } + } + + fn open_link(&mut self, href: String) { + self.stack.push(InlineFrame::Link { + href, + text: String::new(), + kids: Vec::new(), + }); + } + fn close_link(&mut self) { + if let Some(InlineFrame::Link { href, text, kids }) = self.stack.pop() { + // Flatten the link's text contents to a single string for the + // `Inline::Link.text` field. Code/strong/emph inside a link are + // collapsed to their plain text — `Inline::Link` doesn't model + // formatting inside the link. + let flat = if !text.is_empty() { + text + } else { + flatten_inlines_to_text(&kids) + }; + self.push_inline(Inline::Link { text: flat, href }); + } + } + + /// Append plain text to the current link's flattened text accumulator + /// (called from inside a link frame). + fn push_link_text(&mut self, s: &str) { + if let Some(InlineFrame::Link { text, .. }) = self.stack.last_mut() { + text.push_str(s); + } + } + + fn finish(mut self) -> (Vec, String) { + // Normal flow: the only remaining frame is the Top, which we unwrap. + // If formatting tags were unbalanced we close them defensively. + while self.stack.len() > 1 { + match self.stack.pop().unwrap() { + InlineFrame::Strong(kids) => self.push_inline(Inline::Strong(kids)), + InlineFrame::Emph(kids) => self.push_inline(Inline::Emph(kids)), + InlineFrame::Link { href, text, kids } => { + let flat = if !text.is_empty() { + text + } else { + flatten_inlines_to_text(&kids) + }; + self.push_inline(Inline::Link { text: flat, href }); + } + InlineFrame::Top(_) => break, + } + } + let top = match self.stack.pop().unwrap() { + InlineFrame::Top(v) => v, + _ => Vec::new(), + }; + (top, self.text) + } + +} + +fn flatten_inlines_to_text(inlines: &[Inline]) -> String { + let mut out = String::new(); + for i in inlines { + flatten_one(i, &mut out); + } + out +} + +fn flatten_one(i: &Inline, out: &mut String) { + match i { + Inline::Text(s) | Inline::Code(s) => out.push_str(s), + Inline::Link { text, .. } => out.push_str(text), + Inline::Strong(v) | Inline::Emph(v) => { + for c in v { + flatten_one(c, out); + } + } + } +} + +impl<'a> WalkState<'a> { + fn new(body_offset_lines: u32, line_index: &'a LineIndex, body: &'a [u8]) -> Self { + Self { + body_offset_lines, + line_index, + body, + blocks: Vec::new(), + heading_stack: Default::default(), + frames: Vec::new(), + } + } + + fn finish(self) -> Vec { + self.blocks + } + + fn heading_path(&self) -> Vec { + self.heading_stack + .iter() + .filter_map(|s| s.clone()) + .collect() + } + + fn span_for(&self, range: &Range) -> SourceSpan { + let start_body = self.line_index.line_zero_indexed(range.start); + let end_body = if range.end <= range.start { + start_body + } else { + self.line_index.end_line_zero_indexed(range.end) + }; + SourceSpan::Line { + start: start_body + self.body_offset_lines, + end: end_body + self.body_offset_lines, + } + } + + /// Where to emit a finished block: into the current container if any + /// (Quote / ListItem), otherwise into the top-level `blocks` vec. + fn emit_block(&mut self, block: ParsedBlock) { + // Find the nearest enclosing container that accepts child blocks. + for frame in self.frames.iter_mut().rev() { + if let Frame::Quote { children, .. } = frame { + children.push(block); + return; + } + } + self.blocks.push(block); + } + + fn handle_event(&mut self, event: Event<'_>, range: Range, warnings: &mut Vec) { + match event { + // ---- Container starts ----------------------------------------------- + Event::Start(Tag::Heading { level, .. }) => { + self.frames.push(Frame::Heading { + level: heading_level_to_u8(level), + range, + inlines: InlineBuf::new(), + }); + } + Event::Start(Tag::Paragraph) => { + // If we're directly inside a list item, the inlines flow into + // the item, not a new paragraph block. + if matches!(self.frames.last(), Some(Frame::ListItem { .. })) { + return; + } + self.frames.push(Frame::Paragraph { + range, + inlines: InlineBuf::new(), + }); + } + Event::Start(Tag::BlockQuote(_)) => { + self.frames.push(Frame::Quote { + range, + children: Vec::new(), + }); + } + Event::Start(Tag::List(start)) => { + let nested_in_item = matches!( + self.frames.last(), + Some(Frame::ListItem { .. }) + ); + self.frames.push(Frame::List { + ordered: start.is_some(), + range, + items: Vec::new(), + nested_in_item, + }); + } + Event::Start(Tag::Item) => { + self.frames.push(Frame::ListItem { + inlines: InlineBuf::new(), + }); + } + Event::Start(Tag::CodeBlock(kind)) => { + let lang = match kind { + CodeBlockKind::Indented => None, + CodeBlockKind::Fenced(info) => { + let trimmed = info.trim(); + if trimmed.is_empty() { + None + } else { + // Take only the first whitespace-delimited token, + // matching how editors render the info string. + Some(trimmed.split_whitespace().next().unwrap_or(trimmed).to_string()) + } + } + }; + self.frames.push(Frame::Code { + lang, + range, + code: String::new(), + }); + } + Event::Start(Tag::Table(aligns)) => { + self.frames.push(Frame::Table { + range, + headers: Vec::new(), + rows: Vec::new(), + in_head: false, + current_row: Vec::new(), + current_cell: String::new(), + cols: aligns.len(), + malformed: None, + }); + } + Event::Start(Tag::TableHead) => { + if let Some(Frame::Table { in_head, .. }) = self.frames.last_mut() { + *in_head = true; + } + } + Event::Start(Tag::TableRow) => { + if let Some(Frame::Table { current_row, .. }) = self.frames.last_mut() { + current_row.clear(); + } + } + Event::Start(Tag::TableCell) => { + if let Some(Frame::Table { current_cell, .. }) = self.frames.last_mut() { + current_cell.clear(); + } + } + Event::Start(Tag::Strong) => { + self.with_current_inlines(|buf| buf.open_strong()); + } + Event::Start(Tag::Emphasis) => { + self.with_current_inlines(|buf| buf.open_emph()); + } + Event::Start(Tag::Link { dest_url, .. }) => { + let href = dest_url.into_string(); + self.with_current_inlines(|buf| buf.open_link(href)); + } + // Block-level image is handled at End — see TagEnd::Image. + Event::Start(Tag::Image { .. }) => { + // No-op at start; we capture src/title at End. + } + + // ---- Container ends ------------------------------------------------- + Event::End(TagEnd::Heading(level)) => { + let level_u8 = heading_level_to_u8(level); + if let Some(Frame::Heading { level: lvl, range, inlines }) = self.frames.pop() { + let (_inline_vec, text) = inlines.finish(); + let text = text.trim().to_string(); + let level_to_use = lvl; + let _ = level_u8; + + // Update heading stack: clear deeper levels, set this level. + if level_to_use >= 1 && level_to_use <= 6 { + let idx = (level_to_use - 1) as usize; + for slot in &mut self.heading_stack[idx + 1..] { + *slot = None; + } + self.heading_stack[idx] = Some(text.clone()); + } + + // The heading_path on the heading block ITSELF excludes + // the heading's own text (it's the path of ancestors). + let path = self + .heading_stack + .iter() + .take((level_to_use - 1) as usize) + .filter_map(|s| s.clone()) + .collect(); + + let block = ParsedBlock { + kind: ParsedBlockKind::Heading, + heading_path: path, + source_span: self.span_for(&range), + payload: ParsedPayload::Heading { level: level_to_use, text }, + }; + self.emit_block(block); + } + } + Event::End(TagEnd::Paragraph) => { + if matches!(self.frames.last(), Some(Frame::Paragraph { .. })) { + if let Some(Frame::Paragraph { range, inlines }) = self.frames.pop() { + let (inline_vec, text) = inlines.finish(); + // Block-level image: a paragraph whose only content + // is `![alt](src)` becomes ImageRef. Detect by + // scanning the original source for the canonical + // shape. + if let Some((alt, src)) = match_block_image(self.body, &range) { + let block = ParsedBlock { + kind: ParsedBlockKind::ImageRef, + heading_path: self.heading_path(), + source_span: self.span_for(&range), + payload: ParsedPayload::ImageRef { src, alt }, + }; + self.emit_block(block); + return; + } + let block = ParsedBlock { + kind: ParsedBlockKind::Paragraph, + heading_path: self.heading_path(), + source_span: self.span_for(&range), + payload: ParsedPayload::Paragraph { text, inlines: inline_vec }, + }; + self.emit_block(block); + } + } + } + Event::End(TagEnd::BlockQuote(_)) => { + if let Some(Frame::Quote { range, children }) = self.frames.pop() { + // Concatenate child text for the Quote payload. + let mut text = String::new(); + let mut inlines: Vec = Vec::new(); + for c in &children { + match &c.payload { + ParsedPayload::Paragraph { text: t, inlines: il } + | ParsedPayload::Quote { text: t, inlines: il } => { + if !text.is_empty() { + text.push('\n'); + } + text.push_str(t); + inlines.extend(il.iter().cloned()); + } + ParsedPayload::Heading { text: t, .. } => { + if !text.is_empty() { + text.push('\n'); + } + text.push_str(t); + inlines.push(Inline::Text(t.clone())); + } + _ => {} + } + } + let block = ParsedBlock { + kind: ParsedBlockKind::Quote, + heading_path: self.heading_path(), + source_span: self.span_for(&range), + payload: ParsedPayload::Quote { text, inlines }, + }; + self.emit_block(block); + } + } + Event::End(TagEnd::List(_)) => { + if let Some(Frame::List { ordered, range, items, nested_in_item }) = self.frames.pop() { + if nested_in_item { + // Flatten this sub-list into the enclosing list + // item's inline text. Each item becomes a line + // prefixed with " - " (or " 1. " for ordered) so + // structure is recognizable downstream. + let mut rendered = String::new(); + for (i, item) in items.iter().enumerate() { + rendered.push('\n'); + rendered.push_str(" "); + if ordered { + rendered.push_str(&format!("{}. ", i + 1)); + } else { + rendered.push_str("- "); + } + rendered.push_str(&flatten_inlines_to_text(item)); + } + if let Some(Frame::ListItem { inlines }) = self.frames.last_mut() { + inlines.push_text(&rendered); + } + } else { + let block = ParsedBlock { + kind: ParsedBlockKind::List, + heading_path: self.heading_path(), + source_span: self.span_for(&range), + payload: ParsedPayload::List { ordered, items }, + }; + self.emit_block(block); + } + } + } + Event::End(TagEnd::Item) => { + if let Some(Frame::ListItem { inlines }) = self.frames.pop() { + let (inline_vec, _text) = inlines.finish(); + if let Some(Frame::List { items, .. }) = self.frames.last_mut() { + items.push(inline_vec); + } + } + } + Event::End(TagEnd::CodeBlock) => { + if let Some(Frame::Code { lang, range, mut code }) = self.frames.pop() { + // Fenced code blocks include a trailing newline from the + // last source line; pulldown-cmark already strips the + // closing fence. We trim a single trailing `\n` for + // clean round-trips with hand-authored fixtures. + if code.ends_with('\n') { + code.pop(); + } + let block = ParsedBlock { + kind: ParsedBlockKind::Code, + heading_path: self.heading_path(), + source_span: self.span_for(&range), + payload: ParsedPayload::Code { lang, code }, + }; + self.emit_block(block); + } + } + Event::End(TagEnd::Table) => { + if let Some(Frame::Table { + range, + headers, + rows, + cols, + malformed, + .. + }) = self.frames.pop() + { + let header_count = if cols > 0 { cols } else { headers.len() }; + let block = if let Some(note) = malformed { + // Fall back to a paragraph carrying the raw markdown. + warnings.push(Warning { + kind: WarningKind::MalformedTable, + note, + }); + let raw = std::str::from_utf8( + self.body.get(range.clone()).unwrap_or(&[]), + ) + .unwrap_or_default() + .to_string(); + ParsedBlock { + kind: ParsedBlockKind::Paragraph, + heading_path: self.heading_path(), + source_span: self.span_for(&range), + payload: ParsedPayload::Paragraph { + text: raw.clone(), + inlines: vec![Inline::Text(raw)], + }, + } + } else { + let _ = header_count; + ParsedBlock { + kind: ParsedBlockKind::Table, + heading_path: self.heading_path(), + source_span: self.span_for(&range), + payload: ParsedPayload::Table { headers, rows }, + } + }; + self.emit_block(block); + } + } + Event::End(TagEnd::TableHead) => { + if let Some(Frame::Table { + in_head, + headers, + current_row, + cols, + malformed, + .. + }) = self.frames.last_mut() + { + *in_head = false; + *headers = std::mem::take(current_row); + if *cols == 0 { + *cols = headers.len(); + } else if headers.len() != *cols && malformed.is_none() { + *malformed = Some(format!( + "header has {} cells, table reported {} columns", + headers.len(), + cols + )); + } + } + } + Event::End(TagEnd::TableRow) => { + if let Some(Frame::Table { + rows, + current_row, + cols, + malformed, + .. + }) = self.frames.last_mut() + { + let row = std::mem::take(current_row); + if row.len() != *cols && malformed.is_none() { + *malformed = Some(format!( + "row {} has {} cells, headers have {}", + rows.len() + 1, + row.len(), + cols + )); + } + rows.push(row); + } + } + Event::End(TagEnd::TableCell) => { + if let Some(Frame::Table { + current_row, + current_cell, + .. + }) = self.frames.last_mut() + { + current_row.push(std::mem::take(current_cell)); + } + } + Event::End(TagEnd::Strong) => { + self.with_current_inlines(|buf| buf.close_strong()); + } + Event::End(TagEnd::Emphasis) => { + self.with_current_inlines(|buf| buf.close_emph()); + } + Event::End(TagEnd::Link) => { + self.with_current_inlines(|buf| buf.close_link()); + } + Event::End(TagEnd::Image) => { + // Inline images are dropped silently. Block-level image refs + // are detected at paragraph End, not here. + } + + // ---- Leaf events ----------------------------------------------------- + Event::Text(s) => { + // Code blocks accumulate into the code string instead of + // the inline buffer. + if let Some(Frame::Code { code, .. }) = self.frames.last_mut() { + code.push_str(&s); + return; + } + if let Some(Frame::Table { + in_head, + current_cell, + .. + }) = self.frames.last_mut() + { + let _ = in_head; + current_cell.push_str(&s); + return; + } + let owned = s.into_string(); + self.with_current_inlines(|buf| { + buf.push_text(&owned); + buf.push_link_text(&owned); + }); + } + Event::Code(s) => { + if let Some(Frame::Table { current_cell, .. }) = self.frames.last_mut() { + current_cell.push_str(&s); + return; + } + let owned = s.into_string(); + self.with_current_inlines(|buf| { + buf.push_code(&owned); + buf.push_link_text(&owned); + }); + } + Event::SoftBreak | Event::HardBreak => { + if let Some(Frame::Code { code, .. }) = self.frames.last_mut() { + code.push('\n'); + return; + } + if let Some(Frame::Table { current_cell, .. }) = self.frames.last_mut() { + current_cell.push(' '); + return; + } + self.with_current_inlines(|buf| buf.push_text(" ")); + } + // Everything else (HTML, footnote refs, task list markers, math, + // rules, etc.) is dropped silently per design §3.4. + _ => {} + } + } + + /// Run `f` on whichever inline accumulator is open at the top of the + /// frame stack. No-op if no inline-accepting frame is open. + fn with_current_inlines(&mut self, f: F) { + for frame in self.frames.iter_mut().rev() { + match frame { + Frame::Paragraph { inlines, .. } + | Frame::Heading { inlines, .. } + | Frame::ListItem { inlines } => { + f(inlines); + return; + } + Frame::Quote { .. } | Frame::List { .. } => continue, + Frame::Code { .. } | Frame::Table { .. } => return, + } + } + } +} + +fn heading_level_to_u8(level: HeadingLevel) -> u8 { + match level { + HeadingLevel::H1 => 1, + HeadingLevel::H2 => 2, + HeadingLevel::H3 => 3, + HeadingLevel::H4 => 4, + HeadingLevel::H5 => 5, + HeadingLevel::H6 => 6, + } +} + +/// Detect a paragraph whose entire trimmed source is `![alt](src)` — the +/// canonical "block-level image" shape. Returns `(alt, src)` if so. We do +/// this by scanning the original bytes (not the inline events) so it stays +/// robust to pulldown-cmark's internal representation of images. +fn match_block_image(body: &[u8], range: &Range) -> Option<(String, String)> { + let slice = body.get(range.clone())?; + let s = std::str::from_utf8(slice).ok()?.trim(); + if !s.starts_with("![") { + return None; + } + // Find the closing `]` of the alt text. Markdown does not allow nested + // brackets without escaping — for a block-level image we only handle the + // simple form. Anything else falls through to ordinary paragraph parsing. + let close_bracket = s.find("](")?; + let alt = &s[2..close_bracket]; + // The rest must be `(SRC)` and nothing else. + let after = &s[close_bracket + 2..]; + let close_paren = after.rfind(')')?; + if close_paren != after.len() - 1 { + return None; + } + let src = &after[..close_paren]; + // Reject if alt or src contain a newline — that means the paragraph has + // more content beyond the image and isn't a pure block-level image. + if alt.contains('\n') || src.contains('\n') { + return None; + } + // Reject brackets/parens inside src — tolerated by CommonMark via + // angle-bracket-wrap, but we keep this conservative for now. + Some((alt.to_string(), src.to_string())) +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(body: &str, offset: u32) -> (Vec, Vec) { + parse_blocks(body.as_bytes(), offset).unwrap() + } + + // ---- heading tree depth + heading_path correctness ---------------------- + + #[test] + fn heading_path_is_ancestors_only() { + let body = "# H1 text\n\n## H2 text\n\nbody under h2\n"; + let (blocks, warns) = parse(body, 1); + assert!(warns.is_empty(), "warns: {warns:?}"); + // Expect: H1 heading, H2 heading, paragraph. + assert_eq!(blocks.len(), 3); + // H1 has no ancestor. + assert_eq!(blocks[0].heading_path, Vec::::new()); + match &blocks[0].payload { + ParsedPayload::Heading { level, text } => { + assert_eq!(*level, 1); + assert_eq!(text, "H1 text"); + } + _ => panic!("expected heading"), + } + // H2 nests under H1. + assert_eq!(blocks[1].heading_path, vec!["H1 text".to_string()]); + // Paragraph nests under H1 → H2. + assert_eq!( + blocks[2].heading_path, + vec!["H1 text".to_string(), "H2 text".to_string()] + ); + } + + #[test] + fn h1_resets_deeper_path() { + let body = "# A\n\n## A.1\n\np1\n\n# B\n\np2\n"; + let (blocks, _) = parse(body, 1); + // p1 under [A, A.1]; p2 under [B]. + let p1 = blocks.iter().find(|b| matches!(b.payload, ParsedPayload::Paragraph { ref text, .. } if text == "p1")).unwrap(); + let p2 = blocks.iter().find(|b| matches!(b.payload, ParsedPayload::Paragraph { ref text, .. } if text == "p2")).unwrap(); + assert_eq!(p1.heading_path, vec!["A".to_string(), "A.1".to_string()]); + assert_eq!(p2.heading_path, vec!["B".to_string()]); + } + + // ---- code blocks --------------------------------------------------------- + + #[test] + fn code_block_lang_preserved() { + let body = "```rust\nfn main(){}\n```\n"; + let (blocks, _) = parse(body, 1); + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::Code { lang, code } => { + assert_eq!(lang.as_deref(), Some("rust")); + assert_eq!(code, "fn main(){}"); + } + _ => panic!("expected code"), + } + } + + #[test] + fn code_block_no_lang_is_none() { + let body = "```\nfoo\n```\n"; + let (blocks, _) = parse(body, 1); + match &blocks[0].payload { + ParsedPayload::Code { lang, .. } => assert_eq!(lang, &None), + _ => panic!("expected code"), + } + } + + // ---- tables -------------------------------------------------------------- + + #[test] + fn gfm_table_parses_headers_and_rows() { + let body = "| a | b | c |\n|---|---|---|\n| 1 | 2 | 3 |\n| x | y | z |\n"; + let (blocks, warns) = parse(body, 1); + assert!(warns.is_empty(), "warns: {warns:?}"); + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::Table { headers, rows } => { + assert_eq!(headers, &vec!["a".to_string(), "b".to_string(), "c".to_string()]); + assert_eq!(rows.len(), 2); + assert_eq!(rows[0], vec!["1".to_string(), "2".to_string(), "3".to_string()]); + assert_eq!(rows[1], vec!["x".to_string(), "y".to_string(), "z".to_string()]); + } + _ => panic!("expected table, got {:?}", blocks[0].payload), + } + } + + /// Drive the malformed-table fallback path directly by feeding a + /// hand-built event stream through `WalkState::handle_event`. This + /// pins the contract: when the table frame's `malformed` field is set + /// at end-of-table, the block degrades to a paragraph and a + /// `MalformedTable` warning is emitted. + /// + /// Doing it via events (instead of a real markdown source) is + /// necessary because `pulldown-cmark` auto-normalizes GFM tables — + /// short rows are padded, long rows are truncated, so a real markdown + /// input never reaches the malformed branch. + #[test] + fn malformed_table_falls_back_to_paragraph_with_warning() { + use pulldown_cmark::{Alignment, CowStr}; + + let body = b"| a | b |\n|---|---|\n| 1 |\n"; + let line_index = LineIndex::new(body); + let mut state = WalkState::new(1, &line_index, body); + let mut warnings = Vec::new(); + + // Synthetic events — fake a 3-column table with a 2-cell header so + // the `malformed` branch fires. + let aligns = vec![Alignment::None, Alignment::None, Alignment::None]; + state.handle_event(Event::Start(Tag::Table(aligns)), 0..body.len(), &mut warnings); + state.handle_event(Event::Start(Tag::TableHead), 0..0, &mut warnings); + state.handle_event(Event::Start(Tag::TableCell), 0..0, &mut warnings); + state.handle_event(Event::Text(CowStr::Borrowed("a")), 0..0, &mut warnings); + state.handle_event(Event::End(TagEnd::TableCell), 0..0, &mut warnings); + state.handle_event(Event::Start(Tag::TableCell), 0..0, &mut warnings); + state.handle_event(Event::Text(CowStr::Borrowed("b")), 0..0, &mut warnings); + state.handle_event(Event::End(TagEnd::TableCell), 0..0, &mut warnings); + // Two-cell header with cols=3 → triggers malformed. + state.handle_event(Event::End(TagEnd::TableHead), 0..0, &mut warnings); + state.handle_event(Event::End(TagEnd::Table), 0..body.len(), &mut warnings); + + let blocks = state.finish(); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].kind, ParsedBlockKind::Paragraph); + match &blocks[0].payload { + ParsedPayload::Paragraph { text, .. } => { + assert!(text.contains("| a | b |"), "raw markdown preserved: {text:?}"); + } + _ => panic!("expected paragraph fallback"), + } + assert_eq!(warnings.len(), 1); + assert!(matches!(warnings[0].kind, WarningKind::MalformedTable)); + assert!(warnings[0].note.contains("header")); + } + + // ---- line ranges (LF) ---------------------------------------------------- + + #[test] + fn line_range_lf_paragraph_at_body_line_5() { + // body lines: 1=blank, 2=blank, 3=blank, 4=blank, 5=paragraph + let body = "\n\n\n\nhello world\n"; + // body_offset_lines=6 → body line 0 (zero-indexed) → file line 6. + // The paragraph is at body 0-indexed line 4 → file line 10. + let (blocks, _) = parse(body, 6); + assert_eq!(blocks.len(), 1); + match blocks[0].source_span { + SourceSpan::Line { start, end } => { + assert_eq!(start, 10, "paragraph should start at file line 10"); + assert_eq!(end, 10); + } + _ => panic!("expected line span"), + } + } + + #[test] + fn line_range_lf_multi_line_paragraph() { + // body line 0 = "a", 1 = "b", 2 = "c"; one paragraph, lines 0..=2. + let body = "a\nb\nc\n"; + let (blocks, _) = parse(body, 1); + assert_eq!(blocks.len(), 1); + match blocks[0].source_span { + SourceSpan::Line { start, end } => { + assert_eq!(start, 1); + assert_eq!(end, 3); + } + _ => panic!(), + } + } + + // ---- line ranges (CRLF) -------------------------------------------------- + + #[test] + fn line_range_crlf_matches_lf() { + let body_lf = "\n\n\n\nhello\n"; + let body_crlf = "\r\n\r\n\r\n\r\nhello\r\n"; + let (lf_blocks, _) = parse(body_lf, 6); + let (crlf_blocks, _) = parse(body_crlf, 6); + assert_eq!(lf_blocks.len(), 1); + assert_eq!(crlf_blocks.len(), 1); + assert_eq!(lf_blocks[0].source_span, crlf_blocks[0].source_span); + } + + // ---- image ref ----------------------------------------------------------- + + #[test] + fn image_ref_block_captures_src_and_alt() { + let body = "![hello](pic.png)\n"; + let (blocks, _) = parse(body, 1); + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::ImageRef { src, alt } => { + assert_eq!(alt, "hello"); + assert_eq!(src, "pic.png"); + } + _ => panic!("expected image ref, got {:?}", blocks[0].payload), + } + assert_eq!(blocks[0].kind, ParsedBlockKind::ImageRef); + } + + #[test] + fn inline_image_inside_paragraph_is_dropped() { + // The image is part of a longer paragraph → not a block-level image. + let body = "see ![alt](pic.png) here\n"; + let (blocks, _) = parse(body, 1); + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::Paragraph { inlines, .. } => { + // Only Text/Code/Link/Strong/Emph allowed. + for inl in inlines { + assert!( + matches!( + inl, + Inline::Text(_) | Inline::Code(_) | Inline::Link { .. } | Inline::Strong(_) | Inline::Emph(_) + ), + "unexpected inline kind: {:?}", + inl + ); + } + } + _ => panic!("expected paragraph, got {:?}", blocks[0].payload), + } + } + + // ---- nested lists -------------------------------------------------------- + + #[test] + fn nested_list_flattens_into_parent_item() { + let body = "- a\n - x\n - y\n- b\n"; + let (blocks, _) = parse(body, 1); + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::List { ordered, items } => { + assert!(!ordered); + assert_eq!(items.len(), 2, "two top-level items"); + // First item should contain "a" plus a flattened rendering + // of the nested sub-list. + let flat = flatten_inlines_to_text(&items[0]); + assert!(flat.contains("a"), "first item missing 'a': {flat:?}"); + assert!(flat.contains("- x"), "first item missing '- x': {flat:?}"); + assert!(flat.contains("- y"), "first item missing '- y': {flat:?}"); + let flat2 = flatten_inlines_to_text(&items[1]); + assert_eq!(flat2.trim(), "b"); + } + _ => panic!("expected list, got {:?}", blocks[0].payload), + } + } + + // ---- malformed input no-panic ------------------------------------------- + + #[test] + fn random_bytes_do_not_panic() { + // Tiny xorshift PRNG seeded with a constant; deterministic inputs + // covering both valid-ish and invalid markdown / utf-8 mixes. + let mut state: u32 = 0x1234_5678; + let mut next = || { + state ^= state << 13; + state ^= state >> 17; + state ^= state << 5; + state + }; + for _ in 0..100 { + let len = (next() as usize) % 256; + let bytes: Vec = (0..len).map(|_| next() as u8).collect(); + let _ = parse_blocks(&bytes, 1).expect("must be Ok"); + } + } + + #[test] + fn adversarial_inputs_no_panic() { + // Hand-crafted oddities. + let cases: &[&[u8]] = &[ + b"", + b"\0\0\0", + b"```\nunclosed", + b"# heading\n```\nfn main() {", + b"| a | b |\n|---|---|\n| 1 |\n", // short row + b"| a | b |\n|---|\n| 1 | 2 |\n", // header/sep mismatch + b"![", + b"](", + b"---\nfm: yes\n", + b"#######", // 7 hashes (invalid heading) + b"\xff\xfe\x00\x00garbage", // non-utf8 + "# 한글\n\n본문\n".as_bytes(), + ]; + for c in cases { + let _ = parse_blocks(c, 1).expect("must be Ok"); + } + } + + // ---- inline filter ------------------------------------------------------- + + #[test] + fn only_allowed_inlines_emitted() { + let body = "**bold** *em* `code` [link](u)\n"; + let (blocks, _) = parse(body, 1); + match &blocks[0].payload { + ParsedPayload::Paragraph { inlines, .. } => { + let kinds: Vec<&'static str> = inlines.iter().map(|i| match i { + Inline::Text(_) => "Text", + Inline::Code(_) => "Code", + Inline::Link { .. } => "Link", + Inline::Strong(_) => "Strong", + Inline::Emph(_) => "Emph", + }).collect(); + assert!(kinds.contains(&"Strong")); + assert!(kinds.contains(&"Emph")); + assert!(kinds.contains(&"Code")); + assert!(kinds.contains(&"Link")); + } + _ => panic!(), + } + } +} diff --git a/crates/kb-parse-md/src/lib.rs b/crates/kb-parse-md/src/lib.rs index 4ce413d..245e8bb 100644 --- a/crates/kb-parse-md/src/lib.rs +++ b/crates/kb-parse-md/src/lib.rs @@ -1,19 +1,21 @@ //! `kb-parse-md` — Markdown parsing for the KB pipeline (§3.7b). //! -//! P1-2 implements the **frontmatter** submodule only. P1-3 will add a -//! sibling `blocks` submodule for block parsing using `pulldown-cmark`. -//! -//! Public surface for P1-2 is intentionally narrow: +//! Public surface: //! //! * [`parse_frontmatter`] — pure function from Markdown bytes to -//! `(Metadata, Option, Vec)`. +//! `(Metadata, Option, Vec)` (P1-2). //! * [`BodyHints`] — caller-supplied fallbacks that feed the §0 Q9 derive -//! table when frontmatter is missing or partial. +//! table when frontmatter is missing or partial (P1-2). //! * [`FrontmatterSpan`] — byte offsets of the frontmatter region in the -//! input slice (returned by [`parse_frontmatter`]). +//! input slice (returned by [`parse_frontmatter`]) (P1-2). +//! * [`parse_blocks`] — pure function from Markdown body bytes to +//! `(Vec, Vec)` with heading paths and 1-indexed +//! `SourceSpan::Line` ranges relative to the original file (P1-3). //! //! Anything else in this crate is `pub(crate)` and may change without notice. +pub mod blocks; pub mod frontmatter; +pub use blocks::parse_blocks; pub use frontmatter::{BodyHints, FrontmatterSpan, parse_frontmatter}; -- 2.49.1 From f604a381dfda2e00284e3e299c8fa2dedf69f851 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 14:17:41 +0000 Subject: [PATCH 02/10] p1-3: snapshot tests + clippy fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two snapshot tests (`nested-headings.md`, `code-and-table.md`) under crates/kb-parse-md/tests/blocks_snapshots.rs, with matching baseline JSON next to each fixture. The snapshot view projects `kb_core::Inline` to flat strings — `Inline` carries `serde(tag = "kind")` which is incompatible with newtype variants holding a primitive (`Text(String)`), so direct serialization of `ParsedBlock` would fail today. The view preserves the contract that matters for P1-3 (heading paths, source spans, payload kinds, payload text/code/table content) and will keep working once kb-core fixes the Inline schema in a later task. Also tightens `level_to_use >= 1 && <= 6` into `(1..=6).contains(&_)` to satisfy `clippy::manual_range_contains`. --- crates/kb-parse-md/src/blocks.rs | 2 +- crates/kb-parse-md/tests/blocks_snapshots.rs | 240 ++++++++++++++++++ .../code-and-table.blocks.snapshot.json | 63 +++++ .../nested-headings.blocks.snapshot.json | 136 ++++++++++ 4 files changed, 440 insertions(+), 1 deletion(-) create mode 100644 crates/kb-parse-md/tests/blocks_snapshots.rs create mode 100644 fixtures/markdown/code-and-table.blocks.snapshot.json create mode 100644 fixtures/markdown/nested-headings.blocks.snapshot.json diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs index 1ddfaa6..86913de 100644 --- a/crates/kb-parse-md/src/blocks.rs +++ b/crates/kb-parse-md/src/blocks.rs @@ -528,7 +528,7 @@ impl<'a> WalkState<'a> { let _ = level_u8; // Update heading stack: clear deeper levels, set this level. - if level_to_use >= 1 && level_to_use <= 6 { + if (1..=6).contains(&level_to_use) { let idx = (level_to_use - 1) as usize; for slot in &mut self.heading_stack[idx + 1..] { *slot = None; diff --git a/crates/kb-parse-md/tests/blocks_snapshots.rs b/crates/kb-parse-md/tests/blocks_snapshots.rs new file mode 100644 index 0000000..483286e --- /dev/null +++ b/crates/kb-parse-md/tests/blocks_snapshots.rs @@ -0,0 +1,240 @@ +//! Snapshot tests pinning the `parse_blocks` output for two fixtures. +//! +//! Baselines are hand-authored / regenerated via the `--ignored` emitter +//! below. `body_offset_lines = 1` is used for both fixtures (no +//! frontmatter, body starts at file line 1). +//! +//! Note on snapshot shape: `kb_core::Inline` carries a `serde(tag = "kind")` +//! enum representation that cannot serialize newtype variants holding a +//! primitive (`Inline::Text(String)` etc.) — that's a serde limitation, not +//! ours, and is fixed up in a later kb-core task. To keep the snapshot +//! human-readable (and stable across that future fix), we project each +//! `ParsedBlock` into a `BlockView` that flattens inline content to plain +//! strings before serialization. This still pins the *contract* that +//! matters for P1-3: heading paths, source spans, payload kinds, payload +//! text content, table headers/rows, and code lang/body. + +use kb_core::{Inline, SourceSpan}; +use kb_parse_md::parse_blocks; +use kb_parse_types::{ParsedBlock, ParsedPayload, Warning}; +use serde::Serialize; +use serde_json::Value; +use std::fs; +use std::path::PathBuf; + +#[derive(Serialize)] +struct Snapshot { + blocks: Vec, + warnings: Vec, +} + +#[derive(Serialize)] +struct BlockView { + kind: String, + heading_path: Vec, + source_span: SourceSpan, + payload: PayloadView, +} + +#[derive(Serialize)] +#[serde(tag = "kind", rename_all = "lowercase")] +enum PayloadView { + Heading { + level: u8, + text: String, + }, + Paragraph { + text: String, + inlines_flat: String, + }, + List { + ordered: bool, + items_flat: Vec, + }, + Code { + lang: Option, + code: String, + }, + Table { + headers: Vec, + rows: Vec>, + }, + Quote { + text: String, + inlines_flat: String, + }, + ImageRef { + src: String, + alt: String, + }, + AudioRef { + src: String, + }, +} + +fn flatten_inline(i: &Inline, out: &mut String) { + match i { + Inline::Text(s) | Inline::Code(s) => out.push_str(s), + Inline::Link { text, href } => { + out.push('['); + out.push_str(text); + out.push_str("]("); + out.push_str(href); + out.push(')'); + } + Inline::Strong(v) => { + out.push_str("**"); + for c in v { + flatten_inline(c, out); + } + out.push_str("**"); + } + Inline::Emph(v) => { + out.push('*'); + for c in v { + flatten_inline(c, out); + } + out.push('*'); + } + } +} + +fn flatten(inlines: &[Inline]) -> String { + let mut out = String::new(); + for i in inlines { + flatten_inline(i, &mut out); + } + out +} + +fn block_to_view(b: &ParsedBlock) -> BlockView { + let kind = format!("{:?}", b.kind).to_lowercase(); + let payload = match &b.payload { + ParsedPayload::Heading { level, text } => PayloadView::Heading { + level: *level, + text: text.clone(), + }, + ParsedPayload::Paragraph { text, inlines } => PayloadView::Paragraph { + text: text.clone(), + inlines_flat: flatten(inlines), + }, + ParsedPayload::List { ordered, items } => PayloadView::List { + ordered: *ordered, + items_flat: items.iter().map(|it| flatten(it)).collect(), + }, + ParsedPayload::Code { lang, code } => PayloadView::Code { + lang: lang.clone(), + code: code.clone(), + }, + ParsedPayload::Table { headers, rows } => PayloadView::Table { + headers: headers.clone(), + rows: rows.clone(), + }, + ParsedPayload::Quote { text, inlines } => PayloadView::Quote { + text: text.clone(), + inlines_flat: flatten(inlines), + }, + ParsedPayload::ImageRef { src, alt } => PayloadView::ImageRef { + src: src.clone(), + alt: alt.clone(), + }, + ParsedPayload::AudioRef { src } => PayloadView::AudioRef { src: src.clone() }, + }; + BlockView { + kind, + heading_path: b.heading_path.clone(), + source_span: b.source_span.clone(), + payload, + } +} + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("..") + .join("fixtures") + .join("markdown") +} + +fn assert_snapshot(fixture: &str, baseline: &str) { + let dir = fixtures_dir(); + let bytes = fs::read(dir.join(fixture)).expect("fixture readable"); + + let (blocks, warns) = parse_blocks(&bytes, 1).unwrap(); + let snap = Snapshot { + blocks: blocks.iter().map(block_to_view).collect(), + warnings: warns, + }; + let actual: Value = serde_json::to_value(&snap).unwrap(); + + let expected_text = + fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable"); + let expected: Value = serde_json::from_str(&expected_text).expect("baseline parses as json"); + + if actual != expected { + let actual_pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "snapshot drift for {fixture}\n\ + --- expected ({baseline}) ---\n{expected_text}\n\ + --- actual ---\n{actual_pretty}\n\ + If the change is intentional, update {baseline}." + ); + } +} + +#[test] +fn nested_headings_blocks_snapshot() { + assert_snapshot( + "nested-headings.md", + "nested-headings.blocks.snapshot.json", + ); +} + +#[test] +fn code_and_table_blocks_snapshot() { + assert_snapshot( + "code-and-table.md", + "code-and-table.blocks.snapshot.json", + ); +} + +/// Run with `cargo test -p kb-parse-md --test blocks_snapshots emit_blocks_snapshots -- --ignored --nocapture` +/// to regenerate the baseline JSON files from the current parser output. +#[test] +#[ignore] +fn emit_blocks_snapshots() { + let dir = fixtures_dir(); + for (fixture, baseline) in [ + ("nested-headings.md", "nested-headings.blocks.snapshot.json"), + ("code-and-table.md", "code-and-table.blocks.snapshot.json"), + ] { + let bytes = fs::read(dir.join(fixture)).unwrap(); + let (blocks, warns) = parse_blocks(&bytes, 1).unwrap(); + let snap = Snapshot { + blocks: blocks.iter().map(block_to_view).collect(), + warnings: warns, + }; + let json = serde_json::to_string_pretty(&snap).unwrap(); + fs::write(dir.join(baseline), format!("{json}\n")).unwrap(); + eprintln!("wrote {}", dir.join(baseline).display()); + } +} + +/// Determinism: parsing the same fixture twice in a row must give equal output. +#[test] +fn snapshot_is_deterministic_across_runs() { + let dir = fixtures_dir(); + let bytes = fs::read(dir.join("nested-headings.md")).unwrap(); + let (a_blocks, a_warns) = parse_blocks(&bytes, 1).unwrap(); + let (b_blocks, b_warns) = parse_blocks(&bytes, 1).unwrap(); + // Compare via the view (which is fully serializable) and via the + // structural equality on `ParsedBlock` itself (no serde involved). + assert_eq!(a_blocks, b_blocks); + assert_eq!(a_warns, b_warns); + let av: Vec<_> = a_blocks.iter().map(block_to_view).collect(); + let bv: Vec<_> = b_blocks.iter().map(block_to_view).collect(); + assert_eq!( + serde_json::to_value(&av).unwrap(), + serde_json::to_value(&bv).unwrap() + ); +} diff --git a/fixtures/markdown/code-and-table.blocks.snapshot.json b/fixtures/markdown/code-and-table.blocks.snapshot.json new file mode 100644 index 0000000..8335e43 --- /dev/null +++ b/fixtures/markdown/code-and-table.blocks.snapshot.json @@ -0,0 +1,63 @@ +{ + "blocks": [ + { + "kind": "heading", + "heading_path": [], + "source_span": { + "kind": "line", + "start": 1, + "end": 1 + }, + "payload": { + "kind": "heading", + "level": 1, + "text": "Code And Table" + } + }, + { + "kind": "code", + "heading_path": [ + "Code And Table" + ], + "source_span": { + "kind": "line", + "start": 3, + "end": 7 + }, + "payload": { + "kind": "code", + "lang": "rust", + "code": "fn main() {\n println!(\"hi\");\n}" + } + }, + { + "kind": "table", + "heading_path": [ + "Code And Table" + ], + "source_span": { + "kind": "line", + "start": 9, + "end": 12 + }, + "payload": { + "kind": "table", + "headers": [ + "col a", + "col b" + ], + "rows": [ + [ + "1", + "2" + ], + [ + "3", + "4" + ] + ] + } + } + ], + "warnings": [] +} diff --git a/fixtures/markdown/nested-headings.blocks.snapshot.json b/fixtures/markdown/nested-headings.blocks.snapshot.json new file mode 100644 index 0000000..032c3e6 --- /dev/null +++ b/fixtures/markdown/nested-headings.blocks.snapshot.json @@ -0,0 +1,136 @@ +{ + "blocks": [ + { + "kind": "heading", + "heading_path": [], + "source_span": { + "kind": "line", + "start": 1, + "end": 1 + }, + "payload": { + "kind": "heading", + "level": 1, + "text": "Top" + } + }, + { + "kind": "paragraph", + "heading_path": [ + "Top" + ], + "source_span": { + "kind": "line", + "start": 3, + "end": 3 + }, + "payload": { + "kind": "paragraph", + "text": "intro", + "inlines_flat": "intro" + } + }, + { + "kind": "heading", + "heading_path": [ + "Top" + ], + "source_span": { + "kind": "line", + "start": 5, + "end": 5 + }, + "payload": { + "kind": "heading", + "level": 2, + "text": "Section A" + } + }, + { + "kind": "paragraph", + "heading_path": [ + "Top", + "Section A" + ], + "source_span": { + "kind": "line", + "start": 7, + "end": 7 + }, + "payload": { + "kind": "paragraph", + "text": "body of A", + "inlines_flat": "body of A" + } + }, + { + "kind": "heading", + "heading_path": [ + "Top", + "Section A" + ], + "source_span": { + "kind": "line", + "start": 9, + "end": 9 + }, + "payload": { + "kind": "heading", + "level": 3, + "text": "Sub A.1" + } + }, + { + "kind": "paragraph", + "heading_path": [ + "Top", + "Section A", + "Sub A.1" + ], + "source_span": { + "kind": "line", + "start": 11, + "end": 11 + }, + "payload": { + "kind": "paragraph", + "text": "deeper", + "inlines_flat": "deeper" + } + }, + { + "kind": "heading", + "heading_path": [ + "Top" + ], + "source_span": { + "kind": "line", + "start": 13, + "end": 13 + }, + "payload": { + "kind": "heading", + "level": 2, + "text": "Section B" + } + }, + { + "kind": "paragraph", + "heading_path": [ + "Top", + "Section B" + ], + "source_span": { + "kind": "line", + "start": 15, + "end": 15 + }, + "payload": { + "kind": "paragraph", + "text": "body of B", + "inlines_flat": "body of B" + } + } + ], + "warnings": [] +} -- 2.49.1 From 2ac0f56105d6e76b183db9b82d5aab5ac72134c8 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 14:18:54 +0000 Subject: [PATCH 03/10] p1-3: drop dead bindings in heading + table end handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the leftover `let _ = level_u8;` and `let _ = header_count;` discards. The Heading frame already carries the canonical level (we populated it from `Tag::Heading` at Start), so we destructure that directly and ignore the redundant `TagEnd::Heading(level)` payload. The header_count helper was dead — Frame::Table tracks `cols` internally and we never consumed `header_count`. --- crates/kb-parse-md/src/blocks.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs index 86913de..5d225c0 100644 --- a/crates/kb-parse-md/src/blocks.rs +++ b/crates/kb-parse-md/src/blocks.rs @@ -519,13 +519,13 @@ impl<'a> WalkState<'a> { } // ---- Container ends ------------------------------------------------- - Event::End(TagEnd::Heading(level)) => { - let level_u8 = heading_level_to_u8(level); - if let Some(Frame::Heading { level: lvl, range, inlines }) = self.frames.pop() { + Event::End(TagEnd::Heading(_level)) => { + // The Tag::Heading frame is the source of truth for the + // level — `_level` from TagEnd is identical for well-formed + // input. We trust the frame. + if let Some(Frame::Heading { level: level_to_use, range, inlines }) = self.frames.pop() { let (_inline_vec, text) = inlines.finish(); let text = text.trim().to_string(); - let level_to_use = lvl; - let _ = level_u8; // Update heading stack: clear deeper levels, set this level. if (1..=6).contains(&level_to_use) { @@ -679,12 +679,10 @@ impl<'a> WalkState<'a> { range, headers, rows, - cols, malformed, .. }) = self.frames.pop() { - let header_count = if cols > 0 { cols } else { headers.len() }; let block = if let Some(note) = malformed { // Fall back to a paragraph carrying the raw markdown. warnings.push(Warning { @@ -706,7 +704,6 @@ impl<'a> WalkState<'a> { }, } } else { - let _ = header_count; ParsedBlock { kind: ParsedBlockKind::Table, heading_path: self.heading_path(), -- 2.49.1 From de9164802be86cf7f7386cc02dddfaa6b3818e94 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 14:34:58 +0000 Subject: [PATCH 04/10] p1-3: fix span arithmetic overflow + body_offset_lines fuzz `span_for` previously used `u32 + u32` directly, so callers passing a large `body_offset_lines` could panic (debug, then masked by `catch_unwind` and the entire body discarded) or wrap to an inverted span with `start > end` (release). Switch to `checked_add`; on overflow flag the walk state and at the end of `parse_blocks_inner` discard accumulated blocks and surface a single `Warning::ExtractFailed` carrying the offending body line. This degrades cleanly without panicking and without emitting a silently-broken span. Also extend `random_bytes_do_not_panic` to mix u32::MAX-style offsets across the fuzz iterations so the overflow path is exercised by the randomized corpus. New tests: - body_offset_lines_max_returns_extract_failed - body_offset_lines_zero_at_max_minus_one_no_overflow Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-parse-md/src/blocks.rs | 96 ++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 6 deletions(-) diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs index 5d225c0..56f5c92 100644 --- a/crates/kb-parse-md/src/blocks.rs +++ b/crates/kb-parse-md/src/blocks.rs @@ -96,6 +96,24 @@ fn parse_blocks_inner(body: &[u8], body_offset_lines: u32) -> (Vec, state.handle_event(event, range, &mut warnings); } + // If line-number arithmetic saturated at any point (e.g. caller passed + // `body_offset_lines = u32::MAX`), discard the (potentially clamped) + // blocks and surface a single `ExtractFailed` warning. Returning the + // possibly-inverted spans would be more harmful than dropping output. + if state.overflow_detected { + let at = state + .overflow_at_body_line + .map(|n| n.to_string()) + .unwrap_or_else(|| "?".to_string()); + return ( + Vec::new(), + vec![Warning { + kind: WarningKind::ExtractFailed, + note: format!("body_offset_lines overflow at body line {at}"), + }], + ); + } + (state.finish(), warnings) } @@ -174,6 +192,14 @@ struct WalkState<'a> { /// Accumulator for the currently-open block. We push events into this /// frame and convert to a `ParsedBlock` when the matching End tag fires. frames: Vec, + + /// Set if any line-number arithmetic in [`WalkState::span_for`] would + /// overflow `u32::MAX`. When set, [`Self::finish`] discards accumulated + /// blocks and the caller emits a single `Warning::ExtractFailed` instead. + overflow_detected: bool, + /// Body-line (0-indexed) where overflow was first observed, for the + /// warning note. + overflow_at_body_line: Option, } /// One in-flight container we are accumulating events into. Stack-shaped so @@ -372,6 +398,8 @@ impl<'a> WalkState<'a> { blocks: Vec::new(), heading_stack: Default::default(), frames: Vec::new(), + overflow_detected: false, + overflow_at_body_line: None, } } @@ -386,16 +414,33 @@ impl<'a> WalkState<'a> { .collect() } - fn span_for(&self, range: &Range) -> SourceSpan { + fn span_for(&mut self, range: &Range) -> SourceSpan { let start_body = self.line_index.line_zero_indexed(range.start); let end_body = if range.end <= range.start { start_body } else { self.line_index.end_line_zero_indexed(range.end) }; - SourceSpan::Line { - start: start_body + self.body_offset_lines, - end: end_body + self.body_offset_lines, + // Saturating add — but also remember whether overflow happened so the + // caller can degrade gracefully rather than silently emitting an + // inverted span. Without this guard, debug builds panic with + // "attempt to add with overflow" (caught by `catch_unwind`, masking + // the real cause) and release builds wrap to `start > end`. + match ( + start_body.checked_add(self.body_offset_lines), + end_body.checked_add(self.body_offset_lines), + ) { + (Some(start), Some(end)) => SourceSpan::Line { start, end }, + _ => { + if !self.overflow_detected { + self.overflow_detected = true; + self.overflow_at_body_line = Some(start_body); + } + SourceSpan::Line { + start: start_body.saturating_add(self.body_offset_lines), + end: end_body.saturating_add(self.body_offset_lines), + } + } } } @@ -1156,6 +1201,9 @@ mod tests { fn random_bytes_do_not_panic() { // Tiny xorshift PRNG seeded with a constant; deterministic inputs // covering both valid-ish and invalid markdown / utf-8 mixes. + // We also vary `body_offset_lines` across the full u32 range — + // including u32::MAX — to exercise the saturating-add path in + // `span_for` and confirm the overflow guard prevents panics. let mut state: u32 = 0x1234_5678; let mut next = || { state ^= state << 13; @@ -1163,13 +1211,49 @@ mod tests { state ^= state << 5; state }; - for _ in 0..100 { + for i in 0..100 { let len = (next() as usize) % 256; let bytes: Vec = (0..len).map(|_| next() as u8).collect(); - let _ = parse_blocks(&bytes, 1).expect("must be Ok"); + // Mix a u32::MAX-style offset every few iterations so the fuzz + // exercises the overflow path explicitly. + let offset = match i % 4 { + 0 => 0, + 1 => 1, + 2 => next(), + _ => u32::MAX - (next() % 16), + }; + let _ = parse_blocks(&bytes, offset).expect("must be Ok"); } } + #[test] + fn body_offset_lines_max_returns_extract_failed() { + // u32::MAX + any positive line number overflows; the parser must + // surface `ExtractFailed` and discard blocks rather than panic + // (debug) or emit an inverted span (release). + let body = b"# h\nbody\n"; + let (blocks, warns) = parse_blocks(body, u32::MAX).expect("must be Ok"); + assert!(blocks.is_empty(), "blocks should be discarded on overflow"); + assert_eq!(warns.len(), 1, "expected one warning, got: {warns:?}"); + assert_eq!(warns[0].kind, WarningKind::ExtractFailed); + assert!( + warns[0].note.contains("overflow"), + "note should mention overflow: {:?}", + warns[0].note + ); + } + + #[test] + fn body_offset_lines_zero_at_max_minus_one_no_overflow() { + // A body with exactly one logical line + offset = u32::MAX would + // still saturate. Pick offset such that no line ever reaches + // overflow — this should succeed normally. + let body = b"hi\n"; + let (blocks, warns) = parse_blocks(body, u32::MAX - 100).expect("must be Ok"); + assert_eq!(blocks.len(), 1); + assert!(warns.is_empty(), "no overflow expected, got: {warns:?}"); + } + #[test] fn adversarial_inputs_no_panic() { // Hand-crafted oddities. -- 2.49.1 From 0050cf32ea7618b7f69c22d04ccaf5cb7bd83f76 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 14:36:27 +0000 Subject: [PATCH 05/10] p1-3: route block-level content inside list items into parent inline buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `emit_block` previously walked the frame stack looking only for a Quote container, falling back to top-level on miss. That caused any block emitted inside a list item — code blocks, images, tables, headings — to escape the list and appear at the top of `blocks`, after the entire list and out of source order. `ParsedPayload::List { items: Vec> }` cannot represent a child block structurally, so the choice is between dropping content and flattening. Extend the reverse-walk to also recognize `Frame::ListItem` and route the block into a textual rendering appended to the item's inline buffer (`flatten_block_into_item`): - Code → fenced text approximation, preserving lang hint + body - Image → `![alt](src)` text - Audio → `[audio](src)` text - Heading → leading hashes + text - Quote → `> text` - Nested List → same rendering as `nested_in_item` flatten - Table → pipe-table approximation Document order is preserved because flattening happens inside the item's frame, before the item closes. New tests: - code_block_inside_list_item_flattens_into_parent - image_inside_list_item_flattens_into_parent - block_content_in_list_preserves_document_order Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-parse-md/src/blocks.rs | 171 ++++++++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 4 deletions(-) diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs index 56f5c92..10afa2c 100644 --- a/crates/kb-parse-md/src/blocks.rs +++ b/crates/kb-parse-md/src/blocks.rs @@ -369,6 +369,88 @@ impl InlineBuf { } +/// Flatten an emitted block into the inline buffer of an enclosing list +/// item, preserving content but losing structure. This keeps document +/// order and avoids "block escapes the list" — the alternative would be +/// pushing the block to the top-level `blocks` vec, where it appears +/// after the entire list and out of source order. +/// +/// Code blocks render as a fenced-text approximation so the language +/// hint and body survive. Images render as `![alt](src)`. Headings, +/// tables, quotes, etc. render as their text payload. +fn flatten_block_into_item(block: &ParsedBlock, inlines: &mut InlineBuf) { + match &block.payload { + ParsedPayload::Code { lang, code } => { + let mut rendered = String::from("\n\n```"); + if let Some(l) = lang { + rendered.push_str(l); + } + rendered.push('\n'); + rendered.push_str(code); + if !code.ends_with('\n') { + rendered.push('\n'); + } + rendered.push_str("```\n"); + inlines.push_text(&rendered); + } + ParsedPayload::ImageRef { src, alt } => { + inlines.push_text(&format!("![{alt}]({src})")); + } + ParsedPayload::AudioRef { src } => { + inlines.push_text(&format!("[audio]({src})")); + } + ParsedPayload::Heading { level, text } => { + // Render with leading hashes so structure is recognizable. + let hashes = "#".repeat((*level as usize).clamp(1, 6)); + inlines.push_text(&format!("\n{hashes} {text}\n")); + } + ParsedPayload::Paragraph { text, inlines: child } => { + // Paragraphs inside list items normally don't reach this path + // (the Start(Tag::Paragraph) handler suppresses creating a + // Paragraph frame when the parent is a ListItem). This branch + // is defensive — e.g. a paragraph emitted via some future + // synthetic-event path. + inlines.push_text("\n"); + for c in child { + inlines.push_inline(c.clone()); + } + inlines.text.push_str(text); + } + ParsedPayload::Quote { text, .. } => { + inlines.push_text(&format!("\n> {text}\n")); + } + ParsedPayload::List { ordered, items } => { + // A non-nested-flag List that still ended up inside a ListItem + // (shouldn't normally happen — `nested_in_item` flattens at + // `End(List)`). Fall back to the same rendering as the nested + // path for safety. + let mut rendered = String::new(); + for (i, item) in items.iter().enumerate() { + rendered.push('\n'); + rendered.push_str(" "); + if *ordered { + rendered.push_str(&format!("{}. ", i + 1)); + } else { + rendered.push_str("- "); + } + rendered.push_str(&flatten_inlines_to_text(item)); + } + inlines.push_text(&rendered); + } + ParsedPayload::Table { headers, rows } => { + // Pipe-table approximation; structure lost, content preserved. + let mut rendered = String::from("\n"); + rendered.push_str(&headers.join(" | ")); + rendered.push('\n'); + for row in rows { + rendered.push_str(&row.join(" | ")); + rendered.push('\n'); + } + inlines.push_text(&rendered); + } + } +} + fn flatten_inlines_to_text(inlines: &[Inline]) -> String { let mut out = String::new(); for i in inlines { @@ -446,12 +528,32 @@ impl<'a> WalkState<'a> { /// Where to emit a finished block: into the current container if any /// (Quote / ListItem), otherwise into the top-level `blocks` vec. + /// + /// Block-level content (code, image, heading, table, ...) inside a list + /// item cannot be represented structurally by `ParsedPayload::List` + /// (items hold `Vec`, not child blocks). To preserve content + /// and document order we **flatten** such blocks into a textual + /// rendering and append it to the enclosing list item's inline buffer. + /// Without this, a code block inside a list item escapes to the + /// top-level `blocks` vec — out of order, and detached from its parent. fn emit_block(&mut self, block: ParsedBlock) { // Find the nearest enclosing container that accepts child blocks. - for frame in self.frames.iter_mut().rev() { - if let Frame::Quote { children, .. } = frame { - children.push(block); - return; + // Walk in reverse: ListItem and Quote both qualify; we honor + // whichever is closer to the top of the frame stack. + for idx in (0..self.frames.len()).rev() { + match &mut self.frames[idx] { + Frame::Quote { children, .. } => { + children.push(block); + return; + } + Frame::ListItem { inlines } => { + // Render the block as text + inlines and append to the + // item's inline buffer. Document order is preserved + // because we run inside the item's frame. + flatten_block_into_item(&block, inlines); + return; + } + _ => continue, } } self.blocks.push(block); @@ -1195,6 +1297,67 @@ mod tests { } } + // ---- block content inside list items ------------------------------------ + + #[test] + fn code_block_inside_list_item_flattens_into_parent() { + // Without the routing fix, the code block would escape the list and + // appear at top level (in wrong source order). + let body = "- item\n\n ```rust\n fn f(){}\n ```\n\n- next\n"; + let (blocks, _) = parse(body, 1); + // Expect a single List block, no escaped Code at top level. + assert_eq!( + blocks.len(), + 1, + "code should not escape to top level: got {} blocks", + blocks.len() + ); + match &blocks[0].payload { + ParsedPayload::List { ordered, items } => { + assert!(!ordered); + assert_eq!(items.len(), 2); + let flat = flatten_inlines_to_text(&items[0]); + assert!(flat.contains("item"), "first item missing 'item': {flat:?}"); + assert!(flat.contains("fn f(){}"), "first item missing code body: {flat:?}"); + assert!(flat.contains("```"), "first item missing fence: {flat:?}"); + assert_eq!(flatten_inlines_to_text(&items[1]).trim(), "next"); + } + _ => panic!("expected list, got {:?}", blocks[0].payload), + } + } + + #[test] + fn image_inside_list_item_flattens_into_parent() { + let body = "- ![alt](pic.png)\n"; + let (blocks, _) = parse(body, 1); + // The image should NOT escape to top level as a standalone ImageRef. + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::List { items, .. } => { + assert_eq!(items.len(), 1); + let flat = flatten_inlines_to_text(&items[0]); + assert!( + flat.contains("![alt](pic.png)") || flat.contains("alt"), + "image should be rendered into item: {flat:?}" + ); + } + _ => panic!("expected list, got {:?}", blocks[0].payload), + } + } + + #[test] + fn block_content_in_list_preserves_document_order() { + // Two items with code between; the resulting blocks should be in + // strictly source order: just one List, no top-level code block. + let body = "- a\n\n ```\n c\n ```\n\n- b\n"; + let (blocks, _) = parse(body, 1); + assert_eq!(blocks.len(), 1, "expected single list block"); + match &blocks[0].kind { + kb_parse_types::ParsedBlockKind::List => {} + other => panic!("expected list, got {other:?}"), + } + } + // ---- malformed input no-panic ------------------------------------------- #[test] -- 2.49.1 From d49dbc19268c2ec9619d9b536a8b75f6298b2b7b Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 14:37:40 +0000 Subject: [PATCH 06/10] p1-3: skip empty heading text when building heading_path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `# ` (a heading with no following text) used to seed the heading stack with `Some("")`, which then propagated into every child block's `heading_path` as a `""` segment — visibly polluting the path that downstream consumers index by. Filter empty entries from both `heading_path()` and the in-line ancestor collection at heading-end. We deliberately keep `Some("")` in the stack rather than skipping the assignment so the slot remains occupied and a subsequent deeper heading is still positioned correctly relative to its level — only the visible path drops the empty. New tests: - empty_heading_does_not_pollute_path - empty_h1_then_h2_does_not_break_stack Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-parse-md/src/blocks.rs | 64 +++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs index 10afa2c..cf005d6 100644 --- a/crates/kb-parse-md/src/blocks.rs +++ b/crates/kb-parse-md/src/blocks.rs @@ -490,9 +490,14 @@ impl<'a> WalkState<'a> { } fn heading_path(&self) -> Vec { + // Skip slots whose stored heading text is empty (e.g. a `#` heading + // with no following text). We deliberately keep `Some("")` in the + // stack so deeper headings still nest under their implicit slot, + // but the path itself filters empties out so child blocks don't + // get a `""` segment polluting their ancestry. self.heading_stack .iter() - .filter_map(|s| s.clone()) + .filter_map(|s| s.clone().filter(|t| !t.is_empty())) .collect() } @@ -685,11 +690,13 @@ impl<'a> WalkState<'a> { // The heading_path on the heading block ITSELF excludes // the heading's own text (it's the path of ancestors). + // Empty heading texts are skipped so they don't create + // a `""` segment in the path (matches `heading_path()`). let path = self .heading_stack .iter() .take((level_to_use - 1) as usize) - .filter_map(|s| s.clone()) + .filter_map(|s| s.clone().filter(|t| !t.is_empty())) .collect(); let block = ParsedBlock { @@ -1092,6 +1099,59 @@ mod tests { assert_eq!(p2.heading_path, vec!["B".to_string()]); } + // ---- empty heading edge cases ------------------------------------------- + + #[test] + fn empty_heading_does_not_pollute_path() { + // `# ` with no text used to seed the heading_stack with `Some("")`, + // which then leaked into every child block's `heading_path` as a + // `""` segment. Now empty entries are filtered from the path. + let body = "# \n# Real H1\n## Sub\nbody\n"; + let (blocks, _) = parse(body, 1); + // body is the last block; verify its heading_path. + let para = blocks + .iter() + .find(|b| matches!(b.payload, ParsedPayload::Paragraph { .. })) + .expect("paragraph present"); + assert_eq!( + para.heading_path, + vec!["Real H1".to_string(), "Sub".to_string()], + "empty heading should not appear in path; got {:?}", + para.heading_path + ); + } + + #[test] + fn empty_h1_then_h2_does_not_break_stack() { + // An empty H1 overwrites the H1 slot with `Some("")` (so a later + // H2 is still treated as positioned at level 2), but the path + // filter drops the empty entry. So Inner's path is `[]` not `[""]`, + // and the body's path is `["Inner"]` — neither carries a `""` and + // the parser doesn't panic or skip blocks. + let body = "# Outer\n\n# \n\n## Inner\nbody\n"; + let (blocks, _) = parse(body, 1); + let inner = blocks + .iter() + .find(|b| { + matches!(b.payload, ParsedPayload::Heading { ref text, .. } if text == "Inner") + }) + .expect("Inner heading present"); + assert_eq!( + inner.heading_path, + Vec::::new(), + "empty H1 leaves an empty slot, filtered from the path" + ); + let para = blocks + .iter() + .find(|b| matches!(b.payload, ParsedPayload::Paragraph { .. })) + .expect("paragraph present"); + assert_eq!( + para.heading_path, + vec!["Inner".to_string()], + "body's path drops the empty H1 between root and Inner" + ); + } + // ---- code blocks --------------------------------------------------------- #[test] -- 2.49.1 From 73040cab30f4db23d428262c6cd26c776425a6b4 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 14:40:04 +0000 Subject: [PATCH 07/10] p1-3: capture image refs from pulldown-cmark Tag::Image events The previous block-level image detector scanned paragraph source bytes for the literal `![alt](src)` shape. That was fragile in three ways: - `![alt](src "title")` leaked the title into `src` (`src "title"`) - `![alt]()` kept the angle brackets verbatim - `![]()` had undefined behavior Replace the byte-scan with state on `Frame::Paragraph` that observes the actual `Tag::Image` events from pulldown-cmark: - `image_count` increments on each `Start(Tag::Image)` and `image_src` captures `dest_url` (which already strips angle brackets and excludes the title). - Text events seen while `image_depth > 0` are routed into `image_alt` and suppressed from the inline buffer. - Strong/Emph/Link starts and any non-image text outside the image flag `non_image_text_seen`. At `End(Paragraph)`, the paragraph is lifted to `ImageRef` iff `image_count == 1 && !non_image_text_seen`. The byte-scanner `match_block_image` is removed. New tests: - image_with_title_attribute (title dropped, no leak into src) - image_with_angle_bracketed_url (brackets stripped) - empty_image_alt_and_src (`![]()` pins to empty/empty) Existing image tests (`image_ref_block_captures_src_and_alt`, `inline_image_inside_paragraph_is_dropped`) continue to pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-parse-md/src/blocks.rs | 232 ++++++++++++++++++++++++------- 1 file changed, 185 insertions(+), 47 deletions(-) diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs index cf005d6..f080d0c 100644 --- a/crates/kb-parse-md/src/blocks.rs +++ b/crates/kb-parse-md/src/blocks.rs @@ -213,6 +213,20 @@ enum Frame { Paragraph { range: Range, inlines: InlineBuf, + /// Block-level image detection: tracks the single-image-only + /// signature `![alt](src)` as a paragraph's *entire* content. + /// + /// When `Tag::Image` opens, `image_depth` is bumped (>0 ⇒ alt-text + /// accumulates into `image_alt` and is suppressed from `inlines`). + /// `image_count` records how many distinct images we've seen and + /// `non_image_text_seen` flags any other inline content. At + /// `End(Paragraph)` the paragraph is lifted to `ImageRef` iff + /// `image_count == 1 && !non_image_text_seen`. + image_depth: u32, + image_count: u32, + non_image_text_seen: bool, + image_src: Option, + image_alt: String, }, Quote { range: Range, @@ -583,6 +597,11 @@ impl<'a> WalkState<'a> { self.frames.push(Frame::Paragraph { range, inlines: InlineBuf::new(), + image_depth: 0, + image_count: 0, + non_image_text_seen: false, + image_src: None, + image_alt: String::new(), }); } Event::Start(Tag::BlockQuote(_)) => { @@ -656,18 +675,40 @@ impl<'a> WalkState<'a> { } } Event::Start(Tag::Strong) => { + self.flag_non_image_in_paragraph(); self.with_current_inlines(|buf| buf.open_strong()); } Event::Start(Tag::Emphasis) => { + self.flag_non_image_in_paragraph(); self.with_current_inlines(|buf| buf.open_emph()); } Event::Start(Tag::Link { dest_url, .. }) => { + self.flag_non_image_in_paragraph(); let href = dest_url.into_string(); self.with_current_inlines(|buf| buf.open_link(href)); } - // Block-level image is handled at End — see TagEnd::Image. - Event::Start(Tag::Image { .. }) => { - // No-op at start; we capture src/title at End. + Event::Start(Tag::Image { dest_url, .. }) => { + // If we're inside a paragraph, this image becomes a + // candidate for block-level lifting. Record its src and + // start accumulating the alt text from the upcoming Text + // events. + if let Some(Frame::Paragraph { + image_depth, + image_count, + image_src, + image_alt, + .. + }) = self.frames.last_mut() + { + *image_depth += 1; + if *image_count == 0 { + *image_src = Some(dest_url.into_string()); + image_alt.clear(); + } + *image_count += 1; + } + // Outside a paragraph (e.g. inside a list item, heading, + // table cell): inline images are dropped silently per §3.4. } // ---- Container ends ------------------------------------------------- @@ -710,26 +751,43 @@ impl<'a> WalkState<'a> { } Event::End(TagEnd::Paragraph) => { if matches!(self.frames.last(), Some(Frame::Paragraph { .. })) { - if let Some(Frame::Paragraph { range, inlines }) = self.frames.pop() { - let (inline_vec, text) = inlines.finish(); - // Block-level image: a paragraph whose only content - // is `![alt](src)` becomes ImageRef. Detect by - // scanning the original source for the canonical - // shape. - if let Some((alt, src)) = match_block_image(self.body, &range) { + if let Some(Frame::Paragraph { + range, + inlines, + image_count, + non_image_text_seen, + image_src, + image_alt, + .. + }) = self.frames.pop() + { + // Block-level image lift: paragraph whose only + // content is exactly one `![alt](src)`. Source + // (with optional title), alt, and angle-bracket + // wrapping are all captured by pulldown-cmark from + // the `Tag::Image` event itself, so the title is + // dropped and angle brackets are stripped without + // any byte-level scanning. + if image_count == 1 && !non_image_text_seen { + let span = self.span_for(&range); let block = ParsedBlock { kind: ParsedBlockKind::ImageRef, heading_path: self.heading_path(), - source_span: self.span_for(&range), - payload: ParsedPayload::ImageRef { src, alt }, + source_span: span, + payload: ParsedPayload::ImageRef { + src: image_src.unwrap_or_default(), + alt: image_alt, + }, }; self.emit_block(block); return; } + let (inline_vec, text) = inlines.finish(); + let span = self.span_for(&range); let block = ParsedBlock { kind: ParsedBlockKind::Paragraph, heading_path: self.heading_path(), - source_span: self.span_for(&range), + source_span: span, payload: ParsedPayload::Paragraph { text, inlines: inline_vec }, }; self.emit_block(block); @@ -932,8 +990,11 @@ impl<'a> WalkState<'a> { self.with_current_inlines(|buf| buf.close_link()); } Event::End(TagEnd::Image) => { - // Inline images are dropped silently. Block-level image refs - // are detected at paragraph End, not here. + if let Some(Frame::Paragraph { image_depth, .. }) = self.frames.last_mut() { + if *image_depth > 0 { + *image_depth -= 1; + } + } } // ---- Leaf events ----------------------------------------------------- @@ -954,6 +1015,32 @@ impl<'a> WalkState<'a> { current_cell.push_str(&s); return; } + // If this text is inside a `Tag::Image` opened inside a + // paragraph, route it to the image's alt accumulator and + // suppress it from the inline buffer (so a paragraph that + // is *only* an image doesn't carry the alt as visible + // inline text in the fallback case either). + if let Some(Frame::Paragraph { + image_depth, + image_alt, + .. + }) = self.frames.last_mut() + { + if *image_depth > 0 { + image_alt.push_str(&s); + return; + } + } + // Otherwise: visible non-image content. + if let Some(Frame::Paragraph { + non_image_text_seen, + .. + }) = self.frames.last_mut() + { + if !s.is_empty() { + *non_image_text_seen = true; + } + } let owned = s.into_string(); self.with_current_inlines(|buf| { buf.push_text(&owned); @@ -965,6 +1052,19 @@ impl<'a> WalkState<'a> { current_cell.push_str(&s); return; } + if let Some(Frame::Paragraph { + non_image_text_seen, + image_depth, + .. + }) = self.frames.last_mut() + { + // Code inside an image's alt — extremely rare but pin + // behavior: count as visible non-image content so the + // paragraph isn't lifted to ImageRef. + if *image_depth == 0 { + *non_image_text_seen = true; + } + } let owned = s.into_string(); self.with_current_inlines(|buf| { buf.push_code(&owned); @@ -988,6 +1088,22 @@ impl<'a> WalkState<'a> { } } + /// If the top frame is an open paragraph that hasn't yet escaped the + /// "single image only" signature, mark it as containing visible + /// non-image content so it won't be lifted to ImageRef at End. + fn flag_non_image_in_paragraph(&mut self) { + if let Some(Frame::Paragraph { + non_image_text_seen, + image_depth, + .. + }) = self.frames.last_mut() + { + if *image_depth == 0 { + *non_image_text_seen = true; + } + } + } + /// Run `f` on whichever inline accumulator is open at the top of the /// frame stack. No-op if no inline-accepting frame is open. fn with_current_inlines(&mut self, f: F) { @@ -1017,38 +1133,6 @@ fn heading_level_to_u8(level: HeadingLevel) -> u8 { } } -/// Detect a paragraph whose entire trimmed source is `![alt](src)` — the -/// canonical "block-level image" shape. Returns `(alt, src)` if so. We do -/// this by scanning the original bytes (not the inline events) so it stays -/// robust to pulldown-cmark's internal representation of images. -fn match_block_image(body: &[u8], range: &Range) -> Option<(String, String)> { - let slice = body.get(range.clone())?; - let s = std::str::from_utf8(slice).ok()?.trim(); - if !s.starts_with("![") { - return None; - } - // Find the closing `]` of the alt text. Markdown does not allow nested - // brackets without escaping — for a block-level image we only handle the - // simple form. Anything else falls through to ordinary paragraph parsing. - let close_bracket = s.find("](")?; - let alt = &s[2..close_bracket]; - // The rest must be `(SRC)` and nothing else. - let after = &s[close_bracket + 2..]; - let close_paren = after.rfind(')')?; - if close_paren != after.len() - 1 { - return None; - } - let src = &after[..close_paren]; - // Reject if alt or src contain a newline — that means the paragraph has - // more content beyond the image and isn't a pure block-level image. - if alt.contains('\n') || src.contains('\n') { - return None; - } - // Reject brackets/parens inside src — tolerated by CommonMark via - // angle-bracket-wrap, but we keep this conservative for now. - Some((alt.to_string(), src.to_string())) -} - // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- @@ -1309,6 +1393,60 @@ mod tests { assert_eq!(blocks[0].kind, ParsedBlockKind::ImageRef); } + #[test] + fn image_with_title_attribute() { + // Source includes a title, but pulldown-cmark exposes it + // separately on `Tag::Image`; we ignore the title — only `src` + // and `alt` survive. Previously the byte-scanner pulled + // `src "title"` into `src`. + let body = "![alt](src.png \"title\")\n"; + let (blocks, _) = parse(body, 1); + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::ImageRef { src, alt } => { + assert_eq!(src, "src.png"); + assert_eq!(alt, "alt"); + } + _ => panic!("expected image ref, got {:?}", blocks[0].payload), + } + } + + #[test] + fn image_with_angle_bracketed_url() { + // `<…>` wrapping is a CommonMark feature for URLs containing + // spaces. pulldown-cmark strips the angle brackets and decodes + // the URL; we should reflect that. + let body = "![alt]()\n"; + let (blocks, _) = parse(body, 1); + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::ImageRef { src, alt } => { + assert_eq!( + src, "https://x.com/a b", + "angle brackets should be stripped" + ); + assert_eq!(alt, "alt"); + } + _ => panic!("expected image ref, got {:?}", blocks[0].payload), + } + } + + #[test] + fn empty_image_alt_and_src() { + // Pin behavior on the degenerate `![]()` shape. Both fields are + // empty strings; the block is still classified as ImageRef. + let body = "![]()\n"; + let (blocks, _) = parse(body, 1); + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::ImageRef { src, alt } => { + assert_eq!(src, ""); + assert_eq!(alt, ""); + } + _ => panic!("expected image ref, got {:?}", blocks[0].payload), + } + } + #[test] fn inline_image_inside_paragraph_is_dropped() { // The image is part of a longer paragraph → not a block-level image. -- 2.49.1 From 23ff4d68af4a45c1db7ca3fcf1b2b49a5d5fa415 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 14:40:42 +0000 Subject: [PATCH 08/10] p1-3: preserve whitespace in link text across SoftBreak/HardBreak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `[multi\nline](http://x)` produced `Inline::Link.text = "multiline"` because the SoftBreak/HardBreak handler called `push_text(" ")` — which updates `paragraph.text` and the inline buffer, but NOT the open link frame's flattened text accumulator. Text events flowed through `push_link_text`; line breaks didn't. Add `push_link_text(" ")` alongside the existing `push_text(" ")` in the break handler so a line break inside `[ ... ](href)` collapses to a visible space rather than disappearing. New tests: - link_with_soft_break_preserves_space_in_text - link_with_hard_break_preserves_space_in_text Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-parse-md/src/blocks.rs | 57 +++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs index f080d0c..7c1a0ed 100644 --- a/crates/kb-parse-md/src/blocks.rs +++ b/crates/kb-parse-md/src/blocks.rs @@ -1080,7 +1080,15 @@ impl<'a> WalkState<'a> { current_cell.push(' '); return; } - self.with_current_inlines(|buf| buf.push_text(" ")); + // Update both `paragraph.text` (via push_text) and the + // open link's flattened text accumulator (via + // push_link_text). Without push_link_text here, a + // multi-line `[text\nmore](href)` collapses to "textmore" + // — losing the visible space between words. + self.with_current_inlines(|buf| { + buf.push_text(" "); + buf.push_link_text(" "); + }); } // Everything else (HTML, footnote refs, task list markers, math, // rules, etc.) is dropped silently per design §3.4. @@ -1639,6 +1647,53 @@ mod tests { // ---- inline filter ------------------------------------------------------- + #[test] + fn link_with_soft_break_preserves_space_in_text() { + // Without the push_link_text fix, this collapses to "multiline". + let body = "[multi\nline](http://x)\n"; + let (blocks, _) = parse(body, 1); + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::Paragraph { inlines, .. } => { + let link = inlines + .iter() + .find(|i| matches!(i, Inline::Link { .. })) + .expect("link present"); + match link { + Inline::Link { text, href } => { + assert_eq!(text, "multi line"); + assert_eq!(href, "http://x"); + } + _ => unreachable!(), + } + } + _ => panic!("expected paragraph, got {:?}", blocks[0].payload), + } + } + + #[test] + fn link_with_hard_break_preserves_space_in_text() { + // Two trailing spaces + newline = HardBreak in CommonMark. + let body = "[multi \nline](http://x)\n"; + let (blocks, _) = parse(body, 1); + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::Paragraph { inlines, .. } => { + let link = inlines + .iter() + .find(|i| matches!(i, Inline::Link { .. })) + .expect("link present"); + match link { + Inline::Link { text, .. } => { + assert_eq!(text, "multi line"); + } + _ => unreachable!(), + } + } + _ => panic!("expected paragraph"), + } + } + #[test] fn only_allowed_inlines_emitted() { let body = "**bold** *em* `code` [link](u)\n"; -- 2.49.1 From 2b6d9abc0fad021a5df00c9c8c165d6be67a2364 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 14:41:47 +0000 Subject: [PATCH 09/10] p1-3: doc-comment + test pin Quote drops non-text children MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `ParsedPayload::Quote { text, inlines }` cannot represent block-level children (lists, code, tables, images), so the BlockQuote end handler silently drops them when assembling the Quote payload. This matches §3.4 for now but is non-obvious and easy to regress without an explicit pin. Add a TODO(P1-future) comment near the Quote emission code and a regression test (`quote_with_list_inside_drops_list`) that fixes the current shape: a `> - item` blockquote produces a Quote with empty text and empty inlines. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-parse-md/src/blocks.rs | 35 ++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs index 7c1a0ed..a4d2ee4 100644 --- a/crates/kb-parse-md/src/blocks.rs +++ b/crates/kb-parse-md/src/blocks.rs @@ -797,6 +797,15 @@ impl<'a> WalkState<'a> { Event::End(TagEnd::BlockQuote(_)) => { if let Some(Frame::Quote { range, children }) = self.frames.pop() { // Concatenate child text for the Quote payload. + // + // TODO(P1-future): `ParsedPayload::Quote { text, inlines }` + // cannot represent block-level children structurally + // (lists, code, tables, images), so they are silently + // dropped here. Only Paragraph / Quote / Heading text + // contributes to the assembled `text`. This matches + // §3.4 for now but should be revisited when Quote + // grows a `children: Vec` field — see + // pin test `quote_with_list_inside_drops_list`. let mut text = String::new(); let mut inlines: Vec = Vec::new(); for c in &children { @@ -1503,6 +1512,32 @@ mod tests { } } + // ---- quote child handling (pinned) -------------------------------------- + + /// Pins current behavior: a list inside a blockquote contributes nothing + /// to the Quote's text/inlines because `ParsedPayload::Quote` cannot + /// represent non-text/non-paragraph block children. The list itself is + /// also not retained as a separate block — it lives only in the popped + /// `Frame::Quote.children` and is dropped. Tracked in TODO(P1-future) + /// near the Quote emission code. + #[test] + fn quote_with_list_inside_drops_list() { + let body = "> - a\n> - b\n"; + let (blocks, _) = parse(body, 1); + // Exactly one Quote block, with empty text/inlines. + assert_eq!(blocks.len(), 1); + match &blocks[0].payload { + ParsedPayload::Quote { text, inlines } => { + assert_eq!(text, "", "list content should not appear in quote text"); + assert!( + inlines.is_empty(), + "list content should not appear in quote inlines" + ); + } + _ => panic!("expected quote, got {:?}", blocks[0].payload), + } + } + // ---- block content inside list items ------------------------------------ #[test] -- 2.49.1 From 80123e9e27910ea076ec8cda1d488a2fb7955f4b Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 14:42:21 +0000 Subject: [PATCH 10/10] p1-3: pin reviewer probe inputs as regression tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The quality reviewer named three specific input probes for the C1/C2/ C3 fixes. Encode each as a verbatim test so future regressions on those exact inputs surface immediately: - probe_overflow: parse_blocks(b"# h\nbody\n", u32::MAX) → empty + Warning::ExtractFailed. - probe_list_escape: list with embedded code block → single List block, two items. - probe_empty_heading: `# \n# Real\nbody\n` → body's heading_path is `["Real"]`. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-parse-md/src/blocks.rs | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs index a4d2ee4..93bdc17 100644 --- a/crates/kb-parse-md/src/blocks.rs +++ b/crates/kb-parse-md/src/blocks.rs @@ -1751,3 +1751,37 @@ mod tests { } } } + +#[cfg(test)] +mod reviewer_probes { + use super::*; + + #[test] + fn probe_overflow() { + let (b, w) = parse_blocks(b"# h\nbody\n", u32::MAX).unwrap(); + assert!(b.is_empty(), "expected empty blocks, got {}", b.len()); + assert_eq!(w.len(), 1); + assert_eq!(w[0].kind, WarningKind::ExtractFailed); + } + + #[test] + fn probe_list_escape() { + let body = b"- item\n\n \x60\x60\x60rust\n fn f(){}\n \x60\x60\x60\n\n- next\n"; + let (b, _) = parse_blocks(body, 0).unwrap(); + assert_eq!(b.len(), 1, "expected single list block, got {}", b.len()); + match &b[0].payload { + ParsedPayload::List { items, .. } => assert_eq!(items.len(), 2), + _ => panic!("expected list, got {:?}", b[0].payload), + } + } + + #[test] + fn probe_empty_heading() { + let (b, _) = parse_blocks(b"# \n# Real\nbody\n", 0).unwrap(); + let para = b + .iter() + .find(|x| matches!(x.payload, ParsedPayload::Paragraph { .. })) + .expect("paragraph present"); + assert_eq!(para.heading_path, vec!["Real".to_string()]); + } +} -- 2.49.1