From 4e7e9cad8746d08fb4c389ef1908364e151490a3 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 14:14:34 +0000
Subject: [PATCH 01/10] p1-3: add parse_blocks (pulldown-cmark walker)
 submodule
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements `kb_parse_md::parse_blocks(body, body_offset_lines)` returning a
flat `Vec<ParsedBlock>` plus warnings. Walks pulldown-cmark events through a
small frame-based state machine that tracks heading paths, accumulates
inline buffers (Text/Code/Link/Strong/Emph only — design §3.4), and
reports SourceSpan::Line spans in 1-indexed file-line coordinates.

Covers headings, paragraphs, code blocks (lang from info string), GFM
tables (with malformed fallback to paragraph + MalformedTable warning),
lists (nested sub-lists flattened into parent item), and block-level image
references. Inline images are dropped silently per the inline filter.
Adversarial inputs are caught with `catch_unwind` and degrade to an empty
output + ExtractFailed warning.

15 unit tests cover heading-path correctness, code lang, table parsing,
malformed-table fallback (driven via synthetic events since pulldown-cmark
auto-normalizes table widths), LF/CRLF line-range parity, image refs,
nested-list flattening, inline filter, and 100-iteration random-bytes plus
hand-crafted adversarial-input no-panic guards.
---
 Cargo.lock                       |   19 +
 crates/kb-parse-md/Cargo.toml    |    8 +-
 crates/kb-parse-md/src/blocks.rs | 1221 ++++++++++++++++++++++++++++++
 crates/kb-parse-md/src/lib.rs    |   16 +-
 4 files changed, 1256 insertions(+), 8 deletions(-)
 create mode 100644 crates/kb-parse-md/src/blocks.rs
diff --git a/Cargo.lock b/Cargo.lock
index 7c34f44..ec3d4e7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -585,11 +585,13 @@ dependencies = [
  "kb-core",
  "kb-parse-types",
  "lingua",
+ "pulldown-cmark",
  "serde",
  "serde_json",
  "serde_yaml_ng",
  "time",
  "toml",
+ "tracing",
 ]
 
 [[package]]
@@ -834,6 +836,17 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "pulldown-cmark"
+version = "0.13.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad"
+dependencies = [
+ "bitflags",
+ "memchr",
+ "unicase",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.45"
@@ -1367,6 +1380,12 @@ dependencies = [
  "tracing-serde",
 ]
 
+[[package]]
+name = "unicase"
+version = "2.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.24"
diff --git a/crates/kb-parse-md/Cargo.toml b/crates/kb-parse-md/Cargo.toml
index db7b56e..3b39606 100644
--- a/crates/kb-parse-md/Cargo.toml
+++ b/crates/kb-parse-md/Cargo.toml
@@ -5,7 +5,7 @@ edition       = { workspace = true }
 rust-version  = { workspace = true }
 license       = { workspace = true }
 repository    = { workspace = true }
-description   = "Markdown frontmatter (and, in p1-3, block) parsing into kb-core::Metadata / kb-parse-types intermediates"
+description   = "Markdown frontmatter and block parsing into kb-core::Metadata / kb-parse-types intermediates"
 
 [dependencies]
 kb-core         = { path = "../kb-core" }
@@ -14,6 +14,12 @@ anyhow          = { workspace = true }
 serde           = { workspace = true }
 serde_json      = { workspace = true }
 time            = { workspace = true }
+tracing         = { workspace = true }
+# pulldown-cmark is the CommonMark parser used by the `blocks` submodule.
+# GFM tables are gated by the runtime `Options::ENABLE_TABLES` flag, not a
+# cargo feature; we strip the default `getopts` + `html` features since we
+# only use the pull-parser API.
+pulldown-cmark  = { version = "0.13", default-features = false }
 # serde_yaml (dtolnay) was archived as unmaintained in 2024.
 # We use the maintained fork serde_yaml_ng. Keeping the same `serde_yaml`-style
 # API surface lets us swap if a different fork wins long term.
diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs
new file mode 100644
index 0000000..1ddfaa6
--- /dev/null
+++ b/crates/kb-parse-md/src/blocks.rs
@@ -0,0 +1,1221 @@
+//! Markdown body → flat `Vec<kb_parse_types::ParsedBlock>` (§3.4 / §3.7b).
+//!
+//! Uses `pulldown-cmark` (with GFM tables enabled at runtime via
+//! `Options::ENABLE_TABLES`) to walk the body once and emit a flat list of
+//! parsed blocks. Heading paths are computed by tracking the most-recent
+//! heading text at each level. Source spans are reported as
+//! [`kb_core::SourceSpan::Line`] in 1-indexed file-line coordinates by
+//! converting `pulldown-cmark`'s byte offsets to line numbers and adding the
+//! caller-supplied `body_offset_lines`.
+//!
+//! ## Coordinate conventions
+//!
+//! * Body lines are *0-indexed* internally — line 0 is the first line of the
+//!   body slice. We map to file coordinates with
+//!   `file_line = body_line_zero_indexed + body_offset_lines`. So a caller
+//!   that passes `body_offset_lines = 6` is saying "the first line of `body`
+//!   is line 6 of the original file".
+//! * `SourceSpan::Line { start, end }` is inclusive on both ends.
+//!
+//! ## Inline filter
+//!
+//! [`kb_core::Inline`] only models `Text | Code | Link | Strong | Emph`.
+//! Inline images, footnotes, hard breaks, etc. are dropped silently per
+//! design §3.4. Block-level `![alt](src)` (an image as the sole content of a
+//! paragraph) is lifted to [`kb_parse_types::ParsedPayload::ImageRef`].
+//!
+//! ## CRLF
+//!
+//! Line numbers are computed by counting `\n` bytes in the prefix; CRLF
+//! input still has `\n` at end-of-line so the math is identical to LF input.
+//! `pulldown-cmark` may include `\r` characters inside its emitted text
+//! events; we leave them as-is for now (they round-trip through serde).
+
+use std::ops::Range;
+
+use kb_core::{Inline, SourceSpan};
+use kb_parse_types::{ParsedBlock, ParsedBlockKind, ParsedPayload, Warning, WarningKind};
+use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag, TagEnd};
+
+/// Parse a Markdown body into a flat `Vec<ParsedBlock>` plus any warnings.
+///
+/// `body_offset_lines` is added to every body-relative line number so that
+/// reported [`SourceSpan::Line`] values address the **original** file. See
+/// the module-level docs for the coordinate convention.
+///
+/// # Errors
+///
+/// `Err` is reserved for genuinely fatal conditions; the current
+/// implementation never produces one — even adversarial inputs degrade to
+/// `Ok((vec![], vec![Warning::ExtractFailed]))` via a top-level
+/// `catch_unwind` guard. The `Result` is kept on the signature so a future
+/// I/O-backed input can be added without breaking callers.
+pub fn parse_blocks(
+    body: &[u8],
+    body_offset_lines: u32,
+) -> anyhow::Result<(Vec<ParsedBlock>, Vec<Warning>)> {
+    // Adversarial-input safety: pulldown-cmark is documented as
+    // panic-free on valid UTF-8, but a defensive catch_unwind keeps the
+    // contract ("never panics") even if the dependency regresses.
+    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+        parse_blocks_inner(body, body_offset_lines)
+    }));
+    match result {
+        Ok(out) => Ok(out),
+        Err(_) => {
+            tracing::warn!("parse_blocks panicked on adversarial input; returning empty");
+            Ok((
+                Vec::new(),
+                vec![Warning {
+                    kind: WarningKind::ExtractFailed,
+                    note: "pulldown-cmark panicked; body discarded".to_string(),
+                }],
+            ))
+        }
+    }
+}
+
+fn parse_blocks_inner(body: &[u8], body_offset_lines: u32) -> (Vec<ParsedBlock>, Vec<Warning>) {
+    // Lossy UTF-8: pulldown-cmark wants `&str`. Non-UTF-8 inputs get
+    // U+FFFD substitution; the byte→line mapping is built from the original
+    // bytes so spans remain accurate.
+    let body_str = match std::str::from_utf8(body) {
+        Ok(s) => std::borrow::Cow::Borrowed(s),
+        Err(_) => String::from_utf8_lossy(body),
+    };
+
+    let line_index = LineIndex::new(body);
+    let mut warnings: Vec<Warning> = Vec::new();
+    let mut state = WalkState::new(body_offset_lines, &line_index, body);
+
+    let mut options = Options::empty();
+    options.insert(Options::ENABLE_TABLES);
+    let parser = Parser::new_ext(body_str.as_ref(), options);
+
+    for (event, range) in parser.into_offset_iter() {
+        state.handle_event(event, range, &mut warnings);
+    }
+
+    (state.finish(), warnings)
+}
+
+// ---------------------------------------------------------------------------
+// Line index — byte offset → 1-indexed body line
+// ---------------------------------------------------------------------------
+
+/// Maps byte offsets in the body to body-line numbers (0-indexed).
+///
+/// Built once per parse. Stores the byte offset of every newline so
+/// `byte → line` is a binary search.
+struct LineIndex {
+    /// Byte offsets of `\n` characters in the body, in ascending order.
+    /// `newlines[i]` is the position of the `i`-th newline (0-indexed).
+    newlines: Vec<usize>,
+    body_len: usize,
+}
+
+impl LineIndex {
+    fn new(body: &[u8]) -> Self {
+        let newlines = body
+            .iter()
+            .enumerate()
+            .filter_map(|(i, &b)| if b == b'\n' { Some(i) } else { None })
+            .collect();
+        Self {
+            newlines,
+            body_len: body.len(),
+        }
+    }
+
+    /// Body line (0-indexed) containing byte offset `pos`.
+    ///
+    /// Bytes before the first `\n` are line 0. Bytes after the last `\n` are
+    /// line `newlines.len()`. A position equal to a newline byte itself is
+    /// considered to be on the line *ending at* that newline (so `pos` that
+    /// points to `\n` reports the line that newline terminates).
+    fn line_zero_indexed(&self, pos: usize) -> u32 {
+        // Clamp to body length — pulldown-cmark may emit ranges with `end`
+        // equal to body.len() for the final block.
+        let pos = pos.min(self.body_len);
+        // partition_point returns the count of newlines whose offset is `< pos`.
+        // That's exactly the line-index for the line containing `pos`, with
+        // the convention "trailing newline belongs to the previous line".
+        let n_before = self.newlines.partition_point(|&nl| nl < pos);
+        n_before as u32
+    }
+
+    /// Body line (0-indexed) containing byte offset `pos - 1` — i.e., the
+    /// line on which the byte just **before** `pos` lives. Used for `end`
+    /// positions of `pulldown-cmark` ranges, which are exclusive (point one
+    /// past the last byte of the construct).
+    fn end_line_zero_indexed(&self, end: usize) -> u32 {
+        if end == 0 {
+            return 0;
+        }
+        self.line_zero_indexed(end - 1)
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Walker state machine
+// ---------------------------------------------------------------------------
+
+struct WalkState<'a> {
+    body_offset_lines: u32,
+    line_index: &'a LineIndex,
+    body: &'a [u8],
+
+    blocks: Vec<ParsedBlock>,
+    /// Most-recent heading text at each level (1..=6), or `None` if no
+    /// heading at that level has been seen since a higher-level heading was
+    /// encountered. `heading_path()` snapshots this into a `Vec<String>`.
+    heading_stack: [Option<String>; 6],
+
+    /// Accumulator for the currently-open block. We push events into this
+    /// frame and convert to a `ParsedBlock` when the matching End tag fires.
+    frames: Vec<Frame>,
+}
+
+/// One in-flight container we are accumulating events into. Stack-shaped so
+/// nested constructs (list inside list, paragraph inside blockquote) work.
+enum Frame {
+    Heading {
+        level: u8,
+        range: Range<usize>,
+        inlines: InlineBuf,
+    },
+    Paragraph {
+        range: Range<usize>,
+        inlines: InlineBuf,
+    },
+    Quote {
+        range: Range<usize>,
+        children: Vec<ParsedBlock>,
+    },
+    List {
+        ordered: bool,
+        range: Range<usize>,
+        items: Vec<Vec<Inline>>,
+        /// True iff the immediate parent of this list is another list
+        /// item — i.e. this is a nested sub-list. Nested sub-lists get
+        /// flattened into the parent item's inline buffer instead of
+        /// being emitted as their own `ParsedPayload::List`.
+        nested_in_item: bool,
+    },
+    /// One `<li>`. Inlines flow into `inlines`; nested sub-lists append
+    /// flattened text into `inlines` as well.
+    ListItem { inlines: InlineBuf },
+    Code {
+        lang: Option<String>,
+        range: Range<usize>,
+        code: String,
+    },
+    Table {
+        range: Range<usize>,
+        headers: Vec<String>,
+        rows: Vec<Vec<String>>,
+        in_head: bool,
+        current_row: Vec<String>,
+        current_cell: String,
+        cols: usize,
+        malformed: Option<String>,
+    },
+}
+
+/// Inline accumulator with a stack of formatting wrappers (Strong / Emph /
+/// Link). The base of the stack is the top-level inline list.
+struct InlineBuf {
+    /// Stack: bottom is the top-level vec; each push starts a new nested
+    /// `Vec<Inline>` for `Strong`/`Emph`/etc. that gets popped & wrapped
+    /// when its End tag fires.
+    stack: Vec<InlineFrame>,
+    /// Plain-text accumulator for `paragraph.text` — the literal text shown
+    /// to the user without formatting. Matches the inline events 1:1.
+    text: String,
+}
+
+enum InlineFrame {
+    Top(Vec<Inline>),
+    Strong(Vec<Inline>),
+    Emph(Vec<Inline>),
+    Link { href: String, text: String, kids: Vec<Inline> },
+}
+
+impl InlineBuf {
+    fn new() -> Self {
+        Self {
+            stack: vec![InlineFrame::Top(Vec::new())],
+            text: String::new(),
+        }
+    }
+
+    fn push_inline(&mut self, inline: Inline) {
+        match self.stack.last_mut().expect("inline stack non-empty") {
+            InlineFrame::Top(v) => v.push(inline),
+            InlineFrame::Strong(v) => v.push(inline),
+            InlineFrame::Emph(v) => v.push(inline),
+            InlineFrame::Link { kids, .. } => kids.push(inline),
+        }
+    }
+
+    fn push_text(&mut self, s: &str) {
+        self.text.push_str(s);
+        self.push_inline(Inline::Text(s.to_string()));
+    }
+
+    fn push_code(&mut self, s: &str) {
+        self.text.push_str(s);
+        self.push_inline(Inline::Code(s.to_string()));
+    }
+
+    fn open_strong(&mut self) {
+        self.stack.push(InlineFrame::Strong(Vec::new()));
+    }
+    fn close_strong(&mut self) {
+        if let Some(InlineFrame::Strong(kids)) = self.stack.pop() {
+            self.push_inline(Inline::Strong(kids));
+        }
+    }
+
+    fn open_emph(&mut self) {
+        self.stack.push(InlineFrame::Emph(Vec::new()));
+    }
+    fn close_emph(&mut self) {
+        if let Some(InlineFrame::Emph(kids)) = self.stack.pop() {
+            self.push_inline(Inline::Emph(kids));
+        }
+    }
+
+    fn open_link(&mut self, href: String) {
+        self.stack.push(InlineFrame::Link {
+            href,
+            text: String::new(),
+            kids: Vec::new(),
+        });
+    }
+    fn close_link(&mut self) {
+        if let Some(InlineFrame::Link { href, text, kids }) = self.stack.pop() {
+            // Flatten the link's text contents to a single string for the
+            // `Inline::Link.text` field. Code/strong/emph inside a link are
+            // collapsed to their plain text — `Inline::Link` doesn't model
+            // formatting inside the link.
+            let flat = if !text.is_empty() {
+                text
+            } else {
+                flatten_inlines_to_text(&kids)
+            };
+            self.push_inline(Inline::Link { text: flat, href });
+        }
+    }
+
+    /// Append plain text to the current link's flattened text accumulator
+    /// (called from inside a link frame).
+    fn push_link_text(&mut self, s: &str) {
+        if let Some(InlineFrame::Link { text, .. }) = self.stack.last_mut() {
+            text.push_str(s);
+        }
+    }
+
+    fn finish(mut self) -> (Vec<Inline>, String) {
+        // Normal flow: the only remaining frame is the Top, which we unwrap.
+        // If formatting tags were unbalanced we close them defensively.
+        while self.stack.len() > 1 {
+            match self.stack.pop().unwrap() {
+                InlineFrame::Strong(kids) => self.push_inline(Inline::Strong(kids)),
+                InlineFrame::Emph(kids) => self.push_inline(Inline::Emph(kids)),
+                InlineFrame::Link { href, text, kids } => {
+                    let flat = if !text.is_empty() {
+                        text
+                    } else {
+                        flatten_inlines_to_text(&kids)
+                    };
+                    self.push_inline(Inline::Link { text: flat, href });
+                }
+                InlineFrame::Top(_) => break,
+            }
+        }
+        let top = match self.stack.pop().unwrap() {
+            InlineFrame::Top(v) => v,
+            _ => Vec::new(),
+        };
+        (top, self.text)
+    }
+
+}
+
+fn flatten_inlines_to_text(inlines: &[Inline]) -> String {
+    let mut out = String::new();
+    for i in inlines {
+        flatten_one(i, &mut out);
+    }
+    out
+}
+
+fn flatten_one(i: &Inline, out: &mut String) {
+    match i {
+        Inline::Text(s) | Inline::Code(s) => out.push_str(s),
+        Inline::Link { text, .. } => out.push_str(text),
+        Inline::Strong(v) | Inline::Emph(v) => {
+            for c in v {
+                flatten_one(c, out);
+            }
+        }
+    }
+}
+
+impl<'a> WalkState<'a> {
+    fn new(body_offset_lines: u32, line_index: &'a LineIndex, body: &'a [u8]) -> Self {
+        Self {
+            body_offset_lines,
+            line_index,
+            body,
+            blocks: Vec::new(),
+            heading_stack: Default::default(),
+            frames: Vec::new(),
+        }
+    }
+
+    fn finish(self) -> Vec<ParsedBlock> {
+        self.blocks
+    }
+
+    fn heading_path(&self) -> Vec<String> {
+        self.heading_stack
+            .iter()
+            .filter_map(|s| s.clone())
+            .collect()
+    }
+
+    fn span_for(&self, range: &Range<usize>) -> SourceSpan {
+        let start_body = self.line_index.line_zero_indexed(range.start);
+        let end_body = if range.end <= range.start {
+            start_body
+        } else {
+            self.line_index.end_line_zero_indexed(range.end)
+        };
+        SourceSpan::Line {
+            start: start_body + self.body_offset_lines,
+            end: end_body + self.body_offset_lines,
+        }
+    }
+
+    /// Where to emit a finished block: into the current container if any
+    /// (Quote / ListItem), otherwise into the top-level `blocks` vec.
+    fn emit_block(&mut self, block: ParsedBlock) {
+        // Find the nearest enclosing container that accepts child blocks.
+        for frame in self.frames.iter_mut().rev() {
+            if let Frame::Quote { children, .. } = frame {
+                children.push(block);
+                return;
+            }
+        }
+        self.blocks.push(block);
+    }
+
+    fn handle_event(&mut self, event: Event<'_>, range: Range<usize>, warnings: &mut Vec<Warning>) {
+        match event {
+            // ---- Container starts -----------------------------------------------
+            Event::Start(Tag::Heading { level, .. }) => {
+                self.frames.push(Frame::Heading {
+                    level: heading_level_to_u8(level),
+                    range,
+                    inlines: InlineBuf::new(),
+                });
+            }
+            Event::Start(Tag::Paragraph) => {
+                // If we're directly inside a list item, the inlines flow into
+                // the item, not a new paragraph block.
+                if matches!(self.frames.last(), Some(Frame::ListItem { .. })) {
+                    return;
+                }
+                self.frames.push(Frame::Paragraph {
+                    range,
+                    inlines: InlineBuf::new(),
+                });
+            }
+            Event::Start(Tag::BlockQuote(_)) => {
+                self.frames.push(Frame::Quote {
+                    range,
+                    children: Vec::new(),
+                });
+            }
+            Event::Start(Tag::List(start)) => {
+                let nested_in_item = matches!(
+                    self.frames.last(),
+                    Some(Frame::ListItem { .. })
+                );
+                self.frames.push(Frame::List {
+                    ordered: start.is_some(),
+                    range,
+                    items: Vec::new(),
+                    nested_in_item,
+                });
+            }
+            Event::Start(Tag::Item) => {
+                self.frames.push(Frame::ListItem {
+                    inlines: InlineBuf::new(),
+                });
+            }
+            Event::Start(Tag::CodeBlock(kind)) => {
+                let lang = match kind {
+                    CodeBlockKind::Indented => None,
+                    CodeBlockKind::Fenced(info) => {
+                        let trimmed = info.trim();
+                        if trimmed.is_empty() {
+                            None
+                        } else {
+                            // Take only the first whitespace-delimited token,
+                            // matching how editors render the info string.
+                            Some(trimmed.split_whitespace().next().unwrap_or(trimmed).to_string())
+                        }
+                    }
+                };
+                self.frames.push(Frame::Code {
+                    lang,
+                    range,
+                    code: String::new(),
+                });
+            }
+            Event::Start(Tag::Table(aligns)) => {
+                self.frames.push(Frame::Table {
+                    range,
+                    headers: Vec::new(),
+                    rows: Vec::new(),
+                    in_head: false,
+                    current_row: Vec::new(),
+                    current_cell: String::new(),
+                    cols: aligns.len(),
+                    malformed: None,
+                });
+            }
+            Event::Start(Tag::TableHead) => {
+                if let Some(Frame::Table { in_head, .. }) = self.frames.last_mut() {
+                    *in_head = true;
+                }
+            }
+            Event::Start(Tag::TableRow) => {
+                if let Some(Frame::Table { current_row, .. }) = self.frames.last_mut() {
+                    current_row.clear();
+                }
+            }
+            Event::Start(Tag::TableCell) => {
+                if let Some(Frame::Table { current_cell, .. }) = self.frames.last_mut() {
+                    current_cell.clear();
+                }
+            }
+            Event::Start(Tag::Strong) => {
+                self.with_current_inlines(|buf| buf.open_strong());
+            }
+            Event::Start(Tag::Emphasis) => {
+                self.with_current_inlines(|buf| buf.open_emph());
+            }
+            Event::Start(Tag::Link { dest_url, .. }) => {
+                let href = dest_url.into_string();
+                self.with_current_inlines(|buf| buf.open_link(href));
+            }
+            // Block-level image is handled at End — see TagEnd::Image.
+            Event::Start(Tag::Image { .. }) => {
+                // No-op at start; we capture src/title at End.
+            }
+
+            // ---- Container ends -------------------------------------------------
+            Event::End(TagEnd::Heading(level)) => {
+                let level_u8 = heading_level_to_u8(level);
+                if let Some(Frame::Heading { level: lvl, range, inlines }) = self.frames.pop() {
+                    let (_inline_vec, text) = inlines.finish();
+                    let text = text.trim().to_string();
+                    let level_to_use = lvl;
+                    let _ = level_u8;
+
+                    // Update heading stack: clear deeper levels, set this level.
+                    if level_to_use >= 1 && level_to_use <= 6 {
+                        let idx = (level_to_use - 1) as usize;
+                        for slot in &mut self.heading_stack[idx + 1..] {
+                            *slot = None;
+                        }
+                        self.heading_stack[idx] = Some(text.clone());
+                    }
+
+                    // The heading_path on the heading block ITSELF excludes
+                    // the heading's own text (it's the path of ancestors).
+                    let path = self
+                        .heading_stack
+                        .iter()
+                        .take((level_to_use - 1) as usize)
+                        .filter_map(|s| s.clone())
+                        .collect();
+
+                    let block = ParsedBlock {
+                        kind: ParsedBlockKind::Heading,
+                        heading_path: path,
+                        source_span: self.span_for(&range),
+                        payload: ParsedPayload::Heading { level: level_to_use, text },
+                    };
+                    self.emit_block(block);
+                }
+            }
+            Event::End(TagEnd::Paragraph) => {
+                if matches!(self.frames.last(), Some(Frame::Paragraph { .. })) {
+                    if let Some(Frame::Paragraph { range, inlines }) = self.frames.pop() {
+                        let (inline_vec, text) = inlines.finish();
+                        // Block-level image: a paragraph whose only content
+                        // is `![alt](src)` becomes ImageRef. Detect by
+                        // scanning the original source for the canonical
+                        // shape.
+                        if let Some((alt, src)) = match_block_image(self.body, &range) {
+                            let block = ParsedBlock {
+                                kind: ParsedBlockKind::ImageRef,
+                                heading_path: self.heading_path(),
+                                source_span: self.span_for(&range),
+                                payload: ParsedPayload::ImageRef { src, alt },
+                            };
+                            self.emit_block(block);
+                            return;
+                        }
+                        let block = ParsedBlock {
+                            kind: ParsedBlockKind::Paragraph,
+                            heading_path: self.heading_path(),
+                            source_span: self.span_for(&range),
+                            payload: ParsedPayload::Paragraph { text, inlines: inline_vec },
+                        };
+                        self.emit_block(block);
+                    }
+                }
+            }
+            Event::End(TagEnd::BlockQuote(_)) => {
+                if let Some(Frame::Quote { range, children }) = self.frames.pop() {
+                    // Concatenate child text for the Quote payload.
+                    let mut text = String::new();
+                    let mut inlines: Vec<Inline> = Vec::new();
+                    for c in &children {
+                        match &c.payload {
+                            ParsedPayload::Paragraph { text: t, inlines: il }
+                            | ParsedPayload::Quote { text: t, inlines: il } => {
+                                if !text.is_empty() {
+                                    text.push('\n');
+                                }
+                                text.push_str(t);
+                                inlines.extend(il.iter().cloned());
+                            }
+                            ParsedPayload::Heading { text: t, .. } => {
+                                if !text.is_empty() {
+                                    text.push('\n');
+                                }
+                                text.push_str(t);
+                                inlines.push(Inline::Text(t.clone()));
+                            }
+                            _ => {}
+                        }
+                    }
+                    let block = ParsedBlock {
+                        kind: ParsedBlockKind::Quote,
+                        heading_path: self.heading_path(),
+                        source_span: self.span_for(&range),
+                        payload: ParsedPayload::Quote { text, inlines },
+                    };
+                    self.emit_block(block);
+                }
+            }
+            Event::End(TagEnd::List(_)) => {
+                if let Some(Frame::List { ordered, range, items, nested_in_item }) = self.frames.pop() {
+                    if nested_in_item {
+                        // Flatten this sub-list into the enclosing list
+                        // item's inline text. Each item becomes a line
+                        // prefixed with "  - " (or "  1. " for ordered) so
+                        // structure is recognizable downstream.
+                        let mut rendered = String::new();
+                        for (i, item) in items.iter().enumerate() {
+                            rendered.push('\n');
+                            rendered.push_str("  ");
+                            if ordered {
+                                rendered.push_str(&format!("{}. ", i + 1));
+                            } else {
+                                rendered.push_str("- ");
+                            }
+                            rendered.push_str(&flatten_inlines_to_text(item));
+                        }
+                        if let Some(Frame::ListItem { inlines }) = self.frames.last_mut() {
+                            inlines.push_text(&rendered);
+                        }
+                    } else {
+                        let block = ParsedBlock {
+                            kind: ParsedBlockKind::List,
+                            heading_path: self.heading_path(),
+                            source_span: self.span_for(&range),
+                            payload: ParsedPayload::List { ordered, items },
+                        };
+                        self.emit_block(block);
+                    }
+                }
+            }
+            Event::End(TagEnd::Item) => {
+                if let Some(Frame::ListItem { inlines }) = self.frames.pop() {
+                    let (inline_vec, _text) = inlines.finish();
+                    if let Some(Frame::List { items, .. }) = self.frames.last_mut() {
+                        items.push(inline_vec);
+                    }
+                }
+            }
+            Event::End(TagEnd::CodeBlock) => {
+                if let Some(Frame::Code { lang, range, mut code }) = self.frames.pop() {
+                    // Fenced code blocks include a trailing newline from the
+                    // last source line; pulldown-cmark already strips the
+                    // closing fence. We trim a single trailing `\n` for
+                    // clean round-trips with hand-authored fixtures.
+                    if code.ends_with('\n') {
+                        code.pop();
+                    }
+                    let block = ParsedBlock {
+                        kind: ParsedBlockKind::Code,
+                        heading_path: self.heading_path(),
+                        source_span: self.span_for(&range),
+                        payload: ParsedPayload::Code { lang, code },
+                    };
+                    self.emit_block(block);
+                }
+            }
+            Event::End(TagEnd::Table) => {
+                if let Some(Frame::Table {
+                    range,
+                    headers,
+                    rows,
+                    cols,
+                    malformed,
+                    ..
+                }) = self.frames.pop()
+                {
+                    let header_count = if cols > 0 { cols } else { headers.len() };
+                    let block = if let Some(note) = malformed {
+                        // Fall back to a paragraph carrying the raw markdown.
+                        warnings.push(Warning {
+                            kind: WarningKind::MalformedTable,
+                            note,
+                        });
+                        let raw = std::str::from_utf8(
+                            self.body.get(range.clone()).unwrap_or(&[]),
+                        )
+                        .unwrap_or_default()
+                        .to_string();
+                        ParsedBlock {
+                            kind: ParsedBlockKind::Paragraph,
+                            heading_path: self.heading_path(),
+                            source_span: self.span_for(&range),
+                            payload: ParsedPayload::Paragraph {
+                                text: raw.clone(),
+                                inlines: vec![Inline::Text(raw)],
+                            },
+                        }
+                    } else {
+                        let _ = header_count;
+                        ParsedBlock {
+                            kind: ParsedBlockKind::Table,
+                            heading_path: self.heading_path(),
+                            source_span: self.span_for(&range),
+                            payload: ParsedPayload::Table { headers, rows },
+                        }
+                    };
+                    self.emit_block(block);
+                }
+            }
+            Event::End(TagEnd::TableHead) => {
+                if let Some(Frame::Table {
+                    in_head,
+                    headers,
+                    current_row,
+                    cols,
+                    malformed,
+                    ..
+                }) = self.frames.last_mut()
+                {
+                    *in_head = false;
+                    *headers = std::mem::take(current_row);
+                    if *cols == 0 {
+                        *cols = headers.len();
+                    } else if headers.len() != *cols && malformed.is_none() {
+                        *malformed = Some(format!(
+                            "header has {} cells, table reported {} columns",
+                            headers.len(),
+                            cols
+                        ));
+                    }
+                }
+            }
+            Event::End(TagEnd::TableRow) => {
+                if let Some(Frame::Table {
+                    rows,
+                    current_row,
+                    cols,
+                    malformed,
+                    ..
+                }) = self.frames.last_mut()
+                {
+                    let row = std::mem::take(current_row);
+                    if row.len() != *cols && malformed.is_none() {
+                        *malformed = Some(format!(
+                            "row {} has {} cells, headers have {}",
+                            rows.len() + 1,
+                            row.len(),
+                            cols
+                        ));
+                    }
+                    rows.push(row);
+                }
+            }
+            Event::End(TagEnd::TableCell) => {
+                if let Some(Frame::Table {
+                    current_row,
+                    current_cell,
+                    ..
+                }) = self.frames.last_mut()
+                {
+                    current_row.push(std::mem::take(current_cell));
+                }
+            }
+            Event::End(TagEnd::Strong) => {
+                self.with_current_inlines(|buf| buf.close_strong());
+            }
+            Event::End(TagEnd::Emphasis) => {
+                self.with_current_inlines(|buf| buf.close_emph());
+            }
+            Event::End(TagEnd::Link) => {
+                self.with_current_inlines(|buf| buf.close_link());
+            }
+            Event::End(TagEnd::Image) => {
+                // Inline images are dropped silently. Block-level image refs
+                // are detected at paragraph End, not here.
+            }
+
+            // ---- Leaf events -----------------------------------------------------
+            Event::Text(s) => {
+                // Code blocks accumulate into the code string instead of
+                // the inline buffer.
+                if let Some(Frame::Code { code, .. }) = self.frames.last_mut() {
+                    code.push_str(&s);
+                    return;
+                }
+                if let Some(Frame::Table {
+                    in_head,
+                    current_cell,
+                    ..
+                }) = self.frames.last_mut()
+                {
+                    let _ = in_head;
+                    current_cell.push_str(&s);
+                    return;
+                }
+                let owned = s.into_string();
+                self.with_current_inlines(|buf| {
+                    buf.push_text(&owned);
+                    buf.push_link_text(&owned);
+                });
+            }
+            Event::Code(s) => {
+                if let Some(Frame::Table { current_cell, .. }) = self.frames.last_mut() {
+                    current_cell.push_str(&s);
+                    return;
+                }
+                let owned = s.into_string();
+                self.with_current_inlines(|buf| {
+                    buf.push_code(&owned);
+                    buf.push_link_text(&owned);
+                });
+            }
+            Event::SoftBreak | Event::HardBreak => {
+                if let Some(Frame::Code { code, .. }) = self.frames.last_mut() {
+                    code.push('\n');
+                    return;
+                }
+                if let Some(Frame::Table { current_cell, .. }) = self.frames.last_mut() {
+                    current_cell.push(' ');
+                    return;
+                }
+                self.with_current_inlines(|buf| buf.push_text(" "));
+            }
+            // Everything else (HTML, footnote refs, task list markers, math,
+            // rules, etc.) is dropped silently per design §3.4.
+            _ => {}
+        }
+    }
+
+    /// Run `f` on whichever inline accumulator is open at the top of the
+    /// frame stack. No-op if no inline-accepting frame is open.
+    fn with_current_inlines<F: FnOnce(&mut InlineBuf)>(&mut self, f: F) {
+        for frame in self.frames.iter_mut().rev() {
+            match frame {
+                Frame::Paragraph { inlines, .. }
+                | Frame::Heading { inlines, .. }
+                | Frame::ListItem { inlines } => {
+                    f(inlines);
+                    return;
+                }
+                Frame::Quote { .. } | Frame::List { .. } => continue,
+                Frame::Code { .. } | Frame::Table { .. } => return,
+            }
+        }
+    }
+}
+
+fn heading_level_to_u8(level: HeadingLevel) -> u8 {
+    match level {
+        HeadingLevel::H1 => 1,
+        HeadingLevel::H2 => 2,
+        HeadingLevel::H3 => 3,
+        HeadingLevel::H4 => 4,
+        HeadingLevel::H5 => 5,
+        HeadingLevel::H6 => 6,
+    }
+}
+
+/// Detect a paragraph whose entire trimmed source is `![alt](src)` — the
+/// canonical "block-level image" shape. Returns `(alt, src)` if so. We do
+/// this by scanning the original bytes (not the inline events) so it stays
+/// robust to pulldown-cmark's internal representation of images.
+fn match_block_image(body: &[u8], range: &Range<usize>) -> Option<(String, String)> {
+    let slice = body.get(range.clone())?;
+    let s = std::str::from_utf8(slice).ok()?.trim();
+    if !s.starts_with("![") {
+        return None;
+    }
+    // Find the closing `]` of the alt text. Markdown does not allow nested
+    // brackets without escaping — for a block-level image we only handle the
+    // simple form. Anything else falls through to ordinary paragraph parsing.
+    let close_bracket = s.find("](")?;
+    let alt = &s[2..close_bracket];
+    // The rest must be `(SRC)` and nothing else.
+    let after = &s[close_bracket + 2..];
+    let close_paren = after.rfind(')')?;
+    if close_paren != after.len() - 1 {
+        return None;
+    }
+    let src = &after[..close_paren];
+    // Reject if alt or src contain a newline — that means the paragraph has
+    // more content beyond the image and isn't a pure block-level image.
+    if alt.contains('\n') || src.contains('\n') {
+        return None;
+    }
+    // Reject brackets/parens inside src — tolerated by CommonMark via
+    // angle-bracket-wrap, but we keep this conservative for now.
+    Some((alt.to_string(), src.to_string()))
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn parse(body: &str, offset: u32) -> (Vec<ParsedBlock>, Vec<Warning>) {
+        parse_blocks(body.as_bytes(), offset).unwrap()
+    }
+
+    // ---- heading tree depth + heading_path correctness ----------------------
+
+    #[test]
+    fn heading_path_is_ancestors_only() {
+        let body = "# H1 text\n\n## H2 text\n\nbody under h2\n";
+        let (blocks, warns) = parse(body, 1);
+        assert!(warns.is_empty(), "warns: {warns:?}");
+        // Expect: H1 heading, H2 heading, paragraph.
+        assert_eq!(blocks.len(), 3);
+        // H1 has no ancestor.
+        assert_eq!(blocks[0].heading_path, Vec::<String>::new());
+        match &blocks[0].payload {
+            ParsedPayload::Heading { level, text } => {
+                assert_eq!(*level, 1);
+                assert_eq!(text, "H1 text");
+            }
+            _ => panic!("expected heading"),
+        }
+        // H2 nests under H1.
+        assert_eq!(blocks[1].heading_path, vec!["H1 text".to_string()]);
+        // Paragraph nests under H1 → H2.
+        assert_eq!(
+            blocks[2].heading_path,
+            vec!["H1 text".to_string(), "H2 text".to_string()]
+        );
+    }
+
+    #[test]
+    fn h1_resets_deeper_path() {
+        let body = "# A\n\n## A.1\n\np1\n\n# B\n\np2\n";
+        let (blocks, _) = parse(body, 1);
+        // p1 under [A, A.1]; p2 under [B].
+        let p1 = blocks.iter().find(|b| matches!(b.payload, ParsedPayload::Paragraph { ref text, .. } if text == "p1")).unwrap();
+        let p2 = blocks.iter().find(|b| matches!(b.payload, ParsedPayload::Paragraph { ref text, .. } if text == "p2")).unwrap();
+        assert_eq!(p1.heading_path, vec!["A".to_string(), "A.1".to_string()]);
+        assert_eq!(p2.heading_path, vec!["B".to_string()]);
+    }
+
+    // ---- code blocks ---------------------------------------------------------
+
+    #[test]
+    fn code_block_lang_preserved() {
+        let body = "```rust\nfn main(){}\n```\n";
+        let (blocks, _) = parse(body, 1);
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::Code { lang, code } => {
+                assert_eq!(lang.as_deref(), Some("rust"));
+                assert_eq!(code, "fn main(){}");
+            }
+            _ => panic!("expected code"),
+        }
+    }
+
+    #[test]
+    fn code_block_no_lang_is_none() {
+        let body = "```\nfoo\n```\n";
+        let (blocks, _) = parse(body, 1);
+        match &blocks[0].payload {
+            ParsedPayload::Code { lang, .. } => assert_eq!(lang, &None),
+            _ => panic!("expected code"),
+        }
+    }
+
+    // ---- tables --------------------------------------------------------------
+
+    #[test]
+    fn gfm_table_parses_headers_and_rows() {
+        let body = "| a | b | c |\n|---|---|---|\n| 1 | 2 | 3 |\n| x | y | z |\n";
+        let (blocks, warns) = parse(body, 1);
+        assert!(warns.is_empty(), "warns: {warns:?}");
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::Table { headers, rows } => {
+                assert_eq!(headers, &vec!["a".to_string(), "b".to_string(), "c".to_string()]);
+                assert_eq!(rows.len(), 2);
+                assert_eq!(rows[0], vec!["1".to_string(), "2".to_string(), "3".to_string()]);
+                assert_eq!(rows[1], vec!["x".to_string(), "y".to_string(), "z".to_string()]);
+            }
+            _ => panic!("expected table, got {:?}", blocks[0].payload),
+        }
+    }
+
+    /// Drive the malformed-table fallback path directly by feeding a
+    /// hand-built event stream through `WalkState::handle_event`. This
+    /// pins the contract: when the table frame's `malformed` field is set
+    /// at end-of-table, the block degrades to a paragraph and a
+    /// `MalformedTable` warning is emitted.
+    ///
+    /// Doing it via events (instead of a real markdown source) is
+    /// necessary because `pulldown-cmark` auto-normalizes GFM tables —
+    /// short rows are padded, long rows are truncated, so a real markdown
+    /// input never reaches the malformed branch.
+    #[test]
+    fn malformed_table_falls_back_to_paragraph_with_warning() {
+        use pulldown_cmark::{Alignment, CowStr};
+
+        let body = b"| a | b |\n|---|---|\n| 1 |\n";
+        let line_index = LineIndex::new(body);
+        let mut state = WalkState::new(1, &line_index, body);
+        let mut warnings = Vec::new();
+
+        // Synthetic events — fake a 3-column table with a 2-cell header so
+        // the `malformed` branch fires.
+        let aligns = vec![Alignment::None, Alignment::None, Alignment::None];
+        state.handle_event(Event::Start(Tag::Table(aligns)), 0..body.len(), &mut warnings);
+        state.handle_event(Event::Start(Tag::TableHead), 0..0, &mut warnings);
+        state.handle_event(Event::Start(Tag::TableCell), 0..0, &mut warnings);
+        state.handle_event(Event::Text(CowStr::Borrowed("a")), 0..0, &mut warnings);
+        state.handle_event(Event::End(TagEnd::TableCell), 0..0, &mut warnings);
+        state.handle_event(Event::Start(Tag::TableCell), 0..0, &mut warnings);
+        state.handle_event(Event::Text(CowStr::Borrowed("b")), 0..0, &mut warnings);
+        state.handle_event(Event::End(TagEnd::TableCell), 0..0, &mut warnings);
+        // Two-cell header with cols=3 → triggers malformed.
+        state.handle_event(Event::End(TagEnd::TableHead), 0..0, &mut warnings);
+        state.handle_event(Event::End(TagEnd::Table), 0..body.len(), &mut warnings);
+
+        let blocks = state.finish();
+        assert_eq!(blocks.len(), 1);
+        assert_eq!(blocks[0].kind, ParsedBlockKind::Paragraph);
+        match &blocks[0].payload {
+            ParsedPayload::Paragraph { text, .. } => {
+                assert!(text.contains("| a | b |"), "raw markdown preserved: {text:?}");
+            }
+            _ => panic!("expected paragraph fallback"),
+        }
+        assert_eq!(warnings.len(), 1);
+        assert!(matches!(warnings[0].kind, WarningKind::MalformedTable));
+        assert!(warnings[0].note.contains("header"));
+    }
+
+    // ---- line ranges (LF) ----------------------------------------------------
+
+    #[test]
+    fn line_range_lf_paragraph_at_body_line_5() {
+        // body lines: 1=blank, 2=blank, 3=blank, 4=blank, 5=paragraph
+        let body = "\n\n\n\nhello world\n";
+        // body_offset_lines=6 → body line 0 (zero-indexed) → file line 6.
+        // The paragraph is at body 0-indexed line 4 → file line 10.
+        let (blocks, _) = parse(body, 6);
+        assert_eq!(blocks.len(), 1);
+        match blocks[0].source_span {
+            SourceSpan::Line { start, end } => {
+                assert_eq!(start, 10, "paragraph should start at file line 10");
+                assert_eq!(end, 10);
+            }
+            _ => panic!("expected line span"),
+        }
+    }
+
+    #[test]
+    fn line_range_lf_multi_line_paragraph() {
+        // body line 0 = "a", 1 = "b", 2 = "c"; one paragraph, lines 0..=2.
+        let body = "a\nb\nc\n";
+        let (blocks, _) = parse(body, 1);
+        assert_eq!(blocks.len(), 1);
+        match blocks[0].source_span {
+            SourceSpan::Line { start, end } => {
+                assert_eq!(start, 1);
+                assert_eq!(end, 3);
+            }
+            _ => panic!(),
+        }
+    }
+
+    // ---- line ranges (CRLF) --------------------------------------------------
+
+    #[test]
+    fn line_range_crlf_matches_lf() {
+        let body_lf = "\n\n\n\nhello\n";
+        let body_crlf = "\r\n\r\n\r\n\r\nhello\r\n";
+        let (lf_blocks, _) = parse(body_lf, 6);
+        let (crlf_blocks, _) = parse(body_crlf, 6);
+        assert_eq!(lf_blocks.len(), 1);
+        assert_eq!(crlf_blocks.len(), 1);
+        assert_eq!(lf_blocks[0].source_span, crlf_blocks[0].source_span);
+    }
+
+    // ---- image ref -----------------------------------------------------------
+
+    #[test]
+    fn image_ref_block_captures_src_and_alt() {
+        let body = "![hello](pic.png)\n";
+        let (blocks, _) = parse(body, 1);
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::ImageRef { src, alt } => {
+                assert_eq!(alt, "hello");
+                assert_eq!(src, "pic.png");
+            }
+            _ => panic!("expected image ref, got {:?}", blocks[0].payload),
+        }
+        assert_eq!(blocks[0].kind, ParsedBlockKind::ImageRef);
+    }
+
+    #[test]
+    fn inline_image_inside_paragraph_is_dropped() {
+        // The image is part of a longer paragraph → not a block-level image.
+        let body = "see ![alt](pic.png) here\n";
+        let (blocks, _) = parse(body, 1);
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::Paragraph { inlines, .. } => {
+                // Only Text/Code/Link/Strong/Emph allowed.
+                for inl in inlines {
+                    assert!(
+                        matches!(
+                            inl,
+                            Inline::Text(_) | Inline::Code(_) | Inline::Link { .. } | Inline::Strong(_) | Inline::Emph(_)
+                        ),
+                        "unexpected inline kind: {:?}",
+                        inl
+                    );
+                }
+            }
+            _ => panic!("expected paragraph, got {:?}", blocks[0].payload),
+        }
+    }
+
+    // ---- nested lists --------------------------------------------------------
+
+    #[test]
+    fn nested_list_flattens_into_parent_item() {
+        let body = "- a\n  - x\n  - y\n- b\n";
+        let (blocks, _) = parse(body, 1);
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::List { ordered, items } => {
+                assert!(!ordered);
+                assert_eq!(items.len(), 2, "two top-level items");
+                // First item should contain "a" plus a flattened rendering
+                // of the nested sub-list.
+                let flat = flatten_inlines_to_text(&items[0]);
+                assert!(flat.contains("a"), "first item missing 'a': {flat:?}");
+                assert!(flat.contains("- x"), "first item missing '- x': {flat:?}");
+                assert!(flat.contains("- y"), "first item missing '- y': {flat:?}");
+                let flat2 = flatten_inlines_to_text(&items[1]);
+                assert_eq!(flat2.trim(), "b");
+            }
+            _ => panic!("expected list, got {:?}", blocks[0].payload),
+        }
+    }
+
+    // ---- malformed input no-panic -------------------------------------------
+
+    #[test]
+    fn random_bytes_do_not_panic() {
+        // Tiny xorshift PRNG seeded with a constant; deterministic inputs
+        // covering both valid-ish and invalid markdown / utf-8 mixes.
+        let mut state: u32 = 0x1234_5678;
+        let mut next = || {
+            state ^= state << 13;
+            state ^= state >> 17;
+            state ^= state << 5;
+            state
+        };
+        for _ in 0..100 {
+            let len = (next() as usize) % 256;
+            let bytes: Vec<u8> = (0..len).map(|_| next() as u8).collect();
+            let _ = parse_blocks(&bytes, 1).expect("must be Ok");
+        }
+    }
+
+    #[test]
+    fn adversarial_inputs_no_panic() {
+        // Hand-crafted oddities.
+        let cases: &[&[u8]] = &[
+            b"",
+            b"\0\0\0",
+            b"```\nunclosed",
+            b"# heading\n```\nfn main() {",
+            b"| a | b |\n|---|---|\n| 1 |\n",   // short row
+            b"| a | b |\n|---|\n| 1 | 2 |\n",   // header/sep mismatch
+            b"![",
+            b"](",
+            b"---\nfm: yes\n",
+            b"#######",                                                  // 7 hashes (invalid heading)
+            b"\xff\xfe\x00\x00garbage",                                   // non-utf8
+            "# 한글\n\n본문\n".as_bytes(),
+        ];
+        for c in cases {
+            let _ = parse_blocks(c, 1).expect("must be Ok");
+        }
+    }
+
+    // ---- inline filter -------------------------------------------------------
+
+    #[test]
+    fn only_allowed_inlines_emitted() {
+        let body = "**bold** *em* `code` [link](u)\n";
+        let (blocks, _) = parse(body, 1);
+        match &blocks[0].payload {
+            ParsedPayload::Paragraph { inlines, .. } => {
+                let kinds: Vec<&'static str> = inlines.iter().map(|i| match i {
+                    Inline::Text(_) => "Text",
+                    Inline::Code(_) => "Code",
+                    Inline::Link { .. } => "Link",
+                    Inline::Strong(_) => "Strong",
+                    Inline::Emph(_) => "Emph",
+                }).collect();
+                assert!(kinds.contains(&"Strong"));
+                assert!(kinds.contains(&"Emph"));
+                assert!(kinds.contains(&"Code"));
+                assert!(kinds.contains(&"Link"));
+            }
+            _ => panic!(),
+        }
+    }
+}
diff --git a/crates/kb-parse-md/src/lib.rs b/crates/kb-parse-md/src/lib.rs
index 4ce413d..245e8bb 100644
--- a/crates/kb-parse-md/src/lib.rs
+++ b/crates/kb-parse-md/src/lib.rs
@@ -1,19 +1,21 @@
 //! `kb-parse-md` — Markdown parsing for the KB pipeline (§3.7b).
 //!
-//! P1-2 implements the **frontmatter** submodule only. P1-3 will add a
-//! sibling `blocks` submodule for block parsing using `pulldown-cmark`.
-//!
-//! Public surface for P1-2 is intentionally narrow:
+//! Public surface:
 //!
 //! * [`parse_frontmatter`] — pure function from Markdown bytes to
-//!   `(Metadata, Option<FrontmatterSpan>, Vec<Warning>)`.
+//!   `(Metadata, Option<FrontmatterSpan>, Vec<Warning>)` (P1-2).
 //! * [`BodyHints`] — caller-supplied fallbacks that feed the §0 Q9 derive
-//!   table when frontmatter is missing or partial.
+//!   table when frontmatter is missing or partial (P1-2).
 //! * [`FrontmatterSpan`] — byte offsets of the frontmatter region in the
-//!   input slice (returned by [`parse_frontmatter`]).
+//!   input slice (returned by [`parse_frontmatter`]) (P1-2).
+//! * [`parse_blocks`] — pure function from Markdown body bytes to
+//!   `(Vec<ParsedBlock>, Vec<Warning>)` with heading paths and 1-indexed
+//!   `SourceSpan::Line` ranges relative to the original file (P1-3).
 //!
 //! Anything else in this crate is `pub(crate)` and may change without notice.
 
+pub mod blocks;
 pub mod frontmatter;
 
+pub use blocks::parse_blocks;
 pub use frontmatter::{BodyHints, FrontmatterSpan, parse_frontmatter};
-- 
2.49.1


From f604a381dfda2e00284e3e299c8fa2dedf69f851 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 14:17:41 +0000
Subject: [PATCH 02/10] p1-3: snapshot tests + clippy fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two snapshot tests (`nested-headings.md`, `code-and-table.md`) under
crates/kb-parse-md/tests/blocks_snapshots.rs, with matching baseline JSON
next to each fixture. The snapshot view projects `kb_core::Inline` to
flat strings — `Inline` carries `serde(tag = "kind")` which is
incompatible with newtype variants holding a primitive (`Text(String)`),
so direct serialization of `ParsedBlock` would fail today. The view
preserves the contract that matters for P1-3 (heading paths, source
spans, payload kinds, payload text/code/table content) and will keep
working once kb-core fixes the Inline schema in a later task.

Also tightens `level_to_use >= 1 && <= 6` into `(1..=6).contains(&_)` to
satisfy `clippy::manual_range_contains`.
---
 crates/kb-parse-md/src/blocks.rs              |   2 +-
 crates/kb-parse-md/tests/blocks_snapshots.rs  | 240 ++++++++++++++++++
 .../code-and-table.blocks.snapshot.json       |  63 +++++
 .../nested-headings.blocks.snapshot.json      | 136 ++++++++++
 4 files changed, 440 insertions(+), 1 deletion(-)
 create mode 100644 crates/kb-parse-md/tests/blocks_snapshots.rs
 create mode 100644 fixtures/markdown/code-and-table.blocks.snapshot.json
 create mode 100644 fixtures/markdown/nested-headings.blocks.snapshot.json

diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs
index 1ddfaa6..86913de 100644
--- a/crates/kb-parse-md/src/blocks.rs
+++ b/crates/kb-parse-md/src/blocks.rs
@@ -528,7 +528,7 @@ impl<'a> WalkState<'a> {
                     let _ = level_u8;
 
                     // Update heading stack: clear deeper levels, set this level.
-                    if level_to_use >= 1 && level_to_use <= 6 {
+                    if (1..=6).contains(&level_to_use) {
                         let idx = (level_to_use - 1) as usize;
                         for slot in &mut self.heading_stack[idx + 1..] {
                             *slot = None;
diff --git a/crates/kb-parse-md/tests/blocks_snapshots.rs b/crates/kb-parse-md/tests/blocks_snapshots.rs
new file mode 100644
index 0000000..483286e
--- /dev/null
+++ b/crates/kb-parse-md/tests/blocks_snapshots.rs
@@ -0,0 +1,240 @@
+//! Snapshot tests pinning the `parse_blocks` output for two fixtures.
+//!
+//! Baselines are hand-authored / regenerated via the `--ignored` emitter
+//! below. `body_offset_lines = 1` is used for both fixtures (no
+//! frontmatter, body starts at file line 1).
+//!
+//! Note on snapshot shape: `kb_core::Inline` carries a `serde(tag = "kind")`
+//! enum representation that cannot serialize newtype variants holding a
+//! primitive (`Inline::Text(String)` etc.) — that's a serde limitation, not
+//! ours, and is fixed up in a later kb-core task. To keep the snapshot
+//! human-readable (and stable across that future fix), we project each
+//! `ParsedBlock` into a `BlockView` that flattens inline content to plain
+//! strings before serialization. This still pins the *contract* that
+//! matters for P1-3: heading paths, source spans, payload kinds, payload
+//! text content, table headers/rows, and code lang/body.
+
+use kb_core::{Inline, SourceSpan};
+use kb_parse_md::parse_blocks;
+use kb_parse_types::{ParsedBlock, ParsedPayload, Warning};
+use serde::Serialize;
+use serde_json::Value;
+use std::fs;
+use std::path::PathBuf;
+
+#[derive(Serialize)]
+struct Snapshot {
+    blocks: Vec<BlockView>,
+    warnings: Vec<Warning>,
+}
+
+#[derive(Serialize)]
+struct BlockView {
+    kind: String,
+    heading_path: Vec<String>,
+    source_span: SourceSpan,
+    payload: PayloadView,
+}
+
+#[derive(Serialize)]
+#[serde(tag = "kind", rename_all = "lowercase")]
+enum PayloadView {
+    Heading {
+        level: u8,
+        text: String,
+    },
+    Paragraph {
+        text: String,
+        inlines_flat: String,
+    },
+    List {
+        ordered: bool,
+        items_flat: Vec<String>,
+    },
+    Code {
+        lang: Option<String>,
+        code: String,
+    },
+    Table {
+        headers: Vec<String>,
+        rows: Vec<Vec<String>>,
+    },
+    Quote {
+        text: String,
+        inlines_flat: String,
+    },
+    ImageRef {
+        src: String,
+        alt: String,
+    },
+    AudioRef {
+        src: String,
+    },
+}
+
+fn flatten_inline(i: &Inline, out: &mut String) {
+    match i {
+        Inline::Text(s) | Inline::Code(s) => out.push_str(s),
+        Inline::Link { text, href } => {
+            out.push('[');
+            out.push_str(text);
+            out.push_str("](");
+            out.push_str(href);
+            out.push(')');
+        }
+        Inline::Strong(v) => {
+            out.push_str("**");
+            for c in v {
+                flatten_inline(c, out);
+            }
+            out.push_str("**");
+        }
+        Inline::Emph(v) => {
+            out.push('*');
+            for c in v {
+                flatten_inline(c, out);
+            }
+            out.push('*');
+        }
+    }
+}
+
+fn flatten(inlines: &[Inline]) -> String {
+    let mut out = String::new();
+    for i in inlines {
+        flatten_inline(i, &mut out);
+    }
+    out
+}
+
+fn block_to_view(b: &ParsedBlock) -> BlockView {
+    let kind = format!("{:?}", b.kind).to_lowercase();
+    let payload = match &b.payload {
+        ParsedPayload::Heading { level, text } => PayloadView::Heading {
+            level: *level,
+            text: text.clone(),
+        },
+        ParsedPayload::Paragraph { text, inlines } => PayloadView::Paragraph {
+            text: text.clone(),
+            inlines_flat: flatten(inlines),
+        },
+        ParsedPayload::List { ordered, items } => PayloadView::List {
+            ordered: *ordered,
+            items_flat: items.iter().map(|it| flatten(it)).collect(),
+        },
+        ParsedPayload::Code { lang, code } => PayloadView::Code {
+            lang: lang.clone(),
+            code: code.clone(),
+        },
+        ParsedPayload::Table { headers, rows } => PayloadView::Table {
+            headers: headers.clone(),
+            rows: rows.clone(),
+        },
+        ParsedPayload::Quote { text, inlines } => PayloadView::Quote {
+            text: text.clone(),
+            inlines_flat: flatten(inlines),
+        },
+        ParsedPayload::ImageRef { src, alt } => PayloadView::ImageRef {
+            src: src.clone(),
+            alt: alt.clone(),
+        },
+        ParsedPayload::AudioRef { src } => PayloadView::AudioRef { src: src.clone() },
+    };
+    BlockView {
+        kind,
+        heading_path: b.heading_path.clone(),
+        source_span: b.source_span.clone(),
+        payload,
+    }
+}
+
+fn fixtures_dir() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("..")
+        .join("..")
+        .join("fixtures")
+        .join("markdown")
+}
+
+fn assert_snapshot(fixture: &str, baseline: &str) {
+    let dir = fixtures_dir();
+    let bytes = fs::read(dir.join(fixture)).expect("fixture readable");
+
+    let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
+    let snap = Snapshot {
+        blocks: blocks.iter().map(block_to_view).collect(),
+        warnings: warns,
+    };
+    let actual: Value = serde_json::to_value(&snap).unwrap();
+
+    let expected_text =
+        fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable");
+    let expected: Value = serde_json::from_str(&expected_text).expect("baseline parses as json");
+
+    if actual != expected {
+        let actual_pretty = serde_json::to_string_pretty(&actual).unwrap();
+        panic!(
+            "snapshot drift for {fixture}\n\
+             --- expected ({baseline}) ---\n{expected_text}\n\
+             --- actual ---\n{actual_pretty}\n\
+             If the change is intentional, update {baseline}."
+        );
+    }
+}
+
+#[test]
+fn nested_headings_blocks_snapshot() {
+    assert_snapshot(
+        "nested-headings.md",
+        "nested-headings.blocks.snapshot.json",
+    );
+}
+
+#[test]
+fn code_and_table_blocks_snapshot() {
+    assert_snapshot(
+        "code-and-table.md",
+        "code-and-table.blocks.snapshot.json",
+    );
+}
+
+/// Run with `cargo test -p kb-parse-md --test blocks_snapshots emit_blocks_snapshots -- --ignored --nocapture`
+/// to regenerate the baseline JSON files from the current parser output.
+#[test]
+#[ignore]
+fn emit_blocks_snapshots() {
+    let dir = fixtures_dir();
+    for (fixture, baseline) in [
+        ("nested-headings.md", "nested-headings.blocks.snapshot.json"),
+        ("code-and-table.md", "code-and-table.blocks.snapshot.json"),
+    ] {
+        let bytes = fs::read(dir.join(fixture)).unwrap();
+        let (blocks, warns) = parse_blocks(&bytes, 1).unwrap();
+        let snap = Snapshot {
+            blocks: blocks.iter().map(block_to_view).collect(),
+            warnings: warns,
+        };
+        let json = serde_json::to_string_pretty(&snap).unwrap();
+        fs::write(dir.join(baseline), format!("{json}\n")).unwrap();
+        eprintln!("wrote {}", dir.join(baseline).display());
+    }
+}
+
+/// Determinism: parsing the same fixture twice in a row must give equal output.
+#[test]
+fn snapshot_is_deterministic_across_runs() {
+    let dir = fixtures_dir();
+    let bytes = fs::read(dir.join("nested-headings.md")).unwrap();
+    let (a_blocks, a_warns) = parse_blocks(&bytes, 1).unwrap();
+    let (b_blocks, b_warns) = parse_blocks(&bytes, 1).unwrap();
+    // Compare via the view (which is fully serializable) and via the
+    // structural equality on `ParsedBlock` itself (no serde involved).
+    assert_eq!(a_blocks, b_blocks);
+    assert_eq!(a_warns, b_warns);
+    let av: Vec<_> = a_blocks.iter().map(block_to_view).collect();
+    let bv: Vec<_> = b_blocks.iter().map(block_to_view).collect();
+    assert_eq!(
+        serde_json::to_value(&av).unwrap(),
+        serde_json::to_value(&bv).unwrap()
+    );
+}
diff --git a/fixtures/markdown/code-and-table.blocks.snapshot.json b/fixtures/markdown/code-and-table.blocks.snapshot.json
new file mode 100644
index 0000000..8335e43
--- /dev/null
+++ b/fixtures/markdown/code-and-table.blocks.snapshot.json
@@ -0,0 +1,63 @@
+{
+  "blocks": [
+    {
+      "kind": "heading",
+      "heading_path": [],
+      "source_span": {
+        "kind": "line",
+        "start": 1,
+        "end": 1
+      },
+      "payload": {
+        "kind": "heading",
+        "level": 1,
+        "text": "Code And Table"
+      }
+    },
+    {
+      "kind": "code",
+      "heading_path": [
+        "Code And Table"
+      ],
+      "source_span": {
+        "kind": "line",
+        "start": 3,
+        "end": 7
+      },
+      "payload": {
+        "kind": "code",
+        "lang": "rust",
+        "code": "fn main() {\n    println!(\"hi\");\n}"
+      }
+    },
+    {
+      "kind": "table",
+      "heading_path": [
+        "Code And Table"
+      ],
+      "source_span": {
+        "kind": "line",
+        "start": 9,
+        "end": 12
+      },
+      "payload": {
+        "kind": "table",
+        "headers": [
+          "col a",
+          "col b"
+        ],
+        "rows": [
+          [
+            "1",
+            "2"
+          ],
+          [
+            "3",
+            "4"
+          ]
+        ]
+      }
+    }
+  ],
+  "warnings": []
+}
diff --git a/fixtures/markdown/nested-headings.blocks.snapshot.json b/fixtures/markdown/nested-headings.blocks.snapshot.json
new file mode 100644
index 0000000..032c3e6
--- /dev/null
+++ b/fixtures/markdown/nested-headings.blocks.snapshot.json
@@ -0,0 +1,136 @@
+{
+  "blocks": [
+    {
+      "kind": "heading",
+      "heading_path": [],
+      "source_span": {
+        "kind": "line",
+        "start": 1,
+        "end": 1
+      },
+      "payload": {
+        "kind": "heading",
+        "level": 1,
+        "text": "Top"
+      }
+    },
+    {
+      "kind": "paragraph",
+      "heading_path": [
+        "Top"
+      ],
+      "source_span": {
+        "kind": "line",
+        "start": 3,
+        "end": 3
+      },
+      "payload": {
+        "kind": "paragraph",
+        "text": "intro",
+        "inlines_flat": "intro"
+      }
+    },
+    {
+      "kind": "heading",
+      "heading_path": [
+        "Top"
+      ],
+      "source_span": {
+        "kind": "line",
+        "start": 5,
+        "end": 5
+      },
+      "payload": {
+        "kind": "heading",
+        "level": 2,
+        "text": "Section A"
+      }
+    },
+    {
+      "kind": "paragraph",
+      "heading_path": [
+        "Top",
+        "Section A"
+      ],
+      "source_span": {
+        "kind": "line",
+        "start": 7,
+        "end": 7
+      },
+      "payload": {
+        "kind": "paragraph",
+        "text": "body of A",
+        "inlines_flat": "body of A"
+      }
+    },
+    {
+      "kind": "heading",
+      "heading_path": [
+        "Top",
+        "Section A"
+      ],
+      "source_span": {
+        "kind": "line",
+        "start": 9,
+        "end": 9
+      },
+      "payload": {
+        "kind": "heading",
+        "level": 3,
+        "text": "Sub A.1"
+      }
+    },
+    {
+      "kind": "paragraph",
+      "heading_path": [
+        "Top",
+        "Section A",
+        "Sub A.1"
+      ],
+      "source_span": {
+        "kind": "line",
+        "start": 11,
+        "end": 11
+      },
+      "payload": {
+        "kind": "paragraph",
+        "text": "deeper",
+        "inlines_flat": "deeper"
+      }
+    },
+    {
+      "kind": "heading",
+      "heading_path": [
+        "Top"
+      ],
+      "source_span": {
+        "kind": "line",
+        "start": 13,
+        "end": 13
+      },
+      "payload": {
+        "kind": "heading",
+        "level": 2,
+        "text": "Section B"
+      }
+    },
+    {
+      "kind": "paragraph",
+      "heading_path": [
+        "Top",
+        "Section B"
+      ],
+      "source_span": {
+        "kind": "line",
+        "start": 15,
+        "end": 15
+      },
+      "payload": {
+        "kind": "paragraph",
+        "text": "body of B",
+        "inlines_flat": "body of B"
+      }
+    }
+  ],
+  "warnings": []
+}
-- 
2.49.1


From 2ac0f56105d6e76b183db9b82d5aab5ac72134c8 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 14:18:54 +0000
Subject: [PATCH 03/10] p1-3: drop dead bindings in heading + table end
 handlers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removes the leftover `let _ = level_u8;` and `let _ = header_count;`
discards. The Heading frame already carries the canonical level (we
populated it from `Tag::Heading` at Start), so we destructure that
directly and ignore the redundant `TagEnd::Heading(level)` payload.
The header_count helper was dead — Frame::Table tracks `cols`
internally and we never consumed `header_count`.
---
 crates/kb-parse-md/src/blocks.rs | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs
index 86913de..5d225c0 100644
--- a/crates/kb-parse-md/src/blocks.rs
+++ b/crates/kb-parse-md/src/blocks.rs
@@ -519,13 +519,13 @@ impl<'a> WalkState<'a> {
             }
 
             // ---- Container ends -------------------------------------------------
-            Event::End(TagEnd::Heading(level)) => {
-                let level_u8 = heading_level_to_u8(level);
-                if let Some(Frame::Heading { level: lvl, range, inlines }) = self.frames.pop() {
+            Event::End(TagEnd::Heading(_level)) => {
+                // The Tag::Heading frame is the source of truth for the
+                // level — `_level` from TagEnd is identical for well-formed
+                // input. We trust the frame.
+                if let Some(Frame::Heading { level: level_to_use, range, inlines }) = self.frames.pop() {
                     let (_inline_vec, text) = inlines.finish();
                     let text = text.trim().to_string();
-                    let level_to_use = lvl;
-                    let _ = level_u8;
 
                     // Update heading stack: clear deeper levels, set this level.
                     if (1..=6).contains(&level_to_use) {
@@ -679,12 +679,10 @@ impl<'a> WalkState<'a> {
                     range,
                     headers,
                     rows,
-                    cols,
                     malformed,
                     ..
                 }) = self.frames.pop()
                 {
-                    let header_count = if cols > 0 { cols } else { headers.len() };
                     let block = if let Some(note) = malformed {
                         // Fall back to a paragraph carrying the raw markdown.
                         warnings.push(Warning {
@@ -706,7 +704,6 @@ impl<'a> WalkState<'a> {
                             },
                         }
                     } else {
-                        let _ = header_count;
                         ParsedBlock {
                             kind: ParsedBlockKind::Table,
                             heading_path: self.heading_path(),
-- 
2.49.1


From de9164802be86cf7f7386cc02dddfaa6b3818e94 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 14:34:58 +0000
Subject: [PATCH 04/10] p1-3: fix span arithmetic overflow + body_offset_lines
 fuzz

`span_for` previously used `u32 + u32` directly, so callers passing a
large `body_offset_lines` could panic (debug, then masked by
`catch_unwind` and the entire body discarded) or wrap to an inverted
span with `start > end` (release).

Switch to `checked_add`; on overflow flag the walk state and at the
end of `parse_blocks_inner` discard accumulated blocks and surface a
single `Warning::ExtractFailed` carrying the offending body line.
This degrades cleanly without panicking and without emitting a
silently-broken span.

Also extend `random_bytes_do_not_panic` to mix u32::MAX-style offsets
across the fuzz iterations so the overflow path is exercised by the
randomized corpus.

New tests:
- body_offset_lines_max_returns_extract_failed
- body_offset_lines_zero_at_max_minus_one_no_overflow

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kb-parse-md/src/blocks.rs | 96 ++++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 6 deletions(-)

diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs
index 5d225c0..56f5c92 100644
--- a/crates/kb-parse-md/src/blocks.rs
+++ b/crates/kb-parse-md/src/blocks.rs
@@ -96,6 +96,24 @@ fn parse_blocks_inner(body: &[u8], body_offset_lines: u32) -> (Vec<ParsedBlock>,
         state.handle_event(event, range, &mut warnings);
     }
 
+    // If line-number arithmetic saturated at any point (e.g. caller passed
+    // `body_offset_lines = u32::MAX`), discard the (potentially clamped)
+    // blocks and surface a single `ExtractFailed` warning. Returning the
+    // possibly-inverted spans would be more harmful than dropping output.
+    if state.overflow_detected {
+        let at = state
+            .overflow_at_body_line
+            .map(|n| n.to_string())
+            .unwrap_or_else(|| "?".to_string());
+        return (
+            Vec::new(),
+            vec![Warning {
+                kind: WarningKind::ExtractFailed,
+                note: format!("body_offset_lines overflow at body line {at}"),
+            }],
+        );
+    }
+
     (state.finish(), warnings)
 }
 
@@ -174,6 +192,14 @@ struct WalkState<'a> {
     /// Accumulator for the currently-open block. We push events into this
     /// frame and convert to a `ParsedBlock` when the matching End tag fires.
     frames: Vec<Frame>,
+
+    /// Set if any line-number arithmetic in [`WalkState::span_for`] would
+    /// overflow `u32::MAX`. When set, [`Self::finish`] discards accumulated
+    /// blocks and the caller emits a single `Warning::ExtractFailed` instead.
+    overflow_detected: bool,
+    /// Body-line (0-indexed) where overflow was first observed, for the
+    /// warning note.
+    overflow_at_body_line: Option<u32>,
 }
 
 /// One in-flight container we are accumulating events into. Stack-shaped so
@@ -372,6 +398,8 @@ impl<'a> WalkState<'a> {
             blocks: Vec::new(),
             heading_stack: Default::default(),
             frames: Vec::new(),
+            overflow_detected: false,
+            overflow_at_body_line: None,
         }
     }
 
@@ -386,16 +414,33 @@ impl<'a> WalkState<'a> {
             .collect()
     }
 
-    fn span_for(&self, range: &Range<usize>) -> SourceSpan {
+    fn span_for(&mut self, range: &Range<usize>) -> SourceSpan {
         let start_body = self.line_index.line_zero_indexed(range.start);
         let end_body = if range.end <= range.start {
             start_body
         } else {
             self.line_index.end_line_zero_indexed(range.end)
         };
-        SourceSpan::Line {
-            start: start_body + self.body_offset_lines,
-            end: end_body + self.body_offset_lines,
+        // Saturating add — but also remember whether overflow happened so the
+        // caller can degrade gracefully rather than silently emitting an
+        // inverted span. Without this guard, debug builds panic with
+        // "attempt to add with overflow" (caught by `catch_unwind`, masking
+        // the real cause) and release builds wrap to `start > end`.
+        match (
+            start_body.checked_add(self.body_offset_lines),
+            end_body.checked_add(self.body_offset_lines),
+        ) {
+            (Some(start), Some(end)) => SourceSpan::Line { start, end },
+            _ => {
+                if !self.overflow_detected {
+                    self.overflow_detected = true;
+                    self.overflow_at_body_line = Some(start_body);
+                }
+                SourceSpan::Line {
+                    start: start_body.saturating_add(self.body_offset_lines),
+                    end: end_body.saturating_add(self.body_offset_lines),
+                }
+            }
         }
     }
 
@@ -1156,6 +1201,9 @@ mod tests {
     fn random_bytes_do_not_panic() {
         // Tiny xorshift PRNG seeded with a constant; deterministic inputs
         // covering both valid-ish and invalid markdown / utf-8 mixes.
+        // We also vary `body_offset_lines` across the full u32 range —
+        // including u32::MAX — to exercise the saturating-add path in
+        // `span_for` and confirm the overflow guard prevents panics.
         let mut state: u32 = 0x1234_5678;
         let mut next = || {
             state ^= state << 13;
@@ -1163,13 +1211,49 @@ mod tests {
             state ^= state << 5;
             state
         };
-        for _ in 0..100 {
+        for i in 0..100 {
             let len = (next() as usize) % 256;
             let bytes: Vec<u8> = (0..len).map(|_| next() as u8).collect();
-            let _ = parse_blocks(&bytes, 1).expect("must be Ok");
+            // Mix a u32::MAX-style offset every few iterations so the fuzz
+            // exercises the overflow path explicitly.
+            let offset = match i % 4 {
+                0 => 0,
+                1 => 1,
+                2 => next(),
+                _ => u32::MAX - (next() % 16),
+            };
+            let _ = parse_blocks(&bytes, offset).expect("must be Ok");
         }
     }
 
+    #[test]
+    fn body_offset_lines_max_returns_extract_failed() {
+        // u32::MAX + any positive line number overflows; the parser must
+        // surface `ExtractFailed` and discard blocks rather than panic
+        // (debug) or emit an inverted span (release).
+        let body = b"# h\nbody\n";
+        let (blocks, warns) = parse_blocks(body, u32::MAX).expect("must be Ok");
+        assert!(blocks.is_empty(), "blocks should be discarded on overflow");
+        assert_eq!(warns.len(), 1, "expected one warning, got: {warns:?}");
+        assert_eq!(warns[0].kind, WarningKind::ExtractFailed);
+        assert!(
+            warns[0].note.contains("overflow"),
+            "note should mention overflow: {:?}",
+            warns[0].note
+        );
+    }
+
+    #[test]
+    fn body_offset_lines_zero_at_max_minus_one_no_overflow() {
+        // A body with exactly one logical line + offset = u32::MAX would
+        // still saturate. Pick offset such that no line ever reaches
+        // overflow — this should succeed normally.
+        let body = b"hi\n";
+        let (blocks, warns) = parse_blocks(body, u32::MAX - 100).expect("must be Ok");
+        assert_eq!(blocks.len(), 1);
+        assert!(warns.is_empty(), "no overflow expected, got: {warns:?}");
+    }
+
     #[test]
     fn adversarial_inputs_no_panic() {
         // Hand-crafted oddities.
-- 
2.49.1


From 0050cf32ea7618b7f69c22d04ccaf5cb7bd83f76 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 14:36:27 +0000
Subject: [PATCH 05/10] p1-3: route block-level content inside list items into
 parent inline buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`emit_block` previously walked the frame stack looking only for a Quote
container, falling back to top-level on miss. That caused any block
emitted inside a list item — code blocks, images, tables, headings —
to escape the list and appear at the top of `blocks`, after the
entire list and out of source order. `ParsedPayload::List { items:
Vec<Vec<Inline>> }` cannot represent a child block structurally, so
the choice is between dropping content and flattening.

Extend the reverse-walk to also recognize `Frame::ListItem` and route
the block into a textual rendering appended to the item's inline
buffer (`flatten_block_into_item`):

- Code → fenced text approximation, preserving lang hint + body
- Image → `![alt](src)` text
- Audio → `[audio](src)` text
- Heading → leading hashes + text
- Quote → `> text`
- Nested List → same rendering as `nested_in_item` flatten
- Table → pipe-table approximation

Document order is preserved because flattening happens inside the
item's frame, before the item closes.

New tests:
- code_block_inside_list_item_flattens_into_parent
- image_inside_list_item_flattens_into_parent
- block_content_in_list_preserves_document_order

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kb-parse-md/src/blocks.rs | 171 ++++++++++++++++++++++++++++++-
 1 file changed, 167 insertions(+), 4 deletions(-)

diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs
index 56f5c92..10afa2c 100644
--- a/crates/kb-parse-md/src/blocks.rs
+++ b/crates/kb-parse-md/src/blocks.rs
@@ -369,6 +369,88 @@ impl InlineBuf {
 
 }
 
+/// Flatten an emitted block into the inline buffer of an enclosing list
+/// item, preserving content but losing structure. This keeps document
+/// order and avoids "block escapes the list" — the alternative would be
+/// pushing the block to the top-level `blocks` vec, where it appears
+/// after the entire list and out of source order.
+///
+/// Code blocks render as a fenced-text approximation so the language
+/// hint and body survive. Images render as `![alt](src)`. Headings,
+/// tables, quotes, etc. render as their text payload.
+fn flatten_block_into_item(block: &ParsedBlock, inlines: &mut InlineBuf) {
+    match &block.payload {
+        ParsedPayload::Code { lang, code } => {
+            let mut rendered = String::from("\n\n```");
+            if let Some(l) = lang {
+                rendered.push_str(l);
+            }
+            rendered.push('\n');
+            rendered.push_str(code);
+            if !code.ends_with('\n') {
+                rendered.push('\n');
+            }
+            rendered.push_str("```\n");
+            inlines.push_text(&rendered);
+        }
+        ParsedPayload::ImageRef { src, alt } => {
+            inlines.push_text(&format!("![{alt}]({src})"));
+        }
+        ParsedPayload::AudioRef { src } => {
+            inlines.push_text(&format!("[audio]({src})"));
+        }
+        ParsedPayload::Heading { level, text } => {
+            // Render with leading hashes so structure is recognizable.
+            let hashes = "#".repeat((*level as usize).clamp(1, 6));
+            inlines.push_text(&format!("\n{hashes} {text}\n"));
+        }
+        ParsedPayload::Paragraph { text, inlines: child } => {
+            // Paragraphs inside list items normally don't reach this path
+            // (the Start(Tag::Paragraph) handler suppresses creating a
+            // Paragraph frame when the parent is a ListItem). This branch
+            // is defensive — e.g. a paragraph emitted via some future
+            // synthetic-event path.
+            inlines.push_text("\n");
+            for c in child {
+                inlines.push_inline(c.clone());
+            }
+            inlines.text.push_str(text);
+        }
+        ParsedPayload::Quote { text, .. } => {
+            inlines.push_text(&format!("\n> {text}\n"));
+        }
+        ParsedPayload::List { ordered, items } => {
+            // A non-nested-flag List that still ended up inside a ListItem
+            // (shouldn't normally happen — `nested_in_item` flattens at
+            // `End(List)`). Fall back to the same rendering as the nested
+            // path for safety.
+            let mut rendered = String::new();
+            for (i, item) in items.iter().enumerate() {
+                rendered.push('\n');
+                rendered.push_str("  ");
+                if *ordered {
+                    rendered.push_str(&format!("{}. ", i + 1));
+                } else {
+                    rendered.push_str("- ");
+                }
+                rendered.push_str(&flatten_inlines_to_text(item));
+            }
+            inlines.push_text(&rendered);
+        }
+        ParsedPayload::Table { headers, rows } => {
+            // Pipe-table approximation; structure lost, content preserved.
+            let mut rendered = String::from("\n");
+            rendered.push_str(&headers.join(" | "));
+            rendered.push('\n');
+            for row in rows {
+                rendered.push_str(&row.join(" | "));
+                rendered.push('\n');
+            }
+            inlines.push_text(&rendered);
+        }
+    }
+}
+
 fn flatten_inlines_to_text(inlines: &[Inline]) -> String {
     let mut out = String::new();
     for i in inlines {
@@ -446,12 +528,32 @@ impl<'a> WalkState<'a> {
 
     /// Where to emit a finished block: into the current container if any
     /// (Quote / ListItem), otherwise into the top-level `blocks` vec.
+    ///
+    /// Block-level content (code, image, heading, table, ...) inside a list
+    /// item cannot be represented structurally by `ParsedPayload::List`
+    /// (items hold `Vec<Inline>`, not child blocks). To preserve content
+    /// and document order we **flatten** such blocks into a textual
+    /// rendering and append it to the enclosing list item's inline buffer.
+    /// Without this, a code block inside a list item escapes to the
+    /// top-level `blocks` vec — out of order, and detached from its parent.
     fn emit_block(&mut self, block: ParsedBlock) {
         // Find the nearest enclosing container that accepts child blocks.
-        for frame in self.frames.iter_mut().rev() {
-            if let Frame::Quote { children, .. } = frame {
-                children.push(block);
-                return;
+        // Walk in reverse: ListItem and Quote both qualify; we honor
+        // whichever is closer to the top of the frame stack.
+        for idx in (0..self.frames.len()).rev() {
+            match &mut self.frames[idx] {
+                Frame::Quote { children, .. } => {
+                    children.push(block);
+                    return;
+                }
+                Frame::ListItem { inlines } => {
+                    // Render the block as text + inlines and append to the
+                    // item's inline buffer. Document order is preserved
+                    // because we run inside the item's frame.
+                    flatten_block_into_item(&block, inlines);
+                    return;
+                }
+                _ => continue,
             }
         }
         self.blocks.push(block);
@@ -1195,6 +1297,67 @@ mod tests {
         }
     }
 
+    // ---- block content inside list items ------------------------------------
+
+    #[test]
+    fn code_block_inside_list_item_flattens_into_parent() {
+        // Without the routing fix, the code block would escape the list and
+        // appear at top level (in wrong source order).
+        let body = "- item\n\n  ```rust\n  fn f(){}\n  ```\n\n- next\n";
+        let (blocks, _) = parse(body, 1);
+        // Expect a single List block, no escaped Code at top level.
+        assert_eq!(
+            blocks.len(),
+            1,
+            "code should not escape to top level: got {} blocks",
+            blocks.len()
+        );
+        match &blocks[0].payload {
+            ParsedPayload::List { ordered, items } => {
+                assert!(!ordered);
+                assert_eq!(items.len(), 2);
+                let flat = flatten_inlines_to_text(&items[0]);
+                assert!(flat.contains("item"), "first item missing 'item': {flat:?}");
+                assert!(flat.contains("fn f(){}"), "first item missing code body: {flat:?}");
+                assert!(flat.contains("```"), "first item missing fence: {flat:?}");
+                assert_eq!(flatten_inlines_to_text(&items[1]).trim(), "next");
+            }
+            _ => panic!("expected list, got {:?}", blocks[0].payload),
+        }
+    }
+
+    #[test]
+    fn image_inside_list_item_flattens_into_parent() {
+        let body = "- ![alt](pic.png)\n";
+        let (blocks, _) = parse(body, 1);
+        // The image should NOT escape to top level as a standalone ImageRef.
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::List { items, .. } => {
+                assert_eq!(items.len(), 1);
+                let flat = flatten_inlines_to_text(&items[0]);
+                assert!(
+                    flat.contains("![alt](pic.png)") || flat.contains("alt"),
+                    "image should be rendered into item: {flat:?}"
+                );
+            }
+            _ => panic!("expected list, got {:?}", blocks[0].payload),
+        }
+    }
+
+    #[test]
+    fn block_content_in_list_preserves_document_order() {
+        // Two items with code between; the resulting blocks should be in
+        // strictly source order: just one List, no top-level code block.
+        let body = "- a\n\n  ```\n  c\n  ```\n\n- b\n";
+        let (blocks, _) = parse(body, 1);
+        assert_eq!(blocks.len(), 1, "expected single list block");
+        match &blocks[0].kind {
+            kb_parse_types::ParsedBlockKind::List => {}
+            other => panic!("expected list, got {other:?}"),
+        }
+    }
+
     // ---- malformed input no-panic -------------------------------------------
 
     #[test]
-- 
2.49.1


From d49dbc19268c2ec9619d9b536a8b75f6298b2b7b Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 14:37:40 +0000
Subject: [PATCH 06/10] p1-3: skip empty heading text when building
 heading_path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`# ` (a heading with no following text) used to seed the heading
stack with `Some("")`, which then propagated into every child block's
`heading_path` as a `""` segment — visibly polluting the path that
downstream consumers index by.

Filter empty entries from both `heading_path()` and the in-line
ancestor collection at heading-end. We deliberately keep `Some("")`
in the stack rather than skipping the assignment so the slot remains
occupied and a subsequent deeper heading is still positioned
correctly relative to its level — only the visible path drops the
empty.

New tests:
- empty_heading_does_not_pollute_path
- empty_h1_then_h2_does_not_break_stack

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kb-parse-md/src/blocks.rs | 64 +++++++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 2 deletions(-)

diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs
index 10afa2c..cf005d6 100644
--- a/crates/kb-parse-md/src/blocks.rs
+++ b/crates/kb-parse-md/src/blocks.rs
@@ -490,9 +490,14 @@ impl<'a> WalkState<'a> {
     }
 
     fn heading_path(&self) -> Vec<String> {
+        // Skip slots whose stored heading text is empty (e.g. a `#` heading
+        // with no following text). We deliberately keep `Some("")` in the
+        // stack so deeper headings still nest under their implicit slot,
+        // but the path itself filters empties out so child blocks don't
+        // get a `""` segment polluting their ancestry.
         self.heading_stack
             .iter()
-            .filter_map(|s| s.clone())
+            .filter_map(|s| s.clone().filter(|t| !t.is_empty()))
             .collect()
     }
 
@@ -685,11 +690,13 @@ impl<'a> WalkState<'a> {
 
                     // The heading_path on the heading block ITSELF excludes
                     // the heading's own text (it's the path of ancestors).
+                    // Empty heading texts are skipped so they don't create
+                    // a `""` segment in the path (matches `heading_path()`).
                     let path = self
                         .heading_stack
                         .iter()
                         .take((level_to_use - 1) as usize)
-                        .filter_map(|s| s.clone())
+                        .filter_map(|s| s.clone().filter(|t| !t.is_empty()))
                         .collect();
 
                     let block = ParsedBlock {
@@ -1092,6 +1099,59 @@ mod tests {
         assert_eq!(p2.heading_path, vec!["B".to_string()]);
     }
 
+    // ---- empty heading edge cases -------------------------------------------
+
+    #[test]
+    fn empty_heading_does_not_pollute_path() {
+        // `# ` with no text used to seed the heading_stack with `Some("")`,
+        // which then leaked into every child block's `heading_path` as a
+        // `""` segment. Now empty entries are filtered from the path.
+        let body = "# \n# Real H1\n## Sub\nbody\n";
+        let (blocks, _) = parse(body, 1);
+        // body is the last block; verify its heading_path.
+        let para = blocks
+            .iter()
+            .find(|b| matches!(b.payload, ParsedPayload::Paragraph { .. }))
+            .expect("paragraph present");
+        assert_eq!(
+            para.heading_path,
+            vec!["Real H1".to_string(), "Sub".to_string()],
+            "empty heading should not appear in path; got {:?}",
+            para.heading_path
+        );
+    }
+
+    #[test]
+    fn empty_h1_then_h2_does_not_break_stack() {
+        // An empty H1 overwrites the H1 slot with `Some("")` (so a later
+        // H2 is still treated as positioned at level 2), but the path
+        // filter drops the empty entry. So Inner's path is `[]` not `[""]`,
+        // and the body's path is `["Inner"]` — neither carries a `""` and
+        // the parser doesn't panic or skip blocks.
+        let body = "# Outer\n\n# \n\n## Inner\nbody\n";
+        let (blocks, _) = parse(body, 1);
+        let inner = blocks
+            .iter()
+            .find(|b| {
+                matches!(b.payload, ParsedPayload::Heading { ref text, .. } if text == "Inner")
+            })
+            .expect("Inner heading present");
+        assert_eq!(
+            inner.heading_path,
+            Vec::<String>::new(),
+            "empty H1 leaves an empty slot, filtered from the path"
+        );
+        let para = blocks
+            .iter()
+            .find(|b| matches!(b.payload, ParsedPayload::Paragraph { .. }))
+            .expect("paragraph present");
+        assert_eq!(
+            para.heading_path,
+            vec!["Inner".to_string()],
+            "body's path drops the empty H1 between root and Inner"
+        );
+    }
+
     // ---- code blocks ---------------------------------------------------------
 
     #[test]
-- 
2.49.1


From 73040cab30f4db23d428262c6cd26c776425a6b4 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 14:40:04 +0000
Subject: [PATCH 07/10] p1-3: capture image refs from pulldown-cmark Tag::Image
 events

The previous block-level image detector scanned paragraph source bytes
for the literal `![alt](src)` shape. That was fragile in three ways:

- `![alt](src "title")` leaked the title into `src` (`src "title"`)
- `![alt](<https://x.com/a b>)` kept the angle brackets verbatim
- `![]()` had undefined behavior

Replace the byte-scan with state on `Frame::Paragraph` that observes
the actual `Tag::Image` events from pulldown-cmark:

- `image_count` increments on each `Start(Tag::Image)` and `image_src`
  captures `dest_url` (which already strips angle brackets and excludes
  the title).
- Text events seen while `image_depth > 0` are routed into `image_alt`
  and suppressed from the inline buffer.
- Strong/Emph/Link starts and any non-image text outside the image
  flag `non_image_text_seen`.

At `End(Paragraph)`, the paragraph is lifted to `ImageRef` iff
`image_count == 1 && !non_image_text_seen`. The byte-scanner
`match_block_image` is removed.

New tests:
- image_with_title_attribute (title dropped, no leak into src)
- image_with_angle_bracketed_url (brackets stripped)
- empty_image_alt_and_src (`![]()` pins to empty/empty)

Existing image tests (`image_ref_block_captures_src_and_alt`,
`inline_image_inside_paragraph_is_dropped`) continue to pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kb-parse-md/src/blocks.rs | 232 ++++++++++++++++++++++++-------
 1 file changed, 185 insertions(+), 47 deletions(-)

diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs
index cf005d6..f080d0c 100644
--- a/crates/kb-parse-md/src/blocks.rs
+++ b/crates/kb-parse-md/src/blocks.rs
@@ -213,6 +213,20 @@ enum Frame {
     Paragraph {
         range: Range<usize>,
         inlines: InlineBuf,
+        /// Block-level image detection: tracks the single-image-only
+        /// signature `![alt](src)` as a paragraph's *entire* content.
+        ///
+        /// When `Tag::Image` opens, `image_depth` is bumped (>0 ⇒ alt-text
+        /// accumulates into `image_alt` and is suppressed from `inlines`).
+        /// `image_count` records how many distinct images we've seen and
+        /// `non_image_text_seen` flags any other inline content. At
+        /// `End(Paragraph)` the paragraph is lifted to `ImageRef` iff
+        /// `image_count == 1 && !non_image_text_seen`.
+        image_depth: u32,
+        image_count: u32,
+        non_image_text_seen: bool,
+        image_src: Option<String>,
+        image_alt: String,
     },
     Quote {
         range: Range<usize>,
@@ -583,6 +597,11 @@ impl<'a> WalkState<'a> {
                 self.frames.push(Frame::Paragraph {
                     range,
                     inlines: InlineBuf::new(),
+                    image_depth: 0,
+                    image_count: 0,
+                    non_image_text_seen: false,
+                    image_src: None,
+                    image_alt: String::new(),
                 });
             }
             Event::Start(Tag::BlockQuote(_)) => {
@@ -656,18 +675,40 @@ impl<'a> WalkState<'a> {
                 }
             }
             Event::Start(Tag::Strong) => {
+                self.flag_non_image_in_paragraph();
                 self.with_current_inlines(|buf| buf.open_strong());
             }
             Event::Start(Tag::Emphasis) => {
+                self.flag_non_image_in_paragraph();
                 self.with_current_inlines(|buf| buf.open_emph());
             }
             Event::Start(Tag::Link { dest_url, .. }) => {
+                self.flag_non_image_in_paragraph();
                 let href = dest_url.into_string();
                 self.with_current_inlines(|buf| buf.open_link(href));
             }
-            // Block-level image is handled at End — see TagEnd::Image.
-            Event::Start(Tag::Image { .. }) => {
-                // No-op at start; we capture src/title at End.
+            Event::Start(Tag::Image { dest_url, .. }) => {
+                // If we're inside a paragraph, this image becomes a
+                // candidate for block-level lifting. Record its src and
+                // start accumulating the alt text from the upcoming Text
+                // events.
+                if let Some(Frame::Paragraph {
+                    image_depth,
+                    image_count,
+                    image_src,
+                    image_alt,
+                    ..
+                }) = self.frames.last_mut()
+                {
+                    *image_depth += 1;
+                    if *image_count == 0 {
+                        *image_src = Some(dest_url.into_string());
+                        image_alt.clear();
+                    }
+                    *image_count += 1;
+                }
+                // Outside a paragraph (e.g. inside a list item, heading,
+                // table cell): inline images are dropped silently per §3.4.
             }
 
             // ---- Container ends -------------------------------------------------
@@ -710,26 +751,43 @@ impl<'a> WalkState<'a> {
             }
             Event::End(TagEnd::Paragraph) => {
                 if matches!(self.frames.last(), Some(Frame::Paragraph { .. })) {
-                    if let Some(Frame::Paragraph { range, inlines }) = self.frames.pop() {
-                        let (inline_vec, text) = inlines.finish();
-                        // Block-level image: a paragraph whose only content
-                        // is `![alt](src)` becomes ImageRef. Detect by
-                        // scanning the original source for the canonical
-                        // shape.
-                        if let Some((alt, src)) = match_block_image(self.body, &range) {
+                    if let Some(Frame::Paragraph {
+                        range,
+                        inlines,
+                        image_count,
+                        non_image_text_seen,
+                        image_src,
+                        image_alt,
+                        ..
+                    }) = self.frames.pop()
+                    {
+                        // Block-level image lift: paragraph whose only
+                        // content is exactly one `![alt](src)`. Source
+                        // (with optional title), alt, and angle-bracket
+                        // wrapping are all captured by pulldown-cmark from
+                        // the `Tag::Image` event itself, so the title is
+                        // dropped and angle brackets are stripped without
+                        // any byte-level scanning.
+                        if image_count == 1 && !non_image_text_seen {
+                            let span = self.span_for(&range);
                             let block = ParsedBlock {
                                 kind: ParsedBlockKind::ImageRef,
                                 heading_path: self.heading_path(),
-                                source_span: self.span_for(&range),
-                                payload: ParsedPayload::ImageRef { src, alt },
+                                source_span: span,
+                                payload: ParsedPayload::ImageRef {
+                                    src: image_src.unwrap_or_default(),
+                                    alt: image_alt,
+                                },
                             };
                             self.emit_block(block);
                             return;
                         }
+                        let (inline_vec, text) = inlines.finish();
+                        let span = self.span_for(&range);
                         let block = ParsedBlock {
                             kind: ParsedBlockKind::Paragraph,
                             heading_path: self.heading_path(),
-                            source_span: self.span_for(&range),
+                            source_span: span,
                             payload: ParsedPayload::Paragraph { text, inlines: inline_vec },
                         };
                         self.emit_block(block);
@@ -932,8 +990,11 @@ impl<'a> WalkState<'a> {
                 self.with_current_inlines(|buf| buf.close_link());
             }
             Event::End(TagEnd::Image) => {
-                // Inline images are dropped silently. Block-level image refs
-                // are detected at paragraph End, not here.
+                if let Some(Frame::Paragraph { image_depth, .. }) = self.frames.last_mut() {
+                    if *image_depth > 0 {
+                        *image_depth -= 1;
+                    }
+                }
             }
 
             // ---- Leaf events -----------------------------------------------------
@@ -954,6 +1015,32 @@ impl<'a> WalkState<'a> {
                     current_cell.push_str(&s);
                     return;
                 }
+                // If this text is inside a `Tag::Image` opened inside a
+                // paragraph, route it to the image's alt accumulator and
+                // suppress it from the inline buffer (so a paragraph that
+                // is *only* an image doesn't carry the alt as visible
+                // inline text in the fallback case either).
+                if let Some(Frame::Paragraph {
+                    image_depth,
+                    image_alt,
+                    ..
+                }) = self.frames.last_mut()
+                {
+                    if *image_depth > 0 {
+                        image_alt.push_str(&s);
+                        return;
+                    }
+                }
+                // Otherwise: visible non-image content.
+                if let Some(Frame::Paragraph {
+                    non_image_text_seen,
+                    ..
+                }) = self.frames.last_mut()
+                {
+                    if !s.is_empty() {
+                        *non_image_text_seen = true;
+                    }
+                }
                 let owned = s.into_string();
                 self.with_current_inlines(|buf| {
                     buf.push_text(&owned);
@@ -965,6 +1052,19 @@ impl<'a> WalkState<'a> {
                     current_cell.push_str(&s);
                     return;
                 }
+                if let Some(Frame::Paragraph {
+                    non_image_text_seen,
+                    image_depth,
+                    ..
+                }) = self.frames.last_mut()
+                {
+                    // Code inside an image's alt — extremely rare but pin
+                    // behavior: count as visible non-image content so the
+                    // paragraph isn't lifted to ImageRef.
+                    if *image_depth == 0 {
+                        *non_image_text_seen = true;
+                    }
+                }
                 let owned = s.into_string();
                 self.with_current_inlines(|buf| {
                     buf.push_code(&owned);
@@ -988,6 +1088,22 @@ impl<'a> WalkState<'a> {
         }
     }
 
+    /// If the top frame is an open paragraph that hasn't yet escaped the
+    /// "single image only" signature, mark it as containing visible
+    /// non-image content so it won't be lifted to ImageRef at End.
+    fn flag_non_image_in_paragraph(&mut self) {
+        if let Some(Frame::Paragraph {
+            non_image_text_seen,
+            image_depth,
+            ..
+        }) = self.frames.last_mut()
+        {
+            if *image_depth == 0 {
+                *non_image_text_seen = true;
+            }
+        }
+    }
+
     /// Run `f` on whichever inline accumulator is open at the top of the
     /// frame stack. No-op if no inline-accepting frame is open.
     fn with_current_inlines<F: FnOnce(&mut InlineBuf)>(&mut self, f: F) {
@@ -1017,38 +1133,6 @@ fn heading_level_to_u8(level: HeadingLevel) -> u8 {
     }
 }
 
-/// Detect a paragraph whose entire trimmed source is `![alt](src)` — the
-/// canonical "block-level image" shape. Returns `(alt, src)` if so. We do
-/// this by scanning the original bytes (not the inline events) so it stays
-/// robust to pulldown-cmark's internal representation of images.
-fn match_block_image(body: &[u8], range: &Range<usize>) -> Option<(String, String)> {
-    let slice = body.get(range.clone())?;
-    let s = std::str::from_utf8(slice).ok()?.trim();
-    if !s.starts_with("![") {
-        return None;
-    }
-    // Find the closing `]` of the alt text. Markdown does not allow nested
-    // brackets without escaping — for a block-level image we only handle the
-    // simple form. Anything else falls through to ordinary paragraph parsing.
-    let close_bracket = s.find("](")?;
-    let alt = &s[2..close_bracket];
-    // The rest must be `(SRC)` and nothing else.
-    let after = &s[close_bracket + 2..];
-    let close_paren = after.rfind(')')?;
-    if close_paren != after.len() - 1 {
-        return None;
-    }
-    let src = &after[..close_paren];
-    // Reject if alt or src contain a newline — that means the paragraph has
-    // more content beyond the image and isn't a pure block-level image.
-    if alt.contains('\n') || src.contains('\n') {
-        return None;
-    }
-    // Reject brackets/parens inside src — tolerated by CommonMark via
-    // angle-bracket-wrap, but we keep this conservative for now.
-    Some((alt.to_string(), src.to_string()))
-}
-
 // ---------------------------------------------------------------------------
 // Tests
 // ---------------------------------------------------------------------------
@@ -1309,6 +1393,60 @@ mod tests {
         assert_eq!(blocks[0].kind, ParsedBlockKind::ImageRef);
     }
 
+    #[test]
+    fn image_with_title_attribute() {
+        // Source includes a title, but pulldown-cmark exposes it
+        // separately on `Tag::Image`; we ignore the title — only `src`
+        // and `alt` survive. Previously the byte-scanner pulled
+        // `src "title"` into `src`.
+        let body = "![alt](src.png \"title\")\n";
+        let (blocks, _) = parse(body, 1);
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::ImageRef { src, alt } => {
+                assert_eq!(src, "src.png");
+                assert_eq!(alt, "alt");
+            }
+            _ => panic!("expected image ref, got {:?}", blocks[0].payload),
+        }
+    }
+
+    #[test]
+    fn image_with_angle_bracketed_url() {
+        // `<…>` wrapping is a CommonMark feature for URLs containing
+        // spaces. pulldown-cmark strips the angle brackets and decodes
+        // the URL; we should reflect that.
+        let body = "![alt](<https://x.com/a b>)\n";
+        let (blocks, _) = parse(body, 1);
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::ImageRef { src, alt } => {
+                assert_eq!(
+                    src, "https://x.com/a b",
+                    "angle brackets should be stripped"
+                );
+                assert_eq!(alt, "alt");
+            }
+            _ => panic!("expected image ref, got {:?}", blocks[0].payload),
+        }
+    }
+
+    #[test]
+    fn empty_image_alt_and_src() {
+        // Pin behavior on the degenerate `![]()` shape. Both fields are
+        // empty strings; the block is still classified as ImageRef.
+        let body = "![]()\n";
+        let (blocks, _) = parse(body, 1);
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::ImageRef { src, alt } => {
+                assert_eq!(src, "");
+                assert_eq!(alt, "");
+            }
+            _ => panic!("expected image ref, got {:?}", blocks[0].payload),
+        }
+    }
+
     #[test]
     fn inline_image_inside_paragraph_is_dropped() {
         // The image is part of a longer paragraph → not a block-level image.
-- 
2.49.1


From 23ff4d68af4a45c1db7ca3fcf1b2b49a5d5fa415 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 14:40:42 +0000
Subject: [PATCH 08/10] p1-3: preserve whitespace in link text across
 SoftBreak/HardBreak
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`[multi\nline](http://x)` produced `Inline::Link.text = "multiline"`
because the SoftBreak/HardBreak handler called `push_text(" ")` —
which updates `paragraph.text` and the inline buffer, but NOT the
open link frame's flattened text accumulator. Text events flowed
through `push_link_text`; line breaks didn't.

Add `push_link_text(" ")` alongside the existing `push_text(" ")` in
the break handler so a line break inside `[ ... ](href)` collapses
to a visible space rather than disappearing.

New tests:
- link_with_soft_break_preserves_space_in_text
- link_with_hard_break_preserves_space_in_text

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kb-parse-md/src/blocks.rs | 57 +++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs
index f080d0c..7c1a0ed 100644
--- a/crates/kb-parse-md/src/blocks.rs
+++ b/crates/kb-parse-md/src/blocks.rs
@@ -1080,7 +1080,15 @@ impl<'a> WalkState<'a> {
                     current_cell.push(' ');
                     return;
                 }
-                self.with_current_inlines(|buf| buf.push_text(" "));
+                // Update both `paragraph.text` (via push_text) and the
+                // open link's flattened text accumulator (via
+                // push_link_text). Without push_link_text here, a
+                // multi-line `[text\nmore](href)` collapses to "textmore"
+                // — losing the visible space between words.
+                self.with_current_inlines(|buf| {
+                    buf.push_text(" ");
+                    buf.push_link_text(" ");
+                });
             }
             // Everything else (HTML, footnote refs, task list markers, math,
             // rules, etc.) is dropped silently per design §3.4.
@@ -1639,6 +1647,53 @@ mod tests {
 
     // ---- inline filter -------------------------------------------------------
 
+    #[test]
+    fn link_with_soft_break_preserves_space_in_text() {
+        // Without the push_link_text fix, this collapses to "multiline".
+        let body = "[multi\nline](http://x)\n";
+        let (blocks, _) = parse(body, 1);
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::Paragraph { inlines, .. } => {
+                let link = inlines
+                    .iter()
+                    .find(|i| matches!(i, Inline::Link { .. }))
+                    .expect("link present");
+                match link {
+                    Inline::Link { text, href } => {
+                        assert_eq!(text, "multi line");
+                        assert_eq!(href, "http://x");
+                    }
+                    _ => unreachable!(),
+                }
+            }
+            _ => panic!("expected paragraph, got {:?}", blocks[0].payload),
+        }
+    }
+
+    #[test]
+    fn link_with_hard_break_preserves_space_in_text() {
+        // Two trailing spaces + newline = HardBreak in CommonMark.
+        let body = "[multi  \nline](http://x)\n";
+        let (blocks, _) = parse(body, 1);
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::Paragraph { inlines, .. } => {
+                let link = inlines
+                    .iter()
+                    .find(|i| matches!(i, Inline::Link { .. }))
+                    .expect("link present");
+                match link {
+                    Inline::Link { text, .. } => {
+                        assert_eq!(text, "multi line");
+                    }
+                    _ => unreachable!(),
+                }
+            }
+            _ => panic!("expected paragraph"),
+        }
+    }
+
     #[test]
     fn only_allowed_inlines_emitted() {
         let body = "**bold** *em* `code` [link](u)\n";
-- 
2.49.1


From 2b6d9abc0fad021a5df00c9c8c165d6be67a2364 Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 14:41:47 +0000
Subject: [PATCH 09/10] p1-3: doc-comment + test pin Quote drops non-text
 children
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`ParsedPayload::Quote { text, inlines }` cannot represent block-level
children (lists, code, tables, images), so the BlockQuote end handler
silently drops them when assembling the Quote payload. This matches
§3.4 for now but is non-obvious and easy to regress without an
explicit pin.

Add a TODO(P1-future) comment near the Quote emission code and a
regression test (`quote_with_list_inside_drops_list`) that fixes the
current shape: a `> - item` blockquote produces a Quote with empty
text and empty inlines.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kb-parse-md/src/blocks.rs | 35 ++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs
index 7c1a0ed..a4d2ee4 100644
--- a/crates/kb-parse-md/src/blocks.rs
+++ b/crates/kb-parse-md/src/blocks.rs
@@ -797,6 +797,15 @@ impl<'a> WalkState<'a> {
             Event::End(TagEnd::BlockQuote(_)) => {
                 if let Some(Frame::Quote { range, children }) = self.frames.pop() {
                     // Concatenate child text for the Quote payload.
+                    //
+                    // TODO(P1-future): `ParsedPayload::Quote { text, inlines }`
+                    // cannot represent block-level children structurally
+                    // (lists, code, tables, images), so they are silently
+                    // dropped here. Only Paragraph / Quote / Heading text
+                    // contributes to the assembled `text`. This matches
+                    // §3.4 for now but should be revisited when Quote
+                    // grows a `children: Vec<ParsedBlock>` field — see
+                    // pin test `quote_with_list_inside_drops_list`.
                     let mut text = String::new();
                     let mut inlines: Vec<Inline> = Vec::new();
                     for c in &children {
@@ -1503,6 +1512,32 @@ mod tests {
         }
     }
 
+    // ---- quote child handling (pinned) --------------------------------------
+
+    /// Pins current behavior: a list inside a blockquote contributes nothing
+    /// to the Quote's text/inlines because `ParsedPayload::Quote` cannot
+    /// represent non-text/non-paragraph block children. The list itself is
+    /// also not retained as a separate block — it lives only in the popped
+    /// `Frame::Quote.children` and is dropped. Tracked in TODO(P1-future)
+    /// near the Quote emission code.
+    #[test]
+    fn quote_with_list_inside_drops_list() {
+        let body = "> - a\n> - b\n";
+        let (blocks, _) = parse(body, 1);
+        // Exactly one Quote block, with empty text/inlines.
+        assert_eq!(blocks.len(), 1);
+        match &blocks[0].payload {
+            ParsedPayload::Quote { text, inlines } => {
+                assert_eq!(text, "", "list content should not appear in quote text");
+                assert!(
+                    inlines.is_empty(),
+                    "list content should not appear in quote inlines"
+                );
+            }
+            _ => panic!("expected quote, got {:?}", blocks[0].payload),
+        }
+    }
+
     // ---- block content inside list items ------------------------------------
 
     #[test]
-- 
2.49.1


From 80123e9e27910ea076ec8cda1d488a2fb7955f4b Mon Sep 17 00:00:00 2001
From: altair823 <dlsrks0734@gmail.com>
Date: Thu, 30 Apr 2026 14:42:21 +0000
Subject: [PATCH 10/10] p1-3: pin reviewer probe inputs as regression tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The quality reviewer named three specific input probes for the C1/C2/
C3 fixes. Encode each as a verbatim test so future regressions on
those exact inputs surface immediately:

- probe_overflow: parse_blocks(b"# h\nbody\n", u32::MAX) → empty +
  Warning::ExtractFailed.
- probe_list_escape: list with embedded code block → single List
  block, two items.
- probe_empty_heading: `# \n# Real\nbody\n` → body's heading_path is
  `["Real"]`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/kb-parse-md/src/blocks.rs | 34 ++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/crates/kb-parse-md/src/blocks.rs b/crates/kb-parse-md/src/blocks.rs
index a4d2ee4..93bdc17 100644
--- a/crates/kb-parse-md/src/blocks.rs
+++ b/crates/kb-parse-md/src/blocks.rs
@@ -1751,3 +1751,37 @@ mod tests {
         }
     }
 }
+
+#[cfg(test)]
+mod reviewer_probes {
+    use super::*;
+
+    #[test]
+    fn probe_overflow() {
+        let (b, w) = parse_blocks(b"# h\nbody\n", u32::MAX).unwrap();
+        assert!(b.is_empty(), "expected empty blocks, got {}", b.len());
+        assert_eq!(w.len(), 1);
+        assert_eq!(w[0].kind, WarningKind::ExtractFailed);
+    }
+
+    #[test]
+    fn probe_list_escape() {
+        let body = b"- item\n\n  \x60\x60\x60rust\n  fn f(){}\n  \x60\x60\x60\n\n- next\n";
+        let (b, _) = parse_blocks(body, 0).unwrap();
+        assert_eq!(b.len(), 1, "expected single list block, got {}", b.len());
+        match &b[0].payload {
+            ParsedPayload::List { items, .. } => assert_eq!(items.len(), 2),
+            _ => panic!("expected list, got {:?}", b[0].payload),
+        }
+    }
+
+    #[test]
+    fn probe_empty_heading() {
+        let (b, _) = parse_blocks(b"# \n# Real\nbody\n", 0).unwrap();
+        let para = b
+            .iter()
+            .find(|x| matches!(x.payload, ParsedPayload::Paragraph { .. }))
+            .expect("paragraph present");
+        assert_eq!(para.heading_path, vec!["Real".to_string()]);
+    }
+}
-- 
2.49.1