p0-1: kb-parse-types thin parser-intermediate crate

Adds the kb-parse-types crate per design §3.7b. Depends only on kb-core
+ serde/thiserror — never on parser libraries. Defines:

- ParsedBlock + ParsedBlockKind + ParsedPayload (8 variants matching
  Block variants in kb-core).
- Warning + WarningKind for parser diagnostics.
- Forward-declared ParsedImageRegion / ParsedPdfPage / ParsedAudioSegment
  shells for P6/P7/P8.

`cargo tree -p kb-parse-types` shows only kb-core, serde, and thiserror.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-30 05:16:50 +00:00
parent f86df99fe9
commit 030986b37c
2 changed files with 111 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
[package]
name = "kb-parse-types"
version = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "Parser intermediate representations (no parser libs allowed)"
[dependencies]
kb-core = { path = "../kb-core" }
serde = { workspace = true }
thiserror = { workspace = true }

View File

@@ -0,0 +1,98 @@
//! `kb-parse-types` — parser intermediate representations (§3.7b).
//!
//! Depends ONLY on `kb-core`. Must NOT depend on any parser library
//! (`pulldown-cmark`, `pdf-extract`, `image`, `whisper-rs`, …) and must
//! NOT depend on any other `kb-*` crate.
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ParsedBlock {
pub kind: ParsedBlockKind,
pub heading_path: Vec<String>,
pub source_span: kb_core::SourceSpan,
pub payload: ParsedPayload,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ParsedBlockKind {
Heading,
Paragraph,
List,
Code,
Table,
Quote,
ImageRef,
AudioRef,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", tag = "kind")]
pub enum ParsedPayload {
Heading {
level: u8,
text: String,
},
Paragraph {
text: String,
inlines: Vec<kb_core::Inline>,
},
List {
ordered: bool,
items: Vec<Vec<kb_core::Inline>>,
},
Code {
lang: Option<String>,
code: String,
},
Table {
headers: Vec<String>,
rows: Vec<Vec<String>>,
},
Quote {
text: String,
inlines: Vec<kb_core::Inline>,
},
ImageRef {
src: String,
alt: String,
},
/// `duration_ms` is filled in by the extractor before chunking — see
/// design §3.7b.
AudioRef {
src: String,
},
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Warning {
pub kind: WarningKind,
pub note: String,
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum WarningKind {
MalformedFrontmatter,
MalformedTable,
EncodingFallback,
ExtractFailed,
}
// Forward-declared (P6/P7/P8). Bodies stay minimal for now.
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct ParsedImageRegion;
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ParsedPdfPage {
pub page: u32,
pub text: String,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct ParsedAudioSegment {
pub start_ms: u64,
pub end_ms: u64,
pub text: String,
}