diff --git a/crates/kb-parse-types/Cargo.toml b/crates/kb-parse-types/Cargo.toml new file mode 100644 index 0000000..b0612a9 --- /dev/null +++ b/crates/kb-parse-types/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "kb-parse-types" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Parser intermediate representations (no parser libs allowed)" + +[dependencies] +kb-core = { path = "../kb-core" } +serde = { workspace = true } +thiserror = { workspace = true } diff --git a/crates/kb-parse-types/src/lib.rs b/crates/kb-parse-types/src/lib.rs new file mode 100644 index 0000000..e09016f --- /dev/null +++ b/crates/kb-parse-types/src/lib.rs @@ -0,0 +1,98 @@ +//! `kb-parse-types` — parser intermediate representations (§3.7b). +//! +//! Depends ONLY on `kb-core`. Must NOT depend on any parser library +//! (`pulldown-cmark`, `pdf-extract`, `image`, `whisper-rs`, …) and must +//! NOT depend on any other `kb-*` crate. + +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ParsedBlock { + pub kind: ParsedBlockKind, + pub heading_path: Vec, + pub source_span: kb_core::SourceSpan, + pub payload: ParsedPayload, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ParsedBlockKind { + Heading, + Paragraph, + List, + Code, + Table, + Quote, + ImageRef, + AudioRef, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase", tag = "kind")] +pub enum ParsedPayload { + Heading { + level: u8, + text: String, + }, + Paragraph { + text: String, + inlines: Vec, + }, + List { + ordered: bool, + items: Vec>, + }, + Code { + lang: Option, + code: String, + }, + Table { + headers: Vec, + rows: Vec>, + }, + Quote { + text: String, + inlines: Vec, + }, + ImageRef { + src: String, + alt: String, + }, + /// `duration_ms` is filled in by the extractor before chunking — see + /// design §3.7b. + AudioRef { + src: String, + }, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct Warning { + pub kind: WarningKind, + pub note: String, +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum WarningKind { + MalformedFrontmatter, + MalformedTable, + EncodingFallback, + ExtractFailed, +} + +// Forward-declared (P6/P7/P8). Bodies stay minimal for now. +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct ParsedImageRegion; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ParsedPdfPage { + pub page: u32, + pub text: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ParsedAudioSegment { + pub start_ms: u64, + pub end_ms: u64, + pub text: String, +}