From c0096ce44bbabbe21119e5e882cffae82c3e0e5d Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 15:16:53 +0000 Subject: [PATCH] p1-4: scaffold kb-normalize crate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the workspace member, `Cargo.toml` with the §8-allowed dep set (kb-core, kb-parse-types, kb-config, serde, serde_json_canonicalizer, blake3, unicode-normalization, time, anyhow, tracing) and a stubbed `build_canonical_document` that pins the public signature plus `doc_id` derivation. `kb-parse-md` is permitted only as a *dev*-dep so the integration snapshot test (added later in this series) can drive a fixture through the real parser without violating the production boundary — `cargo tree -p kb-normalize --depth 1 --edges normal` confirms no parser implementation appears in the regular dep tree. `id_for_doc` and `id_for_block` are re-exported from kb-core (which holds the canonical recipe per §4.2); kb-normalize is the canonical *entry point* per design §8. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 18 ++++++++++ Cargo.toml | 1 + crates/kb-normalize/Cargo.toml | 30 +++++++++++++++++ crates/kb-normalize/src/lib.rs | 61 ++++++++++++++++++++++++++++++++++ 4 files changed, 110 insertions(+) create mode 100644 crates/kb-normalize/Cargo.toml create mode 100644 crates/kb-normalize/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index ec3d4e7..62a0784 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -577,6 +577,24 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "kb-normalize" +version = "0.1.0" +dependencies = [ + "anyhow", + "blake3", + "kb-config", + "kb-core", + "kb-parse-md", + "kb-parse-types", + "serde", + "serde_json", + "serde_json_canonicalizer", + "time", + "tracing", + "unicode-normalization", +] + [[package]] name = "kb-parse-md" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 23fb992..b5d4b57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ "crates/kb-config", "crates/kb-source-fs", "crates/kb-parse-md", + "crates/kb-normalize", "crates/kb-app", "crates/kb-cli", ] diff --git a/crates/kb-normalize/Cargo.toml b/crates/kb-normalize/Cargo.toml new file mode 100644 index 0000000..feaf5c7 --- /dev/null +++ b/crates/kb-normalize/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "kb-normalize" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Lift parser output (kb-parse-types) into kb-core::CanonicalDocument with deterministic IDs (§3.4, §4.2, §4.3)" + +[dependencies] +kb-core = { path = "../kb-core" } +kb-parse-types = { path = "../kb-parse-types" } +kb-config = { path = "../kb-config" } +serde = { workspace = true } +serde_json = { workspace = true } +serde_json_canonicalizer = "0.3" +blake3 = { workspace = true } +unicode-normalization = "0.1" +time = { workspace = true } +anyhow = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +# kb-parse-md is permitted as a *dev*-dependency only — used by the +# integration snapshot test to drive a fixture through the real parser. +# Forbidden as a regular dep per design §8 (kb-normalize must not depend +# on any specific parser); `cargo tree -p kb-normalize --depth 1` (the +# default scope, excluding dev-deps) confirms this. +kb-parse-md = { path = "../kb-parse-md" } +serde_json = { workspace = true } diff --git a/crates/kb-normalize/src/lib.rs b/crates/kb-normalize/src/lib.rs new file mode 100644 index 0000000..c71cbe6 --- /dev/null +++ b/crates/kb-normalize/src/lib.rs @@ -0,0 +1,61 @@ +//! `kb-normalize` — lift parser output (`kb-parse-types`) into a +//! [`kb_core::CanonicalDocument`] with deterministic IDs. +//! +//! Per design §3.4 (CanonicalDocument / Block), §4.2 (ID recipe), §4.3 +//! (ordinal rule), §3.6 (Provenance), §8 (module boundaries). +//! +//! Public surface: +//! +//! * [`build_canonical_document`] — assemble a `CanonicalDocument` from +//! `(RawAsset, Metadata, Vec, ParserVersion, Vec)`. +//! * [`id_for_doc`], [`id_for_block`] — re-exports of the canonical +//! ID-recipe functions in `kb-core::ids` (§4.2). `kb-core` is the only +//! implementation; `kb-normalize` is the canonical *entry point* per +//! design §8. +//! +//! This crate must NOT depend on any parser implementation crate +//! (`kb-parse-md`, `kb-parse-pdf`, …). All parser output flows in via +//! the shared `kb-parse-types` crate. + +use anyhow::Result; +use kb_core::{ + CanonicalDocument, Lang, Metadata, ParserVersion, Provenance, RawAsset, +}; +use kb_parse_types::{ParsedBlock, Warning}; + +pub use kb_core::{id_for_block, id_for_doc}; + +/// Build a [`CanonicalDocument`] from the raw asset, frontmatter +/// metadata, parser blocks, parser version, and any warnings. Full +/// behavior (block ID assignment, provenance, title/lang lift) is +/// filled in by subsequent commits in this series; this stub establishes +/// the public signature and the doc_id derivation only. +pub fn build_canonical_document( + asset: &RawAsset, + metadata: Metadata, + blocks: Vec, + parser_version: &ParserVersion, + _warnings: Vec, +) -> Result { + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version); + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title: String::new(), + lang: Lang(String::new()), + blocks: Vec::new(), + metadata, + provenance: Provenance { events: Vec::new() }, + parser_version: parser_version.clone(), + schema_version: 1, + doc_version: 1, + }) + .map(|d| { + // `blocks` is consumed but not yet lifted — flag it as live to + // satisfy the unused-binding lint until the next commit fills + // in the real lifting logic. + let _ = blocks; + d + }) +}