p1-4: scaffold kb-normalize crate

Add the workspace member, `Cargo.toml` with the §8-allowed dep set
(kb-core, kb-parse-types, kb-config, serde, serde_json_canonicalizer,
blake3, unicode-normalization, time, anyhow, tracing) and a stubbed
`build_canonical_document` that pins the public signature plus
`doc_id` derivation. `kb-parse-md` is permitted only as a *dev*-dep so
the integration snapshot test (added later in this series) can drive
a fixture through the real parser without violating the production
boundary — `cargo tree -p kb-normalize --depth 1 --edges normal`
confirms no parser implementation appears in the regular dep tree.

`id_for_doc` and `id_for_block` are re-exported from kb-core (which
holds the canonical recipe per §4.2); kb-normalize is the canonical
*entry point* per design §8.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-30 15:16:53 +00:00
parent cfccb3687d
commit c0096ce44b
4 changed files with 110 additions and 0 deletions

18
Cargo.lock generated
View File

@@ -577,6 +577,24 @@ dependencies = [
"unicode-normalization",
]
[[package]]
name = "kb-normalize"
version = "0.1.0"
dependencies = [
"anyhow",
"blake3",
"kb-config",
"kb-core",
"kb-parse-md",
"kb-parse-types",
"serde",
"serde_json",
"serde_json_canonicalizer",
"time",
"tracing",
"unicode-normalization",
]
[[package]]
name = "kb-parse-md"
version = "0.1.0"

View File

@@ -6,6 +6,7 @@ members = [
"crates/kb-config",
"crates/kb-source-fs",
"crates/kb-parse-md",
"crates/kb-normalize",
"crates/kb-app",
"crates/kb-cli",
]

View File

@@ -0,0 +1,30 @@
[package]
name = "kb-normalize"
version = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "Lift parser output (kb-parse-types) into kb-core::CanonicalDocument with deterministic IDs (§3.4, §4.2, §4.3)"
[dependencies]
kb-core = { path = "../kb-core" }
kb-parse-types = { path = "../kb-parse-types" }
kb-config = { path = "../kb-config" }
serde = { workspace = true }
serde_json = { workspace = true }
serde_json_canonicalizer = "0.3"
blake3 = { workspace = true }
unicode-normalization = "0.1"
time = { workspace = true }
anyhow = { workspace = true }
tracing = { workspace = true }
[dev-dependencies]
# kb-parse-md is permitted as a *dev*-dependency only — used by the
# integration snapshot test to drive a fixture through the real parser.
# Forbidden as a regular dep per design §8 (kb-normalize must not depend
# on any specific parser); `cargo tree -p kb-normalize --depth 1` (the
# default scope, excluding dev-deps) confirms this.
kb-parse-md = { path = "../kb-parse-md" }
serde_json = { workspace = true }

View File

@@ -0,0 +1,61 @@
//! `kb-normalize` — lift parser output (`kb-parse-types`) into a
//! [`kb_core::CanonicalDocument`] with deterministic IDs.
//!
//! Per design §3.4 (CanonicalDocument / Block), §4.2 (ID recipe), §4.3
//! (ordinal rule), §3.6 (Provenance), §8 (module boundaries).
//!
//! Public surface:
//!
//! * [`build_canonical_document`] — assemble a `CanonicalDocument` from
//! `(RawAsset, Metadata, Vec<ParsedBlock>, ParserVersion, Vec<Warning>)`.
//! * [`id_for_doc`], [`id_for_block`] — re-exports of the canonical
//! ID-recipe functions in `kb-core::ids` (§4.2). `kb-core` is the only
//! implementation; `kb-normalize` is the canonical *entry point* per
//! design §8.
//!
//! This crate must NOT depend on any parser implementation crate
//! (`kb-parse-md`, `kb-parse-pdf`, …). All parser output flows in via
//! the shared `kb-parse-types` crate.
use anyhow::Result;
use kb_core::{
CanonicalDocument, Lang, Metadata, ParserVersion, Provenance, RawAsset,
};
use kb_parse_types::{ParsedBlock, Warning};
pub use kb_core::{id_for_block, id_for_doc};
/// Build a [`CanonicalDocument`] from the raw asset, frontmatter
/// metadata, parser blocks, parser version, and any warnings. Full
/// behavior (block ID assignment, provenance, title/lang lift) is
/// filled in by subsequent commits in this series; this stub establishes
/// the public signature and the doc_id derivation only.
pub fn build_canonical_document(
asset: &RawAsset,
metadata: Metadata,
blocks: Vec<ParsedBlock>,
parser_version: &ParserVersion,
_warnings: Vec<Warning>,
) -> Result<CanonicalDocument> {
let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version);
Ok(CanonicalDocument {
doc_id,
source_asset_id: asset.asset_id.clone(),
workspace_path: asset.workspace_path.clone(),
title: String::new(),
lang: Lang(String::new()),
blocks: Vec::new(),
metadata,
provenance: Provenance { events: Vec::new() },
parser_version: parser_version.clone(),
schema_version: 1,
doc_version: 1,
})
.map(|d| {
// `blocks` is consumed but not yet lifted — flag it as live to
// satisfy the unused-binding lint until the next commit fills
// in the real lifting logic.
let _ = blocks;
d
})
}