p1-4: scaffold kb-normalize crate
Add the workspace member, `Cargo.toml` with the §8-allowed dep set (kb-core, kb-parse-types, kb-config, serde, serde_json_canonicalizer, blake3, unicode-normalization, time, anyhow, tracing) and a stubbed `build_canonical_document` that pins the public signature plus `doc_id` derivation. `kb-parse-md` is permitted only as a *dev*-dep so the integration snapshot test (added later in this series) can drive a fixture through the real parser without violating the production boundary — `cargo tree -p kb-normalize --depth 1 --edges normal` confirms no parser implementation appears in the regular dep tree. `id_for_doc` and `id_for_block` are re-exported from kb-core (which holds the canonical recipe per §4.2); kb-normalize is the canonical *entry point* per design §8. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
18
Cargo.lock
generated
18
Cargo.lock
generated
@@ -577,6 +577,24 @@ dependencies = [
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-normalize"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
"kb-config",
|
||||
"kb-core",
|
||||
"kb-parse-md",
|
||||
"kb-parse-types",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_json_canonicalizer",
|
||||
"time",
|
||||
"tracing",
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-parse-md"
|
||||
version = "0.1.0"
|
||||
|
||||
@@ -6,6 +6,7 @@ members = [
|
||||
"crates/kb-config",
|
||||
"crates/kb-source-fs",
|
||||
"crates/kb-parse-md",
|
||||
"crates/kb-normalize",
|
||||
"crates/kb-app",
|
||||
"crates/kb-cli",
|
||||
]
|
||||
|
||||
30
crates/kb-normalize/Cargo.toml
Normal file
30
crates/kb-normalize/Cargo.toml
Normal file
@@ -0,0 +1,30 @@
|
||||
[package]
|
||||
name = "kb-normalize"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Lift parser output (kb-parse-types) into kb-core::CanonicalDocument with deterministic IDs (§3.4, §4.2, §4.3)"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
kb-parse-types = { path = "../kb-parse-types" }
|
||||
kb-config = { path = "../kb-config" }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
serde_json_canonicalizer = "0.3"
|
||||
blake3 = { workspace = true }
|
||||
unicode-normalization = "0.1"
|
||||
time = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
# kb-parse-md is permitted as a *dev*-dependency only — used by the
|
||||
# integration snapshot test to drive a fixture through the real parser.
|
||||
# Forbidden as a regular dep per design §8 (kb-normalize must not depend
|
||||
# on any specific parser); `cargo tree -p kb-normalize --depth 1` (the
|
||||
# default scope, excluding dev-deps) confirms this.
|
||||
kb-parse-md = { path = "../kb-parse-md" }
|
||||
serde_json = { workspace = true }
|
||||
61
crates/kb-normalize/src/lib.rs
Normal file
61
crates/kb-normalize/src/lib.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
//! `kb-normalize` — lift parser output (`kb-parse-types`) into a
|
||||
//! [`kb_core::CanonicalDocument`] with deterministic IDs.
|
||||
//!
|
||||
//! Per design §3.4 (CanonicalDocument / Block), §4.2 (ID recipe), §4.3
|
||||
//! (ordinal rule), §3.6 (Provenance), §8 (module boundaries).
|
||||
//!
|
||||
//! Public surface:
|
||||
//!
|
||||
//! * [`build_canonical_document`] — assemble a `CanonicalDocument` from
|
||||
//! `(RawAsset, Metadata, Vec<ParsedBlock>, ParserVersion, Vec<Warning>)`.
|
||||
//! * [`id_for_doc`], [`id_for_block`] — re-exports of the canonical
|
||||
//! ID-recipe functions in `kb-core::ids` (§4.2). `kb-core` is the only
|
||||
//! implementation; `kb-normalize` is the canonical *entry point* per
|
||||
//! design §8.
|
||||
//!
|
||||
//! This crate must NOT depend on any parser implementation crate
|
||||
//! (`kb-parse-md`, `kb-parse-pdf`, …). All parser output flows in via
|
||||
//! the shared `kb-parse-types` crate.
|
||||
|
||||
use anyhow::Result;
|
||||
use kb_core::{
|
||||
CanonicalDocument, Lang, Metadata, ParserVersion, Provenance, RawAsset,
|
||||
};
|
||||
use kb_parse_types::{ParsedBlock, Warning};
|
||||
|
||||
pub use kb_core::{id_for_block, id_for_doc};
|
||||
|
||||
/// Build a [`CanonicalDocument`] from the raw asset, frontmatter
|
||||
/// metadata, parser blocks, parser version, and any warnings. Full
|
||||
/// behavior (block ID assignment, provenance, title/lang lift) is
|
||||
/// filled in by subsequent commits in this series; this stub establishes
|
||||
/// the public signature and the doc_id derivation only.
|
||||
pub fn build_canonical_document(
|
||||
asset: &RawAsset,
|
||||
metadata: Metadata,
|
||||
blocks: Vec<ParsedBlock>,
|
||||
parser_version: &ParserVersion,
|
||||
_warnings: Vec<Warning>,
|
||||
) -> Result<CanonicalDocument> {
|
||||
let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version);
|
||||
Ok(CanonicalDocument {
|
||||
doc_id,
|
||||
source_asset_id: asset.asset_id.clone(),
|
||||
workspace_path: asset.workspace_path.clone(),
|
||||
title: String::new(),
|
||||
lang: Lang(String::new()),
|
||||
blocks: Vec::new(),
|
||||
metadata,
|
||||
provenance: Provenance { events: Vec::new() },
|
||||
parser_version: parser_version.clone(),
|
||||
schema_version: 1,
|
||||
doc_version: 1,
|
||||
})
|
||||
.map(|d| {
|
||||
// `blocks` is consumed but not yet lifted — flag it as live to
|
||||
// satisfy the unused-binding lint until the next commit fills
|
||||
// in the real lifting logic.
|
||||
let _ = blocks;
|
||||
d
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user