diff --git a/crates/kb-normalize/src/lib.rs b/crates/kb-normalize/src/lib.rs index b73e48c..3bf1e11 100644 --- a/crates/kb-normalize/src/lib.rs +++ b/crates/kb-normalize/src/lib.rs @@ -525,4 +525,68 @@ mod tests { assert!(!doc.metadata.user.contains_key("lang")); assert!(doc.metadata.user.contains_key("custom")); } + + /// Determinism property: 1000 iterations of `build_canonical_document` + /// over identical inputs produce byte-identical JSON, modulo the two + /// non-deterministic `now_utc()` timestamps for the Parsed/Normalized + /// events. We strip those timestamps before comparing. Must finish + /// within 1 second. + #[test] + fn determinism_1000_iterations_under_1s() { + let asset = fixture_asset(); + let metadata = fixture_metadata(); + let pv = parser_version(); + + // Helper: serialize and replace the two now_utc-derived timestamps + // (Parsed + Normalized + any Warning events) with a sentinel so + // the comparison only checks the deterministic fields. + fn strip_dynamic_at(doc: &CanonicalDocument) -> Value { + let mut v = serde_json::to_value(doc).unwrap(); + if let Some(events) = v + .get_mut("provenance") + .and_then(|p| p.get_mut("events")) + .and_then(|e| e.as_array_mut()) + { + for (i, ev) in events.iter_mut().enumerate() { + // index 0 is Discovered (deterministic — pinned in + // the fixture). Strip everything after. + if i > 0 + && let Some(obj) = ev.as_object_mut() + { + obj.insert("at".into(), Value::String("".into())); + } + } + } + v + } + + let baseline = build_canonical_document( + &asset, + metadata.clone(), + vec![], + &pv, + vec![], + ) + .unwrap(); + let baseline_json = serde_json::to_string(&strip_dynamic_at(&baseline)).unwrap(); + + let start = std::time::Instant::now(); + for _ in 0..1000 { + let next = build_canonical_document( + &asset, + metadata.clone(), + vec![], + &pv, + vec![], + ) + .unwrap(); + let next_json = serde_json::to_string(&strip_dynamic_at(&next)).unwrap(); + assert_eq!(baseline_json, next_json); + } + assert!( + start.elapsed() < std::time::Duration::from_secs(1), + "1000 iterations took {:?}", + start.elapsed() + ); + } } diff --git a/crates/kb-normalize/tests/normalize_snapshot.rs b/crates/kb-normalize/tests/normalize_snapshot.rs new file mode 100644 index 0000000..890ddd8 --- /dev/null +++ b/crates/kb-normalize/tests/normalize_snapshot.rs @@ -0,0 +1,149 @@ +//! Snapshot test pinning the full `CanonicalDocument` JSON for the +//! `code-and-table.md` fixture. +//! +//! This is an integration test (it lives under `tests/`) and depends on +//! `kb-parse-md` only as a dev-dep so the production crate's regular +//! deps still satisfy the §8 boundary (`cargo tree -p kb-normalize +//! --depth 1` without `-e dev` does not list any parser implementation). +//! +//! Non-deterministic fields are stripped before comparison: +//! +//! * `provenance.events[*].at` — each invocation calls `now_utc()` for +//! the Parsed/Normalized/Warning events. The Discovered event uses +//! the asset's pinned `discovered_at`, so we keep that one and replace +//! only indices ≥ 1. + +use std::path::PathBuf; + +use kb_core::{ + AssetId, AssetStorage, Checksum, MediaType, ParserVersion, RawAsset, SourceUri, + WorkspacePath, +}; +use kb_normalize::build_canonical_document; +use kb_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; +use serde_json::Value; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("..") + .join("fixtures") + .join("markdown") +} + +fn fixed_asset(workspace_path: &str) -> RawAsset { + let wp = WorkspacePath::new(workspace_path.into()).unwrap(); + RawAsset { + asset_id: AssetId("a".repeat(32)), + source_uri: SourceUri::File(PathBuf::from("/tmp/code-and-table.md")), + workspace_path: wp, + media_type: MediaType::Markdown, + byte_len: 0, + checksum: Checksum("0".repeat(64)), + // Pin discovered_at so the Discovered provenance event is + // deterministic across runs. + discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + stored: AssetStorage::Reference { + path: PathBuf::from("/tmp/code-and-table.md"), + sha: Checksum("0".repeat(64)), + }, + } +} + +fn strip_dynamic(mut v: Value) -> Value { + if let Some(events) = v + .get_mut("provenance") + .and_then(|p| p.get_mut("events")) + .and_then(|e| e.as_array_mut()) + { + for (i, ev) in events.iter_mut().enumerate() { + if i > 0 + && let Some(obj) = ev.as_object_mut() + { + obj.insert("at".into(), Value::String("".into())); + } + } + } + v +} + +#[test] +fn code_and_table_canonical_snapshot() { + let dir = fixtures_dir(); + let bytes = std::fs::read(dir.join("code-and-table.md")).expect("fixture readable"); + + // Frontmatter parse — code-and-table.md has none, so we provide + // BodyHints with deterministic timestamps so the lifted Metadata + // is reproducible. The body offset is 1 (no frontmatter prefix). + let asset = fixed_asset("notes/code-and-table.md"); + let hints = BodyHints { + first_h1: None, + fs_ctime: asset.discovered_at, + fs_mtime: asset.discovered_at, + fallback_lang: Some("en".into()), + }; + let (metadata, fm_span, _fm_warns) = + parse_frontmatter(&bytes, &hints).expect("frontmatter parses"); + + // No frontmatter → body starts at line 1. With frontmatter, line + // count of the prelude is computed from the byte span; this fixture + // has none, so the constant 1 is fine. + let body_offset_lines: u32 = match fm_span { + // Defensive: count the newlines in the prelude. The fixture + // hits the `None` branch so this code path is not exercised + // by the test, but kept for completeness. + Some(span) => bytes[..span.end].iter().filter(|b| **b == b'\n').count() as u32 + 1, + None => 1, + }; + let (blocks, parse_warns) = + parse_blocks(&bytes, body_offset_lines).expect("blocks parse"); + + let parser_version = ParserVersion("kb-normalize-snapshot-test-0".into()); + let mut metadata = metadata; + // The `created_at` / `updated_at` lifted from BodyHints are pinned + // to `discovered_at` above, so they are already deterministic. + metadata.aliases.sort(); + metadata.tags.sort(); + + let doc = build_canonical_document( + &asset, + metadata, + blocks, + &parser_version, + parse_warns, + ) + .expect("build_canonical_document"); + + let actual = strip_dynamic(serde_json::to_value(&doc).unwrap()); + + let baseline_path = dir.join("code-and-table.canonical.snapshot.json"); + let baseline_text = match std::fs::read_to_string(&baseline_path) { + Ok(s) => s, + Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + return; + } + Err(e) => panic!( + "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}", + baseline_path.display() + ), + }; + let expected: Value = + serde_json::from_str(&baseline_text).expect("baseline parses as json"); + + if actual != expected { + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + eprintln!("updated baseline {}", baseline_path.display()); + return; + } + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "canonical snapshot drift\n--- expected ({}) ---\n{baseline_text}\n--- actual ---\n{pretty}\nIf intentional, re-run with UPDATE_SNAPSHOTS=1.", + baseline_path.display() + ); + } +} diff --git a/fixtures/markdown/code-and-table.canonical.snapshot.json b/fixtures/markdown/code-and-table.canonical.snapshot.json new file mode 100644 index 0000000..9fb909c --- /dev/null +++ b/fixtures/markdown/code-and-table.canonical.snapshot.json @@ -0,0 +1,102 @@ +{ + "blocks": [ + { + "common": { + "block_id": "dd1528c6e84d8a66087cbf6faafd67c6", + "heading_path": [], + "source_span": { + "end": 1, + "kind": "line", + "start": 1 + } + }, + "kind": "heading", + "level": 1, + "text": "Code And Table" + }, + { + "code": "fn main() {\n println!(\"hi\");\n}", + "common": { + "block_id": "68ea34aca04b83413dd8556126ae4584", + "heading_path": [ + "Code And Table" + ], + "source_span": { + "end": 7, + "kind": "line", + "start": 3 + } + }, + "kind": "code", + "lang": "rust" + }, + { + "common": { + "block_id": "b50a8e941b11f1834ae17adba9e08118", + "heading_path": [ + "Code And Table" + ], + "source_span": { + "end": 12, + "kind": "line", + "start": 9 + } + }, + "headers": [ + "col a", + "col b" + ], + "kind": "table", + "rows": [ + [ + "1", + "2" + ], + [ + "3", + "4" + ] + ] + } + ], + "doc_id": "6a9ef317c9c097ff3f6aeb317559bd83", + "doc_version": 1, + "lang": "en", + "metadata": { + "aliases": [], + "created_at": "2023-11-14T22:13:20Z", + "source_type": "markdown", + "tags": [], + "trust_level": "primary", + "updated_at": "2023-11-14T22:13:20Z", + "user": {}, + "user_id_alias": null + }, + "parser_version": "kb-normalize-snapshot-test-0", + "provenance": { + "events": [ + { + "agent": "kb-source-fs", + "at": "2023-11-14T22:13:20Z", + "kind": "discovered", + "note": null + }, + { + "agent": "kb-parse-md", + "at": "", + "kind": "parsed", + "note": "parser_version=kb-normalize-snapshot-test-0" + }, + { + "agent": "kb-normalize", + "at": "", + "kind": "normalized", + "note": null + } + ] + }, + "schema_version": 1, + "source_asset_id": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "title": "", + "workspace_path": "notes/code-and-table.md" +}