From 42a7d53e5dc55248397988e8dde8bb2d82160579 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 12:56:19 +0000 Subject: [PATCH] p1-2: fixtures + snapshot tests for frontmatter parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two markdown fixtures with hand-authored JSON baselines that pin the §0 Q9 derive output across runs: - frontmatter-only.md exercises the YAML happy path with most fields, unknown keys, an `id:` field, and a non-UTC created_at (so the baseline shows original_timestamps preservation). - mixed-lang.md is body-only with no `lang:` field; baseline pins the lingua autodetect result for our enabled language set. A separate `emit_snapshots` test (marked `#[ignore]`) regenerates the baselines from the current parser output. A determinism test parses the fixture twice and asserts equality so any non-determinism (e.g. key ordering, lingua nondeterminism) fails fast. --- .../tests/frontmatter_snapshots.rs | 111 ++++++++++++++++++ fixtures/markdown/frontmatter-only.md | 22 ++++ .../markdown/frontmatter-only.snapshot.json | 31 +++++ fixtures/markdown/mixed-lang.md | 9 ++ fixtures/markdown/mixed-lang.snapshot.json | 16 +++ 5 files changed, 189 insertions(+) create mode 100644 crates/kb-parse-md/tests/frontmatter_snapshots.rs create mode 100644 fixtures/markdown/frontmatter-only.md create mode 100644 fixtures/markdown/frontmatter-only.snapshot.json create mode 100644 fixtures/markdown/mixed-lang.md create mode 100644 fixtures/markdown/mixed-lang.snapshot.json diff --git a/crates/kb-parse-md/tests/frontmatter_snapshots.rs b/crates/kb-parse-md/tests/frontmatter_snapshots.rs new file mode 100644 index 0000000..84c6bcc --- /dev/null +++ b/crates/kb-parse-md/tests/frontmatter_snapshots.rs @@ -0,0 +1,111 @@ +//! Snapshot tests pinning the §0 Q9 derive output for two fixtures. +//! +//! The baseline JSON next to each fixture is hand-authored / regenerated +//! from a deterministic run. `BodyHints` timestamps are caller-provided +//! and therefore stable; lingua autodetect over our fixtures is also +//! stable for the language set we configured. + +use kb_parse_md::{BodyHints, parse_frontmatter}; +use serde::Serialize; +use serde_json::Value; +use std::fs; +use std::path::PathBuf; +use time::macros::datetime; + +/// Stable view of the parser output suitable for JSON snapshotting. +/// We deliberately exclude `FrontmatterSpan` byte offsets here too — they're +/// fully determined by the input file and are exercised by unit tests; the +/// snapshot focuses on the §0 Q9 derive contract. +#[derive(Serialize)] +struct Snapshot { + metadata: kb_core::Metadata, + span_present: bool, + warnings: Vec, +} + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("..") + .join("fixtures") + .join("markdown") +} + +fn pinned_hints() -> BodyHints { + BodyHints { + first_h1: None, + fs_ctime: datetime!(2024-01-01 00:00:00 UTC), + fs_mtime: datetime!(2024-01-02 00:00:00 UTC), + fallback_lang: None, + } +} + +fn assert_snapshot(fixture: &str, baseline: &str) { + let dir = fixtures_dir(); + let bytes = fs::read(dir.join(fixture)).expect("fixture readable"); + + let (meta, span, warns) = parse_frontmatter(&bytes, &pinned_hints()).unwrap(); + let snap = Snapshot { + metadata: meta, + span_present: span.is_some(), + warnings: warns, + }; + let actual: Value = serde_json::to_value(&snap).unwrap(); + + let expected_text = + fs::read_to_string(dir.join(baseline)).expect("snapshot baseline readable"); + let expected: Value = serde_json::from_str(&expected_text).expect("baseline parses as json"); + + if actual != expected { + let actual_pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "snapshot drift for {fixture}\n\ + --- expected ({baseline}) ---\n{expected_text}\n\ + --- actual ---\n{actual_pretty}\n\ + If the change is intentional, update {baseline}." + ); + } +} + +#[test] +fn frontmatter_only_snapshot() { + assert_snapshot("frontmatter-only.md", "frontmatter-only.snapshot.json"); +} + +/// Run with `cargo test -p kb-parse-md --test frontmatter_snapshots emit_snapshots -- --ignored --nocapture` +/// to regenerate the baseline JSON files from the current parser output. +#[test] +#[ignore] +fn emit_snapshots() { + let dir = fixtures_dir(); + for (fixture, baseline) in [ + ("frontmatter-only.md", "frontmatter-only.snapshot.json"), + ("mixed-lang.md", "mixed-lang.snapshot.json"), + ] { + let bytes = fs::read(dir.join(fixture)).unwrap(); + let (meta, span, warns) = parse_frontmatter(&bytes, &pinned_hints()).unwrap(); + let snap = Snapshot { + metadata: meta, + span_present: span.is_some(), + warnings: warns, + }; + let json = serde_json::to_string_pretty(&snap).unwrap(); + fs::write(dir.join(baseline), format!("{json}\n")).unwrap(); + eprintln!("wrote {}", dir.join(baseline).display()); + } +} + +#[test] +fn mixed_lang_snapshot() { + assert_snapshot("mixed-lang.md", "mixed-lang.snapshot.json"); +} + +/// Determinism: parsing the same fixture twice in a row must give equal output. +#[test] +fn snapshot_is_deterministic_across_runs() { + let dir = fixtures_dir(); + let bytes = fs::read(dir.join("frontmatter-only.md")).unwrap(); + let (a, _, _) = parse_frontmatter(&bytes, &pinned_hints()).unwrap(); + let (b, _, _) = parse_frontmatter(&bytes, &pinned_hints()).unwrap(); + assert_eq!(serde_json::to_value(&a).unwrap(), serde_json::to_value(&b).unwrap()); +} diff --git a/fixtures/markdown/frontmatter-only.md b/fixtures/markdown/frontmatter-only.md new file mode 100644 index 0000000..766b641 --- /dev/null +++ b/fixtures/markdown/frontmatter-only.md @@ -0,0 +1,22 @@ +--- +title: Frontmatter Only +aliases: + - fm-only + - first-fixture +tags: + - parse + - test +lang: en +created_at: 2024-01-15T10:00:00+09:00 +updated_at: 2024-02-20T08:30:00Z +source_type: note +trust_level: secondary +id: my-stable-handle +custom_field: hello +nested_obj: + key: value +--- + +# Body Heading + +Body paragraph. diff --git a/fixtures/markdown/frontmatter-only.snapshot.json b/fixtures/markdown/frontmatter-only.snapshot.json new file mode 100644 index 0000000..ae187df --- /dev/null +++ b/fixtures/markdown/frontmatter-only.snapshot.json @@ -0,0 +1,31 @@ +{ + "metadata": { + "aliases": [ + "fm-only", + "first-fixture" + ], + "tags": [ + "parse", + "test" + ], + "created_at": "2024-01-15T01:00:00Z", + "updated_at": "2024-02-20T08:30:00Z", + "source_type": "note", + "trust_level": "secondary", + "user_id_alias": "my-stable-handle", + "user": { + "custom_field": "hello", + "lang": "en", + "nested_obj": { + "key": "value" + }, + "original_timestamps": { + "created_at": "2024-01-15T10:00:00+09:00" + }, + "title": "Frontmatter Only", + "user_id_alias": "my-stable-handle" + } + }, + "span_present": true, + "warnings": [] +} diff --git a/fixtures/markdown/mixed-lang.md b/fixtures/markdown/mixed-lang.md new file mode 100644 index 0000000..71ccf6e --- /dev/null +++ b/fixtures/markdown/mixed-lang.md @@ -0,0 +1,9 @@ +# Mixed Language Note + +이 문서는 한국어와 영어가 섞여 있습니다. The body has both Korean +sentences and English sentences. lingua는 통계적 언어 감지기를 제공합니다. +This is to test that auto-detect picks one of `ko` or `en` deterministically +when no `lang:` field is present in the frontmatter. + +본문은 첫 4 KB만 분석되지만, 짧은 문서에서도 잘 동작해야 합니다. +The detector should pick the dominant language across the sample window. diff --git a/fixtures/markdown/mixed-lang.snapshot.json b/fixtures/markdown/mixed-lang.snapshot.json new file mode 100644 index 0000000..c6cda5c --- /dev/null +++ b/fixtures/markdown/mixed-lang.snapshot.json @@ -0,0 +1,16 @@ +{ + "metadata": { + "aliases": [], + "tags": [], + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-02T00:00:00Z", + "source_type": "markdown", + "trust_level": "primary", + "user_id_alias": null, + "user": { + "lang": "en" + } + }, + "span_present": false, + "warnings": [] +}