From f8d00bdaf67bda60a91ed0442f789e6c17d640f7 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 12:42:12 +0000 Subject: [PATCH] p1-1: address review (AssetStorage doc-comment, real directory-cycle test) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Document AssetStorage::Copied / Reference path semantics so P1-6 (asset writer) knows that at scan time `Copied.path` is the SOURCE path and the writer is responsible for both copying bytes AND overwriting `path` with the destination. - Rename the dangling-symlink test to make its scope explicit (`dangling_symlink_pseudo_cycle_does_not_crash`); the prior name implied a real two-step directory cycle but the targets were broken links. - Add `two_step_directory_cycle_visited_set_breaks_loop`: builds `a/loop -> ../b` and `b/loop -> ../a` over real directories with real files, asserting scan terminates with a finite, deterministic asset list — exercises the canonical-path visited-set in walker::walk_files. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-core/src/asset.rs | 12 ++++ crates/kb-source-fs/tests/symlink_cycle.rs | 84 +++++++++++++++++++++- 2 files changed, 93 insertions(+), 3 deletions(-) diff --git a/crates/kb-core/src/asset.rs b/crates/kb-core/src/asset.rs index 8532175..6078219 100644 --- a/crates/kb-core/src/asset.rs +++ b/crates/kb-core/src/asset.rs @@ -40,6 +40,18 @@ impl WorkspacePath { } } +/// On-disk storage decision for a `RawAsset`. +/// +/// **Important convention** — `path` field semantics differ by variant: +/// +/// - `Copied { path }`: at scan time, `path` is the **source** path on the +/// user's filesystem. The asset writer (P1-6) is responsible for actually +/// copying the bytes into the workspace asset store, AND for overwriting +/// `path` with the destination path after the copy completes. +/// +/// - `Reference { path, sha }`: `path` is always the **source** path. No +/// bytes are ever copied; downstream readers stream from `path` directly. +/// `sha` is the BLAKE3 full hex (matches `RawAsset::checksum`). #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "lowercase", tag = "kind")] pub enum AssetStorage { diff --git a/crates/kb-source-fs/tests/symlink_cycle.rs b/crates/kb-source-fs/tests/symlink_cycle.rs index 2366549..bcd00c4 100644 --- a/crates/kb-source-fs/tests/symlink_cycle.rs +++ b/crates/kb-source-fs/tests/symlink_cycle.rs @@ -61,11 +61,17 @@ fn symlink_cycle_does_not_loop_or_crash() { } #[test] -fn two_step_symlink_cycle_does_not_loop() { +fn dangling_symlink_pseudo_cycle_does_not_crash() { // root/ // ├── alpha.md - // ├── a → b - // └── b → a + // ├── a → b (b does not exist as a real file/dir) + // └── b → a (a does not exist as a real file/dir) + // + // Both symlinks are dangling — neither resolves to anything. This is + // NOT a real two-step directory cycle (see + // `two_step_directory_cycle_visited_set_breaks_loop` for that case); + // it merely verifies the scan tolerates broken-link pseudo-cycles + // without crashing or looping. let dir = tempfile::tempdir().unwrap(); let root = dir.path(); std::fs::write(root.join("alpha.md"), b"alpha").unwrap(); @@ -80,3 +86,75 @@ fn two_step_symlink_cycle_does_not_loop() { let v = conn.scan(&SourceScope::default()).expect("scan must return"); assert!(v.iter().any(|a| a.workspace_path.0 == "alpha.md")); } + +#[test] +fn two_step_directory_cycle_visited_set_breaks_loop() { + // Real two-step directory cycle through symlinks: + // root/ + // ├── a/ + // │ ├── inside_a.md + // │ └── loop → ../b (symlink, target IS a real directory) + // └── b/ + // ├── inside_b.md + // └── loop → ../a (symlink, target IS a real directory) + // + // Without the visited-set, walkdir would descend + // a → a/loop (=b) → a/loop/loop (=a) → … forever. + // The canonical-path visited-set in `walker::walk_files` must break + // the loop and yield a finite, deterministic result. + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir(root.join("a")).unwrap(); + std::fs::create_dir(root.join("b")).unwrap(); + std::fs::write(root.join("a/inside_a.md"), b"a-content").unwrap(); + std::fs::write(root.join("b/inside_b.md"), b"b-content").unwrap(); + // Use relative targets so the symlink truly points at the sibling + // directory regardless of where the tempdir lives. + symlink("../b", root.join("a/loop")).unwrap(); + symlink("../a", root.join("b/loop")).unwrap(); + + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .expect("connector init"); + + // Run scan twice — both must terminate AND produce identical + // workspace_path lists (visited-set is deterministic per scan). + let v1 = conn.scan(&SourceScope::default()).expect("scan must return"); + let v2 = conn.scan(&SourceScope::default()).expect("scan must return"); + + let names1: Vec = v1.iter().map(|a| a.workspace_path.0.clone()).collect(); + let names2: Vec = v2.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert_eq!(names1, names2, "scan must be deterministic across runs"); + + // No duplicate workspace paths (visited-set should suppress + // re-emission of the same canonical file via the cycle). + let mut seen = std::collections::HashSet::new(); + for asset in &v1 { + assert!( + seen.insert(asset.workspace_path.0.clone()), + "duplicate workspace_path: {}", + asset.workspace_path.0 + ); + } + + // Both real files must appear at least once. Their exact relative + // paths depend on which side of the cycle the walker descended into + // first; assert by basename to keep the check robust. + assert!( + v1.iter().any(|a| a.workspace_path.0.ends_with("inside_a.md")), + "expected inside_a.md in scan output, got: {names1:?}" + ); + assert!( + v1.iter().any(|a| a.workspace_path.0.ends_with("inside_b.md")), + "expected inside_b.md in scan output, got: {names1:?}" + ); + + // Sanity bound: with two real files and a working cycle guard the + // output should be tiny. If we ever produce >50 entries the visited + // set has regressed. + assert!( + v1.len() < 50, + "scan emitted {} assets — cycle guard likely regressed: {:?}", + v1.len(), + names1 + ); +}