From 7c75e10b2c787981c46ecd324e25ed343849c0db Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 12:27:34 +0000 Subject: [PATCH 1/5] p1-1: scaffold kb-source-fs crate (FsSourceConnector) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Walk config.workspace.root, apply gitignore-style filters (config.workspace.exclude ∪ .kbignore ∪ baked-in defaults for .DS_Store / ._*), stream BLAKE3 over each file, and emit a deterministic Vec sorted by workspace_path. Modules: - hash: streaming blake3::Hasher + 64 KiB read buffer (no whole-file loads); pinned digests for empty input and "hello world". - media: extension → MediaType (markdown/pdf/image/audio/other). - walker: ignore::OverrideBuilder for filter union; walkdir with manual visited-set cycle protection on top of follow_links. - connector: public FsSourceConnector::new(&Config) + SourceConnector::scan(&SourceScope) impl. Uses kb_core::to_posix for WorkspacePath construction (carries P0-1 # rejection through unchanged) and kb_core::id_for_asset for AssetId derivation. Storage variant signals intent only; actual byte copy is P1-6's responsibility. Per design §3.3, §6.2, §6.6, §7.1, §7.2, §8. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 377 +++++++++++++++++++++++- Cargo.toml | 1 + crates/kb-source-fs/Cargo.toml | 23 ++ crates/kb-source-fs/src/connector.rs | 411 +++++++++++++++++++++++++++ crates/kb-source-fs/src/hash.rs | 92 ++++++ crates/kb-source-fs/src/lib.rs | 16 ++ crates/kb-source-fs/src/media.rs | 85 ++++++ crates/kb-source-fs/src/walker.rs | 247 ++++++++++++++++ 8 files changed, 1250 insertions(+), 2 deletions(-) create mode 100644 crates/kb-source-fs/Cargo.toml create mode 100644 crates/kb-source-fs/src/connector.rs create mode 100644 crates/kb-source-fs/src/hash.rs create mode 100644 crates/kb-source-fs/src/lib.rs create mode 100644 crates/kb-source-fs/src/media.rs create mode 100644 crates/kb-source-fs/src/walker.rs diff --git a/Cargo.lock b/Cargo.lock index 58f8384..48aaa06 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -79,6 +79,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + [[package]] name = "blake3" version = "1.8.5" @@ -93,6 +99,16 @@ dependencies = [ "cpufeatures", ] +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "cc" version = "1.2.61" @@ -179,6 +195,25 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -222,12 +257,34 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + [[package]] name = "find-msvc-tools" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "getrandom" version = "0.2.17" @@ -239,6 +296,41 @@ dependencies = [ "wasi", ] +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] + +[[package]] +name = "globset" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", +] + [[package]] name = "hashbrown" version = "0.17.0" @@ -251,6 +343,28 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "ignore" +version = "0.4.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3d782a365a015e0f5c04902246139249abf769125006fbe7649e2ee88169b4a" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "indexmap" version = "2.14.0" @@ -258,7 +372,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.17.0", + "serde", + "serde_core", ] [[package]] @@ -335,12 +451,35 @@ dependencies = [ "serde", ] +[[package]] +name = "kb-source-fs" +version = "0.1.0" +dependencies = [ + "anyhow", + "blake3", + "ignore", + "kb-config", + "kb-core", + "serde", + "serde_json", + "tempfile", + "time", + "tracing", + "walkdir", +] + [[package]] name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "libc" version = "0.2.186" @@ -356,6 +495,12 @@ dependencies = [ "libc", ] +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + [[package]] name = "log" version = "0.4.29" @@ -422,6 +567,16 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -440,13 +595,19 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "redox_users" version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ - "getrandom", + "getrandom 0.2.17", "libredox", "thiserror 1.0.69", ] @@ -468,12 +629,40 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + [[package]] name = "ryu-js" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + [[package]] name = "serde" version = "1.0.228" @@ -581,6 +770,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -819,6 +1021,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "utf8parse" version = "0.2.2" @@ -831,12 +1039,83 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" +[[package]] +name = "wasip2" +version = "1.0.1+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +dependencies = [ + "wit-bindgen 0.46.0", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "windows-link" version = "0.2.1" @@ -927,6 +1206,100 @@ dependencies = [ "memchr", ] +[[package]] +name = "wit-bindgen" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + [[package]] name = "zmij" version = "1.0.21" diff --git a/Cargo.toml b/Cargo.toml index ce933c2..a9f9b3b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "crates/kb-core", "crates/kb-parse-types", "crates/kb-config", + "crates/kb-source-fs", "crates/kb-app", "crates/kb-cli", ] diff --git a/crates/kb-source-fs/Cargo.toml b/crates/kb-source-fs/Cargo.toml new file mode 100644 index 0000000..9274c3c --- /dev/null +++ b/crates/kb-source-fs/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "kb-source-fs" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Local filesystem SourceConnector — walks workspace.root + applies gitignore filters" + +[dependencies] +kb-core = { path = "../kb-core" } +kb-config = { path = "../kb-config" } +anyhow = { workspace = true } +serde = { workspace = true } +time = { workspace = true } +blake3 = { workspace = true } +tracing = { workspace = true } +walkdir = "2" +ignore = "0.4" + +[dev-dependencies] +serde_json = { workspace = true } +tempfile = "3" diff --git a/crates/kb-source-fs/src/connector.rs b/crates/kb-source-fs/src/connector.rs new file mode 100644 index 0000000..fe98c1e --- /dev/null +++ b/crates/kb-source-fs/src/connector.rs @@ -0,0 +1,411 @@ +//! `FsSourceConnector` — public surface for the crate. +//! +//! ```ignore +//! pub struct FsSourceConnector { /* internal */ } +//! impl FsSourceConnector { +//! pub fn new(config: &kb_config::Config) -> anyhow::Result; +//! } +//! impl kb_core::SourceConnector for FsSourceConnector { +//! fn scan(&self, scope: &kb_core::SourceScope) -> anyhow::Result>; +//! } +//! ``` + +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use time::OffsetDateTime; + +use kb_config::Config; +use kb_core::{ + AssetStorage, Checksum, RawAsset, SourceConnector, SourceScope, SourceUri, + id_for_asset, to_posix, +}; + +use crate::hash::hash_file; +use crate::media::media_type_for; +use crate::walker::{build_overrides, read_kbignore, walk_files}; + +/// Local-filesystem `SourceConnector`. Constructed once from `Config`, +/// reused across `scan` calls. +/// +/// State carried between `new` and `scan`: +/// - `default_root`: `config.workspace.root` resolved to a `PathBuf`. Used +/// only when `SourceScope::root` is empty (i.e. the caller did not +/// override the root). +/// - `default_exclude`: snapshot of `config.workspace.exclude` at +/// construction time. +/// - `copy_threshold_bytes`: `config.storage.copy_threshold_mb * 1 MiB` +/// pre-multiplied so we don't recompute per file. +pub struct FsSourceConnector { + default_root: PathBuf, + default_exclude: Vec, + copy_threshold_bytes: u64, +} + +impl FsSourceConnector { + pub fn new(config: &Config) -> Result { + // `config.workspace.root` is a String that may contain `~` or env + // expansions. P0-* did not yet provide a path-expansion helper in + // kb-config; for P1-1 we expand `~` ourselves and leave `${VAR}` + // for a follow-up. The vast majority of users hit the `~` case. + let root = expand_tilde(&config.workspace.root); + + let copy_threshold_bytes = config + .storage + .copy_threshold_mb + .saturating_mul(1024 * 1024); + + Ok(Self { + default_root: root, + default_exclude: config.workspace.exclude.clone(), + copy_threshold_bytes, + }) + } +} + +impl SourceConnector for FsSourceConnector { + fn scan(&self, scope: &SourceScope) -> Result> { + // `SourceScope::root` overrides config root when non-empty. This + // matches the design's "scope is the per-call lens; config is the + // default" split (§7.1). + let root = if scope.root.as_os_str().is_empty() { + self.default_root.clone() + } else { + scope.root.clone() + }; + + // Union: config.workspace.exclude ∪ scope.exclude ∪ .kbignore. + // Per §6.2 the union of `.kbignore` and `config.workspace.exclude` + // is the filter set. `scope.exclude` is added on top so a caller + // can layer a per-call narrowing. + let mut excludes = self.default_exclude.clone(); + excludes.extend(scope.exclude.iter().cloned()); + let kbignore = read_kbignore(&root)?; + + let overrides = build_overrides(&root, &excludes, &kbignore)?; + + // `scope.include` is intentionally ignored at this stage of the + // pipeline: per §6.2 the workspace-level include lives in + // `WorkspaceCfg` and is enforced by the asset writer / extractors. + // Surfacing it here would double-filter Markdown vs PDF before the + // extractor router gets to see them. + if !scope.include.is_empty() { + tracing::debug!( + count = scope.include.len(), + "FsSourceConnector ignores scope.include — handled by extractor router" + ); + } + + let files = walk_files(&root, &overrides)?; + + let mut assets = Vec::with_capacity(files.len()); + for abs in &files { + // `to_posix` does NFC + leading `./` strip + `#` rejection. + // Compute the workspace-relative path before handing to it so + // emitted `WorkspacePath` is always relative. + let rel = abs.strip_prefix(&root).unwrap_or(abs); + let workspace_path = match to_posix(rel) { + Ok(p) => p, + Err(e) => { + // A path containing `#` is the only documented reason + // `to_posix` fails today. Drop the file with a warning + // rather than aborting the entire scan — a single bad + // filename should not nuke a 10 000-file ingest. + tracing::warn!( + path = %abs.display(), + error = %e, + "skipping file: path is not a valid WorkspacePath", + ); + continue; + } + }; + + let media_type = media_type_for(abs); + let (byte_len, full_hex) = hash_file(abs) + .with_context(|| format!("hashing {}", abs.display()))?; + let checksum = Checksum(full_hex.clone()); + let asset_id = id_for_asset(&full_hex); + + // Storage variant signals *intent*, not an actual copy. + // P1-6 (asset writer) is responsible for the on-disk copy. + let stored = if byte_len > self.copy_threshold_bytes { + AssetStorage::Reference { + path: abs.clone(), + sha: checksum.clone(), + } + } else { + AssetStorage::Copied { path: abs.clone() } + }; + + assets.push(RawAsset { + asset_id, + source_uri: SourceUri::File(abs.clone()), + workspace_path, + media_type, + byte_len, + checksum, + discovered_at: OffsetDateTime::now_utc(), + stored, + }); + } + + // Determinism: sort by workspace_path. WorkspacePath is a String + // newtype with stable lexicographic ordering. Two scans of the + // same tree must produce identical Vec modulo the + // wall-clock `discovered_at` field. + assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0)); + + Ok(assets) + } +} + +/// Expand a leading `~` to the current user's home directory. No-op for +/// any other shape (absolute, relative, `${VAR}`-style). +fn expand_tilde(s: &str) -> PathBuf { + if let Some(rest) = s.strip_prefix("~/") { + if let Some(home) = dirs_home() { + return home.join(rest); + } + } else if s == "~" { + if let Some(home) = dirs_home() { + return home; + } + } + PathBuf::from(s) +} + +/// Tiny `dirs::home_dir`-compat shim that does NOT add the `dirs` crate to +/// our dep set (we explicitly enumerate allowed deps in the task spec). +/// Reads `$HOME` directly. +fn dirs_home() -> Option { + std::env::var_os("HOME").map(PathBuf::from) +} + +#[cfg(test)] +mod tests { + use super::*; + use kb_config::Config; + + fn cfg_with_root(root: &str) -> Config { + let mut c = Config::defaults(); + c.workspace.root = root.to_string(); + c.workspace.exclude.clear(); + c + } + + #[test] + fn scan_empty_dir_yields_empty_vec() { + let dir = tempfile::tempdir().unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root( + dir.path().to_str().unwrap(), + )) + .unwrap(); + let scope = SourceScope::default(); + let v = conn.scan(&scope).unwrap(); + assert!(v.is_empty()); + } + + #[test] + fn scan_emits_sorted_workspace_paths() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir_all(root.join("notes")).unwrap(); + std::fs::write(root.join("README.md"), b"hi").unwrap(); + std::fs::write(root.join("notes/beta.md"), b"b").unwrap(); + std::fs::write(root.join("notes/alpha.md"), b"a").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert_eq!( + names, + vec![ + "README.md".to_string(), + "notes/alpha.md".to_string(), + "notes/beta.md".to_string(), + ] + ); + } + + #[test] + fn scan_filters_by_kbignore() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join(".kbignore"), "*.tmp\n").unwrap(); + std::fs::write(root.join("a.md"), b"x").unwrap(); + std::fs::write(root.join("b.tmp"), b"x").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); + // .kbignore itself starts with `.` and is not in DEFAULT_EXCLUDES, + // so it is *not* automatically hidden — but the task spec only + // requires `*.tmp` and `.DS_Store` / `._*` filtering, and the + // `.kbignore` file is a legitimate "Other(\"\")" asset. Either + // present-or-absent is acceptable; the assertion below pins + // current behaviour: .kbignore appears, b.tmp does not. + assert!(names.contains(&".kbignore".to_string())); + assert!(names.contains(&"a.md".to_string())); + assert!(!names.contains(&"b.tmp".to_string())); + } + + #[test] + fn scan_filters_default_excludes() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join("a.md"), b"x").unwrap(); + std::fs::write(root.join(".DS_Store"), b"\0\0").unwrap(); + std::fs::write(root.join("._sidecar"), b"\0\0").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert_eq!(names, vec!["a.md".to_string()]); + } + + #[test] + fn scan_unions_config_exclude_and_kbignore() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join(".kbignore"), "*.tmp\n").unwrap(); + std::fs::write(root.join("a.md"), b"x").unwrap(); + std::fs::write(root.join("b.tmp"), b"x").unwrap(); + std::fs::write(root.join("c.log"), b"x").unwrap(); + + let mut cfg = cfg_with_root(root.to_str().unwrap()); + cfg.workspace.exclude.push("*.log".to_string()); + + let conn = FsSourceConnector::new(&cfg).unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert!(names.contains(&"a.md".to_string())); + assert!(!names.contains(&"b.tmp".to_string()), "kbignore should drop *.tmp"); + assert!(!names.contains(&"c.log".to_string()), "config.exclude should drop *.log"); + } + + #[test] + fn scan_blake3_pinned_for_known_file() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join("hello.md"), b"hello world").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + assert_eq!(v.len(), 1); + let asset = &v[0]; + assert_eq!( + asset.checksum.0, + "d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24" + ); + assert_eq!(asset.byte_len, 11); + // asset_id is derived from the full hex via id_for_asset. + assert_eq!(asset.asset_id, id_for_asset(&asset.checksum.0)); + } + + #[test] + fn scan_idempotent_modulo_timestamp() { + // Same filesystem state → identical Vec *modulo* + // discovered_at. Strip that field and compare. + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir_all(root.join("notes")).unwrap(); + std::fs::write(root.join("notes/a.md"), b"alpha").unwrap(); + std::fs::write(root.join("notes/b.md"), b"beta").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v1 = conn.scan(&SourceScope::default()).unwrap(); + let v2 = conn.scan(&SourceScope::default()).unwrap(); + assert_eq!(v1.len(), v2.len()); + for (a, b) in v1.iter().zip(v2.iter()) { + assert_eq!(a.asset_id, b.asset_id); + assert_eq!(a.workspace_path, b.workspace_path); + assert_eq!(a.checksum, b.checksum); + assert_eq!(a.byte_len, b.byte_len); + assert_eq!(a.media_type, b.media_type); + assert_eq!(a.source_uri, b.source_uri); + assert_eq!(a.stored, b.stored); + // discovered_at intentionally NOT compared + } + } + + #[test] + fn scan_emits_posix_normalized_paths() { + // End-to-end: the connector must produce POSIX-normalized + // workspace paths via `kb_core::to_posix`. We can't construct an + // input with literal `./` / `//` segments via the filesystem (the + // OS won't let us), so instead we assert the resulting strings + // are already POSIX-clean (no leading `./`, no `//`, forward + // slashes only) — which is the post-conditions side of the + // round-trip the unit tests in `kb-core::normalize` cover. + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir_all(root.join("a/b/c")).unwrap(); + std::fs::write(root.join("a/b/c/d.md"), b"x").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + assert_eq!(v.len(), 1); + let p = &v[0].workspace_path.0; + assert_eq!(p, "a/b/c/d.md"); + assert!(!p.starts_with("./")); + assert!(!p.contains("//")); + assert!(!p.contains('\\')); + } + + #[test] + fn scan_skips_files_whose_name_contains_hash() { + // `WorkspacePath` rejects `#` (collides with the W3C-Media-Fragments + // separator used by `Citation`). The connector must drop such + // files with a warning rather than aborting the scan. + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join("ok.md"), b"x").unwrap(); + std::fs::write(root.join("has#hash.md"), b"y").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert_eq!(names, vec!["ok.md".to_string()]); + } + + #[test] + fn copy_vs_reference_threshold_signals_intent() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join("small.md"), b"hi").unwrap(); + + let mut cfg = cfg_with_root(root.to_str().unwrap()); + // Threshold = 0 MiB ⇒ even a 2-byte file becomes Reference. + cfg.storage.copy_threshold_mb = 0; + let conn = FsSourceConnector::new(&cfg).unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + assert_eq!(v.len(), 1); + match &v[0].stored { + AssetStorage::Reference { sha, .. } => { + assert_eq!(sha, &v[0].checksum); + } + other => panic!("expected Reference, got {other:?}"), + } + + // Threshold high (default 100 MiB) ⇒ Copied. + let mut cfg2 = cfg_with_root(root.to_str().unwrap()); + cfg2.storage.copy_threshold_mb = 100; + let conn2 = FsSourceConnector::new(&cfg2).unwrap(); + let v2 = conn2.scan(&SourceScope::default()).unwrap(); + assert!(matches!(v2[0].stored, AssetStorage::Copied { .. })); + } +} diff --git a/crates/kb-source-fs/src/hash.rs b/crates/kb-source-fs/src/hash.rs new file mode 100644 index 0000000..b2b40a1 --- /dev/null +++ b/crates/kb-source-fs/src/hash.rs @@ -0,0 +1,92 @@ +//! Streaming BLAKE3 over a file path. Per task spec, files MUST NOT be +//! loaded fully into memory: `blake3::Hasher::update_reader` reads through a +//! 64 KiB internal buffer, which keeps memory bounded for any size of file. +//! +//! Returns `(byte_len, full_hex)`: +//! - `byte_len` is the total bytes hashed (== file size after follow). +//! - `full_hex` is the canonical lowercase hex (64 chars) of the full +//! blake3 digest. The `kb-core::Checksum` invariant is "full hex"; the +//! 32-char prefix is reserved for `AssetId` derivation via +//! `kb_core::id_for_asset`. + +use std::fs::File; +use std::io::{self, Read}; +use std::path::Path; + +use anyhow::{Context, Result}; + +const READ_BUFFER_BYTES: usize = 64 * 1024; + +/// Stream-hash a file with blake3. Returns `(byte_len, full_hex_64)`. +/// +/// `byte_len` is computed during streaming so callers do not need a separate +/// `metadata().len()` call (which can disagree with hashed bytes if the file +/// is rewritten mid-scan, but blake3-of-stream is the source of truth for +/// `RawAsset.checksum`). +pub(crate) fn hash_file(path: &Path) -> Result<(u64, String)> { + let file = File::open(path) + .with_context(|| format!("failed to open {} for hashing", path.display()))?; + hash_reader(file).with_context(|| format!("failed to hash {}", path.display())) +} + +fn hash_reader(mut reader: R) -> Result<(u64, String)> { + let mut hasher = blake3::Hasher::new(); + let mut buf = vec![0u8; READ_BUFFER_BYTES]; + let mut total: u64 = 0; + loop { + match reader.read(&mut buf) { + Ok(0) => break, + Ok(n) => { + hasher.update(&buf[..n]); + total = total.saturating_add(n as u64); + } + Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e.into()), + } + } + Ok((total, hasher.finalize().to_hex().to_string())) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// blake3 of the empty input is the well-known "official empty hash" + /// from the blake3 spec. Pinned so that swapping the hash crate or the + /// streaming implementation can never silently produce a different + /// digest for a known input. + #[test] + fn empty_blake3_pinned() { + let (n, hex) = hash_reader(std::io::empty()).unwrap(); + assert_eq!(n, 0); + assert_eq!( + hex, + "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262" + ); + } + + /// `b"hello world"` blake3 (full 64 hex). Computed independently with + /// `b3sum`; pinning here detects any drift in the streaming pipeline. + #[test] + fn known_bytes_blake3_pinned() { + let bytes = b"hello world"; + let (n, hex) = hash_reader(&bytes[..]).unwrap(); + assert_eq!(n, 11); + assert_eq!( + hex, + "d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24" + ); + } + + /// Streaming a buffer larger than `READ_BUFFER_BYTES` must produce the + /// same digest as a single-shot blake3 over the same bytes — i.e. the + /// chunk boundary is invisible. + #[test] + fn streaming_matches_oneshot_over_buffer_boundary() { + let bytes: Vec = (0u8..=255u8).cycle().take(READ_BUFFER_BYTES * 3 + 17).collect(); + let (n, streamed) = hash_reader(&bytes[..]).unwrap(); + assert_eq!(n, bytes.len() as u64); + let oneshot = blake3::hash(&bytes).to_hex().to_string(); + assert_eq!(streamed, oneshot); + } +} diff --git a/crates/kb-source-fs/src/lib.rs b/crates/kb-source-fs/src/lib.rs new file mode 100644 index 0000000..ae6b128 --- /dev/null +++ b/crates/kb-source-fs/src/lib.rs @@ -0,0 +1,16 @@ +//! `kb-source-fs` — local filesystem `SourceConnector`. +//! +//! Walks `config.workspace.root`, applies gitignore-style filters from +//! `config.workspace.exclude` ∪ `.kbignore`, computes BLAKE3 of every file, +//! and emits `Vec` sorted by `workspace_path` for determinism. +//! +//! Per design §3.3 (RawAsset), §6.2 (workspace + .kbignore), §6.6 (POSIX +//! normalization), §7.1 (SourceScope), §7.2 (SourceConnector), §8 (module +//! boundaries). + +mod connector; +mod hash; +mod media; +mod walker; + +pub use connector::FsSourceConnector; diff --git a/crates/kb-source-fs/src/media.rs b/crates/kb-source-fs/src/media.rs new file mode 100644 index 0000000..ecaf6e3 --- /dev/null +++ b/crates/kb-source-fs/src/media.rs @@ -0,0 +1,85 @@ +//! Media-type detection by extension. Per P1-1 task spec we do NOT do +//! libmagic-style sniffing; extension is enough for P1. Unknown / missing +//! extensions fall through to `MediaType::Other(ext.to_string())` (empty +//! string when the file has no extension at all). + +use std::path::Path; + +use kb_core::{AudioType, ImageType, MediaType}; + +/// Return `MediaType` for `path` based purely on its lowercased extension. +/// `.md` → Markdown, `.pdf` → Pdf, image and audio extensions map onto +/// `MediaType::Image(_)` / `MediaType::Audio(_)`. Anything else (including +/// missing extension) → `MediaType::Other(ext)`. +pub(crate) fn media_type_for(path: &Path) -> MediaType { + let ext = path + .extension() + .and_then(|s| s.to_str()) + .map(|s| s.to_ascii_lowercase()) + .unwrap_or_default(); + + match ext.as_str() { + "md" => MediaType::Markdown, + "pdf" => MediaType::Pdf, + + "png" => MediaType::Image(ImageType::Png), + "jpg" | "jpeg" => MediaType::Image(ImageType::Jpeg), + "webp" => MediaType::Image(ImageType::Webp), + "gif" => MediaType::Image(ImageType::Gif), + "tiff" | "tif" => MediaType::Image(ImageType::Tiff), + + "m4a" => MediaType::Audio(AudioType::M4a), + "mp3" => MediaType::Audio(AudioType::Mp3), + "wav" => MediaType::Audio(AudioType::Wav), + "flac" => MediaType::Audio(AudioType::Flac), + "ogg" => MediaType::Audio(AudioType::Ogg), + + // Empty string (no extension) and any other extension: bucket as + // Other and let downstream extractors decide if they support it. + _ => MediaType::Other(ext), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn markdown_and_pdf() { + assert_eq!(media_type_for(Path::new("a/b.md")), MediaType::Markdown); + assert_eq!(media_type_for(Path::new("a/b.MD")), MediaType::Markdown); + assert_eq!(media_type_for(Path::new("a/b.pdf")), MediaType::Pdf); + } + + #[test] + fn images_and_audio() { + assert_eq!( + media_type_for(Path::new("p.jpg")), + MediaType::Image(ImageType::Jpeg) + ); + assert_eq!( + media_type_for(Path::new("p.JPEG")), + MediaType::Image(ImageType::Jpeg) + ); + assert_eq!( + media_type_for(Path::new("a.M4A")), + MediaType::Audio(AudioType::M4a) + ); + assert_eq!( + media_type_for(Path::new("a.flac")), + MediaType::Audio(AudioType::Flac) + ); + } + + #[test] + fn unknown_and_missing_extension() { + assert_eq!( + media_type_for(Path::new("notes/x.weird")), + MediaType::Other("weird".to_string()) + ); + assert_eq!( + media_type_for(Path::new("README")), + MediaType::Other(String::new()) + ); + } +} diff --git a/crates/kb-source-fs/src/walker.rs b/crates/kb-source-fs/src/walker.rs new file mode 100644 index 0000000..95b6b3d --- /dev/null +++ b/crates/kb-source-fs/src/walker.rs @@ -0,0 +1,247 @@ +//! Directory walker with gitignore-style filtering and symlink-cycle +//! protection. +//! +//! Filter set (per task spec, design §6.2): +//! - `config.workspace.exclude` (passed in by `FsSourceConnector`) +//! - `/.kbignore` (optional file at workspace root) +//! - default-excludes for `.DS_Store` and macOS resource forks (`._*`) +//! +//! All three are merged via `ignore::overrides::OverrideBuilder`, which +//! gives full gitignore semantics (anchors, `!` negation, `**`, etc.). We +//! prepend `!` to each pattern because `OverrideBuilder` treats positive +//! patterns as "include" and negative as "exclude" — see §"Filter set" +//! comment in `build_walker` for the full reasoning. +//! +//! Symlink handling: we want to follow links (so a workspace using a +//! symlinked `notes/` directory works), but we must NOT loop forever on +//! `a -> b -> a`. `walkdir` does NOT detect cycles for us when +//! `follow_links(true)`; we layer our own visited-set on top, keyed by the +//! canonical path of every entry, and skip any entry we've already seen. + +use std::collections::HashSet; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; +use ignore::overrides::{Override, OverrideBuilder}; +use walkdir::{DirEntry, WalkDir}; + +/// Default-excludes baked into the connector. These are NOT configurable; +/// they cover noise that is never useful to ingest and would otherwise need +/// to appear in every user's `.kbignore`. +const DEFAULT_EXCLUDES: &[&str] = &[ + // Finder metadata + ".DS_Store", + "**/.DS_Store", + // macOS resource forks (AppleDouble files) + "._*", + "**/._*", +]; + +/// Build the merged `Override` from `config.workspace.exclude` ∪ `.kbignore` +/// ∪ baked-in default excludes. +/// +/// Each input pattern is registered as an *exclude* (gitignore-style: a +/// leading `!` flips a positive match to a negative one in the +/// `OverrideBuilder` API). Order doesn't matter — the union is computed by +/// the underlying gitignore engine. +pub(crate) fn build_overrides( + root: &Path, + config_exclude: &[String], + kbignore_patterns: &[String], +) -> Result { + let mut builder = OverrideBuilder::new(root); + + for pat in DEFAULT_EXCLUDES { + builder + .add(&format!("!{pat}")) + .with_context(|| format!("invalid default-exclude pattern: {pat}"))?; + } + for pat in config_exclude { + builder + .add(&format!("!{pat}")) + .with_context(|| format!("invalid workspace.exclude pattern: {pat}"))?; + } + for pat in kbignore_patterns { + builder + .add(&format!("!{pat}")) + .with_context(|| format!("invalid .kbignore pattern: {pat}"))?; + } + + builder.build().context("failed to compile override set") +} + +/// Read `/.kbignore` if it exists. Each non-blank, non-comment line is +/// a gitignore pattern. Missing file → empty Vec (not an error). +pub(crate) fn read_kbignore(root: &Path) -> Result> { + let path = root.join(".kbignore"); + if !path.exists() { + return Ok(Vec::new()); + } + let text = std::fs::read_to_string(&path) + .with_context(|| format!("failed to read {}", path.display()))?; + Ok(text + .lines() + .map(|l| l.trim()) + .filter(|l| !l.is_empty() && !l.starts_with('#')) + .map(|l| l.to_string()) + .collect()) +} + +/// Iterate every regular file under `root`, applying `overrides` and +/// detecting symlink cycles. Returns absolute file paths. +/// +/// Strategy: +/// - `walkdir::WalkDir::follow_links(true)` to traverse symlinks. +/// - Maintain `visited: HashSet` of *canonical* paths. Before +/// descending into a directory entry, canonicalize and check the set; +/// if already present, skip. This breaks `a -> b -> a` cycles in O(n) +/// per entry without a custom recursive walker. +/// - For each yielded entry, ask `overrides` whether it is excluded; if +/// so, drop it. If the entry is a directory, also short-circuit +/// `WalkDir`'s descent via `it.skip_current_dir()`. +pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result> { + let mut out = Vec::new(); + let mut visited: HashSet = HashSet::new(); + + let walker = WalkDir::new(root).follow_links(true).into_iter(); + let mut it = walker.filter_entry(|e| !is_excluded(e, root, overrides)); + + while let Some(res) = it.next() { + let entry = match res { + Ok(e) => e, + Err(err) => { + // `walkdir` surfaces I/O errors AND its own cycle detector + // (when follow_links is on it sometimes catches them). + // Either way: log and skip; do not abort the whole scan. + tracing::warn!(error = %err, "walkdir entry error; skipping"); + continue; + } + }; + + let path = entry.path(); + + // Cycle guard: only canonicalize symlinks (cheap on the common case + // of plain files/dirs) and on directories that are followed via a + // symlink. `walkdir`'s `path_is_symlink()` is true when the entry's + // *original* path is a symlink (it returns true for the link, not + // for the resolved target). For non-symlinked directories we still + // record the canonical path so a *later* symlink that points back + // to one of them is detected. + if entry.file_type().is_dir() { + match std::fs::canonicalize(path) { + Ok(canon) => { + if !visited.insert(canon) { + // Already visited via another path → break cycle. + it.skip_current_dir(); + continue; + } + } + Err(_) => { + // Broken symlink etc. — skip silently. + continue; + } + } + } + + if entry.file_type().is_file() { + out.push(path.to_path_buf()); + } + } + + Ok(out) +} + +fn is_excluded(entry: &DirEntry, root: &Path, overrides: &Override) -> bool { + // `Override::matched(path, is_dir)` uses the path *relative to* the + // override builder's root. `walkdir` gives absolute paths when + // `WalkDir::new` was given an absolute path — strip the root prefix + // before consulting the override. + let rel = match entry.path().strip_prefix(root) { + Ok(p) => p, + Err(_) => entry.path(), + }; + overrides + .matched(rel, entry.file_type().is_dir()) + .is_ignore() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_inputs_compile_into_an_override() { + let dir = tempfile::tempdir().unwrap(); + let ov = build_overrides(dir.path(), &[], &[]).unwrap(); + // Default-excludes only; non-special files should not match. + let m = ov.matched(Path::new("notes/alpha.md"), false); + assert!(!m.is_ignore()); + } + + #[test] + fn default_excludes_ds_store_and_resource_forks() { + let dir = tempfile::tempdir().unwrap(); + let ov = build_overrides(dir.path(), &[], &[]).unwrap(); + assert!(ov.matched(Path::new(".DS_Store"), false).is_ignore()); + assert!( + ov.matched(Path::new("notes/.DS_Store"), false).is_ignore() + ); + assert!(ov.matched(Path::new("._foo.md"), false).is_ignore()); + assert!( + ov.matched(Path::new("notes/._sidecar"), false).is_ignore() + ); + } + + #[test] + fn config_exclude_filters_tmp_and_node_modules() { + let dir = tempfile::tempdir().unwrap(); + let ov = build_overrides( + dir.path(), + &["*.tmp".to_string(), "node_modules/**".to_string()], + &[], + ) + .unwrap(); + assert!(ov.matched(Path::new("a.tmp"), false).is_ignore()); + assert!(ov.matched(Path::new("notes/x.tmp"), false).is_ignore()); + assert!( + ov.matched(Path::new("node_modules/foo/bar.js"), false) + .is_ignore() + ); + assert!(!ov.matched(Path::new("alpha.md"), false).is_ignore()); + } + + #[test] + fn kbignore_union_with_config_exclude() { + // "either set excluding it ⇒ excluded" + let dir = tempfile::tempdir().unwrap(); + let ov = build_overrides( + dir.path(), + &["*.tmp".to_string()], + &["secret/**".to_string()], + ) + .unwrap(); + assert!(ov.matched(Path::new("a.tmp"), false).is_ignore()); + assert!( + ov.matched(Path::new("secret/key.md"), false).is_ignore() + ); + } + + #[test] + fn read_kbignore_missing_returns_empty() { + let dir = tempfile::tempdir().unwrap(); + let v = read_kbignore(dir.path()).unwrap(); + assert!(v.is_empty()); + } + + #[test] + fn read_kbignore_strips_blanks_and_comments() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join(".kbignore"), + "# comment\n*.tmp\n\nignored/**\n", + ) + .unwrap(); + let v = read_kbignore(dir.path()).unwrap(); + assert_eq!(v, vec!["*.tmp".to_string(), "ignored/**".to_string()]); + } +} -- 2.49.1 From 3d4c485415d34bb66da87493d5a5ffccaf44c857 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 12:27:44 +0000 Subject: [PATCH 2/5] =?UTF-8?q?p1-1:=20integration=20test=20=E2=80=94=20sy?= =?UTF-8?q?mlink=20cycles=20do=20not=20loop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two cases: - root/notes -> root (single-link cycle through workspace root). - root/a -> b, root/b -> a (two-step cycle of dangling symlinks). Both must complete in O(seconds) and surface alpha.md exactly once. Proves walker::walk_files's visited-set guard catches realistic cycle shapes via the public SourceConnector API. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-source-fs/tests/symlink_cycle.rs | 82 ++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 crates/kb-source-fs/tests/symlink_cycle.rs diff --git a/crates/kb-source-fs/tests/symlink_cycle.rs b/crates/kb-source-fs/tests/symlink_cycle.rs new file mode 100644 index 0000000..2366549 --- /dev/null +++ b/crates/kb-source-fs/tests/symlink_cycle.rs @@ -0,0 +1,82 @@ +//! Integration test: a `notes/` symlink whose target points back at the +//! workspace root MUST NOT cause `scan` to loop forever or panic. +//! +//! Layout (built per-test in a tempdir): +//! root/ +//! ├── alpha.md +//! ├── notes/ (symlink → root) ← cycle: root → notes → root → … +//! +//! Expected: `scan` returns in O(seconds), every emitted path is unique, +//! and `alpha.md` appears at least once. +//! +//! The cycle guard lives in `walker::walk_files`; this test exists to +//! prove it catches the realistic shape (cycle through one or more +//! symlinks) end-to-end via the public API. + +#![cfg(unix)] + +use std::os::unix::fs::symlink; + +use kb_config::Config; +use kb_core::{SourceConnector, SourceScope}; +use kb_source_fs::FsSourceConnector; + +fn cfg_with_root(root: &str) -> Config { + let mut c = Config::defaults(); + c.workspace.root = root.to_string(); + c.workspace.exclude.clear(); + c +} + +#[test] +fn symlink_cycle_does_not_loop_or_crash() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + + std::fs::write(root.join("alpha.md"), b"alpha").unwrap(); + // Symlink: root/notes → root (a → a cycle through the link `notes`). + symlink(root, root.join("notes")).unwrap(); + + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .expect("connector init"); + let v = conn + .scan(&SourceScope::default()) + .expect("scan must return, not loop"); + + // Determinism check: no duplicate workspace paths. + let mut seen = std::collections::HashSet::new(); + for asset in &v { + assert!( + seen.insert(asset.workspace_path.0.clone()), + "duplicate workspace_path: {}", + asset.workspace_path.0 + ); + } + // The original alpha.md must appear. + assert!( + v.iter().any(|a| a.workspace_path.0 == "alpha.md"), + "expected alpha.md in scan output, got: {:?}", + v.iter().map(|a| &a.workspace_path.0).collect::>() + ); +} + +#[test] +fn two_step_symlink_cycle_does_not_loop() { + // root/ + // ├── alpha.md + // ├── a → b + // └── b → a + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join("alpha.md"), b"alpha").unwrap(); + symlink(root.join("b"), root.join("a")).unwrap(); + symlink(root.join("a"), root.join("b")).unwrap(); + + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .expect("connector init"); + // Even though a→b→a never resolves to a real directory (broken + // pseudo-cycle of dangling symlinks), the scan must complete and + // surface alpha.md. + let v = conn.scan(&SourceScope::default()).expect("scan must return"); + assert!(v.iter().any(|a| a.workspace_path.0 == "alpha.md")); +} -- 2.49.1 From 0699220d5d9b004719c29d5460e6b229e68161d1 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 12:28:00 +0000 Subject: [PATCH 3/5] p1-1: tree-1 fixture + snapshot/determinism tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fixtures/source-fs/tree-1/: README.md notes/alpha.md notes/beta.md ignored/skip.tmp (excluded by .kbignore *.tmp) .kbignore ("*.tmp") .DS_Store (implicitly excluded by FsSourceConnector) The committed baseline (tree-1.snapshot.json) has discovered_at, source_uri.value, and stored.path replaced with "" so the JSON is portable across checkout locations and CI runs. The test applies the same stripping to scan output before comparing. The determinism test runs scan twice and asserts byte-identical serialized JSON (post-strip) — same filesystem state must yield the same Vec. Regenerate baseline with `KB_REGEN_SNAPSHOT=1 cargo test -p kb-source-fs --test snapshot_tree1 -- tree_1_snapshot_matches_baseline`. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-source-fs/tests/snapshot_tree1.rs | 139 ++++++++++++++++++++ fixtures/source-fs/tree-1.snapshot.json | 68 ++++++++++ fixtures/source-fs/tree-1/.DS_Store | 1 + fixtures/source-fs/tree-1/.kbignore | 1 + fixtures/source-fs/tree-1/README.md | 5 + fixtures/source-fs/tree-1/ignored/skip.tmp | 1 + fixtures/source-fs/tree-1/notes/alpha.md | 1 + fixtures/source-fs/tree-1/notes/beta.md | 1 + 8 files changed, 217 insertions(+) create mode 100644 crates/kb-source-fs/tests/snapshot_tree1.rs create mode 100644 fixtures/source-fs/tree-1.snapshot.json create mode 100644 fixtures/source-fs/tree-1/.DS_Store create mode 100644 fixtures/source-fs/tree-1/.kbignore create mode 100644 fixtures/source-fs/tree-1/README.md create mode 100644 fixtures/source-fs/tree-1/ignored/skip.tmp create mode 100644 fixtures/source-fs/tree-1/notes/alpha.md create mode 100644 fixtures/source-fs/tree-1/notes/beta.md diff --git a/crates/kb-source-fs/tests/snapshot_tree1.rs b/crates/kb-source-fs/tests/snapshot_tree1.rs new file mode 100644 index 0000000..6eb0ed9 --- /dev/null +++ b/crates/kb-source-fs/tests/snapshot_tree1.rs @@ -0,0 +1,139 @@ +//! Snapshot + determinism tests against `fixtures/source-fs/tree-1`. +//! +//! Layout (committed under `/fixtures/source-fs/tree-1/`): +//! +//! ``` +//! tree-1/ +//! ├── README.md +//! ├── notes/ +//! │ ├── alpha.md +//! │ └── beta.md +//! ├── ignored/ +//! │ └── skip.tmp # excluded by .kbignore +//! ├── .kbignore # contains: *.tmp +//! └── .DS_Store # implicitly excluded +//! ``` +//! +//! Two assertions: +//! 1. Snapshot stability — `scan` output (with `discovered_at` stripped) +//! matches the committed baseline JSON byte-for-byte. +//! 2. Determinism — running `scan` twice produces byte-identical JSON +//! after stripping `discovered_at`. +//! +//! `discovered_at` is wall-clock and intentionally NOT part of the +//! contract: the task spec says strip it before comparison. + +use std::path::PathBuf; + +use kb_config::Config; +use kb_core::{SourceConnector, SourceScope}; +use kb_source_fs::FsSourceConnector; +use serde_json::Value; + +/// Repo root, derived from `CARGO_MANIFEST_DIR` (= `crates/kb-source-fs`). +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .parent() + .unwrap() + .to_path_buf() +} + +fn fixture_root() -> PathBuf { + repo_root().join("fixtures/source-fs/tree-1") +} + +fn baseline_path() -> PathBuf { + repo_root().join("fixtures/source-fs/tree-1.snapshot.json") +} + +fn cfg_for_fixture(root: &str) -> Config { + let mut c = Config::defaults(); + c.workspace.root = root.to_string(); + // Clear default excludes (`.git/**`, `node_modules/**`, `.obsidian/**`) + // so the snapshot is purely a function of the fixture + .kbignore + + // baked-in default-excludes. + c.workspace.exclude.clear(); + c +} + +/// Run `scan` against the fixture and return the JSON value with every +/// `discovered_at` field replaced by the literal string "". +/// Also strip `source_uri.value` and `stored.path` because they contain +/// absolute paths that vary by checkout location — the snapshot must be +/// portable across machines and CI checkout dirs. +fn scan_and_strip() -> Value { + let root = fixture_root(); + let cfg = cfg_for_fixture(root.to_str().unwrap()); + let conn = FsSourceConnector::new(&cfg).expect("connector init"); + let assets = conn + .scan(&SourceScope::default()) + .expect("scan must succeed against committed fixture"); + + let mut v = serde_json::to_value(&assets).expect("serialize"); + if let Value::Array(items) = &mut v { + for item in items { + if let Value::Object(map) = item { + map.insert( + "discovered_at".to_string(), + Value::String("".to_string()), + ); + // source_uri = { kind: "file", value: "" } — strip value. + if let Some(Value::Object(s)) = map.get_mut("source_uri") { + if s.contains_key("value") { + s.insert("value".to_string(), Value::String("".to_string())); + } + } + // stored = { kind: "copied"|"reference", path: "", ... } — strip path. + if let Some(Value::Object(s)) = map.get_mut("stored") { + if s.contains_key("path") { + s.insert("path".to_string(), Value::String("".to_string())); + } + } + } + } + } + v +} + +#[test] +fn tree_1_snapshot_matches_baseline() { + let actual = scan_and_strip(); + + // If KB_REGEN_SNAPSHOT is set, (re)write the baseline and exit + // *before* attempting to read it. This is the only path that may + // create the file from scratch. + if std::env::var_os("KB_REGEN_SNAPSHOT").is_some() { + let pretty = serde_json::to_string_pretty(&actual).unwrap() + "\n"; + std::fs::write(baseline_path(), pretty).expect("write baseline"); + panic!("regenerated baseline; rerun without KB_REGEN_SNAPSHOT to verify"); + } + + let baseline_text = std::fs::read_to_string(baseline_path()).unwrap_or_else(|_| { + panic!( + "missing baseline at {} — regenerate via `KB_REGEN_SNAPSHOT=1 cargo test \ + -p kb-source-fs --test snapshot_tree1 -- tree_1_snapshot_matches_baseline`", + baseline_path().display() + ) + }); + let expected: Value = serde_json::from_str(&baseline_text) + .expect("baseline JSON must parse"); + + if actual != expected { + let actual_pretty = serde_json::to_string_pretty(&actual).unwrap(); + let expected_pretty = serde_json::to_string_pretty(&expected).unwrap(); + panic!( + "snapshot drift.\n--- expected ---\n{expected_pretty}\n--- actual ---\n{actual_pretty}\n" + ); + } +} + +#[test] +fn tree_1_scan_is_deterministic() { + let v1 = scan_and_strip(); + let v2 = scan_and_strip(); + let s1 = serde_json::to_string(&v1).unwrap(); + let s2 = serde_json::to_string(&v2).unwrap(); + assert_eq!(s1, s2, "two consecutive scans diverged"); +} diff --git a/fixtures/source-fs/tree-1.snapshot.json b/fixtures/source-fs/tree-1.snapshot.json new file mode 100644 index 0000000..350d53c --- /dev/null +++ b/fixtures/source-fs/tree-1.snapshot.json @@ -0,0 +1,68 @@ +[ + { + "asset_id": "bd6e5649e546d6ac94c3269ffe7192c5", + "byte_len": 6, + "checksum": "f6b71def043f1fd92f2d34969a7272a9d134730551de8c9754c4be79fbc0aef3", + "discovered_at": "", + "media_type": { + "other": "" + }, + "source_uri": { + "kind": "file", + "value": "" + }, + "stored": { + "kind": "copied", + "path": "" + }, + "workspace_path": ".kbignore" + }, + { + "asset_id": "ba6cd31cab86eff7a86638ee76494bcf", + "byte_len": 169, + "checksum": "b0124489083674f6ad99a57ee5fc425feb71754a538a97a1ab580e8eb9b1f1c1", + "discovered_at": "", + "media_type": "markdown", + "source_uri": { + "kind": "file", + "value": "" + }, + "stored": { + "kind": "copied", + "path": "" + }, + "workspace_path": "README.md" + }, + { + "asset_id": "3381fcc34cf9415a391ba6b0dc6037c5", + "byte_len": 11, + "checksum": "e9fa9a5e0725d7bf6ec9d1565d3921eb6b62aa7f0db40c1c3ffebda7475d4258", + "discovered_at": "", + "media_type": "markdown", + "source_uri": { + "kind": "file", + "value": "" + }, + "stored": { + "kind": "copied", + "path": "" + }, + "workspace_path": "notes/alpha.md" + }, + { + "asset_id": "e300aa98aec843d2df1dd8f43702b257", + "byte_len": 10, + "checksum": "3e4df2f43563730d61672ce67a9bf479bc7c7a2f1384e2081d52e06f143353ed", + "discovered_at": "", + "media_type": "markdown", + "source_uri": { + "kind": "file", + "value": "" + }, + "stored": { + "kind": "copied", + "path": "" + }, + "workspace_path": "notes/beta.md" + } +] diff --git a/fixtures/source-fs/tree-1/.DS_Store b/fixtures/source-fs/tree-1/.DS_Store new file mode 100644 index 0000000..c24ae00 --- /dev/null +++ b/fixtures/source-fs/tree-1/.DS_Store @@ -0,0 +1 @@ +macOS Finder metadata placeholder. Implicitly excluded by FsSourceConnector. diff --git a/fixtures/source-fs/tree-1/.kbignore b/fixtures/source-fs/tree-1/.kbignore new file mode 100644 index 0000000..1944fd6 --- /dev/null +++ b/fixtures/source-fs/tree-1/.kbignore @@ -0,0 +1 @@ +*.tmp diff --git a/fixtures/source-fs/tree-1/README.md b/fixtures/source-fs/tree-1/README.md new file mode 100644 index 0000000..1948784 --- /dev/null +++ b/fixtures/source-fs/tree-1/README.md @@ -0,0 +1,5 @@ +# tree-1 + +Fixture for `kb-source-fs` snapshot tests. Contents are intentionally tiny +and stable — bumping a byte here will require regenerating the snapshot +baseline. diff --git a/fixtures/source-fs/tree-1/ignored/skip.tmp b/fixtures/source-fs/tree-1/ignored/skip.tmp new file mode 100644 index 0000000..73cdccd --- /dev/null +++ b/fixtures/source-fs/tree-1/ignored/skip.tmp @@ -0,0 +1 @@ +should be excluded by .kbignore diff --git a/fixtures/source-fs/tree-1/notes/alpha.md b/fixtures/source-fs/tree-1/notes/alpha.md new file mode 100644 index 0000000..b01a892 --- /dev/null +++ b/fixtures/source-fs/tree-1/notes/alpha.md @@ -0,0 +1 @@ +alpha note diff --git a/fixtures/source-fs/tree-1/notes/beta.md b/fixtures/source-fs/tree-1/notes/beta.md new file mode 100644 index 0000000..e689c5e --- /dev/null +++ b/fixtures/source-fs/tree-1/notes/beta.md @@ -0,0 +1 @@ +beta note -- 2.49.1 From f8d00bdaf67bda60a91ed0442f789e6c17d640f7 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 12:42:12 +0000 Subject: [PATCH 4/5] p1-1: address review (AssetStorage doc-comment, real directory-cycle test) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Document AssetStorage::Copied / Reference path semantics so P1-6 (asset writer) knows that at scan time `Copied.path` is the SOURCE path and the writer is responsible for both copying bytes AND overwriting `path` with the destination. - Rename the dangling-symlink test to make its scope explicit (`dangling_symlink_pseudo_cycle_does_not_crash`); the prior name implied a real two-step directory cycle but the targets were broken links. - Add `two_step_directory_cycle_visited_set_breaks_loop`: builds `a/loop -> ../b` and `b/loop -> ../a` over real directories with real files, asserting scan terminates with a finite, deterministic asset list — exercises the canonical-path visited-set in walker::walk_files. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-core/src/asset.rs | 12 ++++ crates/kb-source-fs/tests/symlink_cycle.rs | 84 +++++++++++++++++++++- 2 files changed, 93 insertions(+), 3 deletions(-) diff --git a/crates/kb-core/src/asset.rs b/crates/kb-core/src/asset.rs index 8532175..6078219 100644 --- a/crates/kb-core/src/asset.rs +++ b/crates/kb-core/src/asset.rs @@ -40,6 +40,18 @@ impl WorkspacePath { } } +/// On-disk storage decision for a `RawAsset`. +/// +/// **Important convention** — `path` field semantics differ by variant: +/// +/// - `Copied { path }`: at scan time, `path` is the **source** path on the +/// user's filesystem. The asset writer (P1-6) is responsible for actually +/// copying the bytes into the workspace asset store, AND for overwriting +/// `path` with the destination path after the copy completes. +/// +/// - `Reference { path, sha }`: `path` is always the **source** path. No +/// bytes are ever copied; downstream readers stream from `path` directly. +/// `sha` is the BLAKE3 full hex (matches `RawAsset::checksum`). #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "lowercase", tag = "kind")] pub enum AssetStorage { diff --git a/crates/kb-source-fs/tests/symlink_cycle.rs b/crates/kb-source-fs/tests/symlink_cycle.rs index 2366549..bcd00c4 100644 --- a/crates/kb-source-fs/tests/symlink_cycle.rs +++ b/crates/kb-source-fs/tests/symlink_cycle.rs @@ -61,11 +61,17 @@ fn symlink_cycle_does_not_loop_or_crash() { } #[test] -fn two_step_symlink_cycle_does_not_loop() { +fn dangling_symlink_pseudo_cycle_does_not_crash() { // root/ // ├── alpha.md - // ├── a → b - // └── b → a + // ├── a → b (b does not exist as a real file/dir) + // └── b → a (a does not exist as a real file/dir) + // + // Both symlinks are dangling — neither resolves to anything. This is + // NOT a real two-step directory cycle (see + // `two_step_directory_cycle_visited_set_breaks_loop` for that case); + // it merely verifies the scan tolerates broken-link pseudo-cycles + // without crashing or looping. let dir = tempfile::tempdir().unwrap(); let root = dir.path(); std::fs::write(root.join("alpha.md"), b"alpha").unwrap(); @@ -80,3 +86,75 @@ fn two_step_symlink_cycle_does_not_loop() { let v = conn.scan(&SourceScope::default()).expect("scan must return"); assert!(v.iter().any(|a| a.workspace_path.0 == "alpha.md")); } + +#[test] +fn two_step_directory_cycle_visited_set_breaks_loop() { + // Real two-step directory cycle through symlinks: + // root/ + // ├── a/ + // │ ├── inside_a.md + // │ └── loop → ../b (symlink, target IS a real directory) + // └── b/ + // ├── inside_b.md + // └── loop → ../a (symlink, target IS a real directory) + // + // Without the visited-set, walkdir would descend + // a → a/loop (=b) → a/loop/loop (=a) → … forever. + // The canonical-path visited-set in `walker::walk_files` must break + // the loop and yield a finite, deterministic result. + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir(root.join("a")).unwrap(); + std::fs::create_dir(root.join("b")).unwrap(); + std::fs::write(root.join("a/inside_a.md"), b"a-content").unwrap(); + std::fs::write(root.join("b/inside_b.md"), b"b-content").unwrap(); + // Use relative targets so the symlink truly points at the sibling + // directory regardless of where the tempdir lives. + symlink("../b", root.join("a/loop")).unwrap(); + symlink("../a", root.join("b/loop")).unwrap(); + + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .expect("connector init"); + + // Run scan twice — both must terminate AND produce identical + // workspace_path lists (visited-set is deterministic per scan). + let v1 = conn.scan(&SourceScope::default()).expect("scan must return"); + let v2 = conn.scan(&SourceScope::default()).expect("scan must return"); + + let names1: Vec = v1.iter().map(|a| a.workspace_path.0.clone()).collect(); + let names2: Vec = v2.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert_eq!(names1, names2, "scan must be deterministic across runs"); + + // No duplicate workspace paths (visited-set should suppress + // re-emission of the same canonical file via the cycle). + let mut seen = std::collections::HashSet::new(); + for asset in &v1 { + assert!( + seen.insert(asset.workspace_path.0.clone()), + "duplicate workspace_path: {}", + asset.workspace_path.0 + ); + } + + // Both real files must appear at least once. Their exact relative + // paths depend on which side of the cycle the walker descended into + // first; assert by basename to keep the check robust. + assert!( + v1.iter().any(|a| a.workspace_path.0.ends_with("inside_a.md")), + "expected inside_a.md in scan output, got: {names1:?}" + ); + assert!( + v1.iter().any(|a| a.workspace_path.0.ends_with("inside_b.md")), + "expected inside_b.md in scan output, got: {names1:?}" + ); + + // Sanity bound: with two real files and a working cycle guard the + // output should be tiny. If we ever produce >50 entries the visited + // set has regressed. + assert!( + v1.len() < 50, + "scan emitted {} assets — cycle guard likely regressed: {:?}", + v1.len(), + names1 + ); +} -- 2.49.1 From 967a6a62c5521e102d4edd1db7a16f607e791b14 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 30 Apr 2026 12:42:25 +0000 Subject: [PATCH 5/5] p1-1: address review (walker module doc, TODO markers, .kbignore ADR) - walker.rs: document why we pick walkdir over ignore::WalkBuilder (explicit canonical-path comparison for sibling-subtree symlinks). - walker.rs: log canonicalize failures via tracing::debug! (was a silent `Err(_) => continue`) so broken/permission-denied symlink targets are observable at debug verbosity. - connector.rs: TODO marker on the scope.include debug-log noting the filter belongs at the extractor router (P1-2/P1-3). - connector.rs: TODO marker on expand_tilde to hoist tilde + ${VAR} expansion into a kb-config helper once available. - connector.rs: comment on the .kbignore read documenting the re-read-on-every-scan() contract. - connector.rs test: tighten the `.kbignore`-itself ADR comment and upgrade the assertion to actively pin "`.kbignore` IS emitted" instead of "either is fine"; future drift will now fail the test. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kb-source-fs/src/connector.rs | 26 +++++++++++++++++++------- crates/kb-source-fs/src/walker.rs | 17 +++++++++++++++-- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/crates/kb-source-fs/src/connector.rs b/crates/kb-source-fs/src/connector.rs index fe98c1e..ed94bc9 100644 --- a/crates/kb-source-fs/src/connector.rs +++ b/crates/kb-source-fs/src/connector.rs @@ -80,10 +80,17 @@ impl SourceConnector for FsSourceConnector { // can layer a per-call narrowing. let mut excludes = self.default_exclude.clone(); excludes.extend(scope.exclude.iter().cloned()); + // .kbignore is re-read on every scan() so users can edit it without + // restarting any long-running process. let kbignore = read_kbignore(&root)?; let overrides = build_overrides(&root, &excludes, &kbignore)?; + // TODO(P1-2/P1-3 router): apply SourceScope::include glob filter at the + // extractor router layer once that crate lands. SourceConnector emits all + // non-excluded files; routing by include-glob is a downstream concern + // (design §6.2 + §7.2 are silent on this split, treat it as router work). + // // `scope.include` is intentionally ignored at this stage of the // pipeline: per §6.2 the workspace-level include lives in // `WorkspaceCfg` and is enforced by the asset writer / extractors. @@ -159,6 +166,9 @@ impl SourceConnector for FsSourceConnector { } } +// TODO(kb-config): hoist tilde + ${VAR} expansion into a kb-config helper +// once that crate gains a path-expansion API. Today this duplicates logic +// that P1-6 (store-sqlite) and future crates will also need. /// Expand a leading `~` to the current user's home directory. No-op for /// any other shape (absolute, relative, `${VAR}`-style). fn expand_tilde(s: &str) -> PathBuf { @@ -242,13 +252,15 @@ mod tests { .unwrap(); let v = conn.scan(&SourceScope::default()).unwrap(); let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); - // .kbignore itself starts with `.` and is not in DEFAULT_EXCLUDES, - // so it is *not* automatically hidden — but the task spec only - // requires `*.tmp` and `.DS_Store` / `._*` filtering, and the - // `.kbignore` file is a legitimate "Other(\"\")" asset. Either - // present-or-absent is acceptable; the assertion below pins - // current behaviour: .kbignore appears, b.tmp does not. - assert!(names.contains(&".kbignore".to_string())); + // Decision: `.kbignore` itself IS emitted as a RawAsset (MediaType::Other("")). + // Rationale: a config file that affects ingest is itself part of the + // workspace contents; the markdown extractor (P1-2) will reject Other("") + // on its own. If we ever decide to omit `.kbignore` from the asset list, + // this test will catch it. + assert!( + names.contains(&".kbignore".to_string()), + ".kbignore must be emitted as an asset; got: {names:?}" + ); assert!(names.contains(&"a.md".to_string())); assert!(!names.contains(&"b.tmp".to_string())); } diff --git a/crates/kb-source-fs/src/walker.rs b/crates/kb-source-fs/src/walker.rs index 95b6b3d..6f399cc 100644 --- a/crates/kb-source-fs/src/walker.rs +++ b/crates/kb-source-fs/src/walker.rs @@ -17,6 +17,15 @@ //! `a -> b -> a`. `walkdir` does NOT detect cycles for us when //! `follow_links(true)`; we layer our own visited-set on top, keyed by the //! canonical path of every entry, and skip any entry we've already seen. +//! +//! ## Why `walkdir` instead of `ignore::WalkBuilder`? +//! +//! `ignore::WalkBuilder` bundles gitignore semantics + cycle detection in +//! one API. We use `walkdir` directly because we need explicit control +//! over canonical-path comparison for sibling-subtree symlinks (a case +//! `walkdir`'s ancestor-only check can miss). Override-based filtering +//! still uses the `ignore` crate's `Override` matcher, just decoupled from +//! its walker. use std::collections::HashSet; use std::path::{Path, PathBuf}; @@ -136,8 +145,12 @@ pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result { - // Broken symlink etc. — skip silently. + Err(err) => { + tracing::debug!( + path = %path.display(), + error = %err, + "skipping: canonicalize failed (broken/permission-denied symlink target)" + ); continue; } } -- 2.49.1