diff --git a/Cargo.lock b/Cargo.lock index 58f8384..48aaa06 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -79,6 +79,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + [[package]] name = "blake3" version = "1.8.5" @@ -93,6 +99,16 @@ dependencies = [ "cpufeatures", ] +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "cc" version = "1.2.61" @@ -179,6 +195,25 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -222,12 +257,34 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + [[package]] name = "find-msvc-tools" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "getrandom" version = "0.2.17" @@ -239,6 +296,41 @@ dependencies = [ "wasi", ] +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] + +[[package]] +name = "globset" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", +] + [[package]] name = "hashbrown" version = "0.17.0" @@ -251,6 +343,28 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "ignore" +version = "0.4.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3d782a365a015e0f5c04902246139249abf769125006fbe7649e2ee88169b4a" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "indexmap" version = "2.14.0" @@ -258,7 +372,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.17.0", + "serde", + "serde_core", ] [[package]] @@ -335,12 +451,35 @@ dependencies = [ "serde", ] +[[package]] +name = "kb-source-fs" +version = "0.1.0" +dependencies = [ + "anyhow", + "blake3", + "ignore", + "kb-config", + "kb-core", + "serde", + "serde_json", + "tempfile", + "time", + "tracing", + "walkdir", +] + [[package]] name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "libc" version = "0.2.186" @@ -356,6 +495,12 @@ dependencies = [ "libc", ] +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + [[package]] name = "log" version = "0.4.29" @@ -422,6 +567,16 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -440,13 +595,19 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "redox_users" version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ - "getrandom", + "getrandom 0.2.17", "libredox", "thiserror 1.0.69", ] @@ -468,12 +629,40 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + [[package]] name = "ryu-js" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + [[package]] name = "serde" version = "1.0.228" @@ -581,6 +770,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -819,6 +1021,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "utf8parse" version = "0.2.2" @@ -831,12 +1039,83 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" +[[package]] +name = "wasip2" +version = "1.0.1+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +dependencies = [ + "wit-bindgen 0.46.0", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "windows-link" version = "0.2.1" @@ -927,6 +1206,100 @@ dependencies = [ "memchr", ] +[[package]] +name = "wit-bindgen" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + [[package]] name = "zmij" version = "1.0.21" diff --git a/Cargo.toml b/Cargo.toml index ce933c2..a9f9b3b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "crates/kb-core", "crates/kb-parse-types", "crates/kb-config", + "crates/kb-source-fs", "crates/kb-app", "crates/kb-cli", ] diff --git a/crates/kb-core/src/asset.rs b/crates/kb-core/src/asset.rs index 8532175..6078219 100644 --- a/crates/kb-core/src/asset.rs +++ b/crates/kb-core/src/asset.rs @@ -40,6 +40,18 @@ impl WorkspacePath { } } +/// On-disk storage decision for a `RawAsset`. +/// +/// **Important convention** — `path` field semantics differ by variant: +/// +/// - `Copied { path }`: at scan time, `path` is the **source** path on the +/// user's filesystem. The asset writer (P1-6) is responsible for actually +/// copying the bytes into the workspace asset store, AND for overwriting +/// `path` with the destination path after the copy completes. +/// +/// - `Reference { path, sha }`: `path` is always the **source** path. No +/// bytes are ever copied; downstream readers stream from `path` directly. +/// `sha` is the BLAKE3 full hex (matches `RawAsset::checksum`). #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "lowercase", tag = "kind")] pub enum AssetStorage { diff --git a/crates/kb-source-fs/Cargo.toml b/crates/kb-source-fs/Cargo.toml new file mode 100644 index 0000000..9274c3c --- /dev/null +++ b/crates/kb-source-fs/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "kb-source-fs" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +description = "Local filesystem SourceConnector — walks workspace.root + applies gitignore filters" + +[dependencies] +kb-core = { path = "../kb-core" } +kb-config = { path = "../kb-config" } +anyhow = { workspace = true } +serde = { workspace = true } +time = { workspace = true } +blake3 = { workspace = true } +tracing = { workspace = true } +walkdir = "2" +ignore = "0.4" + +[dev-dependencies] +serde_json = { workspace = true } +tempfile = "3" diff --git a/crates/kb-source-fs/src/connector.rs b/crates/kb-source-fs/src/connector.rs new file mode 100644 index 0000000..ed94bc9 --- /dev/null +++ b/crates/kb-source-fs/src/connector.rs @@ -0,0 +1,423 @@ +//! `FsSourceConnector` — public surface for the crate. +//! +//! ```ignore +//! pub struct FsSourceConnector { /* internal */ } +//! impl FsSourceConnector { +//! pub fn new(config: &kb_config::Config) -> anyhow::Result; +//! } +//! impl kb_core::SourceConnector for FsSourceConnector { +//! fn scan(&self, scope: &kb_core::SourceScope) -> anyhow::Result>; +//! } +//! ``` + +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use time::OffsetDateTime; + +use kb_config::Config; +use kb_core::{ + AssetStorage, Checksum, RawAsset, SourceConnector, SourceScope, SourceUri, + id_for_asset, to_posix, +}; + +use crate::hash::hash_file; +use crate::media::media_type_for; +use crate::walker::{build_overrides, read_kbignore, walk_files}; + +/// Local-filesystem `SourceConnector`. Constructed once from `Config`, +/// reused across `scan` calls. +/// +/// State carried between `new` and `scan`: +/// - `default_root`: `config.workspace.root` resolved to a `PathBuf`. Used +/// only when `SourceScope::root` is empty (i.e. the caller did not +/// override the root). +/// - `default_exclude`: snapshot of `config.workspace.exclude` at +/// construction time. +/// - `copy_threshold_bytes`: `config.storage.copy_threshold_mb * 1 MiB` +/// pre-multiplied so we don't recompute per file. +pub struct FsSourceConnector { + default_root: PathBuf, + default_exclude: Vec, + copy_threshold_bytes: u64, +} + +impl FsSourceConnector { + pub fn new(config: &Config) -> Result { + // `config.workspace.root` is a String that may contain `~` or env + // expansions. P0-* did not yet provide a path-expansion helper in + // kb-config; for P1-1 we expand `~` ourselves and leave `${VAR}` + // for a follow-up. The vast majority of users hit the `~` case. + let root = expand_tilde(&config.workspace.root); + + let copy_threshold_bytes = config + .storage + .copy_threshold_mb + .saturating_mul(1024 * 1024); + + Ok(Self { + default_root: root, + default_exclude: config.workspace.exclude.clone(), + copy_threshold_bytes, + }) + } +} + +impl SourceConnector for FsSourceConnector { + fn scan(&self, scope: &SourceScope) -> Result> { + // `SourceScope::root` overrides config root when non-empty. This + // matches the design's "scope is the per-call lens; config is the + // default" split (§7.1). + let root = if scope.root.as_os_str().is_empty() { + self.default_root.clone() + } else { + scope.root.clone() + }; + + // Union: config.workspace.exclude ∪ scope.exclude ∪ .kbignore. + // Per §6.2 the union of `.kbignore` and `config.workspace.exclude` + // is the filter set. `scope.exclude` is added on top so a caller + // can layer a per-call narrowing. + let mut excludes = self.default_exclude.clone(); + excludes.extend(scope.exclude.iter().cloned()); + // .kbignore is re-read on every scan() so users can edit it without + // restarting any long-running process. + let kbignore = read_kbignore(&root)?; + + let overrides = build_overrides(&root, &excludes, &kbignore)?; + + // TODO(P1-2/P1-3 router): apply SourceScope::include glob filter at the + // extractor router layer once that crate lands. SourceConnector emits all + // non-excluded files; routing by include-glob is a downstream concern + // (design §6.2 + §7.2 are silent on this split, treat it as router work). + // + // `scope.include` is intentionally ignored at this stage of the + // pipeline: per §6.2 the workspace-level include lives in + // `WorkspaceCfg` and is enforced by the asset writer / extractors. + // Surfacing it here would double-filter Markdown vs PDF before the + // extractor router gets to see them. + if !scope.include.is_empty() { + tracing::debug!( + count = scope.include.len(), + "FsSourceConnector ignores scope.include — handled by extractor router" + ); + } + + let files = walk_files(&root, &overrides)?; + + let mut assets = Vec::with_capacity(files.len()); + for abs in &files { + // `to_posix` does NFC + leading `./` strip + `#` rejection. + // Compute the workspace-relative path before handing to it so + // emitted `WorkspacePath` is always relative. + let rel = abs.strip_prefix(&root).unwrap_or(abs); + let workspace_path = match to_posix(rel) { + Ok(p) => p, + Err(e) => { + // A path containing `#` is the only documented reason + // `to_posix` fails today. Drop the file with a warning + // rather than aborting the entire scan — a single bad + // filename should not nuke a 10 000-file ingest. + tracing::warn!( + path = %abs.display(), + error = %e, + "skipping file: path is not a valid WorkspacePath", + ); + continue; + } + }; + + let media_type = media_type_for(abs); + let (byte_len, full_hex) = hash_file(abs) + .with_context(|| format!("hashing {}", abs.display()))?; + let checksum = Checksum(full_hex.clone()); + let asset_id = id_for_asset(&full_hex); + + // Storage variant signals *intent*, not an actual copy. + // P1-6 (asset writer) is responsible for the on-disk copy. + let stored = if byte_len > self.copy_threshold_bytes { + AssetStorage::Reference { + path: abs.clone(), + sha: checksum.clone(), + } + } else { + AssetStorage::Copied { path: abs.clone() } + }; + + assets.push(RawAsset { + asset_id, + source_uri: SourceUri::File(abs.clone()), + workspace_path, + media_type, + byte_len, + checksum, + discovered_at: OffsetDateTime::now_utc(), + stored, + }); + } + + // Determinism: sort by workspace_path. WorkspacePath is a String + // newtype with stable lexicographic ordering. Two scans of the + // same tree must produce identical Vec modulo the + // wall-clock `discovered_at` field. + assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0)); + + Ok(assets) + } +} + +// TODO(kb-config): hoist tilde + ${VAR} expansion into a kb-config helper +// once that crate gains a path-expansion API. Today this duplicates logic +// that P1-6 (store-sqlite) and future crates will also need. +/// Expand a leading `~` to the current user's home directory. No-op for +/// any other shape (absolute, relative, `${VAR}`-style). +fn expand_tilde(s: &str) -> PathBuf { + if let Some(rest) = s.strip_prefix("~/") { + if let Some(home) = dirs_home() { + return home.join(rest); + } + } else if s == "~" { + if let Some(home) = dirs_home() { + return home; + } + } + PathBuf::from(s) +} + +/// Tiny `dirs::home_dir`-compat shim that does NOT add the `dirs` crate to +/// our dep set (we explicitly enumerate allowed deps in the task spec). +/// Reads `$HOME` directly. +fn dirs_home() -> Option { + std::env::var_os("HOME").map(PathBuf::from) +} + +#[cfg(test)] +mod tests { + use super::*; + use kb_config::Config; + + fn cfg_with_root(root: &str) -> Config { + let mut c = Config::defaults(); + c.workspace.root = root.to_string(); + c.workspace.exclude.clear(); + c + } + + #[test] + fn scan_empty_dir_yields_empty_vec() { + let dir = tempfile::tempdir().unwrap(); + let conn = FsSourceConnector::new(&cfg_with_root( + dir.path().to_str().unwrap(), + )) + .unwrap(); + let scope = SourceScope::default(); + let v = conn.scan(&scope).unwrap(); + assert!(v.is_empty()); + } + + #[test] + fn scan_emits_sorted_workspace_paths() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir_all(root.join("notes")).unwrap(); + std::fs::write(root.join("README.md"), b"hi").unwrap(); + std::fs::write(root.join("notes/beta.md"), b"b").unwrap(); + std::fs::write(root.join("notes/alpha.md"), b"a").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert_eq!( + names, + vec![ + "README.md".to_string(), + "notes/alpha.md".to_string(), + "notes/beta.md".to_string(), + ] + ); + } + + #[test] + fn scan_filters_by_kbignore() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join(".kbignore"), "*.tmp\n").unwrap(); + std::fs::write(root.join("a.md"), b"x").unwrap(); + std::fs::write(root.join("b.tmp"), b"x").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); + // Decision: `.kbignore` itself IS emitted as a RawAsset (MediaType::Other("")). + // Rationale: a config file that affects ingest is itself part of the + // workspace contents; the markdown extractor (P1-2) will reject Other("") + // on its own. If we ever decide to omit `.kbignore` from the asset list, + // this test will catch it. + assert!( + names.contains(&".kbignore".to_string()), + ".kbignore must be emitted as an asset; got: {names:?}" + ); + assert!(names.contains(&"a.md".to_string())); + assert!(!names.contains(&"b.tmp".to_string())); + } + + #[test] + fn scan_filters_default_excludes() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join("a.md"), b"x").unwrap(); + std::fs::write(root.join(".DS_Store"), b"\0\0").unwrap(); + std::fs::write(root.join("._sidecar"), b"\0\0").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert_eq!(names, vec!["a.md".to_string()]); + } + + #[test] + fn scan_unions_config_exclude_and_kbignore() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join(".kbignore"), "*.tmp\n").unwrap(); + std::fs::write(root.join("a.md"), b"x").unwrap(); + std::fs::write(root.join("b.tmp"), b"x").unwrap(); + std::fs::write(root.join("c.log"), b"x").unwrap(); + + let mut cfg = cfg_with_root(root.to_str().unwrap()); + cfg.workspace.exclude.push("*.log".to_string()); + + let conn = FsSourceConnector::new(&cfg).unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert!(names.contains(&"a.md".to_string())); + assert!(!names.contains(&"b.tmp".to_string()), "kbignore should drop *.tmp"); + assert!(!names.contains(&"c.log".to_string()), "config.exclude should drop *.log"); + } + + #[test] + fn scan_blake3_pinned_for_known_file() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join("hello.md"), b"hello world").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + assert_eq!(v.len(), 1); + let asset = &v[0]; + assert_eq!( + asset.checksum.0, + "d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24" + ); + assert_eq!(asset.byte_len, 11); + // asset_id is derived from the full hex via id_for_asset. + assert_eq!(asset.asset_id, id_for_asset(&asset.checksum.0)); + } + + #[test] + fn scan_idempotent_modulo_timestamp() { + // Same filesystem state → identical Vec *modulo* + // discovered_at. Strip that field and compare. + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir_all(root.join("notes")).unwrap(); + std::fs::write(root.join("notes/a.md"), b"alpha").unwrap(); + std::fs::write(root.join("notes/b.md"), b"beta").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v1 = conn.scan(&SourceScope::default()).unwrap(); + let v2 = conn.scan(&SourceScope::default()).unwrap(); + assert_eq!(v1.len(), v2.len()); + for (a, b) in v1.iter().zip(v2.iter()) { + assert_eq!(a.asset_id, b.asset_id); + assert_eq!(a.workspace_path, b.workspace_path); + assert_eq!(a.checksum, b.checksum); + assert_eq!(a.byte_len, b.byte_len); + assert_eq!(a.media_type, b.media_type); + assert_eq!(a.source_uri, b.source_uri); + assert_eq!(a.stored, b.stored); + // discovered_at intentionally NOT compared + } + } + + #[test] + fn scan_emits_posix_normalized_paths() { + // End-to-end: the connector must produce POSIX-normalized + // workspace paths via `kb_core::to_posix`. We can't construct an + // input with literal `./` / `//` segments via the filesystem (the + // OS won't let us), so instead we assert the resulting strings + // are already POSIX-clean (no leading `./`, no `//`, forward + // slashes only) — which is the post-conditions side of the + // round-trip the unit tests in `kb-core::normalize` cover. + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir_all(root.join("a/b/c")).unwrap(); + std::fs::write(root.join("a/b/c/d.md"), b"x").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + assert_eq!(v.len(), 1); + let p = &v[0].workspace_path.0; + assert_eq!(p, "a/b/c/d.md"); + assert!(!p.starts_with("./")); + assert!(!p.contains("//")); + assert!(!p.contains('\\')); + } + + #[test] + fn scan_skips_files_whose_name_contains_hash() { + // `WorkspacePath` rejects `#` (collides with the W3C-Media-Fragments + // separator used by `Citation`). The connector must drop such + // files with a warning rather than aborting the scan. + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join("ok.md"), b"x").unwrap(); + std::fs::write(root.join("has#hash.md"), b"y").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert_eq!(names, vec!["ok.md".to_string()]); + } + + #[test] + fn copy_vs_reference_threshold_signals_intent() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join("small.md"), b"hi").unwrap(); + + let mut cfg = cfg_with_root(root.to_str().unwrap()); + // Threshold = 0 MiB ⇒ even a 2-byte file becomes Reference. + cfg.storage.copy_threshold_mb = 0; + let conn = FsSourceConnector::new(&cfg).unwrap(); + let v = conn.scan(&SourceScope::default()).unwrap(); + assert_eq!(v.len(), 1); + match &v[0].stored { + AssetStorage::Reference { sha, .. } => { + assert_eq!(sha, &v[0].checksum); + } + other => panic!("expected Reference, got {other:?}"), + } + + // Threshold high (default 100 MiB) ⇒ Copied. + let mut cfg2 = cfg_with_root(root.to_str().unwrap()); + cfg2.storage.copy_threshold_mb = 100; + let conn2 = FsSourceConnector::new(&cfg2).unwrap(); + let v2 = conn2.scan(&SourceScope::default()).unwrap(); + assert!(matches!(v2[0].stored, AssetStorage::Copied { .. })); + } +} diff --git a/crates/kb-source-fs/src/hash.rs b/crates/kb-source-fs/src/hash.rs new file mode 100644 index 0000000..b2b40a1 --- /dev/null +++ b/crates/kb-source-fs/src/hash.rs @@ -0,0 +1,92 @@ +//! Streaming BLAKE3 over a file path. Per task spec, files MUST NOT be +//! loaded fully into memory: `blake3::Hasher::update_reader` reads through a +//! 64 KiB internal buffer, which keeps memory bounded for any size of file. +//! +//! Returns `(byte_len, full_hex)`: +//! - `byte_len` is the total bytes hashed (== file size after follow). +//! - `full_hex` is the canonical lowercase hex (64 chars) of the full +//! blake3 digest. The `kb-core::Checksum` invariant is "full hex"; the +//! 32-char prefix is reserved for `AssetId` derivation via +//! `kb_core::id_for_asset`. + +use std::fs::File; +use std::io::{self, Read}; +use std::path::Path; + +use anyhow::{Context, Result}; + +const READ_BUFFER_BYTES: usize = 64 * 1024; + +/// Stream-hash a file with blake3. Returns `(byte_len, full_hex_64)`. +/// +/// `byte_len` is computed during streaming so callers do not need a separate +/// `metadata().len()` call (which can disagree with hashed bytes if the file +/// is rewritten mid-scan, but blake3-of-stream is the source of truth for +/// `RawAsset.checksum`). +pub(crate) fn hash_file(path: &Path) -> Result<(u64, String)> { + let file = File::open(path) + .with_context(|| format!("failed to open {} for hashing", path.display()))?; + hash_reader(file).with_context(|| format!("failed to hash {}", path.display())) +} + +fn hash_reader(mut reader: R) -> Result<(u64, String)> { + let mut hasher = blake3::Hasher::new(); + let mut buf = vec![0u8; READ_BUFFER_BYTES]; + let mut total: u64 = 0; + loop { + match reader.read(&mut buf) { + Ok(0) => break, + Ok(n) => { + hasher.update(&buf[..n]); + total = total.saturating_add(n as u64); + } + Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e.into()), + } + } + Ok((total, hasher.finalize().to_hex().to_string())) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// blake3 of the empty input is the well-known "official empty hash" + /// from the blake3 spec. Pinned so that swapping the hash crate or the + /// streaming implementation can never silently produce a different + /// digest for a known input. + #[test] + fn empty_blake3_pinned() { + let (n, hex) = hash_reader(std::io::empty()).unwrap(); + assert_eq!(n, 0); + assert_eq!( + hex, + "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262" + ); + } + + /// `b"hello world"` blake3 (full 64 hex). Computed independently with + /// `b3sum`; pinning here detects any drift in the streaming pipeline. + #[test] + fn known_bytes_blake3_pinned() { + let bytes = b"hello world"; + let (n, hex) = hash_reader(&bytes[..]).unwrap(); + assert_eq!(n, 11); + assert_eq!( + hex, + "d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24" + ); + } + + /// Streaming a buffer larger than `READ_BUFFER_BYTES` must produce the + /// same digest as a single-shot blake3 over the same bytes — i.e. the + /// chunk boundary is invisible. + #[test] + fn streaming_matches_oneshot_over_buffer_boundary() { + let bytes: Vec = (0u8..=255u8).cycle().take(READ_BUFFER_BYTES * 3 + 17).collect(); + let (n, streamed) = hash_reader(&bytes[..]).unwrap(); + assert_eq!(n, bytes.len() as u64); + let oneshot = blake3::hash(&bytes).to_hex().to_string(); + assert_eq!(streamed, oneshot); + } +} diff --git a/crates/kb-source-fs/src/lib.rs b/crates/kb-source-fs/src/lib.rs new file mode 100644 index 0000000..ae6b128 --- /dev/null +++ b/crates/kb-source-fs/src/lib.rs @@ -0,0 +1,16 @@ +//! `kb-source-fs` — local filesystem `SourceConnector`. +//! +//! Walks `config.workspace.root`, applies gitignore-style filters from +//! `config.workspace.exclude` ∪ `.kbignore`, computes BLAKE3 of every file, +//! and emits `Vec` sorted by `workspace_path` for determinism. +//! +//! Per design §3.3 (RawAsset), §6.2 (workspace + .kbignore), §6.6 (POSIX +//! normalization), §7.1 (SourceScope), §7.2 (SourceConnector), §8 (module +//! boundaries). + +mod connector; +mod hash; +mod media; +mod walker; + +pub use connector::FsSourceConnector; diff --git a/crates/kb-source-fs/src/media.rs b/crates/kb-source-fs/src/media.rs new file mode 100644 index 0000000..ecaf6e3 --- /dev/null +++ b/crates/kb-source-fs/src/media.rs @@ -0,0 +1,85 @@ +//! Media-type detection by extension. Per P1-1 task spec we do NOT do +//! libmagic-style sniffing; extension is enough for P1. Unknown / missing +//! extensions fall through to `MediaType::Other(ext.to_string())` (empty +//! string when the file has no extension at all). + +use std::path::Path; + +use kb_core::{AudioType, ImageType, MediaType}; + +/// Return `MediaType` for `path` based purely on its lowercased extension. +/// `.md` → Markdown, `.pdf` → Pdf, image and audio extensions map onto +/// `MediaType::Image(_)` / `MediaType::Audio(_)`. Anything else (including +/// missing extension) → `MediaType::Other(ext)`. +pub(crate) fn media_type_for(path: &Path) -> MediaType { + let ext = path + .extension() + .and_then(|s| s.to_str()) + .map(|s| s.to_ascii_lowercase()) + .unwrap_or_default(); + + match ext.as_str() { + "md" => MediaType::Markdown, + "pdf" => MediaType::Pdf, + + "png" => MediaType::Image(ImageType::Png), + "jpg" | "jpeg" => MediaType::Image(ImageType::Jpeg), + "webp" => MediaType::Image(ImageType::Webp), + "gif" => MediaType::Image(ImageType::Gif), + "tiff" | "tif" => MediaType::Image(ImageType::Tiff), + + "m4a" => MediaType::Audio(AudioType::M4a), + "mp3" => MediaType::Audio(AudioType::Mp3), + "wav" => MediaType::Audio(AudioType::Wav), + "flac" => MediaType::Audio(AudioType::Flac), + "ogg" => MediaType::Audio(AudioType::Ogg), + + // Empty string (no extension) and any other extension: bucket as + // Other and let downstream extractors decide if they support it. + _ => MediaType::Other(ext), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn markdown_and_pdf() { + assert_eq!(media_type_for(Path::new("a/b.md")), MediaType::Markdown); + assert_eq!(media_type_for(Path::new("a/b.MD")), MediaType::Markdown); + assert_eq!(media_type_for(Path::new("a/b.pdf")), MediaType::Pdf); + } + + #[test] + fn images_and_audio() { + assert_eq!( + media_type_for(Path::new("p.jpg")), + MediaType::Image(ImageType::Jpeg) + ); + assert_eq!( + media_type_for(Path::new("p.JPEG")), + MediaType::Image(ImageType::Jpeg) + ); + assert_eq!( + media_type_for(Path::new("a.M4A")), + MediaType::Audio(AudioType::M4a) + ); + assert_eq!( + media_type_for(Path::new("a.flac")), + MediaType::Audio(AudioType::Flac) + ); + } + + #[test] + fn unknown_and_missing_extension() { + assert_eq!( + media_type_for(Path::new("notes/x.weird")), + MediaType::Other("weird".to_string()) + ); + assert_eq!( + media_type_for(Path::new("README")), + MediaType::Other(String::new()) + ); + } +} diff --git a/crates/kb-source-fs/src/walker.rs b/crates/kb-source-fs/src/walker.rs new file mode 100644 index 0000000..6f399cc --- /dev/null +++ b/crates/kb-source-fs/src/walker.rs @@ -0,0 +1,260 @@ +//! Directory walker with gitignore-style filtering and symlink-cycle +//! protection. +//! +//! Filter set (per task spec, design §6.2): +//! - `config.workspace.exclude` (passed in by `FsSourceConnector`) +//! - `/.kbignore` (optional file at workspace root) +//! - default-excludes for `.DS_Store` and macOS resource forks (`._*`) +//! +//! All three are merged via `ignore::overrides::OverrideBuilder`, which +//! gives full gitignore semantics (anchors, `!` negation, `**`, etc.). We +//! prepend `!` to each pattern because `OverrideBuilder` treats positive +//! patterns as "include" and negative as "exclude" — see §"Filter set" +//! comment in `build_walker` for the full reasoning. +//! +//! Symlink handling: we want to follow links (so a workspace using a +//! symlinked `notes/` directory works), but we must NOT loop forever on +//! `a -> b -> a`. `walkdir` does NOT detect cycles for us when +//! `follow_links(true)`; we layer our own visited-set on top, keyed by the +//! canonical path of every entry, and skip any entry we've already seen. +//! +//! ## Why `walkdir` instead of `ignore::WalkBuilder`? +//! +//! `ignore::WalkBuilder` bundles gitignore semantics + cycle detection in +//! one API. We use `walkdir` directly because we need explicit control +//! over canonical-path comparison for sibling-subtree symlinks (a case +//! `walkdir`'s ancestor-only check can miss). Override-based filtering +//! still uses the `ignore` crate's `Override` matcher, just decoupled from +//! its walker. + +use std::collections::HashSet; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; +use ignore::overrides::{Override, OverrideBuilder}; +use walkdir::{DirEntry, WalkDir}; + +/// Default-excludes baked into the connector. These are NOT configurable; +/// they cover noise that is never useful to ingest and would otherwise need +/// to appear in every user's `.kbignore`. +const DEFAULT_EXCLUDES: &[&str] = &[ + // Finder metadata + ".DS_Store", + "**/.DS_Store", + // macOS resource forks (AppleDouble files) + "._*", + "**/._*", +]; + +/// Build the merged `Override` from `config.workspace.exclude` ∪ `.kbignore` +/// ∪ baked-in default excludes. +/// +/// Each input pattern is registered as an *exclude* (gitignore-style: a +/// leading `!` flips a positive match to a negative one in the +/// `OverrideBuilder` API). Order doesn't matter — the union is computed by +/// the underlying gitignore engine. +pub(crate) fn build_overrides( + root: &Path, + config_exclude: &[String], + kbignore_patterns: &[String], +) -> Result { + let mut builder = OverrideBuilder::new(root); + + for pat in DEFAULT_EXCLUDES { + builder + .add(&format!("!{pat}")) + .with_context(|| format!("invalid default-exclude pattern: {pat}"))?; + } + for pat in config_exclude { + builder + .add(&format!("!{pat}")) + .with_context(|| format!("invalid workspace.exclude pattern: {pat}"))?; + } + for pat in kbignore_patterns { + builder + .add(&format!("!{pat}")) + .with_context(|| format!("invalid .kbignore pattern: {pat}"))?; + } + + builder.build().context("failed to compile override set") +} + +/// Read `/.kbignore` if it exists. Each non-blank, non-comment line is +/// a gitignore pattern. Missing file → empty Vec (not an error). +pub(crate) fn read_kbignore(root: &Path) -> Result> { + let path = root.join(".kbignore"); + if !path.exists() { + return Ok(Vec::new()); + } + let text = std::fs::read_to_string(&path) + .with_context(|| format!("failed to read {}", path.display()))?; + Ok(text + .lines() + .map(|l| l.trim()) + .filter(|l| !l.is_empty() && !l.starts_with('#')) + .map(|l| l.to_string()) + .collect()) +} + +/// Iterate every regular file under `root`, applying `overrides` and +/// detecting symlink cycles. Returns absolute file paths. +/// +/// Strategy: +/// - `walkdir::WalkDir::follow_links(true)` to traverse symlinks. +/// - Maintain `visited: HashSet` of *canonical* paths. Before +/// descending into a directory entry, canonicalize and check the set; +/// if already present, skip. This breaks `a -> b -> a` cycles in O(n) +/// per entry without a custom recursive walker. +/// - For each yielded entry, ask `overrides` whether it is excluded; if +/// so, drop it. If the entry is a directory, also short-circuit +/// `WalkDir`'s descent via `it.skip_current_dir()`. +pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result> { + let mut out = Vec::new(); + let mut visited: HashSet = HashSet::new(); + + let walker = WalkDir::new(root).follow_links(true).into_iter(); + let mut it = walker.filter_entry(|e| !is_excluded(e, root, overrides)); + + while let Some(res) = it.next() { + let entry = match res { + Ok(e) => e, + Err(err) => { + // `walkdir` surfaces I/O errors AND its own cycle detector + // (when follow_links is on it sometimes catches them). + // Either way: log and skip; do not abort the whole scan. + tracing::warn!(error = %err, "walkdir entry error; skipping"); + continue; + } + }; + + let path = entry.path(); + + // Cycle guard: only canonicalize symlinks (cheap on the common case + // of plain files/dirs) and on directories that are followed via a + // symlink. `walkdir`'s `path_is_symlink()` is true when the entry's + // *original* path is a symlink (it returns true for the link, not + // for the resolved target). For non-symlinked directories we still + // record the canonical path so a *later* symlink that points back + // to one of them is detected. + if entry.file_type().is_dir() { + match std::fs::canonicalize(path) { + Ok(canon) => { + if !visited.insert(canon) { + // Already visited via another path → break cycle. + it.skip_current_dir(); + continue; + } + } + Err(err) => { + tracing::debug!( + path = %path.display(), + error = %err, + "skipping: canonicalize failed (broken/permission-denied symlink target)" + ); + continue; + } + } + } + + if entry.file_type().is_file() { + out.push(path.to_path_buf()); + } + } + + Ok(out) +} + +fn is_excluded(entry: &DirEntry, root: &Path, overrides: &Override) -> bool { + // `Override::matched(path, is_dir)` uses the path *relative to* the + // override builder's root. `walkdir` gives absolute paths when + // `WalkDir::new` was given an absolute path — strip the root prefix + // before consulting the override. + let rel = match entry.path().strip_prefix(root) { + Ok(p) => p, + Err(_) => entry.path(), + }; + overrides + .matched(rel, entry.file_type().is_dir()) + .is_ignore() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_inputs_compile_into_an_override() { + let dir = tempfile::tempdir().unwrap(); + let ov = build_overrides(dir.path(), &[], &[]).unwrap(); + // Default-excludes only; non-special files should not match. + let m = ov.matched(Path::new("notes/alpha.md"), false); + assert!(!m.is_ignore()); + } + + #[test] + fn default_excludes_ds_store_and_resource_forks() { + let dir = tempfile::tempdir().unwrap(); + let ov = build_overrides(dir.path(), &[], &[]).unwrap(); + assert!(ov.matched(Path::new(".DS_Store"), false).is_ignore()); + assert!( + ov.matched(Path::new("notes/.DS_Store"), false).is_ignore() + ); + assert!(ov.matched(Path::new("._foo.md"), false).is_ignore()); + assert!( + ov.matched(Path::new("notes/._sidecar"), false).is_ignore() + ); + } + + #[test] + fn config_exclude_filters_tmp_and_node_modules() { + let dir = tempfile::tempdir().unwrap(); + let ov = build_overrides( + dir.path(), + &["*.tmp".to_string(), "node_modules/**".to_string()], + &[], + ) + .unwrap(); + assert!(ov.matched(Path::new("a.tmp"), false).is_ignore()); + assert!(ov.matched(Path::new("notes/x.tmp"), false).is_ignore()); + assert!( + ov.matched(Path::new("node_modules/foo/bar.js"), false) + .is_ignore() + ); + assert!(!ov.matched(Path::new("alpha.md"), false).is_ignore()); + } + + #[test] + fn kbignore_union_with_config_exclude() { + // "either set excluding it ⇒ excluded" + let dir = tempfile::tempdir().unwrap(); + let ov = build_overrides( + dir.path(), + &["*.tmp".to_string()], + &["secret/**".to_string()], + ) + .unwrap(); + assert!(ov.matched(Path::new("a.tmp"), false).is_ignore()); + assert!( + ov.matched(Path::new("secret/key.md"), false).is_ignore() + ); + } + + #[test] + fn read_kbignore_missing_returns_empty() { + let dir = tempfile::tempdir().unwrap(); + let v = read_kbignore(dir.path()).unwrap(); + assert!(v.is_empty()); + } + + #[test] + fn read_kbignore_strips_blanks_and_comments() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join(".kbignore"), + "# comment\n*.tmp\n\nignored/**\n", + ) + .unwrap(); + let v = read_kbignore(dir.path()).unwrap(); + assert_eq!(v, vec!["*.tmp".to_string(), "ignored/**".to_string()]); + } +} diff --git a/crates/kb-source-fs/tests/snapshot_tree1.rs b/crates/kb-source-fs/tests/snapshot_tree1.rs new file mode 100644 index 0000000..6eb0ed9 --- /dev/null +++ b/crates/kb-source-fs/tests/snapshot_tree1.rs @@ -0,0 +1,139 @@ +//! Snapshot + determinism tests against `fixtures/source-fs/tree-1`. +//! +//! Layout (committed under `/fixtures/source-fs/tree-1/`): +//! +//! ``` +//! tree-1/ +//! ├── README.md +//! ├── notes/ +//! │ ├── alpha.md +//! │ └── beta.md +//! ├── ignored/ +//! │ └── skip.tmp # excluded by .kbignore +//! ├── .kbignore # contains: *.tmp +//! └── .DS_Store # implicitly excluded +//! ``` +//! +//! Two assertions: +//! 1. Snapshot stability — `scan` output (with `discovered_at` stripped) +//! matches the committed baseline JSON byte-for-byte. +//! 2. Determinism — running `scan` twice produces byte-identical JSON +//! after stripping `discovered_at`. +//! +//! `discovered_at` is wall-clock and intentionally NOT part of the +//! contract: the task spec says strip it before comparison. + +use std::path::PathBuf; + +use kb_config::Config; +use kb_core::{SourceConnector, SourceScope}; +use kb_source_fs::FsSourceConnector; +use serde_json::Value; + +/// Repo root, derived from `CARGO_MANIFEST_DIR` (= `crates/kb-source-fs`). +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .parent() + .unwrap() + .to_path_buf() +} + +fn fixture_root() -> PathBuf { + repo_root().join("fixtures/source-fs/tree-1") +} + +fn baseline_path() -> PathBuf { + repo_root().join("fixtures/source-fs/tree-1.snapshot.json") +} + +fn cfg_for_fixture(root: &str) -> Config { + let mut c = Config::defaults(); + c.workspace.root = root.to_string(); + // Clear default excludes (`.git/**`, `node_modules/**`, `.obsidian/**`) + // so the snapshot is purely a function of the fixture + .kbignore + + // baked-in default-excludes. + c.workspace.exclude.clear(); + c +} + +/// Run `scan` against the fixture and return the JSON value with every +/// `discovered_at` field replaced by the literal string "". +/// Also strip `source_uri.value` and `stored.path` because they contain +/// absolute paths that vary by checkout location — the snapshot must be +/// portable across machines and CI checkout dirs. +fn scan_and_strip() -> Value { + let root = fixture_root(); + let cfg = cfg_for_fixture(root.to_str().unwrap()); + let conn = FsSourceConnector::new(&cfg).expect("connector init"); + let assets = conn + .scan(&SourceScope::default()) + .expect("scan must succeed against committed fixture"); + + let mut v = serde_json::to_value(&assets).expect("serialize"); + if let Value::Array(items) = &mut v { + for item in items { + if let Value::Object(map) = item { + map.insert( + "discovered_at".to_string(), + Value::String("".to_string()), + ); + // source_uri = { kind: "file", value: "" } — strip value. + if let Some(Value::Object(s)) = map.get_mut("source_uri") { + if s.contains_key("value") { + s.insert("value".to_string(), Value::String("".to_string())); + } + } + // stored = { kind: "copied"|"reference", path: "", ... } — strip path. + if let Some(Value::Object(s)) = map.get_mut("stored") { + if s.contains_key("path") { + s.insert("path".to_string(), Value::String("".to_string())); + } + } + } + } + } + v +} + +#[test] +fn tree_1_snapshot_matches_baseline() { + let actual = scan_and_strip(); + + // If KB_REGEN_SNAPSHOT is set, (re)write the baseline and exit + // *before* attempting to read it. This is the only path that may + // create the file from scratch. + if std::env::var_os("KB_REGEN_SNAPSHOT").is_some() { + let pretty = serde_json::to_string_pretty(&actual).unwrap() + "\n"; + std::fs::write(baseline_path(), pretty).expect("write baseline"); + panic!("regenerated baseline; rerun without KB_REGEN_SNAPSHOT to verify"); + } + + let baseline_text = std::fs::read_to_string(baseline_path()).unwrap_or_else(|_| { + panic!( + "missing baseline at {} — regenerate via `KB_REGEN_SNAPSHOT=1 cargo test \ + -p kb-source-fs --test snapshot_tree1 -- tree_1_snapshot_matches_baseline`", + baseline_path().display() + ) + }); + let expected: Value = serde_json::from_str(&baseline_text) + .expect("baseline JSON must parse"); + + if actual != expected { + let actual_pretty = serde_json::to_string_pretty(&actual).unwrap(); + let expected_pretty = serde_json::to_string_pretty(&expected).unwrap(); + panic!( + "snapshot drift.\n--- expected ---\n{expected_pretty}\n--- actual ---\n{actual_pretty}\n" + ); + } +} + +#[test] +fn tree_1_scan_is_deterministic() { + let v1 = scan_and_strip(); + let v2 = scan_and_strip(); + let s1 = serde_json::to_string(&v1).unwrap(); + let s2 = serde_json::to_string(&v2).unwrap(); + assert_eq!(s1, s2, "two consecutive scans diverged"); +} diff --git a/crates/kb-source-fs/tests/symlink_cycle.rs b/crates/kb-source-fs/tests/symlink_cycle.rs new file mode 100644 index 0000000..bcd00c4 --- /dev/null +++ b/crates/kb-source-fs/tests/symlink_cycle.rs @@ -0,0 +1,160 @@ +//! Integration test: a `notes/` symlink whose target points back at the +//! workspace root MUST NOT cause `scan` to loop forever or panic. +//! +//! Layout (built per-test in a tempdir): +//! root/ +//! ├── alpha.md +//! ├── notes/ (symlink → root) ← cycle: root → notes → root → … +//! +//! Expected: `scan` returns in O(seconds), every emitted path is unique, +//! and `alpha.md` appears at least once. +//! +//! The cycle guard lives in `walker::walk_files`; this test exists to +//! prove it catches the realistic shape (cycle through one or more +//! symlinks) end-to-end via the public API. + +#![cfg(unix)] + +use std::os::unix::fs::symlink; + +use kb_config::Config; +use kb_core::{SourceConnector, SourceScope}; +use kb_source_fs::FsSourceConnector; + +fn cfg_with_root(root: &str) -> Config { + let mut c = Config::defaults(); + c.workspace.root = root.to_string(); + c.workspace.exclude.clear(); + c +} + +#[test] +fn symlink_cycle_does_not_loop_or_crash() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + + std::fs::write(root.join("alpha.md"), b"alpha").unwrap(); + // Symlink: root/notes → root (a → a cycle through the link `notes`). + symlink(root, root.join("notes")).unwrap(); + + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .expect("connector init"); + let v = conn + .scan(&SourceScope::default()) + .expect("scan must return, not loop"); + + // Determinism check: no duplicate workspace paths. + let mut seen = std::collections::HashSet::new(); + for asset in &v { + assert!( + seen.insert(asset.workspace_path.0.clone()), + "duplicate workspace_path: {}", + asset.workspace_path.0 + ); + } + // The original alpha.md must appear. + assert!( + v.iter().any(|a| a.workspace_path.0 == "alpha.md"), + "expected alpha.md in scan output, got: {:?}", + v.iter().map(|a| &a.workspace_path.0).collect::>() + ); +} + +#[test] +fn dangling_symlink_pseudo_cycle_does_not_crash() { + // root/ + // ├── alpha.md + // ├── a → b (b does not exist as a real file/dir) + // └── b → a (a does not exist as a real file/dir) + // + // Both symlinks are dangling — neither resolves to anything. This is + // NOT a real two-step directory cycle (see + // `two_step_directory_cycle_visited_set_breaks_loop` for that case); + // it merely verifies the scan tolerates broken-link pseudo-cycles + // without crashing or looping. + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join("alpha.md"), b"alpha").unwrap(); + symlink(root.join("b"), root.join("a")).unwrap(); + symlink(root.join("a"), root.join("b")).unwrap(); + + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .expect("connector init"); + // Even though a→b→a never resolves to a real directory (broken + // pseudo-cycle of dangling symlinks), the scan must complete and + // surface alpha.md. + let v = conn.scan(&SourceScope::default()).expect("scan must return"); + assert!(v.iter().any(|a| a.workspace_path.0 == "alpha.md")); +} + +#[test] +fn two_step_directory_cycle_visited_set_breaks_loop() { + // Real two-step directory cycle through symlinks: + // root/ + // ├── a/ + // │ ├── inside_a.md + // │ └── loop → ../b (symlink, target IS a real directory) + // └── b/ + // ├── inside_b.md + // └── loop → ../a (symlink, target IS a real directory) + // + // Without the visited-set, walkdir would descend + // a → a/loop (=b) → a/loop/loop (=a) → … forever. + // The canonical-path visited-set in `walker::walk_files` must break + // the loop and yield a finite, deterministic result. + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir(root.join("a")).unwrap(); + std::fs::create_dir(root.join("b")).unwrap(); + std::fs::write(root.join("a/inside_a.md"), b"a-content").unwrap(); + std::fs::write(root.join("b/inside_b.md"), b"b-content").unwrap(); + // Use relative targets so the symlink truly points at the sibling + // directory regardless of where the tempdir lives. + symlink("../b", root.join("a/loop")).unwrap(); + symlink("../a", root.join("b/loop")).unwrap(); + + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .expect("connector init"); + + // Run scan twice — both must terminate AND produce identical + // workspace_path lists (visited-set is deterministic per scan). + let v1 = conn.scan(&SourceScope::default()).expect("scan must return"); + let v2 = conn.scan(&SourceScope::default()).expect("scan must return"); + + let names1: Vec = v1.iter().map(|a| a.workspace_path.0.clone()).collect(); + let names2: Vec = v2.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert_eq!(names1, names2, "scan must be deterministic across runs"); + + // No duplicate workspace paths (visited-set should suppress + // re-emission of the same canonical file via the cycle). + let mut seen = std::collections::HashSet::new(); + for asset in &v1 { + assert!( + seen.insert(asset.workspace_path.0.clone()), + "duplicate workspace_path: {}", + asset.workspace_path.0 + ); + } + + // Both real files must appear at least once. Their exact relative + // paths depend on which side of the cycle the walker descended into + // first; assert by basename to keep the check robust. + assert!( + v1.iter().any(|a| a.workspace_path.0.ends_with("inside_a.md")), + "expected inside_a.md in scan output, got: {names1:?}" + ); + assert!( + v1.iter().any(|a| a.workspace_path.0.ends_with("inside_b.md")), + "expected inside_b.md in scan output, got: {names1:?}" + ); + + // Sanity bound: with two real files and a working cycle guard the + // output should be tiny. If we ever produce >50 entries the visited + // set has regressed. + assert!( + v1.len() < 50, + "scan emitted {} assets — cycle guard likely regressed: {:?}", + v1.len(), + names1 + ); +} diff --git a/fixtures/source-fs/tree-1.snapshot.json b/fixtures/source-fs/tree-1.snapshot.json new file mode 100644 index 0000000..350d53c --- /dev/null +++ b/fixtures/source-fs/tree-1.snapshot.json @@ -0,0 +1,68 @@ +[ + { + "asset_id": "bd6e5649e546d6ac94c3269ffe7192c5", + "byte_len": 6, + "checksum": "f6b71def043f1fd92f2d34969a7272a9d134730551de8c9754c4be79fbc0aef3", + "discovered_at": "", + "media_type": { + "other": "" + }, + "source_uri": { + "kind": "file", + "value": "" + }, + "stored": { + "kind": "copied", + "path": "" + }, + "workspace_path": ".kbignore" + }, + { + "asset_id": "ba6cd31cab86eff7a86638ee76494bcf", + "byte_len": 169, + "checksum": "b0124489083674f6ad99a57ee5fc425feb71754a538a97a1ab580e8eb9b1f1c1", + "discovered_at": "", + "media_type": "markdown", + "source_uri": { + "kind": "file", + "value": "" + }, + "stored": { + "kind": "copied", + "path": "" + }, + "workspace_path": "README.md" + }, + { + "asset_id": "3381fcc34cf9415a391ba6b0dc6037c5", + "byte_len": 11, + "checksum": "e9fa9a5e0725d7bf6ec9d1565d3921eb6b62aa7f0db40c1c3ffebda7475d4258", + "discovered_at": "", + "media_type": "markdown", + "source_uri": { + "kind": "file", + "value": "" + }, + "stored": { + "kind": "copied", + "path": "" + }, + "workspace_path": "notes/alpha.md" + }, + { + "asset_id": "e300aa98aec843d2df1dd8f43702b257", + "byte_len": 10, + "checksum": "3e4df2f43563730d61672ce67a9bf479bc7c7a2f1384e2081d52e06f143353ed", + "discovered_at": "", + "media_type": "markdown", + "source_uri": { + "kind": "file", + "value": "" + }, + "stored": { + "kind": "copied", + "path": "" + }, + "workspace_path": "notes/beta.md" + } +] diff --git a/fixtures/source-fs/tree-1/.DS_Store b/fixtures/source-fs/tree-1/.DS_Store new file mode 100644 index 0000000..c24ae00 --- /dev/null +++ b/fixtures/source-fs/tree-1/.DS_Store @@ -0,0 +1 @@ +macOS Finder metadata placeholder. Implicitly excluded by FsSourceConnector. diff --git a/fixtures/source-fs/tree-1/.kbignore b/fixtures/source-fs/tree-1/.kbignore new file mode 100644 index 0000000..1944fd6 --- /dev/null +++ b/fixtures/source-fs/tree-1/.kbignore @@ -0,0 +1 @@ +*.tmp diff --git a/fixtures/source-fs/tree-1/README.md b/fixtures/source-fs/tree-1/README.md new file mode 100644 index 0000000..1948784 --- /dev/null +++ b/fixtures/source-fs/tree-1/README.md @@ -0,0 +1,5 @@ +# tree-1 + +Fixture for `kb-source-fs` snapshot tests. Contents are intentionally tiny +and stable — bumping a byte here will require regenerating the snapshot +baseline. diff --git a/fixtures/source-fs/tree-1/ignored/skip.tmp b/fixtures/source-fs/tree-1/ignored/skip.tmp new file mode 100644 index 0000000..73cdccd --- /dev/null +++ b/fixtures/source-fs/tree-1/ignored/skip.tmp @@ -0,0 +1 @@ +should be excluded by .kbignore diff --git a/fixtures/source-fs/tree-1/notes/alpha.md b/fixtures/source-fs/tree-1/notes/alpha.md new file mode 100644 index 0000000..b01a892 --- /dev/null +++ b/fixtures/source-fs/tree-1/notes/alpha.md @@ -0,0 +1 @@ +alpha note diff --git a/fixtures/source-fs/tree-1/notes/beta.md b/fixtures/source-fs/tree-1/notes/beta.md new file mode 100644 index 0000000..e689c5e --- /dev/null +++ b/fixtures/source-fs/tree-1/notes/beta.md @@ -0,0 +1 @@ +beta note