feat(p1-1): kb-source-fs filesystem source connector #6

Merged
altair823 merged 5 commits from feat/p1-1-source-fs into main 2026-04-30 12:45:47 +00:00
18 changed files with 1664 additions and 2 deletions

377
Cargo.lock generated
View File

@@ -79,6 +79,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "bitflags"
version = "2.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
[[package]]
name = "blake3"
version = "1.8.5"
@@ -93,6 +99,16 @@ dependencies = [
"cpufeatures",
]
[[package]]
name = "bstr"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab"
dependencies = [
"memchr",
"serde",
]
[[package]]
name = "cc"
version = "1.2.61"
@@ -179,6 +195,25 @@ dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
@@ -222,12 +257,34 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "errno"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
dependencies = [
"libc",
"windows-sys 0.61.2",
]
[[package]]
name = "fastrand"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
[[package]]
name = "find-msvc-tools"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
[[package]]
name = "foldhash"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "getrandom"
version = "0.2.17"
@@ -239,6 +296,41 @@ dependencies = [
"wasi",
]
[[package]]
name = "getrandom"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
dependencies = [
"cfg-if",
"libc",
"r-efi",
"wasip2",
"wasip3",
]
[[package]]
name = "globset"
version = "0.4.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3"
dependencies = [
"aho-corasick",
"bstr",
"log",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "hashbrown"
version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
"foldhash",
]
[[package]]
name = "hashbrown"
version = "0.17.0"
@@ -251,6 +343,28 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "id-arena"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
[[package]]
name = "ignore"
version = "0.4.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3d782a365a015e0f5c04902246139249abf769125006fbe7649e2ee88169b4a"
dependencies = [
"crossbeam-deque",
"globset",
"log",
"memchr",
"regex-automata",
"same-file",
"walkdir",
"winapi-util",
]
[[package]]
name = "indexmap"
version = "2.14.0"
@@ -258,7 +372,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
dependencies = [
"equivalent",
"hashbrown",
"hashbrown 0.17.0",
"serde",
"serde_core",
]
[[package]]
@@ -335,12 +451,35 @@ dependencies = [
"serde",
]
[[package]]
name = "kb-source-fs"
version = "0.1.0"
dependencies = [
"anyhow",
"blake3",
"ignore",
"kb-config",
"kb-core",
"serde",
"serde_json",
"tempfile",
"time",
"tracing",
"walkdir",
]
[[package]]
name = "lazy_static"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "leb128fmt"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
[[package]]
name = "libc"
version = "0.2.186"
@@ -356,6 +495,12 @@ dependencies = [
"libc",
]
[[package]]
name = "linux-raw-sys"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
[[package]]
name = "log"
version = "0.4.29"
@@ -422,6 +567,16 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "prettyplease"
version = "0.2.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
dependencies = [
"proc-macro2",
"syn",
]
[[package]]
name = "proc-macro2"
version = "1.0.106"
@@ -440,13 +595,19 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "r-efi"
version = "6.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
[[package]]
name = "redox_users"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
dependencies = [
"getrandom",
"getrandom 0.2.17",
"libredox",
"thiserror 1.0.69",
]
@@ -468,12 +629,40 @@ version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
[[package]]
name = "rustix"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
dependencies = [
"bitflags",
"errno",
"libc",
"linux-raw-sys",
"windows-sys 0.61.2",
]
[[package]]
name = "ryu-js"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]]
name = "semver"
version = "1.0.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
[[package]]
name = "serde"
version = "1.0.228"
@@ -581,6 +770,19 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "tempfile"
version = "3.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
dependencies = [
"fastrand",
"getrandom 0.4.2",
"once_cell",
"rustix",
"windows-sys 0.61.2",
]
[[package]]
name = "thiserror"
version = "1.0.69"
@@ -819,6 +1021,12 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-xid"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
[[package]]
name = "utf8parse"
version = "0.2.2"
@@ -831,12 +1039,83 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
[[package]]
name = "walkdir"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
dependencies = [
"same-file",
"winapi-util",
]
[[package]]
name = "wasi"
version = "0.11.1+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
[[package]]
name = "wasip2"
version = "1.0.1+wasi-0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
dependencies = [
"wit-bindgen 0.46.0",
]
[[package]]
name = "wasip3"
version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
dependencies = [
"wit-bindgen 0.51.0",
]
[[package]]
name = "wasm-encoder"
version = "0.244.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
dependencies = [
"leb128fmt",
"wasmparser",
]
[[package]]
name = "wasm-metadata"
version = "0.244.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
dependencies = [
"anyhow",
"indexmap",
"wasm-encoder",
"wasmparser",
]
[[package]]
name = "wasmparser"
version = "0.244.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
dependencies = [
"bitflags",
"hashbrown 0.15.5",
"indexmap",
"semver",
]
[[package]]
name = "winapi-util"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "windows-link"
version = "0.2.1"
@@ -927,6 +1206,100 @@ dependencies = [
"memchr",
]
[[package]]
name = "wit-bindgen"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
[[package]]
name = "wit-bindgen"
version = "0.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
dependencies = [
"wit-bindgen-rust-macro",
]
[[package]]
name = "wit-bindgen-core"
version = "0.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
dependencies = [
"anyhow",
"heck",
"wit-parser",
]
[[package]]
name = "wit-bindgen-rust"
version = "0.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
dependencies = [
"anyhow",
"heck",
"indexmap",
"prettyplease",
"syn",
"wasm-metadata",
"wit-bindgen-core",
"wit-component",
]
[[package]]
name = "wit-bindgen-rust-macro"
version = "0.51.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
dependencies = [
"anyhow",
"prettyplease",
"proc-macro2",
"quote",
"syn",
"wit-bindgen-core",
"wit-bindgen-rust",
]
[[package]]
name = "wit-component"
version = "0.244.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
dependencies = [
"anyhow",
"bitflags",
"indexmap",
"log",
"serde",
"serde_derive",
"serde_json",
"wasm-encoder",
"wasm-metadata",
"wasmparser",
"wit-parser",
]
[[package]]
name = "wit-parser"
version = "0.244.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
dependencies = [
"anyhow",
"id-arena",
"indexmap",
"log",
"semver",
"serde",
"serde_derive",
"serde_json",
"unicode-xid",
"wasmparser",
]
[[package]]
name = "zmij"
version = "1.0.21"

View File

@@ -4,6 +4,7 @@ members = [
"crates/kb-core",
"crates/kb-parse-types",
"crates/kb-config",
"crates/kb-source-fs",
"crates/kb-app",
"crates/kb-cli",
]

View File

@@ -40,6 +40,18 @@ impl WorkspacePath {
}
}
/// On-disk storage decision for a `RawAsset`.
///
/// **Important convention** — `path` field semantics differ by variant:
///
/// - `Copied { path }`: at scan time, `path` is the **source** path on the
/// user's filesystem. The asset writer (P1-6) is responsible for actually
/// copying the bytes into the workspace asset store, AND for overwriting
/// `path` with the destination path after the copy completes.
///
/// - `Reference { path, sha }`: `path` is always the **source** path. No
/// bytes are ever copied; downstream readers stream from `path` directly.
/// `sha` is the BLAKE3 full hex (matches `RawAsset::checksum`).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", tag = "kind")]
pub enum AssetStorage {

View File

@@ -0,0 +1,23 @@
[package]
name = "kb-source-fs"
version = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "Local filesystem SourceConnector — walks workspace.root + applies gitignore filters"
[dependencies]
kb-core = { path = "../kb-core" }
kb-config = { path = "../kb-config" }
anyhow = { workspace = true }
serde = { workspace = true }
time = { workspace = true }
blake3 = { workspace = true }
tracing = { workspace = true }
walkdir = "2"
ignore = "0.4"
[dev-dependencies]
serde_json = { workspace = true }
tempfile = "3"

View File

@@ -0,0 +1,423 @@
//! `FsSourceConnector` — public surface for the crate.
//!
//! ```ignore
//! pub struct FsSourceConnector { /* internal */ }
//! impl FsSourceConnector {
//! pub fn new(config: &kb_config::Config) -> anyhow::Result<Self>;
//! }
//! impl kb_core::SourceConnector for FsSourceConnector {
//! fn scan(&self, scope: &kb_core::SourceScope) -> anyhow::Result<Vec<kb_core::RawAsset>>;
//! }
//! ```
use std::path::PathBuf;
use anyhow::{Context, Result};
use time::OffsetDateTime;
use kb_config::Config;
use kb_core::{
AssetStorage, Checksum, RawAsset, SourceConnector, SourceScope, SourceUri,
id_for_asset, to_posix,
};
use crate::hash::hash_file;
use crate::media::media_type_for;
use crate::walker::{build_overrides, read_kbignore, walk_files};
/// Local-filesystem `SourceConnector`. Constructed once from `Config`,
/// reused across `scan` calls.
///
/// State carried between `new` and `scan`:
/// - `default_root`: `config.workspace.root` resolved to a `PathBuf`. Used
/// only when `SourceScope::root` is empty (i.e. the caller did not
/// override the root).
/// - `default_exclude`: snapshot of `config.workspace.exclude` at
/// construction time.
/// - `copy_threshold_bytes`: `config.storage.copy_threshold_mb * 1 MiB`
/// pre-multiplied so we don't recompute per file.
pub struct FsSourceConnector {
default_root: PathBuf,
default_exclude: Vec<String>,
copy_threshold_bytes: u64,
}
impl FsSourceConnector {
pub fn new(config: &Config) -> Result<Self> {
// `config.workspace.root` is a String that may contain `~` or env
// expansions. P0-* did not yet provide a path-expansion helper in
// kb-config; for P1-1 we expand `~` ourselves and leave `${VAR}`
// for a follow-up. The vast majority of users hit the `~` case.
let root = expand_tilde(&config.workspace.root);
let copy_threshold_bytes = config
.storage
.copy_threshold_mb
.saturating_mul(1024 * 1024);
Ok(Self {
default_root: root,
default_exclude: config.workspace.exclude.clone(),
copy_threshold_bytes,
})
}
}
impl SourceConnector for FsSourceConnector {
fn scan(&self, scope: &SourceScope) -> Result<Vec<RawAsset>> {
// `SourceScope::root` overrides config root when non-empty. This
// matches the design's "scope is the per-call lens; config is the
// default" split (§7.1).
let root = if scope.root.as_os_str().is_empty() {
self.default_root.clone()
} else {
scope.root.clone()
};
// Union: config.workspace.exclude scope.exclude .kbignore.
// Per §6.2 the union of `.kbignore` and `config.workspace.exclude`
// is the filter set. `scope.exclude` is added on top so a caller
// can layer a per-call narrowing.
let mut excludes = self.default_exclude.clone();
excludes.extend(scope.exclude.iter().cloned());
// .kbignore is re-read on every scan() so users can edit it without
// restarting any long-running process.
let kbignore = read_kbignore(&root)?;
let overrides = build_overrides(&root, &excludes, &kbignore)?;
// TODO(P1-2/P1-3 router): apply SourceScope::include glob filter at the
// extractor router layer once that crate lands. SourceConnector emits all
// non-excluded files; routing by include-glob is a downstream concern
// (design §6.2 + §7.2 are silent on this split, treat it as router work).
//
// `scope.include` is intentionally ignored at this stage of the
// pipeline: per §6.2 the workspace-level include lives in
// `WorkspaceCfg` and is enforced by the asset writer / extractors.
// Surfacing it here would double-filter Markdown vs PDF before the
// extractor router gets to see them.
if !scope.include.is_empty() {
tracing::debug!(
count = scope.include.len(),
"FsSourceConnector ignores scope.include — handled by extractor router"
);
}
let files = walk_files(&root, &overrides)?;
let mut assets = Vec::with_capacity(files.len());
for abs in &files {
// `to_posix` does NFC + leading `./` strip + `#` rejection.
// Compute the workspace-relative path before handing to it so
// emitted `WorkspacePath` is always relative.
let rel = abs.strip_prefix(&root).unwrap_or(abs);
let workspace_path = match to_posix(rel) {
Ok(p) => p,
Err(e) => {
// A path containing `#` is the only documented reason
// `to_posix` fails today. Drop the file with a warning
// rather than aborting the entire scan — a single bad
// filename should not nuke a 10 000-file ingest.
tracing::warn!(
path = %abs.display(),
error = %e,
"skipping file: path is not a valid WorkspacePath",
);
continue;
}
};
let media_type = media_type_for(abs);
let (byte_len, full_hex) = hash_file(abs)
.with_context(|| format!("hashing {}", abs.display()))?;
let checksum = Checksum(full_hex.clone());
let asset_id = id_for_asset(&full_hex);
// Storage variant signals *intent*, not an actual copy.
// P1-6 (asset writer) is responsible for the on-disk copy.
let stored = if byte_len > self.copy_threshold_bytes {
AssetStorage::Reference {
path: abs.clone(),
sha: checksum.clone(),
}
} else {
AssetStorage::Copied { path: abs.clone() }
};
assets.push(RawAsset {
asset_id,
source_uri: SourceUri::File(abs.clone()),
workspace_path,
media_type,
byte_len,
checksum,
discovered_at: OffsetDateTime::now_utc(),
stored,
});
}
// Determinism: sort by workspace_path. WorkspacePath is a String
// newtype with stable lexicographic ordering. Two scans of the
// same tree must produce identical Vec<RawAsset> modulo the
// wall-clock `discovered_at` field.
assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0));
Ok(assets)
}
}
// TODO(kb-config): hoist tilde + ${VAR} expansion into a kb-config helper
// once that crate gains a path-expansion API. Today this duplicates logic
// that P1-6 (store-sqlite) and future crates will also need.
/// Expand a leading `~` to the current user's home directory. No-op for
/// any other shape (absolute, relative, `${VAR}`-style).
fn expand_tilde(s: &str) -> PathBuf {
if let Some(rest) = s.strip_prefix("~/") {
if let Some(home) = dirs_home() {
return home.join(rest);
}
} else if s == "~" {
if let Some(home) = dirs_home() {
return home;
}
}
PathBuf::from(s)
}
/// Tiny `dirs::home_dir`-compat shim that does NOT add the `dirs` crate to
/// our dep set (we explicitly enumerate allowed deps in the task spec).
/// Reads `$HOME` directly.
fn dirs_home() -> Option<PathBuf> {
std::env::var_os("HOME").map(PathBuf::from)
}
#[cfg(test)]
mod tests {
use super::*;
use kb_config::Config;
fn cfg_with_root(root: &str) -> Config {
let mut c = Config::defaults();
c.workspace.root = root.to_string();
c.workspace.exclude.clear();
c
}
#[test]
fn scan_empty_dir_yields_empty_vec() {
let dir = tempfile::tempdir().unwrap();
let conn = FsSourceConnector::new(&cfg_with_root(
dir.path().to_str().unwrap(),
))
.unwrap();
let scope = SourceScope::default();
let v = conn.scan(&scope).unwrap();
assert!(v.is_empty());
}
#[test]
fn scan_emits_sorted_workspace_paths() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::create_dir_all(root.join("notes")).unwrap();
std::fs::write(root.join("README.md"), b"hi").unwrap();
std::fs::write(root.join("notes/beta.md"), b"b").unwrap();
std::fs::write(root.join("notes/alpha.md"), b"a").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let v = conn.scan(&SourceScope::default()).unwrap();
let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect();
assert_eq!(
names,
vec![
"README.md".to_string(),
"notes/alpha.md".to_string(),
"notes/beta.md".to_string(),
]
);
}
#[test]
fn scan_filters_by_kbignore() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join(".kbignore"), "*.tmp\n").unwrap();
std::fs::write(root.join("a.md"), b"x").unwrap();
std::fs::write(root.join("b.tmp"), b"x").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let v = conn.scan(&SourceScope::default()).unwrap();
let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect();
// Decision: `.kbignore` itself IS emitted as a RawAsset (MediaType::Other("")).
// Rationale: a config file that affects ingest is itself part of the
// workspace contents; the markdown extractor (P1-2) will reject Other("")
// on its own. If we ever decide to omit `.kbignore` from the asset list,
// this test will catch it.
assert!(
names.contains(&".kbignore".to_string()),
".kbignore must be emitted as an asset; got: {names:?}"
);
assert!(names.contains(&"a.md".to_string()));
assert!(!names.contains(&"b.tmp".to_string()));
}
#[test]
fn scan_filters_default_excludes() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join("a.md"), b"x").unwrap();
std::fs::write(root.join(".DS_Store"), b"\0\0").unwrap();
std::fs::write(root.join("._sidecar"), b"\0\0").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let v = conn.scan(&SourceScope::default()).unwrap();
let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect();
assert_eq!(names, vec!["a.md".to_string()]);
}
#[test]
fn scan_unions_config_exclude_and_kbignore() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join(".kbignore"), "*.tmp\n").unwrap();
std::fs::write(root.join("a.md"), b"x").unwrap();
std::fs::write(root.join("b.tmp"), b"x").unwrap();
std::fs::write(root.join("c.log"), b"x").unwrap();
let mut cfg = cfg_with_root(root.to_str().unwrap());
cfg.workspace.exclude.push("*.log".to_string());
let conn = FsSourceConnector::new(&cfg).unwrap();
let v = conn.scan(&SourceScope::default()).unwrap();
let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect();
assert!(names.contains(&"a.md".to_string()));
assert!(!names.contains(&"b.tmp".to_string()), "kbignore should drop *.tmp");
assert!(!names.contains(&"c.log".to_string()), "config.exclude should drop *.log");
}
#[test]
fn scan_blake3_pinned_for_known_file() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join("hello.md"), b"hello world").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let v = conn.scan(&SourceScope::default()).unwrap();
assert_eq!(v.len(), 1);
let asset = &v[0];
assert_eq!(
asset.checksum.0,
"d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"
);
assert_eq!(asset.byte_len, 11);
// asset_id is derived from the full hex via id_for_asset.
assert_eq!(asset.asset_id, id_for_asset(&asset.checksum.0));
}
#[test]
fn scan_idempotent_modulo_timestamp() {
// Same filesystem state → identical Vec<RawAsset> *modulo*
// discovered_at. Strip that field and compare.
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::create_dir_all(root.join("notes")).unwrap();
std::fs::write(root.join("notes/a.md"), b"alpha").unwrap();
std::fs::write(root.join("notes/b.md"), b"beta").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let v1 = conn.scan(&SourceScope::default()).unwrap();
let v2 = conn.scan(&SourceScope::default()).unwrap();
assert_eq!(v1.len(), v2.len());
for (a, b) in v1.iter().zip(v2.iter()) {
assert_eq!(a.asset_id, b.asset_id);
assert_eq!(a.workspace_path, b.workspace_path);
assert_eq!(a.checksum, b.checksum);
assert_eq!(a.byte_len, b.byte_len);
assert_eq!(a.media_type, b.media_type);
assert_eq!(a.source_uri, b.source_uri);
assert_eq!(a.stored, b.stored);
// discovered_at intentionally NOT compared
}
}
#[test]
fn scan_emits_posix_normalized_paths() {
// End-to-end: the connector must produce POSIX-normalized
// workspace paths via `kb_core::to_posix`. We can't construct an
// input with literal `./` / `//` segments via the filesystem (the
// OS won't let us), so instead we assert the resulting strings
// are already POSIX-clean (no leading `./`, no `//`, forward
// slashes only) — which is the post-conditions side of the
// round-trip the unit tests in `kb-core::normalize` cover.
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::create_dir_all(root.join("a/b/c")).unwrap();
std::fs::write(root.join("a/b/c/d.md"), b"x").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let v = conn.scan(&SourceScope::default()).unwrap();
assert_eq!(v.len(), 1);
let p = &v[0].workspace_path.0;
assert_eq!(p, "a/b/c/d.md");
assert!(!p.starts_with("./"));
assert!(!p.contains("//"));
assert!(!p.contains('\\'));
}
#[test]
fn scan_skips_files_whose_name_contains_hash() {
// `WorkspacePath` rejects `#` (collides with the W3C-Media-Fragments
// separator used by `Citation`). The connector must drop such
// files with a warning rather than aborting the scan.
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join("ok.md"), b"x").unwrap();
std::fs::write(root.join("has#hash.md"), b"y").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let v = conn.scan(&SourceScope::default()).unwrap();
let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect();
assert_eq!(names, vec!["ok.md".to_string()]);
}
#[test]
fn copy_vs_reference_threshold_signals_intent() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join("small.md"), b"hi").unwrap();
let mut cfg = cfg_with_root(root.to_str().unwrap());
// Threshold = 0 MiB ⇒ even a 2-byte file becomes Reference.
cfg.storage.copy_threshold_mb = 0;
let conn = FsSourceConnector::new(&cfg).unwrap();
let v = conn.scan(&SourceScope::default()).unwrap();
assert_eq!(v.len(), 1);
match &v[0].stored {
AssetStorage::Reference { sha, .. } => {
assert_eq!(sha, &v[0].checksum);
}
other => panic!("expected Reference, got {other:?}"),
}
// Threshold high (default 100 MiB) ⇒ Copied.
let mut cfg2 = cfg_with_root(root.to_str().unwrap());
cfg2.storage.copy_threshold_mb = 100;
let conn2 = FsSourceConnector::new(&cfg2).unwrap();
let v2 = conn2.scan(&SourceScope::default()).unwrap();
assert!(matches!(v2[0].stored, AssetStorage::Copied { .. }));
}
}

View File

@@ -0,0 +1,92 @@
//! Streaming BLAKE3 over a file path. Per task spec, files MUST NOT be
//! loaded fully into memory: `blake3::Hasher::update_reader` reads through a
//! 64 KiB internal buffer, which keeps memory bounded for any size of file.
//!
//! Returns `(byte_len, full_hex)`:
//! - `byte_len` is the total bytes hashed (== file size after follow).
//! - `full_hex` is the canonical lowercase hex (64 chars) of the full
//! blake3 digest. The `kb-core::Checksum` invariant is "full hex"; the
//! 32-char prefix is reserved for `AssetId` derivation via
//! `kb_core::id_for_asset`.
use std::fs::File;
use std::io::{self, Read};
use std::path::Path;
use anyhow::{Context, Result};
const READ_BUFFER_BYTES: usize = 64 * 1024;
/// Stream-hash a file with blake3. Returns `(byte_len, full_hex_64)`.
///
/// `byte_len` is computed during streaming so callers do not need a separate
/// `metadata().len()` call (which can disagree with hashed bytes if the file
/// is rewritten mid-scan, but blake3-of-stream is the source of truth for
/// `RawAsset.checksum`).
pub(crate) fn hash_file(path: &Path) -> Result<(u64, String)> {
let file = File::open(path)
.with_context(|| format!("failed to open {} for hashing", path.display()))?;
hash_reader(file).with_context(|| format!("failed to hash {}", path.display()))
}
fn hash_reader<R: Read>(mut reader: R) -> Result<(u64, String)> {
let mut hasher = blake3::Hasher::new();
let mut buf = vec![0u8; READ_BUFFER_BYTES];
let mut total: u64 = 0;
loop {
match reader.read(&mut buf) {
Ok(0) => break,
Ok(n) => {
hasher.update(&buf[..n]);
total = total.saturating_add(n as u64);
}
Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => return Err(e.into()),
}
}
Ok((total, hasher.finalize().to_hex().to_string()))
}
#[cfg(test)]
mod tests {
use super::*;
/// blake3 of the empty input is the well-known "official empty hash"
/// from the blake3 spec. Pinned so that swapping the hash crate or the
/// streaming implementation can never silently produce a different
/// digest for a known input.
#[test]
fn empty_blake3_pinned() {
let (n, hex) = hash_reader(std::io::empty()).unwrap();
assert_eq!(n, 0);
assert_eq!(
hex,
"af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
);
}
/// `b"hello world"` blake3 (full 64 hex). Computed independently with
/// `b3sum`; pinning here detects any drift in the streaming pipeline.
#[test]
fn known_bytes_blake3_pinned() {
let bytes = b"hello world";
let (n, hex) = hash_reader(&bytes[..]).unwrap();
assert_eq!(n, 11);
assert_eq!(
hex,
"d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"
);
}
/// Streaming a buffer larger than `READ_BUFFER_BYTES` must produce the
/// same digest as a single-shot blake3 over the same bytes — i.e. the
/// chunk boundary is invisible.
#[test]
fn streaming_matches_oneshot_over_buffer_boundary() {
let bytes: Vec<u8> = (0u8..=255u8).cycle().take(READ_BUFFER_BYTES * 3 + 17).collect();
let (n, streamed) = hash_reader(&bytes[..]).unwrap();
assert_eq!(n, bytes.len() as u64);
let oneshot = blake3::hash(&bytes).to_hex().to_string();
assert_eq!(streamed, oneshot);
}
}

View File

@@ -0,0 +1,16 @@
//! `kb-source-fs` — local filesystem `SourceConnector`.
//!
//! Walks `config.workspace.root`, applies gitignore-style filters from
//! `config.workspace.exclude` `.kbignore`, computes BLAKE3 of every file,
//! and emits `Vec<RawAsset>` sorted by `workspace_path` for determinism.
//!
//! Per design §3.3 (RawAsset), §6.2 (workspace + .kbignore), §6.6 (POSIX
//! normalization), §7.1 (SourceScope), §7.2 (SourceConnector), §8 (module
//! boundaries).
mod connector;
mod hash;
mod media;
mod walker;
pub use connector::FsSourceConnector;

View File

@@ -0,0 +1,85 @@
//! Media-type detection by extension. Per P1-1 task spec we do NOT do
//! libmagic-style sniffing; extension is enough for P1. Unknown / missing
//! extensions fall through to `MediaType::Other(ext.to_string())` (empty
//! string when the file has no extension at all).
use std::path::Path;
use kb_core::{AudioType, ImageType, MediaType};
/// Return `MediaType` for `path` based purely on its lowercased extension.
/// `.md` → Markdown, `.pdf` → Pdf, image and audio extensions map onto
/// `MediaType::Image(_)` / `MediaType::Audio(_)`. Anything else (including
/// missing extension) → `MediaType::Other(ext)`.
pub(crate) fn media_type_for(path: &Path) -> MediaType {
let ext = path
.extension()
.and_then(|s| s.to_str())
.map(|s| s.to_ascii_lowercase())
.unwrap_or_default();
match ext.as_str() {
"md" => MediaType::Markdown,
"pdf" => MediaType::Pdf,
"png" => MediaType::Image(ImageType::Png),
"jpg" | "jpeg" => MediaType::Image(ImageType::Jpeg),
"webp" => MediaType::Image(ImageType::Webp),
"gif" => MediaType::Image(ImageType::Gif),
"tiff" | "tif" => MediaType::Image(ImageType::Tiff),
"m4a" => MediaType::Audio(AudioType::M4a),
"mp3" => MediaType::Audio(AudioType::Mp3),
"wav" => MediaType::Audio(AudioType::Wav),
"flac" => MediaType::Audio(AudioType::Flac),
"ogg" => MediaType::Audio(AudioType::Ogg),
// Empty string (no extension) and any other extension: bucket as
// Other and let downstream extractors decide if they support it.
_ => MediaType::Other(ext),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn markdown_and_pdf() {
assert_eq!(media_type_for(Path::new("a/b.md")), MediaType::Markdown);
assert_eq!(media_type_for(Path::new("a/b.MD")), MediaType::Markdown);
assert_eq!(media_type_for(Path::new("a/b.pdf")), MediaType::Pdf);
}
#[test]
fn images_and_audio() {
assert_eq!(
media_type_for(Path::new("p.jpg")),
MediaType::Image(ImageType::Jpeg)
);
assert_eq!(
media_type_for(Path::new("p.JPEG")),
MediaType::Image(ImageType::Jpeg)
);
assert_eq!(
media_type_for(Path::new("a.M4A")),
MediaType::Audio(AudioType::M4a)
);
assert_eq!(
media_type_for(Path::new("a.flac")),
MediaType::Audio(AudioType::Flac)
);
}
#[test]
fn unknown_and_missing_extension() {
assert_eq!(
media_type_for(Path::new("notes/x.weird")),
MediaType::Other("weird".to_string())
);
assert_eq!(
media_type_for(Path::new("README")),
MediaType::Other(String::new())
);
}
}

View File

@@ -0,0 +1,260 @@
//! Directory walker with gitignore-style filtering and symlink-cycle
//! protection.
//!
//! Filter set (per task spec, design §6.2):
//! - `config.workspace.exclude` (passed in by `FsSourceConnector`)
//! - `<root>/.kbignore` (optional file at workspace root)
//! - default-excludes for `.DS_Store` and macOS resource forks (`._*`)
//!
//! All three are merged via `ignore::overrides::OverrideBuilder`, which
//! gives full gitignore semantics (anchors, `!` negation, `**`, etc.). We
//! prepend `!` to each pattern because `OverrideBuilder` treats positive
//! patterns as "include" and negative as "exclude" — see §"Filter set"
//! comment in `build_walker` for the full reasoning.
//!
//! Symlink handling: we want to follow links (so a workspace using a
//! symlinked `notes/` directory works), but we must NOT loop forever on
//! `a -> b -> a`. `walkdir` does NOT detect cycles for us when
//! `follow_links(true)`; we layer our own visited-set on top, keyed by the
//! canonical path of every entry, and skip any entry we've already seen.
//!
//! ## Why `walkdir` instead of `ignore::WalkBuilder`?
//!
//! `ignore::WalkBuilder` bundles gitignore semantics + cycle detection in
//! one API. We use `walkdir` directly because we need explicit control
//! over canonical-path comparison for sibling-subtree symlinks (a case
//! `walkdir`'s ancestor-only check can miss). Override-based filtering
//! still uses the `ignore` crate's `Override` matcher, just decoupled from
//! its walker.
use std::collections::HashSet;
use std::path::{Path, PathBuf};
use anyhow::{Context, Result};
use ignore::overrides::{Override, OverrideBuilder};
use walkdir::{DirEntry, WalkDir};
/// Default-excludes baked into the connector. These are NOT configurable;
/// they cover noise that is never useful to ingest and would otherwise need
/// to appear in every user's `.kbignore`.
const DEFAULT_EXCLUDES: &[&str] = &[
// Finder metadata
".DS_Store",
"**/.DS_Store",
// macOS resource forks (AppleDouble files)
"._*",
"**/._*",
];
/// Build the merged `Override` from `config.workspace.exclude` `.kbignore`
/// baked-in default excludes.
///
/// Each input pattern is registered as an *exclude* (gitignore-style: a
/// leading `!` flips a positive match to a negative one in the
/// `OverrideBuilder` API). Order doesn't matter — the union is computed by
/// the underlying gitignore engine.
pub(crate) fn build_overrides(
root: &Path,
config_exclude: &[String],
kbignore_patterns: &[String],
) -> Result<Override> {
let mut builder = OverrideBuilder::new(root);
for pat in DEFAULT_EXCLUDES {
builder
.add(&format!("!{pat}"))
.with_context(|| format!("invalid default-exclude pattern: {pat}"))?;
}
for pat in config_exclude {
builder
.add(&format!("!{pat}"))
.with_context(|| format!("invalid workspace.exclude pattern: {pat}"))?;
}
for pat in kbignore_patterns {
builder
.add(&format!("!{pat}"))
.with_context(|| format!("invalid .kbignore pattern: {pat}"))?;
}
builder.build().context("failed to compile override set")
}
/// Read `<root>/.kbignore` if it exists. Each non-blank, non-comment line is
/// a gitignore pattern. Missing file → empty Vec (not an error).
pub(crate) fn read_kbignore(root: &Path) -> Result<Vec<String>> {
let path = root.join(".kbignore");
if !path.exists() {
return Ok(Vec::new());
}
let text = std::fs::read_to_string(&path)
.with_context(|| format!("failed to read {}", path.display()))?;
Ok(text
.lines()
.map(|l| l.trim())
.filter(|l| !l.is_empty() && !l.starts_with('#'))
.map(|l| l.to_string())
.collect())
}
/// Iterate every regular file under `root`, applying `overrides` and
/// detecting symlink cycles. Returns absolute file paths.
///
/// Strategy:
/// - `walkdir::WalkDir::follow_links(true)` to traverse symlinks.
/// - Maintain `visited: HashSet<PathBuf>` of *canonical* paths. Before
/// descending into a directory entry, canonicalize and check the set;
/// if already present, skip. This breaks `a -> b -> a` cycles in O(n)
/// per entry without a custom recursive walker.
/// - For each yielded entry, ask `overrides` whether it is excluded; if
/// so, drop it. If the entry is a directory, also short-circuit
/// `WalkDir`'s descent via `it.skip_current_dir()`.
pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result<Vec<PathBuf>> {
let mut out = Vec::new();
let mut visited: HashSet<PathBuf> = HashSet::new();
let walker = WalkDir::new(root).follow_links(true).into_iter();
let mut it = walker.filter_entry(|e| !is_excluded(e, root, overrides));
while let Some(res) = it.next() {
let entry = match res {
Ok(e) => e,
Err(err) => {
// `walkdir` surfaces I/O errors AND its own cycle detector
// (when follow_links is on it sometimes catches them).
// Either way: log and skip; do not abort the whole scan.
tracing::warn!(error = %err, "walkdir entry error; skipping");
continue;
}
};
let path = entry.path();
// Cycle guard: only canonicalize symlinks (cheap on the common case
// of plain files/dirs) and on directories that are followed via a
// symlink. `walkdir`'s `path_is_symlink()` is true when the entry's
// *original* path is a symlink (it returns true for the link, not
// for the resolved target). For non-symlinked directories we still
// record the canonical path so a *later* symlink that points back
// to one of them is detected.
if entry.file_type().is_dir() {
match std::fs::canonicalize(path) {
Ok(canon) => {
if !visited.insert(canon) {
// Already visited via another path → break cycle.
it.skip_current_dir();
continue;
}
}
Err(err) => {
tracing::debug!(
path = %path.display(),
error = %err,
"skipping: canonicalize failed (broken/permission-denied symlink target)"
);
continue;
}
}
}
if entry.file_type().is_file() {
out.push(path.to_path_buf());
}
}
Ok(out)
}
fn is_excluded(entry: &DirEntry, root: &Path, overrides: &Override) -> bool {
// `Override::matched(path, is_dir)` uses the path *relative to* the
// override builder's root. `walkdir` gives absolute paths when
// `WalkDir::new` was given an absolute path — strip the root prefix
// before consulting the override.
let rel = match entry.path().strip_prefix(root) {
Ok(p) => p,
Err(_) => entry.path(),
};
overrides
.matched(rel, entry.file_type().is_dir())
.is_ignore()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_inputs_compile_into_an_override() {
let dir = tempfile::tempdir().unwrap();
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
// Default-excludes only; non-special files should not match.
let m = ov.matched(Path::new("notes/alpha.md"), false);
assert!(!m.is_ignore());
}
#[test]
fn default_excludes_ds_store_and_resource_forks() {
let dir = tempfile::tempdir().unwrap();
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
assert!(ov.matched(Path::new(".DS_Store"), false).is_ignore());
assert!(
ov.matched(Path::new("notes/.DS_Store"), false).is_ignore()
);
assert!(ov.matched(Path::new("._foo.md"), false).is_ignore());
assert!(
ov.matched(Path::new("notes/._sidecar"), false).is_ignore()
);
}
#[test]
fn config_exclude_filters_tmp_and_node_modules() {
let dir = tempfile::tempdir().unwrap();
let ov = build_overrides(
dir.path(),
&["*.tmp".to_string(), "node_modules/**".to_string()],
&[],
)
.unwrap();
assert!(ov.matched(Path::new("a.tmp"), false).is_ignore());
assert!(ov.matched(Path::new("notes/x.tmp"), false).is_ignore());
assert!(
ov.matched(Path::new("node_modules/foo/bar.js"), false)
.is_ignore()
);
assert!(!ov.matched(Path::new("alpha.md"), false).is_ignore());
}
#[test]
fn kbignore_union_with_config_exclude() {
// "either set excluding it ⇒ excluded"
let dir = tempfile::tempdir().unwrap();
let ov = build_overrides(
dir.path(),
&["*.tmp".to_string()],
&["secret/**".to_string()],
)
.unwrap();
assert!(ov.matched(Path::new("a.tmp"), false).is_ignore());
assert!(
ov.matched(Path::new("secret/key.md"), false).is_ignore()
);
}
#[test]
fn read_kbignore_missing_returns_empty() {
let dir = tempfile::tempdir().unwrap();
let v = read_kbignore(dir.path()).unwrap();
assert!(v.is_empty());
}
#[test]
fn read_kbignore_strips_blanks_and_comments() {
let dir = tempfile::tempdir().unwrap();
std::fs::write(
dir.path().join(".kbignore"),
"# comment\n*.tmp\n\nignored/**\n",
)
.unwrap();
let v = read_kbignore(dir.path()).unwrap();
assert_eq!(v, vec!["*.tmp".to_string(), "ignored/**".to_string()]);
}
}

View File

@@ -0,0 +1,139 @@
//! Snapshot + determinism tests against `fixtures/source-fs/tree-1`.
//!
//! Layout (committed under `<repo>/fixtures/source-fs/tree-1/`):
//!
//! ```
//! tree-1/
//! ├── README.md
//! ├── notes/
//! │ ├── alpha.md
//! │ └── beta.md
//! ├── ignored/
//! │ └── skip.tmp # excluded by .kbignore
//! ├── .kbignore # contains: *.tmp
//! └── .DS_Store # implicitly excluded
//! ```
//!
//! Two assertions:
//! 1. Snapshot stability — `scan` output (with `discovered_at` stripped)
//! matches the committed baseline JSON byte-for-byte.
//! 2. Determinism — running `scan` twice produces byte-identical JSON
//! after stripping `discovered_at`.
//!
//! `discovered_at` is wall-clock and intentionally NOT part of the
//! contract: the task spec says strip it before comparison.
use std::path::PathBuf;
use kb_config::Config;
use kb_core::{SourceConnector, SourceScope};
use kb_source_fs::FsSourceConnector;
use serde_json::Value;
/// Repo root, derived from `CARGO_MANIFEST_DIR` (= `crates/kb-source-fs`).
fn repo_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap()
.to_path_buf()
}
fn fixture_root() -> PathBuf {
repo_root().join("fixtures/source-fs/tree-1")
}
fn baseline_path() -> PathBuf {
repo_root().join("fixtures/source-fs/tree-1.snapshot.json")
}
fn cfg_for_fixture(root: &str) -> Config {
let mut c = Config::defaults();
c.workspace.root = root.to_string();
// Clear default excludes (`.git/**`, `node_modules/**`, `.obsidian/**`)
// so the snapshot is purely a function of the fixture + .kbignore +
// baked-in default-excludes.
c.workspace.exclude.clear();
c
}
/// Run `scan` against the fixture and return the JSON value with every
/// `discovered_at` field replaced by the literal string "<stripped>".
/// Also strip `source_uri.value` and `stored.path` because they contain
/// absolute paths that vary by checkout location — the snapshot must be
/// portable across machines and CI checkout dirs.
fn scan_and_strip() -> Value {
let root = fixture_root();
let cfg = cfg_for_fixture(root.to_str().unwrap());
let conn = FsSourceConnector::new(&cfg).expect("connector init");
let assets = conn
.scan(&SourceScope::default())
.expect("scan must succeed against committed fixture");
let mut v = serde_json::to_value(&assets).expect("serialize");
if let Value::Array(items) = &mut v {
for item in items {
if let Value::Object(map) = item {
map.insert(
"discovered_at".to_string(),
Value::String("<stripped>".to_string()),
);
// source_uri = { kind: "file", value: "<abs>" } — strip value.
if let Some(Value::Object(s)) = map.get_mut("source_uri") {
if s.contains_key("value") {
s.insert("value".to_string(), Value::String("<stripped>".to_string()));
}
}
// stored = { kind: "copied"|"reference", path: "<abs>", ... } — strip path.
if let Some(Value::Object(s)) = map.get_mut("stored") {
if s.contains_key("path") {
s.insert("path".to_string(), Value::String("<stripped>".to_string()));
}
}
}
}
}
v
}
#[test]
fn tree_1_snapshot_matches_baseline() {
let actual = scan_and_strip();
// If KB_REGEN_SNAPSHOT is set, (re)write the baseline and exit
// *before* attempting to read it. This is the only path that may
// create the file from scratch.
if std::env::var_os("KB_REGEN_SNAPSHOT").is_some() {
let pretty = serde_json::to_string_pretty(&actual).unwrap() + "\n";
std::fs::write(baseline_path(), pretty).expect("write baseline");
panic!("regenerated baseline; rerun without KB_REGEN_SNAPSHOT to verify");
}
let baseline_text = std::fs::read_to_string(baseline_path()).unwrap_or_else(|_| {
panic!(
"missing baseline at {} — regenerate via `KB_REGEN_SNAPSHOT=1 cargo test \
-p kb-source-fs --test snapshot_tree1 -- tree_1_snapshot_matches_baseline`",
baseline_path().display()
)
});
let expected: Value = serde_json::from_str(&baseline_text)
.expect("baseline JSON must parse");
if actual != expected {
let actual_pretty = serde_json::to_string_pretty(&actual).unwrap();
let expected_pretty = serde_json::to_string_pretty(&expected).unwrap();
panic!(
"snapshot drift.\n--- expected ---\n{expected_pretty}\n--- actual ---\n{actual_pretty}\n"
);
}
}
#[test]
fn tree_1_scan_is_deterministic() {
let v1 = scan_and_strip();
let v2 = scan_and_strip();
let s1 = serde_json::to_string(&v1).unwrap();
let s2 = serde_json::to_string(&v2).unwrap();
assert_eq!(s1, s2, "two consecutive scans diverged");
}

View File

@@ -0,0 +1,160 @@
//! Integration test: a `notes/` symlink whose target points back at the
//! workspace root MUST NOT cause `scan` to loop forever or panic.
//!
//! Layout (built per-test in a tempdir):
//! root/
//! ├── alpha.md
//! ├── notes/ (symlink → root) ← cycle: root → notes → root → …
//!
//! Expected: `scan` returns in O(seconds), every emitted path is unique,
//! and `alpha.md` appears at least once.
//!
//! The cycle guard lives in `walker::walk_files`; this test exists to
//! prove it catches the realistic shape (cycle through one or more
//! symlinks) end-to-end via the public API.
#![cfg(unix)]
use std::os::unix::fs::symlink;
use kb_config::Config;
use kb_core::{SourceConnector, SourceScope};
use kb_source_fs::FsSourceConnector;
fn cfg_with_root(root: &str) -> Config {
let mut c = Config::defaults();
c.workspace.root = root.to_string();
c.workspace.exclude.clear();
c
}
#[test]
fn symlink_cycle_does_not_loop_or_crash() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join("alpha.md"), b"alpha").unwrap();
// Symlink: root/notes → root (a → a cycle through the link `notes`).
symlink(root, root.join("notes")).unwrap();
let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.expect("connector init");
let v = conn
.scan(&SourceScope::default())
.expect("scan must return, not loop");
// Determinism check: no duplicate workspace paths.
let mut seen = std::collections::HashSet::new();
for asset in &v {
assert!(
seen.insert(asset.workspace_path.0.clone()),
"duplicate workspace_path: {}",
asset.workspace_path.0
);
}
// The original alpha.md must appear.
assert!(
v.iter().any(|a| a.workspace_path.0 == "alpha.md"),
"expected alpha.md in scan output, got: {:?}",
v.iter().map(|a| &a.workspace_path.0).collect::<Vec<_>>()
);
}
#[test]
fn dangling_symlink_pseudo_cycle_does_not_crash() {
// root/
// ├── alpha.md
// ├── a → b (b does not exist as a real file/dir)
// └── b → a (a does not exist as a real file/dir)
//
// Both symlinks are dangling — neither resolves to anything. This is
// NOT a real two-step directory cycle (see
// `two_step_directory_cycle_visited_set_breaks_loop` for that case);
// it merely verifies the scan tolerates broken-link pseudo-cycles
// without crashing or looping.
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join("alpha.md"), b"alpha").unwrap();
symlink(root.join("b"), root.join("a")).unwrap();
symlink(root.join("a"), root.join("b")).unwrap();
let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.expect("connector init");
// Even though a→b→a never resolves to a real directory (broken
// pseudo-cycle of dangling symlinks), the scan must complete and
// surface alpha.md.
let v = conn.scan(&SourceScope::default()).expect("scan must return");
assert!(v.iter().any(|a| a.workspace_path.0 == "alpha.md"));
}
#[test]
fn two_step_directory_cycle_visited_set_breaks_loop() {
// Real two-step directory cycle through symlinks:
// root/
// ├── a/
// │ ├── inside_a.md
// │ └── loop → ../b (symlink, target IS a real directory)
// └── b/
// ├── inside_b.md
// └── loop → ../a (symlink, target IS a real directory)
//
// Without the visited-set, walkdir would descend
// a → a/loop (=b) → a/loop/loop (=a) → … forever.
// The canonical-path visited-set in `walker::walk_files` must break
// the loop and yield a finite, deterministic result.
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::create_dir(root.join("a")).unwrap();
std::fs::create_dir(root.join("b")).unwrap();
std::fs::write(root.join("a/inside_a.md"), b"a-content").unwrap();
std::fs::write(root.join("b/inside_b.md"), b"b-content").unwrap();
// Use relative targets so the symlink truly points at the sibling
// directory regardless of where the tempdir lives.
symlink("../b", root.join("a/loop")).unwrap();
symlink("../a", root.join("b/loop")).unwrap();
let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.expect("connector init");
// Run scan twice — both must terminate AND produce identical
// workspace_path lists (visited-set is deterministic per scan).
let v1 = conn.scan(&SourceScope::default()).expect("scan must return");
let v2 = conn.scan(&SourceScope::default()).expect("scan must return");
let names1: Vec<String> = v1.iter().map(|a| a.workspace_path.0.clone()).collect();
let names2: Vec<String> = v2.iter().map(|a| a.workspace_path.0.clone()).collect();
assert_eq!(names1, names2, "scan must be deterministic across runs");
// No duplicate workspace paths (visited-set should suppress
// re-emission of the same canonical file via the cycle).
let mut seen = std::collections::HashSet::new();
for asset in &v1 {
assert!(
seen.insert(asset.workspace_path.0.clone()),
"duplicate workspace_path: {}",
asset.workspace_path.0
);
}
// Both real files must appear at least once. Their exact relative
// paths depend on which side of the cycle the walker descended into
// first; assert by basename to keep the check robust.
assert!(
v1.iter().any(|a| a.workspace_path.0.ends_with("inside_a.md")),
"expected inside_a.md in scan output, got: {names1:?}"
);
assert!(
v1.iter().any(|a| a.workspace_path.0.ends_with("inside_b.md")),
"expected inside_b.md in scan output, got: {names1:?}"
);
// Sanity bound: with two real files and a working cycle guard the
// output should be tiny. If we ever produce >50 entries the visited
// set has regressed.
assert!(
v1.len() < 50,
"scan emitted {} assets — cycle guard likely regressed: {:?}",
v1.len(),
names1
);
}

View File

@@ -0,0 +1,68 @@
[
{
"asset_id": "bd6e5649e546d6ac94c3269ffe7192c5",
"byte_len": 6,
"checksum": "f6b71def043f1fd92f2d34969a7272a9d134730551de8c9754c4be79fbc0aef3",
"discovered_at": "<stripped>",
"media_type": {
"other": ""
},
"source_uri": {
"kind": "file",
"value": "<stripped>"
},
"stored": {
"kind": "copied",
"path": "<stripped>"
},
"workspace_path": ".kbignore"
},
{
"asset_id": "ba6cd31cab86eff7a86638ee76494bcf",
"byte_len": 169,
"checksum": "b0124489083674f6ad99a57ee5fc425feb71754a538a97a1ab580e8eb9b1f1c1",
"discovered_at": "<stripped>",
"media_type": "markdown",
"source_uri": {
"kind": "file",
"value": "<stripped>"
},
"stored": {
"kind": "copied",
"path": "<stripped>"
},
"workspace_path": "README.md"
},
{
"asset_id": "3381fcc34cf9415a391ba6b0dc6037c5",
"byte_len": 11,
"checksum": "e9fa9a5e0725d7bf6ec9d1565d3921eb6b62aa7f0db40c1c3ffebda7475d4258",
"discovered_at": "<stripped>",
"media_type": "markdown",
"source_uri": {
"kind": "file",
"value": "<stripped>"
},
"stored": {
"kind": "copied",
"path": "<stripped>"
},
"workspace_path": "notes/alpha.md"
},
{
"asset_id": "e300aa98aec843d2df1dd8f43702b257",
"byte_len": 10,
"checksum": "3e4df2f43563730d61672ce67a9bf479bc7c7a2f1384e2081d52e06f143353ed",
"discovered_at": "<stripped>",
"media_type": "markdown",
"source_uri": {
"kind": "file",
"value": "<stripped>"
},
"stored": {
"kind": "copied",
"path": "<stripped>"
},
"workspace_path": "notes/beta.md"
}
]

1
fixtures/source-fs/tree-1/.DS_Store vendored Normal file
View File

@@ -0,0 +1 @@
macOS Finder metadata placeholder. Implicitly excluded by FsSourceConnector.

View File

@@ -0,0 +1 @@
*.tmp

View File

@@ -0,0 +1,5 @@
# tree-1
Fixture for `kb-source-fs` snapshot tests. Contents are intentionally tiny
and stable — bumping a byte here will require regenerating the snapshot
baseline.

View File

@@ -0,0 +1 @@
should be excluded by .kbignore

View File

@@ -0,0 +1 @@
alpha note

View File

@@ -0,0 +1 @@
beta note