feat(p1-1): kb-source-fs filesystem source connector #6
377
Cargo.lock
generated
377
Cargo.lock
generated
@@ -79,6 +79,12 @@ version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
|
||||
|
||||
[[package]]
|
||||
name = "blake3"
|
||||
version = "1.8.5"
|
||||
@@ -93,6 +99,16 @@ dependencies = [
|
||||
"cpufeatures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "1.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.61"
|
||||
@@ -179,6 +195,25 @@ dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
||||
dependencies = [
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.21"
|
||||
@@ -222,12 +257,34 @@ version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
|
||||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.3.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
|
||||
|
||||
[[package]]
|
||||
name = "foldhash"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.17"
|
||||
@@ -239,6 +296,41 @@ dependencies = [
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"wasip2",
|
||||
"wasip3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "globset"
|
||||
version = "0.4.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"bstr",
|
||||
"log",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.15.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
|
||||
dependencies = [
|
||||
"foldhash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.17.0"
|
||||
@@ -251,6 +343,28 @@ version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||
|
||||
[[package]]
|
||||
name = "id-arena"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
|
||||
|
||||
[[package]]
|
||||
name = "ignore"
|
||||
version = "0.4.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3d782a365a015e0f5c04902246139249abf769125006fbe7649e2ee88169b4a"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"globset",
|
||||
"log",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"same-file",
|
||||
"walkdir",
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.14.0"
|
||||
@@ -258,7 +372,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown",
|
||||
"hashbrown 0.17.0",
|
||||
"serde",
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -335,12 +451,35 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kb-source-fs"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"blake3",
|
||||
"ignore",
|
||||
"kb-config",
|
||||
"kb-core",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"time",
|
||||
"tracing",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||
|
||||
[[package]]
|
||||
name = "leb128fmt"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.186"
|
||||
@@ -356,6 +495,12 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.29"
|
||||
@@ -422,6 +567,16 @@ version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
|
||||
[[package]]
|
||||
name = "prettyplease"
|
||||
version = "0.2.37"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.106"
|
||||
@@ -440,13 +595,19 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "r-efi"
|
||||
version = "6.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
|
||||
|
||||
[[package]]
|
||||
name = "redox_users"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
"getrandom 0.2.17",
|
||||
"libredox",
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
@@ -468,12 +629,40 @@ version = "0.8.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ryu-js"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15"
|
||||
|
||||
[[package]]
|
||||
name = "same-file"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.228"
|
||||
@@ -581,6 +770,19 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.27.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"getrandom 0.4.2",
|
||||
"once_cell",
|
||||
"rustix",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.69"
|
||||
@@ -819,6 +1021,12 @@ dependencies = [
|
||||
"tinyvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.2"
|
||||
@@ -831,12 +1039,83 @@ version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
|
||||
dependencies = [
|
||||
"same-file",
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.11.1+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
||||
|
||||
[[package]]
|
||||
name = "wasip2"
|
||||
version = "1.0.1+wasi-0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
|
||||
dependencies = [
|
||||
"wit-bindgen 0.46.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasip3"
|
||||
version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
|
||||
dependencies = [
|
||||
"wit-bindgen 0.51.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-encoder"
|
||||
version = "0.244.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
|
||||
dependencies = [
|
||||
"leb128fmt",
|
||||
"wasmparser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-metadata"
|
||||
version = "0.244.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"indexmap",
|
||||
"wasm-encoder",
|
||||
"wasmparser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasmparser"
|
||||
version = "0.244.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"hashbrown 0.15.5",
|
||||
"indexmap",
|
||||
"semver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-util"
|
||||
version = "0.1.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
|
||||
dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-link"
|
||||
version = "0.2.1"
|
||||
@@ -927,6 +1206,100 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen"
|
||||
version = "0.51.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
|
||||
dependencies = [
|
||||
"wit-bindgen-rust-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-core"
|
||||
version = "0.51.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"heck",
|
||||
"wit-parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-rust"
|
||||
version = "0.51.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"heck",
|
||||
"indexmap",
|
||||
"prettyplease",
|
||||
"syn",
|
||||
"wasm-metadata",
|
||||
"wit-bindgen-core",
|
||||
"wit-component",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-rust-macro"
|
||||
version = "0.51.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wit-bindgen-core",
|
||||
"wit-bindgen-rust",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-component"
|
||||
version = "0.244.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bitflags",
|
||||
"indexmap",
|
||||
"log",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"serde_json",
|
||||
"wasm-encoder",
|
||||
"wasm-metadata",
|
||||
"wasmparser",
|
||||
"wit-parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-parser"
|
||||
version = "0.244.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"id-arena",
|
||||
"indexmap",
|
||||
"log",
|
||||
"semver",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"serde_json",
|
||||
"unicode-xid",
|
||||
"wasmparser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zmij"
|
||||
version = "1.0.21"
|
||||
|
||||
@@ -4,6 +4,7 @@ members = [
|
||||
"crates/kb-core",
|
||||
"crates/kb-parse-types",
|
||||
"crates/kb-config",
|
||||
"crates/kb-source-fs",
|
||||
"crates/kb-app",
|
||||
"crates/kb-cli",
|
||||
]
|
||||
|
||||
@@ -40,6 +40,18 @@ impl WorkspacePath {
|
||||
}
|
||||
}
|
||||
|
||||
/// On-disk storage decision for a `RawAsset`.
|
||||
///
|
||||
/// **Important convention** — `path` field semantics differ by variant:
|
||||
///
|
||||
/// - `Copied { path }`: at scan time, `path` is the **source** path on the
|
||||
/// user's filesystem. The asset writer (P1-6) is responsible for actually
|
||||
/// copying the bytes into the workspace asset store, AND for overwriting
|
||||
/// `path` with the destination path after the copy completes.
|
||||
///
|
||||
/// - `Reference { path, sha }`: `path` is always the **source** path. No
|
||||
/// bytes are ever copied; downstream readers stream from `path` directly.
|
||||
/// `sha` is the BLAKE3 full hex (matches `RawAsset::checksum`).
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase", tag = "kind")]
|
||||
pub enum AssetStorage {
|
||||
|
||||
23
crates/kb-source-fs/Cargo.toml
Normal file
23
crates/kb-source-fs/Cargo.toml
Normal file
@@ -0,0 +1,23 @@
|
||||
[package]
|
||||
name = "kb-source-fs"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
rust-version = { workspace = true }
|
||||
license = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
description = "Local filesystem SourceConnector — walks workspace.root + applies gitignore filters"
|
||||
|
||||
[dependencies]
|
||||
kb-core = { path = "../kb-core" }
|
||||
kb-config = { path = "../kb-config" }
|
||||
anyhow = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
time = { workspace = true }
|
||||
blake3 = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
walkdir = "2"
|
||||
ignore = "0.4"
|
||||
|
||||
[dev-dependencies]
|
||||
serde_json = { workspace = true }
|
||||
tempfile = "3"
|
||||
423
crates/kb-source-fs/src/connector.rs
Normal file
423
crates/kb-source-fs/src/connector.rs
Normal file
@@ -0,0 +1,423 @@
|
||||
//! `FsSourceConnector` — public surface for the crate.
|
||||
//!
|
||||
//! ```ignore
|
||||
//! pub struct FsSourceConnector { /* internal */ }
|
||||
//! impl FsSourceConnector {
|
||||
//! pub fn new(config: &kb_config::Config) -> anyhow::Result<Self>;
|
||||
//! }
|
||||
//! impl kb_core::SourceConnector for FsSourceConnector {
|
||||
//! fn scan(&self, scope: &kb_core::SourceScope) -> anyhow::Result<Vec<kb_core::RawAsset>>;
|
||||
//! }
|
||||
//! ```
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use kb_config::Config;
|
||||
use kb_core::{
|
||||
AssetStorage, Checksum, RawAsset, SourceConnector, SourceScope, SourceUri,
|
||||
id_for_asset, to_posix,
|
||||
};
|
||||
|
||||
use crate::hash::hash_file;
|
||||
use crate::media::media_type_for;
|
||||
use crate::walker::{build_overrides, read_kbignore, walk_files};
|
||||
|
||||
/// Local-filesystem `SourceConnector`. Constructed once from `Config`,
|
||||
/// reused across `scan` calls.
|
||||
///
|
||||
/// State carried between `new` and `scan`:
|
||||
/// - `default_root`: `config.workspace.root` resolved to a `PathBuf`. Used
|
||||
/// only when `SourceScope::root` is empty (i.e. the caller did not
|
||||
/// override the root).
|
||||
/// - `default_exclude`: snapshot of `config.workspace.exclude` at
|
||||
/// construction time.
|
||||
/// - `copy_threshold_bytes`: `config.storage.copy_threshold_mb * 1 MiB`
|
||||
/// pre-multiplied so we don't recompute per file.
|
||||
pub struct FsSourceConnector {
|
||||
default_root: PathBuf,
|
||||
default_exclude: Vec<String>,
|
||||
copy_threshold_bytes: u64,
|
||||
}
|
||||
|
||||
impl FsSourceConnector {
|
||||
pub fn new(config: &Config) -> Result<Self> {
|
||||
// `config.workspace.root` is a String that may contain `~` or env
|
||||
// expansions. P0-* did not yet provide a path-expansion helper in
|
||||
// kb-config; for P1-1 we expand `~` ourselves and leave `${VAR}`
|
||||
// for a follow-up. The vast majority of users hit the `~` case.
|
||||
let root = expand_tilde(&config.workspace.root);
|
||||
|
||||
let copy_threshold_bytes = config
|
||||
.storage
|
||||
.copy_threshold_mb
|
||||
.saturating_mul(1024 * 1024);
|
||||
|
||||
Ok(Self {
|
||||
default_root: root,
|
||||
default_exclude: config.workspace.exclude.clone(),
|
||||
copy_threshold_bytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SourceConnector for FsSourceConnector {
|
||||
fn scan(&self, scope: &SourceScope) -> Result<Vec<RawAsset>> {
|
||||
// `SourceScope::root` overrides config root when non-empty. This
|
||||
// matches the design's "scope is the per-call lens; config is the
|
||||
// default" split (§7.1).
|
||||
let root = if scope.root.as_os_str().is_empty() {
|
||||
self.default_root.clone()
|
||||
} else {
|
||||
scope.root.clone()
|
||||
};
|
||||
|
||||
// Union: config.workspace.exclude ∪ scope.exclude ∪ .kbignore.
|
||||
// Per §6.2 the union of `.kbignore` and `config.workspace.exclude`
|
||||
// is the filter set. `scope.exclude` is added on top so a caller
|
||||
// can layer a per-call narrowing.
|
||||
let mut excludes = self.default_exclude.clone();
|
||||
excludes.extend(scope.exclude.iter().cloned());
|
||||
// .kbignore is re-read on every scan() so users can edit it without
|
||||
// restarting any long-running process.
|
||||
let kbignore = read_kbignore(&root)?;
|
||||
|
||||
let overrides = build_overrides(&root, &excludes, &kbignore)?;
|
||||
|
||||
// TODO(P1-2/P1-3 router): apply SourceScope::include glob filter at the
|
||||
// extractor router layer once that crate lands. SourceConnector emits all
|
||||
// non-excluded files; routing by include-glob is a downstream concern
|
||||
// (design §6.2 + §7.2 are silent on this split, treat it as router work).
|
||||
//
|
||||
// `scope.include` is intentionally ignored at this stage of the
|
||||
// pipeline: per §6.2 the workspace-level include lives in
|
||||
// `WorkspaceCfg` and is enforced by the asset writer / extractors.
|
||||
// Surfacing it here would double-filter Markdown vs PDF before the
|
||||
// extractor router gets to see them.
|
||||
if !scope.include.is_empty() {
|
||||
tracing::debug!(
|
||||
count = scope.include.len(),
|
||||
"FsSourceConnector ignores scope.include — handled by extractor router"
|
||||
);
|
||||
}
|
||||
|
||||
let files = walk_files(&root, &overrides)?;
|
||||
|
||||
let mut assets = Vec::with_capacity(files.len());
|
||||
for abs in &files {
|
||||
// `to_posix` does NFC + leading `./` strip + `#` rejection.
|
||||
// Compute the workspace-relative path before handing to it so
|
||||
// emitted `WorkspacePath` is always relative.
|
||||
let rel = abs.strip_prefix(&root).unwrap_or(abs);
|
||||
let workspace_path = match to_posix(rel) {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
// A path containing `#` is the only documented reason
|
||||
// `to_posix` fails today. Drop the file with a warning
|
||||
// rather than aborting the entire scan — a single bad
|
||||
// filename should not nuke a 10 000-file ingest.
|
||||
tracing::warn!(
|
||||
path = %abs.display(),
|
||||
error = %e,
|
||||
"skipping file: path is not a valid WorkspacePath",
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let media_type = media_type_for(abs);
|
||||
let (byte_len, full_hex) = hash_file(abs)
|
||||
.with_context(|| format!("hashing {}", abs.display()))?;
|
||||
let checksum = Checksum(full_hex.clone());
|
||||
let asset_id = id_for_asset(&full_hex);
|
||||
|
||||
// Storage variant signals *intent*, not an actual copy.
|
||||
// P1-6 (asset writer) is responsible for the on-disk copy.
|
||||
let stored = if byte_len > self.copy_threshold_bytes {
|
||||
AssetStorage::Reference {
|
||||
path: abs.clone(),
|
||||
sha: checksum.clone(),
|
||||
}
|
||||
} else {
|
||||
AssetStorage::Copied { path: abs.clone() }
|
||||
};
|
||||
|
||||
assets.push(RawAsset {
|
||||
asset_id,
|
||||
source_uri: SourceUri::File(abs.clone()),
|
||||
workspace_path,
|
||||
media_type,
|
||||
byte_len,
|
||||
checksum,
|
||||
discovered_at: OffsetDateTime::now_utc(),
|
||||
stored,
|
||||
});
|
||||
}
|
||||
|
||||
// Determinism: sort by workspace_path. WorkspacePath is a String
|
||||
// newtype with stable lexicographic ordering. Two scans of the
|
||||
// same tree must produce identical Vec<RawAsset> modulo the
|
||||
// wall-clock `discovered_at` field.
|
||||
assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0));
|
||||
|
||||
Ok(assets)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(kb-config): hoist tilde + ${VAR} expansion into a kb-config helper
|
||||
// once that crate gains a path-expansion API. Today this duplicates logic
|
||||
// that P1-6 (store-sqlite) and future crates will also need.
|
||||
/// Expand a leading `~` to the current user's home directory. No-op for
|
||||
/// any other shape (absolute, relative, `${VAR}`-style).
|
||||
fn expand_tilde(s: &str) -> PathBuf {
|
||||
if let Some(rest) = s.strip_prefix("~/") {
|
||||
if let Some(home) = dirs_home() {
|
||||
return home.join(rest);
|
||||
}
|
||||
} else if s == "~" {
|
||||
if let Some(home) = dirs_home() {
|
||||
return home;
|
||||
}
|
||||
}
|
||||
PathBuf::from(s)
|
||||
}
|
||||
|
||||
/// Tiny `dirs::home_dir`-compat shim that does NOT add the `dirs` crate to
|
||||
/// our dep set (we explicitly enumerate allowed deps in the task spec).
|
||||
/// Reads `$HOME` directly.
|
||||
fn dirs_home() -> Option<PathBuf> {
|
||||
std::env::var_os("HOME").map(PathBuf::from)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use kb_config::Config;
|
||||
|
||||
fn cfg_with_root(root: &str) -> Config {
|
||||
let mut c = Config::defaults();
|
||||
c.workspace.root = root.to_string();
|
||||
c.workspace.exclude.clear();
|
||||
c
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_empty_dir_yields_empty_vec() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let conn = FsSourceConnector::new(&cfg_with_root(
|
||||
dir.path().to_str().unwrap(),
|
||||
))
|
||||
.unwrap();
|
||||
let scope = SourceScope::default();
|
||||
let v = conn.scan(&scope).unwrap();
|
||||
assert!(v.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_emits_sorted_workspace_paths() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
std::fs::create_dir_all(root.join("notes")).unwrap();
|
||||
std::fs::write(root.join("README.md"), b"hi").unwrap();
|
||||
std::fs::write(root.join("notes/beta.md"), b"b").unwrap();
|
||||
std::fs::write(root.join("notes/alpha.md"), b"a").unwrap();
|
||||
|
||||
let conn =
|
||||
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||
.unwrap();
|
||||
let v = conn.scan(&SourceScope::default()).unwrap();
|
||||
let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect();
|
||||
assert_eq!(
|
||||
names,
|
||||
vec![
|
||||
"README.md".to_string(),
|
||||
"notes/alpha.md".to_string(),
|
||||
"notes/beta.md".to_string(),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_filters_by_kbignore() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
std::fs::write(root.join(".kbignore"), "*.tmp\n").unwrap();
|
||||
std::fs::write(root.join("a.md"), b"x").unwrap();
|
||||
std::fs::write(root.join("b.tmp"), b"x").unwrap();
|
||||
|
||||
let conn =
|
||||
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||
.unwrap();
|
||||
let v = conn.scan(&SourceScope::default()).unwrap();
|
||||
let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect();
|
||||
// Decision: `.kbignore` itself IS emitted as a RawAsset (MediaType::Other("")).
|
||||
// Rationale: a config file that affects ingest is itself part of the
|
||||
// workspace contents; the markdown extractor (P1-2) will reject Other("")
|
||||
// on its own. If we ever decide to omit `.kbignore` from the asset list,
|
||||
// this test will catch it.
|
||||
assert!(
|
||||
names.contains(&".kbignore".to_string()),
|
||||
".kbignore must be emitted as an asset; got: {names:?}"
|
||||
);
|
||||
assert!(names.contains(&"a.md".to_string()));
|
||||
assert!(!names.contains(&"b.tmp".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_filters_default_excludes() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
std::fs::write(root.join("a.md"), b"x").unwrap();
|
||||
std::fs::write(root.join(".DS_Store"), b"\0\0").unwrap();
|
||||
std::fs::write(root.join("._sidecar"), b"\0\0").unwrap();
|
||||
|
||||
let conn =
|
||||
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||
.unwrap();
|
||||
let v = conn.scan(&SourceScope::default()).unwrap();
|
||||
let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect();
|
||||
assert_eq!(names, vec!["a.md".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_unions_config_exclude_and_kbignore() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
std::fs::write(root.join(".kbignore"), "*.tmp\n").unwrap();
|
||||
std::fs::write(root.join("a.md"), b"x").unwrap();
|
||||
std::fs::write(root.join("b.tmp"), b"x").unwrap();
|
||||
std::fs::write(root.join("c.log"), b"x").unwrap();
|
||||
|
||||
let mut cfg = cfg_with_root(root.to_str().unwrap());
|
||||
cfg.workspace.exclude.push("*.log".to_string());
|
||||
|
||||
let conn = FsSourceConnector::new(&cfg).unwrap();
|
||||
let v = conn.scan(&SourceScope::default()).unwrap();
|
||||
let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect();
|
||||
assert!(names.contains(&"a.md".to_string()));
|
||||
assert!(!names.contains(&"b.tmp".to_string()), "kbignore should drop *.tmp");
|
||||
assert!(!names.contains(&"c.log".to_string()), "config.exclude should drop *.log");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_blake3_pinned_for_known_file() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
std::fs::write(root.join("hello.md"), b"hello world").unwrap();
|
||||
|
||||
let conn =
|
||||
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||
.unwrap();
|
||||
let v = conn.scan(&SourceScope::default()).unwrap();
|
||||
assert_eq!(v.len(), 1);
|
||||
let asset = &v[0];
|
||||
assert_eq!(
|
||||
asset.checksum.0,
|
||||
"d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"
|
||||
);
|
||||
assert_eq!(asset.byte_len, 11);
|
||||
// asset_id is derived from the full hex via id_for_asset.
|
||||
assert_eq!(asset.asset_id, id_for_asset(&asset.checksum.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_idempotent_modulo_timestamp() {
|
||||
// Same filesystem state → identical Vec<RawAsset> *modulo*
|
||||
// discovered_at. Strip that field and compare.
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
std::fs::create_dir_all(root.join("notes")).unwrap();
|
||||
std::fs::write(root.join("notes/a.md"), b"alpha").unwrap();
|
||||
std::fs::write(root.join("notes/b.md"), b"beta").unwrap();
|
||||
|
||||
let conn =
|
||||
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||
.unwrap();
|
||||
let v1 = conn.scan(&SourceScope::default()).unwrap();
|
||||
let v2 = conn.scan(&SourceScope::default()).unwrap();
|
||||
assert_eq!(v1.len(), v2.len());
|
||||
for (a, b) in v1.iter().zip(v2.iter()) {
|
||||
assert_eq!(a.asset_id, b.asset_id);
|
||||
assert_eq!(a.workspace_path, b.workspace_path);
|
||||
assert_eq!(a.checksum, b.checksum);
|
||||
assert_eq!(a.byte_len, b.byte_len);
|
||||
assert_eq!(a.media_type, b.media_type);
|
||||
assert_eq!(a.source_uri, b.source_uri);
|
||||
assert_eq!(a.stored, b.stored);
|
||||
// discovered_at intentionally NOT compared
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_emits_posix_normalized_paths() {
|
||||
// End-to-end: the connector must produce POSIX-normalized
|
||||
// workspace paths via `kb_core::to_posix`. We can't construct an
|
||||
// input with literal `./` / `//` segments via the filesystem (the
|
||||
// OS won't let us), so instead we assert the resulting strings
|
||||
// are already POSIX-clean (no leading `./`, no `//`, forward
|
||||
// slashes only) — which is the post-conditions side of the
|
||||
// round-trip the unit tests in `kb-core::normalize` cover.
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
std::fs::create_dir_all(root.join("a/b/c")).unwrap();
|
||||
std::fs::write(root.join("a/b/c/d.md"), b"x").unwrap();
|
||||
|
||||
let conn =
|
||||
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||
.unwrap();
|
||||
let v = conn.scan(&SourceScope::default()).unwrap();
|
||||
assert_eq!(v.len(), 1);
|
||||
let p = &v[0].workspace_path.0;
|
||||
assert_eq!(p, "a/b/c/d.md");
|
||||
assert!(!p.starts_with("./"));
|
||||
assert!(!p.contains("//"));
|
||||
assert!(!p.contains('\\'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_skips_files_whose_name_contains_hash() {
|
||||
// `WorkspacePath` rejects `#` (collides with the W3C-Media-Fragments
|
||||
// separator used by `Citation`). The connector must drop such
|
||||
// files with a warning rather than aborting the scan.
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
std::fs::write(root.join("ok.md"), b"x").unwrap();
|
||||
std::fs::write(root.join("has#hash.md"), b"y").unwrap();
|
||||
|
||||
let conn =
|
||||
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||
.unwrap();
|
||||
let v = conn.scan(&SourceScope::default()).unwrap();
|
||||
let names: Vec<_> = v.iter().map(|a| a.workspace_path.0.clone()).collect();
|
||||
assert_eq!(names, vec!["ok.md".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn copy_vs_reference_threshold_signals_intent() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
std::fs::write(root.join("small.md"), b"hi").unwrap();
|
||||
|
||||
let mut cfg = cfg_with_root(root.to_str().unwrap());
|
||||
// Threshold = 0 MiB ⇒ even a 2-byte file becomes Reference.
|
||||
cfg.storage.copy_threshold_mb = 0;
|
||||
let conn = FsSourceConnector::new(&cfg).unwrap();
|
||||
let v = conn.scan(&SourceScope::default()).unwrap();
|
||||
assert_eq!(v.len(), 1);
|
||||
match &v[0].stored {
|
||||
AssetStorage::Reference { sha, .. } => {
|
||||
assert_eq!(sha, &v[0].checksum);
|
||||
}
|
||||
other => panic!("expected Reference, got {other:?}"),
|
||||
}
|
||||
|
||||
// Threshold high (default 100 MiB) ⇒ Copied.
|
||||
let mut cfg2 = cfg_with_root(root.to_str().unwrap());
|
||||
cfg2.storage.copy_threshold_mb = 100;
|
||||
let conn2 = FsSourceConnector::new(&cfg2).unwrap();
|
||||
let v2 = conn2.scan(&SourceScope::default()).unwrap();
|
||||
assert!(matches!(v2[0].stored, AssetStorage::Copied { .. }));
|
||||
}
|
||||
}
|
||||
92
crates/kb-source-fs/src/hash.rs
Normal file
92
crates/kb-source-fs/src/hash.rs
Normal file
@@ -0,0 +1,92 @@
|
||||
//! Streaming BLAKE3 over a file path. Per task spec, files MUST NOT be
|
||||
//! loaded fully into memory: `blake3::Hasher::update_reader` reads through a
|
||||
//! 64 KiB internal buffer, which keeps memory bounded for any size of file.
|
||||
//!
|
||||
//! Returns `(byte_len, full_hex)`:
|
||||
//! - `byte_len` is the total bytes hashed (== file size after follow).
|
||||
//! - `full_hex` is the canonical lowercase hex (64 chars) of the full
|
||||
//! blake3 digest. The `kb-core::Checksum` invariant is "full hex"; the
|
||||
//! 32-char prefix is reserved for `AssetId` derivation via
|
||||
//! `kb_core::id_for_asset`.
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{self, Read};
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
const READ_BUFFER_BYTES: usize = 64 * 1024;
|
||||
|
||||
/// Stream-hash a file with blake3. Returns `(byte_len, full_hex_64)`.
|
||||
///
|
||||
/// `byte_len` is computed during streaming so callers do not need a separate
|
||||
/// `metadata().len()` call (which can disagree with hashed bytes if the file
|
||||
/// is rewritten mid-scan, but blake3-of-stream is the source of truth for
|
||||
/// `RawAsset.checksum`).
|
||||
pub(crate) fn hash_file(path: &Path) -> Result<(u64, String)> {
|
||||
let file = File::open(path)
|
||||
.with_context(|| format!("failed to open {} for hashing", path.display()))?;
|
||||
hash_reader(file).with_context(|| format!("failed to hash {}", path.display()))
|
||||
}
|
||||
|
||||
fn hash_reader<R: Read>(mut reader: R) -> Result<(u64, String)> {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
let mut buf = vec![0u8; READ_BUFFER_BYTES];
|
||||
let mut total: u64 = 0;
|
||||
loop {
|
||||
match reader.read(&mut buf) {
|
||||
Ok(0) => break,
|
||||
Ok(n) => {
|
||||
hasher.update(&buf[..n]);
|
||||
total = total.saturating_add(n as u64);
|
||||
}
|
||||
Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
}
|
||||
Ok((total, hasher.finalize().to_hex().to_string()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// blake3 of the empty input is the well-known "official empty hash"
|
||||
/// from the blake3 spec. Pinned so that swapping the hash crate or the
|
||||
/// streaming implementation can never silently produce a different
|
||||
/// digest for a known input.
|
||||
#[test]
|
||||
fn empty_blake3_pinned() {
|
||||
let (n, hex) = hash_reader(std::io::empty()).unwrap();
|
||||
assert_eq!(n, 0);
|
||||
assert_eq!(
|
||||
hex,
|
||||
"af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
|
||||
);
|
||||
}
|
||||
|
||||
/// `b"hello world"` blake3 (full 64 hex). Computed independently with
|
||||
/// `b3sum`; pinning here detects any drift in the streaming pipeline.
|
||||
#[test]
|
||||
fn known_bytes_blake3_pinned() {
|
||||
let bytes = b"hello world";
|
||||
let (n, hex) = hash_reader(&bytes[..]).unwrap();
|
||||
assert_eq!(n, 11);
|
||||
assert_eq!(
|
||||
hex,
|
||||
"d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"
|
||||
);
|
||||
}
|
||||
|
||||
/// Streaming a buffer larger than `READ_BUFFER_BYTES` must produce the
|
||||
/// same digest as a single-shot blake3 over the same bytes — i.e. the
|
||||
/// chunk boundary is invisible.
|
||||
#[test]
|
||||
fn streaming_matches_oneshot_over_buffer_boundary() {
|
||||
let bytes: Vec<u8> = (0u8..=255u8).cycle().take(READ_BUFFER_BYTES * 3 + 17).collect();
|
||||
let (n, streamed) = hash_reader(&bytes[..]).unwrap();
|
||||
assert_eq!(n, bytes.len() as u64);
|
||||
let oneshot = blake3::hash(&bytes).to_hex().to_string();
|
||||
assert_eq!(streamed, oneshot);
|
||||
}
|
||||
}
|
||||
16
crates/kb-source-fs/src/lib.rs
Normal file
16
crates/kb-source-fs/src/lib.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
//! `kb-source-fs` — local filesystem `SourceConnector`.
|
||||
//!
|
||||
//! Walks `config.workspace.root`, applies gitignore-style filters from
|
||||
//! `config.workspace.exclude` ∪ `.kbignore`, computes BLAKE3 of every file,
|
||||
//! and emits `Vec<RawAsset>` sorted by `workspace_path` for determinism.
|
||||
//!
|
||||
//! Per design §3.3 (RawAsset), §6.2 (workspace + .kbignore), §6.6 (POSIX
|
||||
//! normalization), §7.1 (SourceScope), §7.2 (SourceConnector), §8 (module
|
||||
//! boundaries).
|
||||
|
||||
mod connector;
|
||||
mod hash;
|
||||
mod media;
|
||||
mod walker;
|
||||
|
||||
pub use connector::FsSourceConnector;
|
||||
85
crates/kb-source-fs/src/media.rs
Normal file
85
crates/kb-source-fs/src/media.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
//! Media-type detection by extension. Per P1-1 task spec we do NOT do
|
||||
//! libmagic-style sniffing; extension is enough for P1. Unknown / missing
|
||||
//! extensions fall through to `MediaType::Other(ext.to_string())` (empty
|
||||
//! string when the file has no extension at all).
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use kb_core::{AudioType, ImageType, MediaType};
|
||||
|
||||
/// Return `MediaType` for `path` based purely on its lowercased extension.
|
||||
/// `.md` → Markdown, `.pdf` → Pdf, image and audio extensions map onto
|
||||
/// `MediaType::Image(_)` / `MediaType::Audio(_)`. Anything else (including
|
||||
/// missing extension) → `MediaType::Other(ext)`.
|
||||
pub(crate) fn media_type_for(path: &Path) -> MediaType {
|
||||
let ext = path
|
||||
.extension()
|
||||
.and_then(|s| s.to_str())
|
||||
.map(|s| s.to_ascii_lowercase())
|
||||
.unwrap_or_default();
|
||||
|
||||
match ext.as_str() {
|
||||
"md" => MediaType::Markdown,
|
||||
"pdf" => MediaType::Pdf,
|
||||
|
||||
"png" => MediaType::Image(ImageType::Png),
|
||||
"jpg" | "jpeg" => MediaType::Image(ImageType::Jpeg),
|
||||
"webp" => MediaType::Image(ImageType::Webp),
|
||||
"gif" => MediaType::Image(ImageType::Gif),
|
||||
"tiff" | "tif" => MediaType::Image(ImageType::Tiff),
|
||||
|
||||
"m4a" => MediaType::Audio(AudioType::M4a),
|
||||
"mp3" => MediaType::Audio(AudioType::Mp3),
|
||||
"wav" => MediaType::Audio(AudioType::Wav),
|
||||
"flac" => MediaType::Audio(AudioType::Flac),
|
||||
"ogg" => MediaType::Audio(AudioType::Ogg),
|
||||
|
||||
// Empty string (no extension) and any other extension: bucket as
|
||||
// Other and let downstream extractors decide if they support it.
|
||||
_ => MediaType::Other(ext),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn markdown_and_pdf() {
|
||||
assert_eq!(media_type_for(Path::new("a/b.md")), MediaType::Markdown);
|
||||
assert_eq!(media_type_for(Path::new("a/b.MD")), MediaType::Markdown);
|
||||
assert_eq!(media_type_for(Path::new("a/b.pdf")), MediaType::Pdf);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn images_and_audio() {
|
||||
assert_eq!(
|
||||
media_type_for(Path::new("p.jpg")),
|
||||
MediaType::Image(ImageType::Jpeg)
|
||||
);
|
||||
assert_eq!(
|
||||
media_type_for(Path::new("p.JPEG")),
|
||||
MediaType::Image(ImageType::Jpeg)
|
||||
);
|
||||
assert_eq!(
|
||||
media_type_for(Path::new("a.M4A")),
|
||||
MediaType::Audio(AudioType::M4a)
|
||||
);
|
||||
assert_eq!(
|
||||
media_type_for(Path::new("a.flac")),
|
||||
MediaType::Audio(AudioType::Flac)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_and_missing_extension() {
|
||||
assert_eq!(
|
||||
media_type_for(Path::new("notes/x.weird")),
|
||||
MediaType::Other("weird".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
media_type_for(Path::new("README")),
|
||||
MediaType::Other(String::new())
|
||||
);
|
||||
}
|
||||
}
|
||||
260
crates/kb-source-fs/src/walker.rs
Normal file
260
crates/kb-source-fs/src/walker.rs
Normal file
@@ -0,0 +1,260 @@
|
||||
//! Directory walker with gitignore-style filtering and symlink-cycle
|
||||
//! protection.
|
||||
//!
|
||||
//! Filter set (per task spec, design §6.2):
|
||||
//! - `config.workspace.exclude` (passed in by `FsSourceConnector`)
|
||||
//! - `<root>/.kbignore` (optional file at workspace root)
|
||||
//! - default-excludes for `.DS_Store` and macOS resource forks (`._*`)
|
||||
//!
|
||||
//! All three are merged via `ignore::overrides::OverrideBuilder`, which
|
||||
//! gives full gitignore semantics (anchors, `!` negation, `**`, etc.). We
|
||||
//! prepend `!` to each pattern because `OverrideBuilder` treats positive
|
||||
//! patterns as "include" and negative as "exclude" — see §"Filter set"
|
||||
//! comment in `build_walker` for the full reasoning.
|
||||
//!
|
||||
//! Symlink handling: we want to follow links (so a workspace using a
|
||||
//! symlinked `notes/` directory works), but we must NOT loop forever on
|
||||
//! `a -> b -> a`. `walkdir` does NOT detect cycles for us when
|
||||
//! `follow_links(true)`; we layer our own visited-set on top, keyed by the
|
||||
//! canonical path of every entry, and skip any entry we've already seen.
|
||||
//!
|
||||
//! ## Why `walkdir` instead of `ignore::WalkBuilder`?
|
||||
//!
|
||||
//! `ignore::WalkBuilder` bundles gitignore semantics + cycle detection in
|
||||
//! one API. We use `walkdir` directly because we need explicit control
|
||||
//! over canonical-path comparison for sibling-subtree symlinks (a case
|
||||
//! `walkdir`'s ancestor-only check can miss). Override-based filtering
|
||||
//! still uses the `ignore` crate's `Override` matcher, just decoupled from
|
||||
//! its walker.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use ignore::overrides::{Override, OverrideBuilder};
|
||||
use walkdir::{DirEntry, WalkDir};
|
||||
|
||||
/// Default-excludes baked into the connector. These are NOT configurable;
|
||||
/// they cover noise that is never useful to ingest and would otherwise need
|
||||
/// to appear in every user's `.kbignore`.
|
||||
const DEFAULT_EXCLUDES: &[&str] = &[
|
||||
// Finder metadata
|
||||
".DS_Store",
|
||||
"**/.DS_Store",
|
||||
// macOS resource forks (AppleDouble files)
|
||||
"._*",
|
||||
"**/._*",
|
||||
];
|
||||
|
||||
/// Build the merged `Override` from `config.workspace.exclude` ∪ `.kbignore`
|
||||
/// ∪ baked-in default excludes.
|
||||
///
|
||||
/// Each input pattern is registered as an *exclude* (gitignore-style: a
|
||||
/// leading `!` flips a positive match to a negative one in the
|
||||
/// `OverrideBuilder` API). Order doesn't matter — the union is computed by
|
||||
/// the underlying gitignore engine.
|
||||
pub(crate) fn build_overrides(
|
||||
root: &Path,
|
||||
config_exclude: &[String],
|
||||
kbignore_patterns: &[String],
|
||||
) -> Result<Override> {
|
||||
let mut builder = OverrideBuilder::new(root);
|
||||
|
||||
for pat in DEFAULT_EXCLUDES {
|
||||
builder
|
||||
.add(&format!("!{pat}"))
|
||||
.with_context(|| format!("invalid default-exclude pattern: {pat}"))?;
|
||||
}
|
||||
for pat in config_exclude {
|
||||
builder
|
||||
.add(&format!("!{pat}"))
|
||||
.with_context(|| format!("invalid workspace.exclude pattern: {pat}"))?;
|
||||
}
|
||||
for pat in kbignore_patterns {
|
||||
builder
|
||||
.add(&format!("!{pat}"))
|
||||
.with_context(|| format!("invalid .kbignore pattern: {pat}"))?;
|
||||
}
|
||||
|
||||
builder.build().context("failed to compile override set")
|
||||
}
|
||||
|
||||
/// Read `<root>/.kbignore` if it exists. Each non-blank, non-comment line is
|
||||
/// a gitignore pattern. Missing file → empty Vec (not an error).
|
||||
pub(crate) fn read_kbignore(root: &Path) -> Result<Vec<String>> {
|
||||
let path = root.join(".kbignore");
|
||||
if !path.exists() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
let text = std::fs::read_to_string(&path)
|
||||
.with_context(|| format!("failed to read {}", path.display()))?;
|
||||
Ok(text
|
||||
.lines()
|
||||
.map(|l| l.trim())
|
||||
.filter(|l| !l.is_empty() && !l.starts_with('#'))
|
||||
.map(|l| l.to_string())
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Iterate every regular file under `root`, applying `overrides` and
|
||||
/// detecting symlink cycles. Returns absolute file paths.
|
||||
///
|
||||
/// Strategy:
|
||||
/// - `walkdir::WalkDir::follow_links(true)` to traverse symlinks.
|
||||
/// - Maintain `visited: HashSet<PathBuf>` of *canonical* paths. Before
|
||||
/// descending into a directory entry, canonicalize and check the set;
|
||||
/// if already present, skip. This breaks `a -> b -> a` cycles in O(n)
|
||||
/// per entry without a custom recursive walker.
|
||||
/// - For each yielded entry, ask `overrides` whether it is excluded; if
|
||||
/// so, drop it. If the entry is a directory, also short-circuit
|
||||
/// `WalkDir`'s descent via `it.skip_current_dir()`.
|
||||
pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result<Vec<PathBuf>> {
|
||||
let mut out = Vec::new();
|
||||
let mut visited: HashSet<PathBuf> = HashSet::new();
|
||||
|
||||
let walker = WalkDir::new(root).follow_links(true).into_iter();
|
||||
let mut it = walker.filter_entry(|e| !is_excluded(e, root, overrides));
|
||||
|
||||
while let Some(res) = it.next() {
|
||||
let entry = match res {
|
||||
Ok(e) => e,
|
||||
Err(err) => {
|
||||
// `walkdir` surfaces I/O errors AND its own cycle detector
|
||||
// (when follow_links is on it sometimes catches them).
|
||||
// Either way: log and skip; do not abort the whole scan.
|
||||
tracing::warn!(error = %err, "walkdir entry error; skipping");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let path = entry.path();
|
||||
|
||||
// Cycle guard: only canonicalize symlinks (cheap on the common case
|
||||
// of plain files/dirs) and on directories that are followed via a
|
||||
// symlink. `walkdir`'s `path_is_symlink()` is true when the entry's
|
||||
// *original* path is a symlink (it returns true for the link, not
|
||||
// for the resolved target). For non-symlinked directories we still
|
||||
// record the canonical path so a *later* symlink that points back
|
||||
// to one of them is detected.
|
||||
if entry.file_type().is_dir() {
|
||||
match std::fs::canonicalize(path) {
|
||||
Ok(canon) => {
|
||||
if !visited.insert(canon) {
|
||||
// Already visited via another path → break cycle.
|
||||
it.skip_current_dir();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::debug!(
|
||||
path = %path.display(),
|
||||
error = %err,
|
||||
"skipping: canonicalize failed (broken/permission-denied symlink target)"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if entry.file_type().is_file() {
|
||||
out.push(path.to_path_buf());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
fn is_excluded(entry: &DirEntry, root: &Path, overrides: &Override) -> bool {
|
||||
// `Override::matched(path, is_dir)` uses the path *relative to* the
|
||||
// override builder's root. `walkdir` gives absolute paths when
|
||||
// `WalkDir::new` was given an absolute path — strip the root prefix
|
||||
// before consulting the override.
|
||||
let rel = match entry.path().strip_prefix(root) {
|
||||
Ok(p) => p,
|
||||
Err(_) => entry.path(),
|
||||
};
|
||||
overrides
|
||||
.matched(rel, entry.file_type().is_dir())
|
||||
.is_ignore()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn empty_inputs_compile_into_an_override() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
|
||||
// Default-excludes only; non-special files should not match.
|
||||
let m = ov.matched(Path::new("notes/alpha.md"), false);
|
||||
assert!(!m.is_ignore());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_excludes_ds_store_and_resource_forks() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
|
||||
assert!(ov.matched(Path::new(".DS_Store"), false).is_ignore());
|
||||
assert!(
|
||||
ov.matched(Path::new("notes/.DS_Store"), false).is_ignore()
|
||||
);
|
||||
assert!(ov.matched(Path::new("._foo.md"), false).is_ignore());
|
||||
assert!(
|
||||
ov.matched(Path::new("notes/._sidecar"), false).is_ignore()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn config_exclude_filters_tmp_and_node_modules() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let ov = build_overrides(
|
||||
dir.path(),
|
||||
&["*.tmp".to_string(), "node_modules/**".to_string()],
|
||||
&[],
|
||||
)
|
||||
.unwrap();
|
||||
assert!(ov.matched(Path::new("a.tmp"), false).is_ignore());
|
||||
assert!(ov.matched(Path::new("notes/x.tmp"), false).is_ignore());
|
||||
assert!(
|
||||
ov.matched(Path::new("node_modules/foo/bar.js"), false)
|
||||
.is_ignore()
|
||||
);
|
||||
assert!(!ov.matched(Path::new("alpha.md"), false).is_ignore());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kbignore_union_with_config_exclude() {
|
||||
// "either set excluding it ⇒ excluded"
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let ov = build_overrides(
|
||||
dir.path(),
|
||||
&["*.tmp".to_string()],
|
||||
&["secret/**".to_string()],
|
||||
)
|
||||
.unwrap();
|
||||
assert!(ov.matched(Path::new("a.tmp"), false).is_ignore());
|
||||
assert!(
|
||||
ov.matched(Path::new("secret/key.md"), false).is_ignore()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn read_kbignore_missing_returns_empty() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let v = read_kbignore(dir.path()).unwrap();
|
||||
assert!(v.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn read_kbignore_strips_blanks_and_comments() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
std::fs::write(
|
||||
dir.path().join(".kbignore"),
|
||||
"# comment\n*.tmp\n\nignored/**\n",
|
||||
)
|
||||
.unwrap();
|
||||
let v = read_kbignore(dir.path()).unwrap();
|
||||
assert_eq!(v, vec!["*.tmp".to_string(), "ignored/**".to_string()]);
|
||||
}
|
||||
}
|
||||
139
crates/kb-source-fs/tests/snapshot_tree1.rs
Normal file
139
crates/kb-source-fs/tests/snapshot_tree1.rs
Normal file
@@ -0,0 +1,139 @@
|
||||
//! Snapshot + determinism tests against `fixtures/source-fs/tree-1`.
|
||||
//!
|
||||
//! Layout (committed under `<repo>/fixtures/source-fs/tree-1/`):
|
||||
//!
|
||||
//! ```
|
||||
//! tree-1/
|
||||
//! ├── README.md
|
||||
//! ├── notes/
|
||||
//! │ ├── alpha.md
|
||||
//! │ └── beta.md
|
||||
//! ├── ignored/
|
||||
//! │ └── skip.tmp # excluded by .kbignore
|
||||
//! ├── .kbignore # contains: *.tmp
|
||||
//! └── .DS_Store # implicitly excluded
|
||||
//! ```
|
||||
//!
|
||||
//! Two assertions:
|
||||
//! 1. Snapshot stability — `scan` output (with `discovered_at` stripped)
|
||||
//! matches the committed baseline JSON byte-for-byte.
|
||||
//! 2. Determinism — running `scan` twice produces byte-identical JSON
|
||||
//! after stripping `discovered_at`.
|
||||
//!
|
||||
//! `discovered_at` is wall-clock and intentionally NOT part of the
|
||||
//! contract: the task spec says strip it before comparison.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use kb_config::Config;
|
||||
use kb_core::{SourceConnector, SourceScope};
|
||||
use kb_source_fs::FsSourceConnector;
|
||||
use serde_json::Value;
|
||||
|
||||
/// Repo root, derived from `CARGO_MANIFEST_DIR` (= `crates/kb-source-fs`).
|
||||
fn repo_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.parent()
|
||||
.unwrap()
|
||||
.parent()
|
||||
.unwrap()
|
||||
.to_path_buf()
|
||||
}
|
||||
|
||||
fn fixture_root() -> PathBuf {
|
||||
repo_root().join("fixtures/source-fs/tree-1")
|
||||
}
|
||||
|
||||
fn baseline_path() -> PathBuf {
|
||||
repo_root().join("fixtures/source-fs/tree-1.snapshot.json")
|
||||
}
|
||||
|
||||
fn cfg_for_fixture(root: &str) -> Config {
|
||||
let mut c = Config::defaults();
|
||||
c.workspace.root = root.to_string();
|
||||
// Clear default excludes (`.git/**`, `node_modules/**`, `.obsidian/**`)
|
||||
// so the snapshot is purely a function of the fixture + .kbignore +
|
||||
// baked-in default-excludes.
|
||||
c.workspace.exclude.clear();
|
||||
c
|
||||
}
|
||||
|
||||
/// Run `scan` against the fixture and return the JSON value with every
|
||||
/// `discovered_at` field replaced by the literal string "<stripped>".
|
||||
/// Also strip `source_uri.value` and `stored.path` because they contain
|
||||
/// absolute paths that vary by checkout location — the snapshot must be
|
||||
/// portable across machines and CI checkout dirs.
|
||||
fn scan_and_strip() -> Value {
|
||||
let root = fixture_root();
|
||||
let cfg = cfg_for_fixture(root.to_str().unwrap());
|
||||
let conn = FsSourceConnector::new(&cfg).expect("connector init");
|
||||
let assets = conn
|
||||
.scan(&SourceScope::default())
|
||||
.expect("scan must succeed against committed fixture");
|
||||
|
||||
let mut v = serde_json::to_value(&assets).expect("serialize");
|
||||
if let Value::Array(items) = &mut v {
|
||||
for item in items {
|
||||
if let Value::Object(map) = item {
|
||||
map.insert(
|
||||
"discovered_at".to_string(),
|
||||
Value::String("<stripped>".to_string()),
|
||||
);
|
||||
// source_uri = { kind: "file", value: "<abs>" } — strip value.
|
||||
if let Some(Value::Object(s)) = map.get_mut("source_uri") {
|
||||
if s.contains_key("value") {
|
||||
s.insert("value".to_string(), Value::String("<stripped>".to_string()));
|
||||
}
|
||||
}
|
||||
// stored = { kind: "copied"|"reference", path: "<abs>", ... } — strip path.
|
||||
if let Some(Value::Object(s)) = map.get_mut("stored") {
|
||||
if s.contains_key("path") {
|
||||
s.insert("path".to_string(), Value::String("<stripped>".to_string()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tree_1_snapshot_matches_baseline() {
|
||||
let actual = scan_and_strip();
|
||||
|
||||
// If KB_REGEN_SNAPSHOT is set, (re)write the baseline and exit
|
||||
// *before* attempting to read it. This is the only path that may
|
||||
// create the file from scratch.
|
||||
if std::env::var_os("KB_REGEN_SNAPSHOT").is_some() {
|
||||
let pretty = serde_json::to_string_pretty(&actual).unwrap() + "\n";
|
||||
std::fs::write(baseline_path(), pretty).expect("write baseline");
|
||||
panic!("regenerated baseline; rerun without KB_REGEN_SNAPSHOT to verify");
|
||||
}
|
||||
|
||||
let baseline_text = std::fs::read_to_string(baseline_path()).unwrap_or_else(|_| {
|
||||
panic!(
|
||||
"missing baseline at {} — regenerate via `KB_REGEN_SNAPSHOT=1 cargo test \
|
||||
-p kb-source-fs --test snapshot_tree1 -- tree_1_snapshot_matches_baseline`",
|
||||
baseline_path().display()
|
||||
)
|
||||
});
|
||||
let expected: Value = serde_json::from_str(&baseline_text)
|
||||
.expect("baseline JSON must parse");
|
||||
|
||||
if actual != expected {
|
||||
let actual_pretty = serde_json::to_string_pretty(&actual).unwrap();
|
||||
let expected_pretty = serde_json::to_string_pretty(&expected).unwrap();
|
||||
panic!(
|
||||
"snapshot drift.\n--- expected ---\n{expected_pretty}\n--- actual ---\n{actual_pretty}\n"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tree_1_scan_is_deterministic() {
|
||||
let v1 = scan_and_strip();
|
||||
let v2 = scan_and_strip();
|
||||
let s1 = serde_json::to_string(&v1).unwrap();
|
||||
let s2 = serde_json::to_string(&v2).unwrap();
|
||||
assert_eq!(s1, s2, "two consecutive scans diverged");
|
||||
}
|
||||
160
crates/kb-source-fs/tests/symlink_cycle.rs
Normal file
160
crates/kb-source-fs/tests/symlink_cycle.rs
Normal file
@@ -0,0 +1,160 @@
|
||||
//! Integration test: a `notes/` symlink whose target points back at the
|
||||
//! workspace root MUST NOT cause `scan` to loop forever or panic.
|
||||
//!
|
||||
//! Layout (built per-test in a tempdir):
|
||||
//! root/
|
||||
//! ├── alpha.md
|
||||
//! ├── notes/ (symlink → root) ← cycle: root → notes → root → …
|
||||
//!
|
||||
//! Expected: `scan` returns in O(seconds), every emitted path is unique,
|
||||
//! and `alpha.md` appears at least once.
|
||||
//!
|
||||
//! The cycle guard lives in `walker::walk_files`; this test exists to
|
||||
//! prove it catches the realistic shape (cycle through one or more
|
||||
//! symlinks) end-to-end via the public API.
|
||||
|
||||
#![cfg(unix)]
|
||||
|
||||
use std::os::unix::fs::symlink;
|
||||
|
||||
use kb_config::Config;
|
||||
use kb_core::{SourceConnector, SourceScope};
|
||||
use kb_source_fs::FsSourceConnector;
|
||||
|
||||
fn cfg_with_root(root: &str) -> Config {
|
||||
let mut c = Config::defaults();
|
||||
c.workspace.root = root.to_string();
|
||||
c.workspace.exclude.clear();
|
||||
c
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn symlink_cycle_does_not_loop_or_crash() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
|
||||
std::fs::write(root.join("alpha.md"), b"alpha").unwrap();
|
||||
// Symlink: root/notes → root (a → a cycle through the link `notes`).
|
||||
symlink(root, root.join("notes")).unwrap();
|
||||
|
||||
let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||
.expect("connector init");
|
||||
let v = conn
|
||||
.scan(&SourceScope::default())
|
||||
.expect("scan must return, not loop");
|
||||
|
||||
// Determinism check: no duplicate workspace paths.
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
for asset in &v {
|
||||
assert!(
|
||||
seen.insert(asset.workspace_path.0.clone()),
|
||||
"duplicate workspace_path: {}",
|
||||
asset.workspace_path.0
|
||||
);
|
||||
}
|
||||
// The original alpha.md must appear.
|
||||
assert!(
|
||||
v.iter().any(|a| a.workspace_path.0 == "alpha.md"),
|
||||
"expected alpha.md in scan output, got: {:?}",
|
||||
v.iter().map(|a| &a.workspace_path.0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dangling_symlink_pseudo_cycle_does_not_crash() {
|
||||
// root/
|
||||
// ├── alpha.md
|
||||
// ├── a → b (b does not exist as a real file/dir)
|
||||
// └── b → a (a does not exist as a real file/dir)
|
||||
//
|
||||
// Both symlinks are dangling — neither resolves to anything. This is
|
||||
// NOT a real two-step directory cycle (see
|
||||
// `two_step_directory_cycle_visited_set_breaks_loop` for that case);
|
||||
// it merely verifies the scan tolerates broken-link pseudo-cycles
|
||||
// without crashing or looping.
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
std::fs::write(root.join("alpha.md"), b"alpha").unwrap();
|
||||
symlink(root.join("b"), root.join("a")).unwrap();
|
||||
symlink(root.join("a"), root.join("b")).unwrap();
|
||||
|
||||
let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||
.expect("connector init");
|
||||
// Even though a→b→a never resolves to a real directory (broken
|
||||
// pseudo-cycle of dangling symlinks), the scan must complete and
|
||||
// surface alpha.md.
|
||||
let v = conn.scan(&SourceScope::default()).expect("scan must return");
|
||||
assert!(v.iter().any(|a| a.workspace_path.0 == "alpha.md"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_step_directory_cycle_visited_set_breaks_loop() {
|
||||
// Real two-step directory cycle through symlinks:
|
||||
// root/
|
||||
// ├── a/
|
||||
// │ ├── inside_a.md
|
||||
// │ └── loop → ../b (symlink, target IS a real directory)
|
||||
// └── b/
|
||||
// ├── inside_b.md
|
||||
// └── loop → ../a (symlink, target IS a real directory)
|
||||
//
|
||||
// Without the visited-set, walkdir would descend
|
||||
// a → a/loop (=b) → a/loop/loop (=a) → … forever.
|
||||
// The canonical-path visited-set in `walker::walk_files` must break
|
||||
// the loop and yield a finite, deterministic result.
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let root = dir.path();
|
||||
std::fs::create_dir(root.join("a")).unwrap();
|
||||
std::fs::create_dir(root.join("b")).unwrap();
|
||||
std::fs::write(root.join("a/inside_a.md"), b"a-content").unwrap();
|
||||
std::fs::write(root.join("b/inside_b.md"), b"b-content").unwrap();
|
||||
// Use relative targets so the symlink truly points at the sibling
|
||||
// directory regardless of where the tempdir lives.
|
||||
symlink("../b", root.join("a/loop")).unwrap();
|
||||
symlink("../a", root.join("b/loop")).unwrap();
|
||||
|
||||
let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||
.expect("connector init");
|
||||
|
||||
// Run scan twice — both must terminate AND produce identical
|
||||
// workspace_path lists (visited-set is deterministic per scan).
|
||||
let v1 = conn.scan(&SourceScope::default()).expect("scan must return");
|
||||
let v2 = conn.scan(&SourceScope::default()).expect("scan must return");
|
||||
|
||||
let names1: Vec<String> = v1.iter().map(|a| a.workspace_path.0.clone()).collect();
|
||||
let names2: Vec<String> = v2.iter().map(|a| a.workspace_path.0.clone()).collect();
|
||||
assert_eq!(names1, names2, "scan must be deterministic across runs");
|
||||
|
||||
// No duplicate workspace paths (visited-set should suppress
|
||||
// re-emission of the same canonical file via the cycle).
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
for asset in &v1 {
|
||||
assert!(
|
||||
seen.insert(asset.workspace_path.0.clone()),
|
||||
"duplicate workspace_path: {}",
|
||||
asset.workspace_path.0
|
||||
);
|
||||
}
|
||||
|
||||
// Both real files must appear at least once. Their exact relative
|
||||
// paths depend on which side of the cycle the walker descended into
|
||||
// first; assert by basename to keep the check robust.
|
||||
assert!(
|
||||
v1.iter().any(|a| a.workspace_path.0.ends_with("inside_a.md")),
|
||||
"expected inside_a.md in scan output, got: {names1:?}"
|
||||
);
|
||||
assert!(
|
||||
v1.iter().any(|a| a.workspace_path.0.ends_with("inside_b.md")),
|
||||
"expected inside_b.md in scan output, got: {names1:?}"
|
||||
);
|
||||
|
||||
// Sanity bound: with two real files and a working cycle guard the
|
||||
// output should be tiny. If we ever produce >50 entries the visited
|
||||
// set has regressed.
|
||||
assert!(
|
||||
v1.len() < 50,
|
||||
"scan emitted {} assets — cycle guard likely regressed: {:?}",
|
||||
v1.len(),
|
||||
names1
|
||||
);
|
||||
}
|
||||
68
fixtures/source-fs/tree-1.snapshot.json
Normal file
68
fixtures/source-fs/tree-1.snapshot.json
Normal file
@@ -0,0 +1,68 @@
|
||||
[
|
||||
{
|
||||
"asset_id": "bd6e5649e546d6ac94c3269ffe7192c5",
|
||||
"byte_len": 6,
|
||||
"checksum": "f6b71def043f1fd92f2d34969a7272a9d134730551de8c9754c4be79fbc0aef3",
|
||||
"discovered_at": "<stripped>",
|
||||
"media_type": {
|
||||
"other": ""
|
||||
},
|
||||
"source_uri": {
|
||||
"kind": "file",
|
||||
"value": "<stripped>"
|
||||
},
|
||||
"stored": {
|
||||
"kind": "copied",
|
||||
"path": "<stripped>"
|
||||
},
|
||||
"workspace_path": ".kbignore"
|
||||
},
|
||||
{
|
||||
"asset_id": "ba6cd31cab86eff7a86638ee76494bcf",
|
||||
"byte_len": 169,
|
||||
"checksum": "b0124489083674f6ad99a57ee5fc425feb71754a538a97a1ab580e8eb9b1f1c1",
|
||||
"discovered_at": "<stripped>",
|
||||
"media_type": "markdown",
|
||||
"source_uri": {
|
||||
"kind": "file",
|
||||
"value": "<stripped>"
|
||||
},
|
||||
"stored": {
|
||||
"kind": "copied",
|
||||
"path": "<stripped>"
|
||||
},
|
||||
"workspace_path": "README.md"
|
||||
},
|
||||
{
|
||||
"asset_id": "3381fcc34cf9415a391ba6b0dc6037c5",
|
||||
"byte_len": 11,
|
||||
"checksum": "e9fa9a5e0725d7bf6ec9d1565d3921eb6b62aa7f0db40c1c3ffebda7475d4258",
|
||||
"discovered_at": "<stripped>",
|
||||
"media_type": "markdown",
|
||||
"source_uri": {
|
||||
"kind": "file",
|
||||
"value": "<stripped>"
|
||||
},
|
||||
"stored": {
|
||||
"kind": "copied",
|
||||
"path": "<stripped>"
|
||||
},
|
||||
"workspace_path": "notes/alpha.md"
|
||||
},
|
||||
{
|
||||
"asset_id": "e300aa98aec843d2df1dd8f43702b257",
|
||||
"byte_len": 10,
|
||||
"checksum": "3e4df2f43563730d61672ce67a9bf479bc7c7a2f1384e2081d52e06f143353ed",
|
||||
"discovered_at": "<stripped>",
|
||||
"media_type": "markdown",
|
||||
"source_uri": {
|
||||
"kind": "file",
|
||||
"value": "<stripped>"
|
||||
},
|
||||
"stored": {
|
||||
"kind": "copied",
|
||||
"path": "<stripped>"
|
||||
},
|
||||
"workspace_path": "notes/beta.md"
|
||||
}
|
||||
]
|
||||
1
fixtures/source-fs/tree-1/.DS_Store
vendored
Normal file
1
fixtures/source-fs/tree-1/.DS_Store
vendored
Normal file
@@ -0,0 +1 @@
|
||||
macOS Finder metadata placeholder. Implicitly excluded by FsSourceConnector.
|
||||
1
fixtures/source-fs/tree-1/.kbignore
Normal file
1
fixtures/source-fs/tree-1/.kbignore
Normal file
@@ -0,0 +1 @@
|
||||
*.tmp
|
||||
5
fixtures/source-fs/tree-1/README.md
Normal file
5
fixtures/source-fs/tree-1/README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# tree-1
|
||||
|
||||
Fixture for `kb-source-fs` snapshot tests. Contents are intentionally tiny
|
||||
and stable — bumping a byte here will require regenerating the snapshot
|
||||
baseline.
|
||||
1
fixtures/source-fs/tree-1/ignored/skip.tmp
Normal file
1
fixtures/source-fs/tree-1/ignored/skip.tmp
Normal file
@@ -0,0 +1 @@
|
||||
should be excluded by .kbignore
|
||||
1
fixtures/source-fs/tree-1/notes/alpha.md
Normal file
1
fixtures/source-fs/tree-1/notes/alpha.md
Normal file
@@ -0,0 +1 @@
|
||||
alpha note
|
||||
1
fixtures/source-fs/tree-1/notes/beta.md
Normal file
1
fixtures/source-fs/tree-1/notes/beta.md
Normal file
@@ -0,0 +1 @@
|
||||
beta note
|
||||
Reference in New Issue
Block a user