diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 3c8e03c..99f5597 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -44,7 +44,7 @@ use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType, - ParserVersion, RawAsset, SearchHit, SearchQuery, SkipExamples, SourceConnector, SourceScope, + ParserVersion, RawAsset, SearchHit, SearchQuery, SourceScope, SourceUri, VectorRecord, VectorStore, }; use kebab_llm_local::OllamaLanguageModel; @@ -305,8 +305,8 @@ pub fn ingest_with_config_opts( ); let connector = FsSourceConnector::new(&app.config) .context("kb-app::ingest: build FsSourceConnector")?; - let assets = connector - .scan(&scope) + let (assets, fs_skips) = connector + .scan_with_skips(&scope) .context("kb-app::ingest: scan workspace")?; crate::ingest_progress::emit( progress, @@ -675,12 +675,12 @@ pub fn ingest_with_config_opts( errors: error_count, duration_ms, skipped_by_extension, - skipped_gitignore: 0, - skipped_kebabignore: 0, - skipped_builtin_blacklist: 0, + skipped_gitignore: fs_skips.skipped_gitignore, + skipped_kebabignore: fs_skips.skipped_kebabignore, + skipped_builtin_blacklist: fs_skips.skipped_builtin_blacklist, skipped_generated: 0, skipped_size_exceeded: 0, - skip_examples: SkipExamples::default(), + skip_examples: fs_skips.skip_examples, items: if summary_only { None } else { Some(items) }, }) } diff --git a/crates/kebab-source-fs/src/connector.rs b/crates/kebab-source-fs/src/connector.rs index b01c14a..d3cdb1f 100644 --- a/crates/kebab-source-fs/src/connector.rs +++ b/crates/kebab-source-fs/src/connector.rs @@ -10,20 +10,20 @@ //! } //! ``` -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use anyhow::{Context, Result}; use time::OffsetDateTime; use kebab_config::Config; use kebab_core::{ - AssetStorage, Checksum, RawAsset, SourceConnector, SourceScope, SourceUri, + AssetStorage, Checksum, RawAsset, SkipExamples, SourceConnector, SourceScope, SourceUri, id_for_asset, to_posix, }; use crate::hash::hash_file; use crate::media::media_type_for; -use crate::walker::{build_overrides, read_kbignore, walk_files}; +use crate::walker::{SkipCategory, WalkOverrides, build_overrides, read_kbignore, walk_files_with_skips}; /// Local-filesystem `SourceConnector`. Constructed once from `Config`, /// reused across `scan` calls. @@ -61,107 +61,179 @@ impl FsSourceConnector { copy_threshold_bytes, }) } -} -impl SourceConnector for FsSourceConnector { - fn scan(&self, scope: &SourceScope) -> Result> { - // `SourceScope::root` overrides config root when non-empty. This - // matches the design's "scope is the per-call lens; config is the - // default" split (§7.1). + /// Resolve the effective root and build the merged + per-source overrides. + fn resolve_scan_params( + &self, + scope: &SourceScope, + ) -> Result<(PathBuf, Vec, WalkOverrides)> { let root = if scope.root.as_os_str().is_empty() { self.default_root.clone() } else { scope.root.clone() }; - // Union: config.workspace.exclude ∪ scope.exclude ∪ .kebabignore. - // Per §6.2 the union of `.kebabignore` and `config.workspace.exclude` - // is the filter set. `scope.exclude` is added on top so a caller - // can layer a per-call narrowing. let mut excludes = self.default_exclude.clone(); excludes.extend(scope.exclude.iter().cloned()); - // .kebabignore is re-read on every scan() so users can edit it without - // restarting any long-running process. let kbignore = read_kbignore(&root)?; let overrides = build_overrides(&root, &excludes, &kbignore)?; + Ok((root, kbignore, overrides)) + } - // TODO(P1-2/P1-3 router): apply SourceScope::include glob filter at the - // extractor router layer once that crate lands. SourceConnector emits all - // non-excluded files; routing by include-glob is a downstream concern - // (design §6.2 + §7.2 are silent on this split, treat it as router work). - // - // `scope.include` is intentionally ignored at this stage of the - // pipeline: per §6.2 the workspace-level include lives in - // `WorkspaceCfg` and is enforced by the asset writer / extractors. - // Surfacing it here would double-filter Markdown vs PDF before the - // extractor router gets to see them. - if !scope.include.is_empty() { - tracing::debug!( - count = scope.include.len(), - "FsSourceConnector ignores scope.include — handled by extractor router" - ); - } + /// Scan the workspace and return the accepted assets together with + /// per-category skip counts and sample paths for `IngestReport`. + /// + /// This is the **preferred entry point** for `kebab-app`: it provides + /// all the information needed to populate `IngestReport.skipped_gitignore`, + /// `skipped_kebabignore`, `skipped_builtin_blacklist`, and `skip_examples` + /// without a second walker pass. + pub fn scan_with_skips( + &self, + scope: &SourceScope, + ) -> Result<(Vec, FsScanSkips)> { + let (root, _kbignore, overrides) = self.resolve_scan_params(scope)?; - let files = walk_files(&root, &overrides)?; + // Suppress unused-variable warning — kbignore patterns are already + // baked into `overrides`; we don't need them again here. - let mut assets = Vec::with_capacity(files.len()); - for abs in &files { - // `to_posix` does NFC + leading `./` strip + `#` rejection. - // Compute the workspace-relative path before handing to it so - // emitted `WorkspacePath` is always relative. - let rel = abs.strip_prefix(&root).unwrap_or(abs); - let workspace_path = match to_posix(rel) { - Ok(p) => p, - Err(e) => { - // A path containing `#` is the only documented reason - // `to_posix` fails today. Drop the file with a warning - // rather than aborting the entire scan — a single bad - // filename should not nuke a 10 000-file ingest. - tracing::warn!( - path = %abs.display(), - error = %e, - "skipping file: path is not a valid WorkspacePath", + log_scope_include_warning(scope); + + let (files, skipped_entries) = walk_files_with_skips(&root, &overrides)?; + + // Accumulate per-category skip counts and sample paths. + let mut fs_skips = FsScanSkips::default(); + for entry in &skipped_entries { + match entry.category { + SkipCategory::BuiltinBlacklist => { + fs_skips.skipped_builtin_blacklist = + fs_skips.skipped_builtin_blacklist.saturating_add(1); + push_sample( + &mut fs_skips.skip_examples.builtin_blacklist, + &entry.path, + &root, ); - continue; } - }; - - let media_type = media_type_for(abs); - let (byte_len, full_hex) = hash_file(abs) - .with_context(|| format!("hashing {}", abs.display()))?; - let checksum = Checksum(full_hex.clone()); - let asset_id = id_for_asset(&full_hex); - - // Storage variant signals *intent*, not an actual copy. - // P1-6 (asset writer) is responsible for the on-disk copy. - let stored = if byte_len > self.copy_threshold_bytes { - AssetStorage::Reference { - path: abs.clone(), - sha: checksum.clone(), + SkipCategory::Gitignore => { + fs_skips.skipped_gitignore = + fs_skips.skipped_gitignore.saturating_add(1); + push_sample( + &mut fs_skips.skip_examples.gitignore, + &entry.path, + &root, + ); } - } else { - AssetStorage::Copied { path: abs.clone() } - }; - - assets.push(RawAsset { - asset_id, - source_uri: SourceUri::File(abs.clone()), - workspace_path, - media_type, - byte_len, - checksum, - discovered_at: OffsetDateTime::now_utc(), - stored, - }); + SkipCategory::Kebabignore => { + fs_skips.skipped_kebabignore = + fs_skips.skipped_kebabignore.saturating_add(1); + // kebabignore intentionally NOT in skip_examples per spec §5.5. + } + SkipCategory::Other => { + // DEFAULT_EXCLUDES or config.workspace.exclude — no dedicated + // IngestReport counter; these are lumped into the existing + // `skipped` field by kebab-app. + } + } } - // Determinism: sort by workspace_path. WorkspacePath is a String - // newtype with stable lexicographic ordering. Two scans of the - // same tree must produce identical Vec modulo the - // wall-clock `discovered_at` field. - assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0)); + let assets = build_assets(&files, &root, self.copy_threshold_bytes)?; + Ok((assets, fs_skips)) + } +} +/// Per-category skip counts and sample paths returned alongside the asset list +/// by [`FsSourceConnector::scan_with_skips`]. +/// +/// Populated from the walker's per-source matchers without a second pass. +#[derive(Debug, Default)] +pub struct FsScanSkips { + pub skipped_gitignore: u32, + pub skipped_kebabignore: u32, + pub skipped_builtin_blacklist: u32, + /// Sample paths per spec §5.5 (≤ 5 per category). Paths are + /// workspace-relative POSIX strings when available, absolute otherwise. + pub skip_examples: SkipExamples, +} + +/// Push a path into a sample vec (cap = 5) as a workspace-relative POSIX +/// string. Falls back to the lossy absolute path if relativisation fails. +fn push_sample(samples: &mut Vec, abs: &Path, root: &Path) { + if samples.len() >= 5 { + return; + } + let rel = abs.strip_prefix(root).unwrap_or(abs); + // Best-effort POSIX string; any non-UTF8 char → replacement char. + let s = rel.to_string_lossy().replace('\\', "/"); + samples.push(s); +} + +/// Convert a list of absolute file paths to `Vec`, sorted by +/// workspace-relative POSIX path for determinism. +fn build_assets( + files: &[PathBuf], + root: &Path, + copy_threshold_bytes: u64, +) -> Result> { + let mut assets = Vec::with_capacity(files.len()); + for abs in files { + let rel = abs.strip_prefix(root).unwrap_or(abs); + let workspace_path = match to_posix(rel) { + Ok(p) => p, + Err(e) => { + tracing::warn!( + path = %abs.display(), + error = %e, + "skipping file: path is not a valid WorkspacePath", + ); + continue; + } + }; + + let media_type = media_type_for(abs); + let (byte_len, full_hex) = hash_file(abs) + .with_context(|| format!("hashing {}", abs.display()))?; + let checksum = Checksum(full_hex.clone()); + let asset_id = id_for_asset(&full_hex); + + let stored = if byte_len > copy_threshold_bytes { + AssetStorage::Reference { + path: abs.clone(), + sha: checksum.clone(), + } + } else { + AssetStorage::Copied { path: abs.clone() } + }; + + assets.push(RawAsset { + asset_id, + source_uri: SourceUri::File(abs.clone()), + workspace_path, + media_type, + byte_len, + checksum, + discovered_at: OffsetDateTime::now_utc(), + stored, + }); + } + + assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0)); + Ok(assets) +} + +fn log_scope_include_warning(scope: &SourceScope) { + if !scope.include.is_empty() { + tracing::debug!( + count = scope.include.len(), + "FsSourceConnector ignores scope.include — handled by extractor router" + ); + } +} + +impl SourceConnector for FsSourceConnector { + fn scan(&self, scope: &SourceScope) -> Result> { + // Delegate to scan_with_skips; discard the skip counts. + // Callers that need skip attribution should call scan_with_skips directly. + let (assets, _skips) = self.scan_with_skips(scope)?; Ok(assets) } } @@ -401,4 +473,142 @@ mod tests { let v2 = conn2.scan(&SourceScope::default()).unwrap(); assert!(matches!(v2[0].stored, AssetStorage::Copied { .. })); } + + // ── IngestReport skip counter wiring tests ─────────────────────────────── + + #[test] + fn scan_with_skips_counts_gitignored_files() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join(".gitignore"), "*.log\n").unwrap(); + std::fs::write(root.join("ok.md"), b"# ok").unwrap(); + std::fs::write(root.join("skipme.log"), b"x").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); + + assert!( + skips.skipped_gitignore >= 1, + "skipped_gitignore should be >= 1; got {}", + skips.skipped_gitignore + ); + assert!( + skips.skip_examples.gitignore.iter().any(|p| p.contains("skipme.log")), + "skip_examples.gitignore should contain 'skipme.log'; got: {:?}", + skips.skip_examples.gitignore + ); + // kebabignore counter must be 0 — file matched gitignore, not kebabignore. + assert_eq!(skips.skipped_kebabignore, 0); + } + + #[test] + fn scan_with_skips_counts_builtin_blacklist_dirs() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::create_dir_all(root.join("node_modules/foo")).unwrap(); + std::fs::write(root.join("node_modules/foo/bar.js"), b"x").unwrap(); + std::fs::write(root.join("ok.md"), b"# ok").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); + + assert!( + skips.skipped_builtin_blacklist >= 1, + "skipped_builtin_blacklist should be >= 1; got {}", + skips.skipped_builtin_blacklist + ); + assert!( + skips.skip_examples.builtin_blacklist.iter().any(|p| p.contains("node_modules")), + "skip_examples.builtin_blacklist should contain a node_modules path; got: {:?}", + skips.skip_examples.builtin_blacklist + ); + } + + #[test] + fn scan_with_skips_kebabignore_increments_counter_no_example() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join(".kebabignore"), "*.secret\n").unwrap(); + std::fs::write(root.join("ok.md"), b"x").unwrap(); + std::fs::write(root.join("creds.secret"), b"pw").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); + + assert!( + skips.skipped_kebabignore >= 1, + "skipped_kebabignore should be >= 1; got {}", + skips.skipped_kebabignore + ); + // Per spec §5.5: kebabignore is intentionally NOT in skip_examples. + assert!( + skips.skip_examples.gitignore.is_empty(), + "gitignore examples should be empty; got: {:?}", + skips.skip_examples.gitignore + ); + assert!( + skips.skip_examples.builtin_blacklist.is_empty(), + "builtin_blacklist examples should be empty; got: {:?}", + skips.skip_examples.builtin_blacklist + ); + } + + #[test] + fn scan_with_skips_builtin_priority_over_gitignore() { + // node_modules/ matches both BUILTIN_BLACKLIST and a .gitignore entry. + // It must be attributed to builtin (spec §5.2 priority order). + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join(".gitignore"), "node_modules/\n").unwrap(); + std::fs::create_dir_all(root.join("node_modules/pkg")).unwrap(); + std::fs::write(root.join("node_modules/pkg/index.js"), b"x").unwrap(); + std::fs::write(root.join("ok.md"), b"x").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); + + assert!( + skips.skipped_builtin_blacklist >= 1, + "builtin counter should be >= 1; got {}", + skips.skipped_builtin_blacklist + ); + assert_eq!( + skips.skipped_gitignore, 0, + "gitignore counter must be 0 when builtin wins; got {}", + skips.skipped_gitignore + ); + } + + #[test] + fn skip_examples_cap_at_five() { + // Write 7 .log files — skip_examples.gitignore must cap at 5. + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + std::fs::write(root.join(".gitignore"), "*.log\n").unwrap(); + for i in 0..7 { + std::fs::write(root.join(format!("f{i}.log")), b"x").unwrap(); + } + std::fs::write(root.join("ok.md"), b"x").unwrap(); + + let conn = + FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())) + .unwrap(); + let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap(); + + assert_eq!(skips.skipped_gitignore, 7, "should count all 7"); + assert_eq!( + skips.skip_examples.gitignore.len(), + 5, + "skip_examples.gitignore must cap at 5; got: {:?}", + skips.skip_examples.gitignore + ); + } } diff --git a/crates/kebab-source-fs/src/lib.rs b/crates/kebab-source-fs/src/lib.rs index ea7f294..6975271 100644 --- a/crates/kebab-source-fs/src/lib.rs +++ b/crates/kebab-source-fs/src/lib.rs @@ -13,4 +13,4 @@ mod hash; mod media; mod walker; -pub use connector::FsSourceConnector; +pub use connector::{FsScanSkips, FsSourceConnector}; diff --git a/crates/kebab-source-fs/src/walker.rs b/crates/kebab-source-fs/src/walker.rs index 0d18baa..e963c0b 100644 --- a/crates/kebab-source-fs/src/walker.rs +++ b/crates/kebab-source-fs/src/walker.rs @@ -35,7 +35,7 @@ use std::path::{Path, PathBuf}; use anyhow::{Context, Result}; use ignore::overrides::{Override, OverrideBuilder}; -use walkdir::{DirEntry, WalkDir}; +use walkdir::WalkDir; /// Default-excludes baked into the connector. These are NOT configurable; /// they cover noise that is never useful to ingest and would otherwise need @@ -49,7 +49,96 @@ const DEFAULT_EXCLUDES: &[&str] = &[ "**/._*", ]; -/// Build the merged `Override` from all five filter sources, in order: +/// Per-source `Override` matchers for skip-counter attribution (spec §5.5). +/// +/// `combined` is the merged union of all sources — used for the actual +/// "is this entry excluded?" decision in the walker. The three per-source +/// matchers (`gitignore`, `kebabignore`, `builtin`) are used ONLY when +/// classifying an already-excluded path for `IngestReport` counter wiring; +/// they are never consulted for every walked file. +/// +/// `default_and_config` covers DEFAULT_EXCLUDES + `config.workspace.exclude` +/// — these do NOT map to any of the three named `IngestReport` counters. +pub(crate) struct WalkOverrides { + /// Merged matcher — same as today's `Override`; used for the walk decision. + pub combined: Override, + /// Matcher built from `/.gitignore` patterns only. + pub gitignore: Override, + /// Matcher built from `/.kebabignore` patterns only. + pub kebabignore: Override, + /// Matcher built from `kebab_parse_code::BUILTIN_BLACKLIST` only. + pub builtin: Override, +} + +/// Skip attribution category. Used by the connector when counting per-source +/// skips for `IngestReport` (spec §5.5). +/// +/// Priority order per spec §5.2: built-in > gitignore > kebabignore. +/// A path matching multiple sources is attributed to the first match. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum SkipCategory { + BuiltinBlacklist, + Gitignore, + Kebabignore, + /// Matched DEFAULT_EXCLUDES or `config.workspace.exclude`. No dedicated + /// counter in `IngestReport` — lumped into the existing `skipped` field. + Other, +} + +/// Build a single `Override` from a list of gitignore-style patterns, all +/// registered as excludes (prepend `!`). +/// +/// Empty pattern list → an `Override` that matches nothing (i.e. no +/// exclusions). Callers must strip blanks / comments before passing. +fn build_single_matcher(root: &Path, patterns: &[&str]) -> Result { + let mut builder = OverrideBuilder::new(root); + for pat in patterns { + builder + .add(&format!("!{pat}")) + .with_context(|| format!("invalid pattern: {pat}"))?; + } + builder.build().context("failed to compile override") +} + +/// Build the builtin-blacklist `Override`, adding directory-level patterns in +/// addition to the `/**`-suffix ones from `BUILTIN_BLACKLIST`. +/// +/// BUILTIN_BLACKLIST uses `**/X/**` patterns which match files *inside* X but +/// NOT the directory entry `X` itself (because `**/X/**` requires a path +/// component after X). The walker prunes at the directory level (`is_dir=true`), +/// so we need `**/X` (no trailing `/**`) to also match the directory itself +/// for attribution purposes. +fn build_builtin_matcher(root: &Path) -> Result { + let mut builder = OverrideBuilder::new(root); + for pat in kebab_parse_code::BUILTIN_BLACKLIST { + // Register the original pattern (matches files inside the dir). + builder + .add(&format!("!{pat}")) + .with_context(|| format!("builtin pattern: {pat}"))?; + // Also derive a directory-level match by stripping trailing `/**`. + // This makes `is_dir=true` checks on the directory itself work. + if let Some(dir_pat) = pat.strip_suffix("/**") { + builder + .add(&format!("!{dir_pat}")) + .with_context(|| format!("builtin dir pattern: {dir_pat}"))?; + } + } + builder.build().context("failed to compile builtin override") +} + +/// Owned-string variant of `build_single_matcher` for caller-supplied +/// `Vec` sources (config.workspace.exclude, .kebabignore). +fn build_single_matcher_owned(root: &Path, patterns: &[String]) -> Result { + let mut builder = OverrideBuilder::new(root); + for pat in patterns { + builder + .add(&format!("!{pat}")) + .with_context(|| format!("invalid pattern: {pat}"))?; + } + builder.build().context("failed to compile override") +} + +/// Build the merged `WalkOverrides` from all five filter sources, in order: /// DEFAULT_EXCLUDES, `config.workspace.exclude`, `.kebabignore`, /// built-in safety-net blacklist (`kebab_parse_code::BUILTIN_BLACKLIST`), /// and `/.gitignore` (root-only, no nested cascade). @@ -58,48 +147,82 @@ const DEFAULT_EXCLUDES: &[&str] = &[ /// leading `!` flips a positive match to a negative one in the /// `OverrideBuilder` API). Order doesn't matter — the union is computed by /// the underlying gitignore engine. +/// +/// The three per-source matchers (`gitignore`, `kebabignore`, `builtin`) are +/// built in addition to the combined one so the connector can attribute skips +/// to the correct `IngestReport` counter without a second walker pass. pub(crate) fn build_overrides( root: &Path, config_exclude: &[String], kbignore_patterns: &[String], -) -> Result { - let mut builder = OverrideBuilder::new(root); +) -> Result { + let gitignore_patterns = read_gitignore(root)?; + + // Per-source matchers (for attribution only). + let gitignore = + build_single_matcher(root, &gitignore_patterns.iter().map(|s| s.as_str()).collect::>())?; + let kebabignore = build_single_matcher_owned(root, kbignore_patterns)?; + // Use the directory-aware builtin matcher so that `is_dir=true` checks on + // directory entries (e.g., `node_modules/`) are attributed to builtin rather + // than to an overlapping gitignore pattern. + let builtin = build_builtin_matcher(root)?; + + // Combined matcher — union of all five sources. + let mut combined_builder = OverrideBuilder::new(root); for pat in DEFAULT_EXCLUDES { - builder + combined_builder .add(&format!("!{pat}")) .with_context(|| format!("invalid default-exclude pattern: {pat}"))?; } for pat in config_exclude { - builder + combined_builder .add(&format!("!{pat}")) .with_context(|| format!("invalid workspace.exclude pattern: {pat}"))?; } for pat in kbignore_patterns { - builder + combined_builder .add(&format!("!{pat}")) .with_context(|| format!("invalid .kebabignore pattern: {pat}"))?; } - - // p10-1A-1: built-in safety-net blacklist (spec §5.2). 6 patterns - // universal across ecosystems. User can negate via `.kebabignore`. for pat in kebab_parse_code::BUILTIN_BLACKLIST { - builder + combined_builder .add(&format!("!{pat}")) .with_context(|| format!("built-in blacklist pattern: {pat}"))?; } - - // p10-1A-1: honor repo-root `.gitignore` (spec §5.2). Read once, - // merge with same convention as user `.kebabignore`. Nested - // cascade deferred to P+. - let gitignore_patterns = read_gitignore(root)?; for pat in &gitignore_patterns { - builder + combined_builder .add(&format!("!{pat}")) .with_context(|| format!(".gitignore pattern: {pat}"))?; } + let combined = combined_builder + .build() + .context("failed to compile combined override set")?; - builder.build().context("failed to compile override set") + Ok(WalkOverrides { + combined, + gitignore, + kebabignore, + builtin, + }) +} + +/// Classify why a path was excluded, using per-source matchers in spec §5.2 +/// priority order: built-in > gitignore > kebabignore > other. +/// +/// `rel` must be relative to the walker root (same as `Override::matched` +/// expects). `is_dir` should match what the original walker saw. +pub(crate) fn classify_skip(rel: &Path, is_dir: bool, ov: &WalkOverrides) -> SkipCategory { + if ov.builtin.matched(rel, is_dir).is_ignore() { + return SkipCategory::BuiltinBlacklist; + } + if ov.gitignore.matched(rel, is_dir).is_ignore() { + return SkipCategory::Gitignore; + } + if ov.kebabignore.matched(rel, is_dir).is_ignore() { + return SkipCategory::Kebabignore; + } + SkipCategory::Other } /// Read `/.gitignore` (single-file, root-only — nested cascade is P+). @@ -161,51 +284,87 @@ pub(crate) fn read_kbignore(root: &Path) -> Result> { .collect()) } -/// Iterate every regular file under `root`, applying `overrides` and -/// detecting symlink cycles. Returns absolute file paths. +/// Skipped-path record emitted by `walk_files_with_skips`. +/// +/// `path` is the absolute path of the excluded entry (dir or file). +/// For excluded directories, this is the directory itself — individual +/// files inside are not enumerated (the subtree is pruned). +pub(crate) struct SkippedEntry { + pub path: PathBuf, + pub category: SkipCategory, +} + +/// Iterate every regular file under `root`, applying `overrides.combined` and +/// detecting symlink cycles. Returns: +/// - `accepted`: absolute paths of files that passed all filters. +/// - `skipped`: entries that were excluded, with attribution. +/// +/// For excluded *directories*, the directory path itself is returned (not the +/// individual files inside — the subtree is pruned in one step, matching the +/// walker's `skip_current_dir` behavior). /// /// Strategy: /// - `walkdir::WalkDir::follow_links(true)` to traverse symlinks. +/// - Manual per-entry check (instead of `filter_entry`) so we can capture +/// the excluded paths for skip attribution. /// - Maintain `visited: HashSet` of *canonical* paths. Before /// descending into a directory entry, canonicalize and check the set; /// if already present, skip. This breaks `a -> b -> a` cycles in O(n) /// per entry without a custom recursive walker. -/// - For each yielded entry, ask `overrides` whether it is excluded; if -/// so, drop it. If the entry is a directory, also short-circuit -/// `WalkDir`'s descent via `it.skip_current_dir()`. -pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result> { - let mut out = Vec::new(); +pub(crate) fn walk_files_with_skips( + root: &Path, + overrides: &WalkOverrides, +) -> Result<(Vec, Vec)> { + let mut accepted = Vec::new(); + let mut skipped: Vec = Vec::new(); let mut visited: HashSet = HashSet::new(); + // Use a non-filtering iterator so we see excluded entries too. let walker = WalkDir::new(root).follow_links(true).into_iter(); - let mut it = walker.filter_entry(|e| !is_excluded(e, root, overrides)); + // We still use filter_entry for the *combined* override so that walkdir + // can short-circuit pruned directories. But we wrap it so we can capture + // the exclusion reason before discarding the entry. + // + // Problem: filter_entry discards without letting us see the entry first. + // Solution: use the raw iterator (no filter_entry) and manage skip_current_dir + // manually, which lets us record what was excluded before pruning. + let mut it = walker; while let Some(res) = it.next() { let entry = match res { Ok(e) => e, Err(err) => { - // `walkdir` surfaces I/O errors AND its own cycle detector - // (when follow_links is on it sometimes catches them). - // Either way: log and skip; do not abort the whole scan. tracing::warn!(error = %err, "walkdir entry error; skipping"); continue; } }; let path = entry.path(); + let rel = match path.strip_prefix(root) { + Ok(p) => p, + Err(_) => path, + }; + let is_dir = entry.file_type().is_dir(); + let excluded = overrides.combined.matched(rel, is_dir).is_ignore(); - // Cycle guard: only canonicalize symlinks (cheap on the common case - // of plain files/dirs) and on directories that are followed via a - // symlink. `walkdir`'s `path_is_symlink()` is true when the entry's - // *original* path is a symlink (it returns true for the link, not - // for the resolved target). For non-symlinked directories we still - // record the canonical path so a *later* symlink that points back - // to one of them is detected. - if entry.file_type().is_dir() { + if excluded { + let cat = classify_skip(rel, is_dir, overrides); + skipped.push(SkippedEntry { + path: path.to_path_buf(), + category: cat, + }); + if is_dir { + // Prune the subtree — don't descend into excluded dirs. + it.skip_current_dir(); + } + continue; + } + + // Cycle guard for directories. + if is_dir { match std::fs::canonicalize(path) { Ok(canon) => { if !visited.insert(canon) { - // Already visited via another path → break cycle. it.skip_current_dir(); continue; } @@ -222,26 +381,13 @@ pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result bool { - // `Override::matched(path, is_dir)` uses the path *relative to* the - // override builder's root. `walkdir` gives absolute paths when - // `WalkDir::new` was given an absolute path — strip the root prefix - // before consulting the override. - let rel = match entry.path().strip_prefix(root) { - Ok(p) => p, - Err(_) => entry.path(), - }; - overrides - .matched(rel, entry.file_type().is_dir()) - .is_ignore() -} #[cfg(test)] mod tests { @@ -252,7 +398,7 @@ mod tests { let dir = tempfile::tempdir().unwrap(); let ov = build_overrides(dir.path(), &[], &[]).unwrap(); // Default-excludes only; non-special files should not match. - let m = ov.matched(Path::new("notes/alpha.md"), false); + let m = ov.combined.matched(Path::new("notes/alpha.md"), false); assert!(!m.is_ignore()); } @@ -260,13 +406,13 @@ mod tests { fn default_excludes_ds_store_and_resource_forks() { let dir = tempfile::tempdir().unwrap(); let ov = build_overrides(dir.path(), &[], &[]).unwrap(); - assert!(ov.matched(Path::new(".DS_Store"), false).is_ignore()); + assert!(ov.combined.matched(Path::new(".DS_Store"), false).is_ignore()); assert!( - ov.matched(Path::new("notes/.DS_Store"), false).is_ignore() + ov.combined.matched(Path::new("notes/.DS_Store"), false).is_ignore() ); - assert!(ov.matched(Path::new("._foo.md"), false).is_ignore()); + assert!(ov.combined.matched(Path::new("._foo.md"), false).is_ignore()); assert!( - ov.matched(Path::new("notes/._sidecar"), false).is_ignore() + ov.combined.matched(Path::new("notes/._sidecar"), false).is_ignore() ); } @@ -279,13 +425,13 @@ mod tests { &[], ) .unwrap(); - assert!(ov.matched(Path::new("a.tmp"), false).is_ignore()); - assert!(ov.matched(Path::new("notes/x.tmp"), false).is_ignore()); + assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore()); + assert!(ov.combined.matched(Path::new("notes/x.tmp"), false).is_ignore()); assert!( - ov.matched(Path::new("node_modules/foo/bar.js"), false) + ov.combined.matched(Path::new("node_modules/foo/bar.js"), false) .is_ignore() ); - assert!(!ov.matched(Path::new("alpha.md"), false).is_ignore()); + assert!(!ov.combined.matched(Path::new("alpha.md"), false).is_ignore()); } #[test] @@ -298,9 +444,9 @@ mod tests { &["secret/**".to_string()], ) .unwrap(); - assert!(ov.matched(Path::new("a.tmp"), false).is_ignore()); + assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore()); assert!( - ov.matched(Path::new("secret/key.md"), false).is_ignore() + ov.combined.matched(Path::new("secret/key.md"), false).is_ignore() ); } @@ -337,8 +483,8 @@ mod tests { let overrides = build_overrides(root, &[], &[]).unwrap(); // Override::matched expects paths relative to the builder's root. - let m_in = overrides.matched(Path::new("src/main.rs"), false); - let m_out = overrides.matched(Path::new("node_modules/foo/bar.js"), false); + let m_in = overrides.combined.matched(Path::new("src/main.rs"), false); + let m_out = overrides.combined.matched(Path::new("node_modules/foo/bar.js"), false); assert!(!m_in.is_ignore(), "src/main.rs should NOT be ignored"); assert!(m_out.is_ignore(), "node_modules/foo/bar.js SHOULD be ignored"); @@ -367,10 +513,10 @@ mod tests { "venv/x/y.txt", "env/x/y.txt", ] { - let m = overrides.matched(Path::new(blacklisted), false); + let m = overrides.combined.matched(Path::new(blacklisted), false); assert!(m.is_ignore(), "{blacklisted} should be ignored"); } - let m_ok = overrides.matched(Path::new("ok/z.txt"), false); + let m_ok = overrides.combined.matched(Path::new("ok/z.txt"), false); assert!(!m_ok.is_ignore(), "ok/z.txt should not be ignored"); } @@ -389,9 +535,9 @@ mod tests { fs::write(root.join("dist/bundle.js"), "x").unwrap(); let overrides = build_overrides(root, &[], &[]).unwrap(); - assert!(overrides.matched(Path::new("a.log"), false).is_ignore()); - assert!(overrides.matched(Path::new("dist/bundle.js"), false).is_ignore()); - assert!(!overrides.matched(Path::new("src/main.rs"), false).is_ignore()); + assert!(overrides.combined.matched(Path::new("a.log"), false).is_ignore()); + assert!(overrides.combined.matched(Path::new("dist/bundle.js"), false).is_ignore()); + assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore()); } #[test] @@ -407,8 +553,8 @@ mod tests { // No .gitignore present — patterns from .gitignore should not affect overrides. let overrides = build_overrides(root, &[], &[]).unwrap(); - assert!(!overrides.matched(Path::new("a.log"), false).is_ignore()); - assert!(!overrides.matched(Path::new("src/main.rs"), false).is_ignore()); + assert!(!overrides.combined.matched(Path::new("a.log"), false).is_ignore()); + assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore()); } #[test] @@ -424,4 +570,128 @@ mod tests { let result = build_overrides(root, &[], &[]); assert!(result.is_ok(), "should not error on negation pattern: {:?}", result.err()); } + + // ── Skip attribution tests ──────────────────────────────────────────────── + + #[test] + fn classify_skip_attributes_builtin_over_gitignore() { + use std::fs; + use tempfile::TempDir; + + let tmp = TempDir::new().unwrap(); + let root = tmp.path(); + // node_modules matches both BUILTIN_BLACKLIST and a hypothetical + // .gitignore entry. Builtin must win (priority order §5.2). + fs::write(root.join(".gitignore"), "node_modules/\n").unwrap(); + + let ov = build_overrides(root, &[], &[]).unwrap(); + // node_modules/ dir itself + let cat = classify_skip(Path::new("node_modules"), true, &ov); + assert_eq!(cat, SkipCategory::BuiltinBlacklist, "builtin must have priority"); + } + + #[test] + fn classify_skip_attributes_gitignore_for_log_files() { + use std::fs; + use tempfile::TempDir; + + let tmp = TempDir::new().unwrap(); + let root = tmp.path(); + fs::write(root.join(".gitignore"), "*.log\n").unwrap(); + + let ov = build_overrides(root, &[], &[]).unwrap(); + let cat = classify_skip(Path::new("app.log"), false, &ov); + assert_eq!(cat, SkipCategory::Gitignore); + } + + #[test] + fn classify_skip_attributes_kebabignore() { + use tempfile::TempDir; + + let tmp = TempDir::new().unwrap(); + let root = tmp.path(); + + let ov = build_overrides(root, &[], &["*.secret".to_string()]).unwrap(); + let cat = classify_skip(Path::new("creds.secret"), false, &ov); + assert_eq!(cat, SkipCategory::Kebabignore); + } + + #[test] + fn walk_files_with_skips_counts_gitignored_files() { + use std::fs; + use tempfile::TempDir; + + let tmp = TempDir::new().unwrap(); + let root = tmp.path(); + fs::write(root.join(".gitignore"), "*.log\n").unwrap(); + fs::write(root.join("ok.md"), "# ok").unwrap(); + fs::write(root.join("skipme.log"), "x").unwrap(); + + let ov = build_overrides(root, &[], &[]).unwrap(); + let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap(); + + let accepted_names: Vec<_> = accepted + .iter() + .map(|p| p.file_name().unwrap().to_string_lossy().into_owned()) + .collect(); + assert!( + accepted_names.iter().any(|n| n == "ok.md"), + "ok.md should be accepted; got: {accepted_names:?}" + ); + assert!( + !accepted_names.iter().any(|n| n == "skipme.log"), + "skipme.log should not be accepted; got: {accepted_names:?}" + ); + + let gitignore_skipped: Vec<_> = skipped_entries + .iter() + .filter(|e| e.category == SkipCategory::Gitignore) + .collect(); + assert!( + gitignore_skipped.iter().any(|e| e.path.file_name() + .map(|n| n == "skipme.log") + .unwrap_or(false)), + "skipme.log should appear in gitignore_skipped; skipped: {:?}", + skipped_entries.iter().map(|e| &e.path).collect::>() + ); + } + + #[test] + fn walk_files_with_skips_counts_builtin_blacklist_dirs() { + use std::fs; + use tempfile::TempDir; + + let tmp = TempDir::new().unwrap(); + let root = tmp.path(); + fs::create_dir_all(root.join("node_modules/foo")).unwrap(); + fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap(); + fs::write(root.join("ok.md"), "# ok").unwrap(); + + let ov = build_overrides(root, &[], &[]).unwrap(); + let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap(); + + let accepted_names: Vec<_> = accepted + .iter() + .map(|p| p.file_name().unwrap().to_string_lossy().into_owned()) + .collect(); + assert!( + accepted_names.iter().any(|n| n == "ok.md"), + "ok.md must be accepted; got: {accepted_names:?}" + ); + + let builtin_skipped: Vec<_> = skipped_entries + .iter() + .filter(|e| e.category == SkipCategory::BuiltinBlacklist) + .collect(); + assert!( + !builtin_skipped.is_empty(), + "node_modules/ should produce at least one BuiltinBlacklist skip" + ); + assert!( + builtin_skipped.iter().any(|e| e.path.components() + .any(|c| c.as_os_str() == "node_modules")), + "skipped path should contain node_modules; got: {:?}", + builtin_skipped.iter().map(|e| &e.path).collect::>() + ); + } }