feat(p10-1a-1): wire IngestReport skip counters by category (gitignore/builtin/kebabignore)

Refactor walker to expose WalkOverrides (combined + per-source matchers),
add walk_files_with_skips that returns accepted files alongside skip
attribution, wire FsSourceConnector::scan_with_skips into kebab-app so
IngestReport.skipped_gitignore, skipped_kebabignore, skipped_builtin_blacklist,
and skip_examples are populated instead of left at zero. Priority order
per spec §5.2 (builtin > gitignore > kebabignore) enforced in classify_skip,
with a directory-aware builtin matcher so pruned directory entries are
correctly attributed to builtin rather than a coincident gitignore entry.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
th-kim0823
2026-05-15 16:42:28 +09:00
parent 8bbe25dc10
commit 9fce24b106
4 changed files with 644 additions and 164 deletions

View File

@@ -44,7 +44,7 @@ use kebab_core::{
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType,
ParserVersion, RawAsset, SearchHit, SearchQuery, SkipExamples, SourceConnector, SourceScope,
ParserVersion, RawAsset, SearchHit, SearchQuery, SourceScope,
SourceUri, VectorRecord, VectorStore,
};
use kebab_llm_local::OllamaLanguageModel;
@@ -305,8 +305,8 @@ pub fn ingest_with_config_opts(
);
let connector = FsSourceConnector::new(&app.config)
.context("kb-app::ingest: build FsSourceConnector")?;
let assets = connector
.scan(&scope)
let (assets, fs_skips) = connector
.scan_with_skips(&scope)
.context("kb-app::ingest: scan workspace")?;
crate::ingest_progress::emit(
progress,
@@ -675,12 +675,12 @@ pub fn ingest_with_config_opts(
errors: error_count,
duration_ms,
skipped_by_extension,
skipped_gitignore: 0,
skipped_kebabignore: 0,
skipped_builtin_blacklist: 0,
skipped_gitignore: fs_skips.skipped_gitignore,
skipped_kebabignore: fs_skips.skipped_kebabignore,
skipped_builtin_blacklist: fs_skips.skipped_builtin_blacklist,
skipped_generated: 0,
skipped_size_exceeded: 0,
skip_examples: SkipExamples::default(),
skip_examples: fs_skips.skip_examples,
items: if summary_only { None } else { Some(items) },
})
}

View File

@@ -10,20 +10,20 @@
//! }
//! ```
use std::path::PathBuf;
use std::path::{Path, PathBuf};
use anyhow::{Context, Result};
use time::OffsetDateTime;
use kebab_config::Config;
use kebab_core::{
AssetStorage, Checksum, RawAsset, SourceConnector, SourceScope, SourceUri,
AssetStorage, Checksum, RawAsset, SkipExamples, SourceConnector, SourceScope, SourceUri,
id_for_asset, to_posix,
};
use crate::hash::hash_file;
use crate::media::media_type_for;
use crate::walker::{build_overrides, read_kbignore, walk_files};
use crate::walker::{SkipCategory, WalkOverrides, build_overrides, read_kbignore, walk_files_with_skips};
/// Local-filesystem `SourceConnector`. Constructed once from `Config`,
/// reused across `scan` calls.
@@ -61,107 +61,179 @@ impl FsSourceConnector {
copy_threshold_bytes,
})
}
}
impl SourceConnector for FsSourceConnector {
fn scan(&self, scope: &SourceScope) -> Result<Vec<RawAsset>> {
// `SourceScope::root` overrides config root when non-empty. This
// matches the design's "scope is the per-call lens; config is the
// default" split (§7.1).
/// Resolve the effective root and build the merged + per-source overrides.
fn resolve_scan_params(
&self,
scope: &SourceScope,
) -> Result<(PathBuf, Vec<String>, WalkOverrides)> {
let root = if scope.root.as_os_str().is_empty() {
self.default_root.clone()
} else {
scope.root.clone()
};
// Union: config.workspace.exclude scope.exclude .kebabignore.
// Per §6.2 the union of `.kebabignore` and `config.workspace.exclude`
// is the filter set. `scope.exclude` is added on top so a caller
// can layer a per-call narrowing.
let mut excludes = self.default_exclude.clone();
excludes.extend(scope.exclude.iter().cloned());
// .kebabignore is re-read on every scan() so users can edit it without
// restarting any long-running process.
let kbignore = read_kbignore(&root)?;
let overrides = build_overrides(&root, &excludes, &kbignore)?;
Ok((root, kbignore, overrides))
}
// TODO(P1-2/P1-3 router): apply SourceScope::include glob filter at the
// extractor router layer once that crate lands. SourceConnector emits all
// non-excluded files; routing by include-glob is a downstream concern
// (design §6.2 + §7.2 are silent on this split, treat it as router work).
//
// `scope.include` is intentionally ignored at this stage of the
// pipeline: per §6.2 the workspace-level include lives in
// `WorkspaceCfg` and is enforced by the asset writer / extractors.
// Surfacing it here would double-filter Markdown vs PDF before the
// extractor router gets to see them.
if !scope.include.is_empty() {
tracing::debug!(
count = scope.include.len(),
"FsSourceConnector ignores scope.include — handled by extractor router"
);
}
/// Scan the workspace and return the accepted assets together with
/// per-category skip counts and sample paths for `IngestReport`.
///
/// This is the **preferred entry point** for `kebab-app`: it provides
/// all the information needed to populate `IngestReport.skipped_gitignore`,
/// `skipped_kebabignore`, `skipped_builtin_blacklist`, and `skip_examples`
/// without a second walker pass.
pub fn scan_with_skips(
&self,
scope: &SourceScope,
) -> Result<(Vec<RawAsset>, FsScanSkips)> {
let (root, _kbignore, overrides) = self.resolve_scan_params(scope)?;
let files = walk_files(&root, &overrides)?;
// Suppress unused-variable warning — kbignore patterns are already
// baked into `overrides`; we don't need them again here.
let mut assets = Vec::with_capacity(files.len());
for abs in &files {
// `to_posix` does NFC + leading `./` strip + `#` rejection.
// Compute the workspace-relative path before handing to it so
// emitted `WorkspacePath` is always relative.
let rel = abs.strip_prefix(&root).unwrap_or(abs);
let workspace_path = match to_posix(rel) {
Ok(p) => p,
Err(e) => {
// A path containing `#` is the only documented reason
// `to_posix` fails today. Drop the file with a warning
// rather than aborting the entire scan — a single bad
// filename should not nuke a 10 000-file ingest.
tracing::warn!(
path = %abs.display(),
error = %e,
"skipping file: path is not a valid WorkspacePath",
log_scope_include_warning(scope);
let (files, skipped_entries) = walk_files_with_skips(&root, &overrides)?;
// Accumulate per-category skip counts and sample paths.
let mut fs_skips = FsScanSkips::default();
for entry in &skipped_entries {
match entry.category {
SkipCategory::BuiltinBlacklist => {
fs_skips.skipped_builtin_blacklist =
fs_skips.skipped_builtin_blacklist.saturating_add(1);
push_sample(
&mut fs_skips.skip_examples.builtin_blacklist,
&entry.path,
&root,
);
continue;
}
};
let media_type = media_type_for(abs);
let (byte_len, full_hex) = hash_file(abs)
.with_context(|| format!("hashing {}", abs.display()))?;
let checksum = Checksum(full_hex.clone());
let asset_id = id_for_asset(&full_hex);
// Storage variant signals *intent*, not an actual copy.
// P1-6 (asset writer) is responsible for the on-disk copy.
let stored = if byte_len > self.copy_threshold_bytes {
AssetStorage::Reference {
path: abs.clone(),
sha: checksum.clone(),
SkipCategory::Gitignore => {
fs_skips.skipped_gitignore =
fs_skips.skipped_gitignore.saturating_add(1);
push_sample(
&mut fs_skips.skip_examples.gitignore,
&entry.path,
&root,
);
}
} else {
AssetStorage::Copied { path: abs.clone() }
};
assets.push(RawAsset {
asset_id,
source_uri: SourceUri::File(abs.clone()),
workspace_path,
media_type,
byte_len,
checksum,
discovered_at: OffsetDateTime::now_utc(),
stored,
});
SkipCategory::Kebabignore => {
fs_skips.skipped_kebabignore =
fs_skips.skipped_kebabignore.saturating_add(1);
// kebabignore intentionally NOT in skip_examples per spec §5.5.
}
SkipCategory::Other => {
// DEFAULT_EXCLUDES or config.workspace.exclude — no dedicated
// IngestReport counter; these are lumped into the existing
// `skipped` field by kebab-app.
}
}
}
// Determinism: sort by workspace_path. WorkspacePath is a String
// newtype with stable lexicographic ordering. Two scans of the
// same tree must produce identical Vec<RawAsset> modulo the
// wall-clock `discovered_at` field.
assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0));
let assets = build_assets(&files, &root, self.copy_threshold_bytes)?;
Ok((assets, fs_skips))
}
}
/// Per-category skip counts and sample paths returned alongside the asset list
/// by [`FsSourceConnector::scan_with_skips`].
///
/// Populated from the walker's per-source matchers without a second pass.
#[derive(Debug, Default)]
pub struct FsScanSkips {
pub skipped_gitignore: u32,
pub skipped_kebabignore: u32,
pub skipped_builtin_blacklist: u32,
/// Sample paths per spec §5.5 (≤ 5 per category). Paths are
/// workspace-relative POSIX strings when available, absolute otherwise.
pub skip_examples: SkipExamples,
}
/// Push a path into a sample vec (cap = 5) as a workspace-relative POSIX
/// string. Falls back to the lossy absolute path if relativisation fails.
fn push_sample(samples: &mut Vec<String>, abs: &Path, root: &Path) {
if samples.len() >= 5 {
return;
}
let rel = abs.strip_prefix(root).unwrap_or(abs);
// Best-effort POSIX string; any non-UTF8 char → replacement char.
let s = rel.to_string_lossy().replace('\\', "/");
samples.push(s);
}
/// Convert a list of absolute file paths to `Vec<RawAsset>`, sorted by
/// workspace-relative POSIX path for determinism.
fn build_assets(
files: &[PathBuf],
root: &Path,
copy_threshold_bytes: u64,
) -> Result<Vec<RawAsset>> {
let mut assets = Vec::with_capacity(files.len());
for abs in files {
let rel = abs.strip_prefix(root).unwrap_or(abs);
let workspace_path = match to_posix(rel) {
Ok(p) => p,
Err(e) => {
tracing::warn!(
path = %abs.display(),
error = %e,
"skipping file: path is not a valid WorkspacePath",
);
continue;
}
};
let media_type = media_type_for(abs);
let (byte_len, full_hex) = hash_file(abs)
.with_context(|| format!("hashing {}", abs.display()))?;
let checksum = Checksum(full_hex.clone());
let asset_id = id_for_asset(&full_hex);
let stored = if byte_len > copy_threshold_bytes {
AssetStorage::Reference {
path: abs.clone(),
sha: checksum.clone(),
}
} else {
AssetStorage::Copied { path: abs.clone() }
};
assets.push(RawAsset {
asset_id,
source_uri: SourceUri::File(abs.clone()),
workspace_path,
media_type,
byte_len,
checksum,
discovered_at: OffsetDateTime::now_utc(),
stored,
});
}
assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0));
Ok(assets)
}
fn log_scope_include_warning(scope: &SourceScope) {
if !scope.include.is_empty() {
tracing::debug!(
count = scope.include.len(),
"FsSourceConnector ignores scope.include — handled by extractor router"
);
}
}
impl SourceConnector for FsSourceConnector {
fn scan(&self, scope: &SourceScope) -> Result<Vec<RawAsset>> {
// Delegate to scan_with_skips; discard the skip counts.
// Callers that need skip attribution should call scan_with_skips directly.
let (assets, _skips) = self.scan_with_skips(scope)?;
Ok(assets)
}
}
@@ -401,4 +473,142 @@ mod tests {
let v2 = conn2.scan(&SourceScope::default()).unwrap();
assert!(matches!(v2[0].stored, AssetStorage::Copied { .. }));
}
// ── IngestReport skip counter wiring tests ───────────────────────────────
#[test]
fn scan_with_skips_counts_gitignored_files() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join(".gitignore"), "*.log\n").unwrap();
std::fs::write(root.join("ok.md"), b"# ok").unwrap();
std::fs::write(root.join("skipme.log"), b"x").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
assert!(
skips.skipped_gitignore >= 1,
"skipped_gitignore should be >= 1; got {}",
skips.skipped_gitignore
);
assert!(
skips.skip_examples.gitignore.iter().any(|p| p.contains("skipme.log")),
"skip_examples.gitignore should contain 'skipme.log'; got: {:?}",
skips.skip_examples.gitignore
);
// kebabignore counter must be 0 — file matched gitignore, not kebabignore.
assert_eq!(skips.skipped_kebabignore, 0);
}
#[test]
fn scan_with_skips_counts_builtin_blacklist_dirs() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::create_dir_all(root.join("node_modules/foo")).unwrap();
std::fs::write(root.join("node_modules/foo/bar.js"), b"x").unwrap();
std::fs::write(root.join("ok.md"), b"# ok").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
assert!(
skips.skipped_builtin_blacklist >= 1,
"skipped_builtin_blacklist should be >= 1; got {}",
skips.skipped_builtin_blacklist
);
assert!(
skips.skip_examples.builtin_blacklist.iter().any(|p| p.contains("node_modules")),
"skip_examples.builtin_blacklist should contain a node_modules path; got: {:?}",
skips.skip_examples.builtin_blacklist
);
}
#[test]
fn scan_with_skips_kebabignore_increments_counter_no_example() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join(".kebabignore"), "*.secret\n").unwrap();
std::fs::write(root.join("ok.md"), b"x").unwrap();
std::fs::write(root.join("creds.secret"), b"pw").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
assert!(
skips.skipped_kebabignore >= 1,
"skipped_kebabignore should be >= 1; got {}",
skips.skipped_kebabignore
);
// Per spec §5.5: kebabignore is intentionally NOT in skip_examples.
assert!(
skips.skip_examples.gitignore.is_empty(),
"gitignore examples should be empty; got: {:?}",
skips.skip_examples.gitignore
);
assert!(
skips.skip_examples.builtin_blacklist.is_empty(),
"builtin_blacklist examples should be empty; got: {:?}",
skips.skip_examples.builtin_blacklist
);
}
#[test]
fn scan_with_skips_builtin_priority_over_gitignore() {
// node_modules/ matches both BUILTIN_BLACKLIST and a .gitignore entry.
// It must be attributed to builtin (spec §5.2 priority order).
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join(".gitignore"), "node_modules/\n").unwrap();
std::fs::create_dir_all(root.join("node_modules/pkg")).unwrap();
std::fs::write(root.join("node_modules/pkg/index.js"), b"x").unwrap();
std::fs::write(root.join("ok.md"), b"x").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
assert!(
skips.skipped_builtin_blacklist >= 1,
"builtin counter should be >= 1; got {}",
skips.skipped_builtin_blacklist
);
assert_eq!(
skips.skipped_gitignore, 0,
"gitignore counter must be 0 when builtin wins; got {}",
skips.skipped_gitignore
);
}
#[test]
fn skip_examples_cap_at_five() {
// Write 7 .log files — skip_examples.gitignore must cap at 5.
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(root.join(".gitignore"), "*.log\n").unwrap();
for i in 0..7 {
std::fs::write(root.join(format!("f{i}.log")), b"x").unwrap();
}
std::fs::write(root.join("ok.md"), b"x").unwrap();
let conn =
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
.unwrap();
let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
assert_eq!(skips.skipped_gitignore, 7, "should count all 7");
assert_eq!(
skips.skip_examples.gitignore.len(),
5,
"skip_examples.gitignore must cap at 5; got: {:?}",
skips.skip_examples.gitignore
);
}
}

View File

@@ -13,4 +13,4 @@ mod hash;
mod media;
mod walker;
pub use connector::FsSourceConnector;
pub use connector::{FsScanSkips, FsSourceConnector};

View File

@@ -35,7 +35,7 @@ use std::path::{Path, PathBuf};
use anyhow::{Context, Result};
use ignore::overrides::{Override, OverrideBuilder};
use walkdir::{DirEntry, WalkDir};
use walkdir::WalkDir;
/// Default-excludes baked into the connector. These are NOT configurable;
/// they cover noise that is never useful to ingest and would otherwise need
@@ -49,7 +49,96 @@ const DEFAULT_EXCLUDES: &[&str] = &[
"**/._*",
];
/// Build the merged `Override` from all five filter sources, in order:
/// Per-source `Override` matchers for skip-counter attribution (spec §5.5).
///
/// `combined` is the merged union of all sources — used for the actual
/// "is this entry excluded?" decision in the walker. The three per-source
/// matchers (`gitignore`, `kebabignore`, `builtin`) are used ONLY when
/// classifying an already-excluded path for `IngestReport` counter wiring;
/// they are never consulted for every walked file.
///
/// `default_and_config` covers DEFAULT_EXCLUDES + `config.workspace.exclude`
/// — these do NOT map to any of the three named `IngestReport` counters.
pub(crate) struct WalkOverrides {
/// Merged matcher — same as today's `Override`; used for the walk decision.
pub combined: Override,
/// Matcher built from `<root>/.gitignore` patterns only.
pub gitignore: Override,
/// Matcher built from `<root>/.kebabignore` patterns only.
pub kebabignore: Override,
/// Matcher built from `kebab_parse_code::BUILTIN_BLACKLIST` only.
pub builtin: Override,
}
/// Skip attribution category. Used by the connector when counting per-source
/// skips for `IngestReport` (spec §5.5).
///
/// Priority order per spec §5.2: built-in > gitignore > kebabignore.
/// A path matching multiple sources is attributed to the first match.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum SkipCategory {
BuiltinBlacklist,
Gitignore,
Kebabignore,
/// Matched DEFAULT_EXCLUDES or `config.workspace.exclude`. No dedicated
/// counter in `IngestReport` — lumped into the existing `skipped` field.
Other,
}
/// Build a single `Override` from a list of gitignore-style patterns, all
/// registered as excludes (prepend `!`).
///
/// Empty pattern list → an `Override` that matches nothing (i.e. no
/// exclusions). Callers must strip blanks / comments before passing.
fn build_single_matcher(root: &Path, patterns: &[&str]) -> Result<Override> {
let mut builder = OverrideBuilder::new(root);
for pat in patterns {
builder
.add(&format!("!{pat}"))
.with_context(|| format!("invalid pattern: {pat}"))?;
}
builder.build().context("failed to compile override")
}
/// Build the builtin-blacklist `Override`, adding directory-level patterns in
/// addition to the `/**`-suffix ones from `BUILTIN_BLACKLIST`.
///
/// BUILTIN_BLACKLIST uses `**/X/**` patterns which match files *inside* X but
/// NOT the directory entry `X` itself (because `**/X/**` requires a path
/// component after X). The walker prunes at the directory level (`is_dir=true`),
/// so we need `**/X` (no trailing `/**`) to also match the directory itself
/// for attribution purposes.
fn build_builtin_matcher(root: &Path) -> Result<Override> {
let mut builder = OverrideBuilder::new(root);
for pat in kebab_parse_code::BUILTIN_BLACKLIST {
// Register the original pattern (matches files inside the dir).
builder
.add(&format!("!{pat}"))
.with_context(|| format!("builtin pattern: {pat}"))?;
// Also derive a directory-level match by stripping trailing `/**`.
// This makes `is_dir=true` checks on the directory itself work.
if let Some(dir_pat) = pat.strip_suffix("/**") {
builder
.add(&format!("!{dir_pat}"))
.with_context(|| format!("builtin dir pattern: {dir_pat}"))?;
}
}
builder.build().context("failed to compile builtin override")
}
/// Owned-string variant of `build_single_matcher` for caller-supplied
/// `Vec<String>` sources (config.workspace.exclude, .kebabignore).
fn build_single_matcher_owned(root: &Path, patterns: &[String]) -> Result<Override> {
let mut builder = OverrideBuilder::new(root);
for pat in patterns {
builder
.add(&format!("!{pat}"))
.with_context(|| format!("invalid pattern: {pat}"))?;
}
builder.build().context("failed to compile override")
}
/// Build the merged `WalkOverrides` from all five filter sources, in order:
/// DEFAULT_EXCLUDES, `config.workspace.exclude`, `.kebabignore`,
/// built-in safety-net blacklist (`kebab_parse_code::BUILTIN_BLACKLIST`),
/// and `<root>/.gitignore` (root-only, no nested cascade).
@@ -58,48 +147,82 @@ const DEFAULT_EXCLUDES: &[&str] = &[
/// leading `!` flips a positive match to a negative one in the
/// `OverrideBuilder` API). Order doesn't matter — the union is computed by
/// the underlying gitignore engine.
///
/// The three per-source matchers (`gitignore`, `kebabignore`, `builtin`) are
/// built in addition to the combined one so the connector can attribute skips
/// to the correct `IngestReport` counter without a second walker pass.
pub(crate) fn build_overrides(
root: &Path,
config_exclude: &[String],
kbignore_patterns: &[String],
) -> Result<Override> {
let mut builder = OverrideBuilder::new(root);
) -> Result<WalkOverrides> {
let gitignore_patterns = read_gitignore(root)?;
// Per-source matchers (for attribution only).
let gitignore =
build_single_matcher(root, &gitignore_patterns.iter().map(|s| s.as_str()).collect::<Vec<_>>())?;
let kebabignore = build_single_matcher_owned(root, kbignore_patterns)?;
// Use the directory-aware builtin matcher so that `is_dir=true` checks on
// directory entries (e.g., `node_modules/`) are attributed to builtin rather
// than to an overlapping gitignore pattern.
let builtin = build_builtin_matcher(root)?;
// Combined matcher — union of all five sources.
let mut combined_builder = OverrideBuilder::new(root);
for pat in DEFAULT_EXCLUDES {
builder
combined_builder
.add(&format!("!{pat}"))
.with_context(|| format!("invalid default-exclude pattern: {pat}"))?;
}
for pat in config_exclude {
builder
combined_builder
.add(&format!("!{pat}"))
.with_context(|| format!("invalid workspace.exclude pattern: {pat}"))?;
}
for pat in kbignore_patterns {
builder
combined_builder
.add(&format!("!{pat}"))
.with_context(|| format!("invalid .kebabignore pattern: {pat}"))?;
}
// p10-1A-1: built-in safety-net blacklist (spec §5.2). 6 patterns
// universal across ecosystems. User can negate via `.kebabignore`.
for pat in kebab_parse_code::BUILTIN_BLACKLIST {
builder
combined_builder
.add(&format!("!{pat}"))
.with_context(|| format!("built-in blacklist pattern: {pat}"))?;
}
// p10-1A-1: honor repo-root `.gitignore` (spec §5.2). Read once,
// merge with same convention as user `.kebabignore`. Nested
// cascade deferred to P+.
let gitignore_patterns = read_gitignore(root)?;
for pat in &gitignore_patterns {
builder
combined_builder
.add(&format!("!{pat}"))
.with_context(|| format!(".gitignore pattern: {pat}"))?;
}
let combined = combined_builder
.build()
.context("failed to compile combined override set")?;
builder.build().context("failed to compile override set")
Ok(WalkOverrides {
combined,
gitignore,
kebabignore,
builtin,
})
}
/// Classify why a path was excluded, using per-source matchers in spec §5.2
/// priority order: built-in > gitignore > kebabignore > other.
///
/// `rel` must be relative to the walker root (same as `Override::matched`
/// expects). `is_dir` should match what the original walker saw.
pub(crate) fn classify_skip(rel: &Path, is_dir: bool, ov: &WalkOverrides) -> SkipCategory {
if ov.builtin.matched(rel, is_dir).is_ignore() {
return SkipCategory::BuiltinBlacklist;
}
if ov.gitignore.matched(rel, is_dir).is_ignore() {
return SkipCategory::Gitignore;
}
if ov.kebabignore.matched(rel, is_dir).is_ignore() {
return SkipCategory::Kebabignore;
}
SkipCategory::Other
}
/// Read `<root>/.gitignore` (single-file, root-only — nested cascade is P+).
@@ -161,51 +284,87 @@ pub(crate) fn read_kbignore(root: &Path) -> Result<Vec<String>> {
.collect())
}
/// Iterate every regular file under `root`, applying `overrides` and
/// detecting symlink cycles. Returns absolute file paths.
/// Skipped-path record emitted by `walk_files_with_skips`.
///
/// `path` is the absolute path of the excluded entry (dir or file).
/// For excluded directories, this is the directory itself — individual
/// files inside are not enumerated (the subtree is pruned).
pub(crate) struct SkippedEntry {
pub path: PathBuf,
pub category: SkipCategory,
}
/// Iterate every regular file under `root`, applying `overrides.combined` and
/// detecting symlink cycles. Returns:
/// - `accepted`: absolute paths of files that passed all filters.
/// - `skipped`: entries that were excluded, with attribution.
///
/// For excluded *directories*, the directory path itself is returned (not the
/// individual files inside — the subtree is pruned in one step, matching the
/// walker's `skip_current_dir` behavior).
///
/// Strategy:
/// - `walkdir::WalkDir::follow_links(true)` to traverse symlinks.
/// - Manual per-entry check (instead of `filter_entry`) so we can capture
/// the excluded paths for skip attribution.
/// - Maintain `visited: HashSet<PathBuf>` of *canonical* paths. Before
/// descending into a directory entry, canonicalize and check the set;
/// if already present, skip. This breaks `a -> b -> a` cycles in O(n)
/// per entry without a custom recursive walker.
/// - For each yielded entry, ask `overrides` whether it is excluded; if
/// so, drop it. If the entry is a directory, also short-circuit
/// `WalkDir`'s descent via `it.skip_current_dir()`.
pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result<Vec<PathBuf>> {
let mut out = Vec::new();
pub(crate) fn walk_files_with_skips(
root: &Path,
overrides: &WalkOverrides,
) -> Result<(Vec<PathBuf>, Vec<SkippedEntry>)> {
let mut accepted = Vec::new();
let mut skipped: Vec<SkippedEntry> = Vec::new();
let mut visited: HashSet<PathBuf> = HashSet::new();
// Use a non-filtering iterator so we see excluded entries too.
let walker = WalkDir::new(root).follow_links(true).into_iter();
let mut it = walker.filter_entry(|e| !is_excluded(e, root, overrides));
// We still use filter_entry for the *combined* override so that walkdir
// can short-circuit pruned directories. But we wrap it so we can capture
// the exclusion reason before discarding the entry.
//
// Problem: filter_entry discards without letting us see the entry first.
// Solution: use the raw iterator (no filter_entry) and manage skip_current_dir
// manually, which lets us record what was excluded before pruning.
let mut it = walker;
while let Some(res) = it.next() {
let entry = match res {
Ok(e) => e,
Err(err) => {
// `walkdir` surfaces I/O errors AND its own cycle detector
// (when follow_links is on it sometimes catches them).
// Either way: log and skip; do not abort the whole scan.
tracing::warn!(error = %err, "walkdir entry error; skipping");
continue;
}
};
let path = entry.path();
let rel = match path.strip_prefix(root) {
Ok(p) => p,
Err(_) => path,
};
let is_dir = entry.file_type().is_dir();
let excluded = overrides.combined.matched(rel, is_dir).is_ignore();
// Cycle guard: only canonicalize symlinks (cheap on the common case
// of plain files/dirs) and on directories that are followed via a
// symlink. `walkdir`'s `path_is_symlink()` is true when the entry's
// *original* path is a symlink (it returns true for the link, not
// for the resolved target). For non-symlinked directories we still
// record the canonical path so a *later* symlink that points back
// to one of them is detected.
if entry.file_type().is_dir() {
if excluded {
let cat = classify_skip(rel, is_dir, overrides);
skipped.push(SkippedEntry {
path: path.to_path_buf(),
category: cat,
});
if is_dir {
// Prune the subtree — don't descend into excluded dirs.
it.skip_current_dir();
}
continue;
}
// Cycle guard for directories.
if is_dir {
match std::fs::canonicalize(path) {
Ok(canon) => {
if !visited.insert(canon) {
// Already visited via another path → break cycle.
it.skip_current_dir();
continue;
}
@@ -222,26 +381,13 @@ pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result<Vec<PathBu
}
if entry.file_type().is_file() {
out.push(path.to_path_buf());
accepted.push(path.to_path_buf());
}
}
Ok(out)
Ok((accepted, skipped))
}
fn is_excluded(entry: &DirEntry, root: &Path, overrides: &Override) -> bool {
// `Override::matched(path, is_dir)` uses the path *relative to* the
// override builder's root. `walkdir` gives absolute paths when
// `WalkDir::new` was given an absolute path — strip the root prefix
// before consulting the override.
let rel = match entry.path().strip_prefix(root) {
Ok(p) => p,
Err(_) => entry.path(),
};
overrides
.matched(rel, entry.file_type().is_dir())
.is_ignore()
}
#[cfg(test)]
mod tests {
@@ -252,7 +398,7 @@ mod tests {
let dir = tempfile::tempdir().unwrap();
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
// Default-excludes only; non-special files should not match.
let m = ov.matched(Path::new("notes/alpha.md"), false);
let m = ov.combined.matched(Path::new("notes/alpha.md"), false);
assert!(!m.is_ignore());
}
@@ -260,13 +406,13 @@ mod tests {
fn default_excludes_ds_store_and_resource_forks() {
let dir = tempfile::tempdir().unwrap();
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
assert!(ov.matched(Path::new(".DS_Store"), false).is_ignore());
assert!(ov.combined.matched(Path::new(".DS_Store"), false).is_ignore());
assert!(
ov.matched(Path::new("notes/.DS_Store"), false).is_ignore()
ov.combined.matched(Path::new("notes/.DS_Store"), false).is_ignore()
);
assert!(ov.matched(Path::new("._foo.md"), false).is_ignore());
assert!(ov.combined.matched(Path::new("._foo.md"), false).is_ignore());
assert!(
ov.matched(Path::new("notes/._sidecar"), false).is_ignore()
ov.combined.matched(Path::new("notes/._sidecar"), false).is_ignore()
);
}
@@ -279,13 +425,13 @@ mod tests {
&[],
)
.unwrap();
assert!(ov.matched(Path::new("a.tmp"), false).is_ignore());
assert!(ov.matched(Path::new("notes/x.tmp"), false).is_ignore());
assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore());
assert!(ov.combined.matched(Path::new("notes/x.tmp"), false).is_ignore());
assert!(
ov.matched(Path::new("node_modules/foo/bar.js"), false)
ov.combined.matched(Path::new("node_modules/foo/bar.js"), false)
.is_ignore()
);
assert!(!ov.matched(Path::new("alpha.md"), false).is_ignore());
assert!(!ov.combined.matched(Path::new("alpha.md"), false).is_ignore());
}
#[test]
@@ -298,9 +444,9 @@ mod tests {
&["secret/**".to_string()],
)
.unwrap();
assert!(ov.matched(Path::new("a.tmp"), false).is_ignore());
assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore());
assert!(
ov.matched(Path::new("secret/key.md"), false).is_ignore()
ov.combined.matched(Path::new("secret/key.md"), false).is_ignore()
);
}
@@ -337,8 +483,8 @@ mod tests {
let overrides = build_overrides(root, &[], &[]).unwrap();
// Override::matched expects paths relative to the builder's root.
let m_in = overrides.matched(Path::new("src/main.rs"), false);
let m_out = overrides.matched(Path::new("node_modules/foo/bar.js"), false);
let m_in = overrides.combined.matched(Path::new("src/main.rs"), false);
let m_out = overrides.combined.matched(Path::new("node_modules/foo/bar.js"), false);
assert!(!m_in.is_ignore(), "src/main.rs should NOT be ignored");
assert!(m_out.is_ignore(), "node_modules/foo/bar.js SHOULD be ignored");
@@ -367,10 +513,10 @@ mod tests {
"venv/x/y.txt",
"env/x/y.txt",
] {
let m = overrides.matched(Path::new(blacklisted), false);
let m = overrides.combined.matched(Path::new(blacklisted), false);
assert!(m.is_ignore(), "{blacklisted} should be ignored");
}
let m_ok = overrides.matched(Path::new("ok/z.txt"), false);
let m_ok = overrides.combined.matched(Path::new("ok/z.txt"), false);
assert!(!m_ok.is_ignore(), "ok/z.txt should not be ignored");
}
@@ -389,9 +535,9 @@ mod tests {
fs::write(root.join("dist/bundle.js"), "x").unwrap();
let overrides = build_overrides(root, &[], &[]).unwrap();
assert!(overrides.matched(Path::new("a.log"), false).is_ignore());
assert!(overrides.matched(Path::new("dist/bundle.js"), false).is_ignore());
assert!(!overrides.matched(Path::new("src/main.rs"), false).is_ignore());
assert!(overrides.combined.matched(Path::new("a.log"), false).is_ignore());
assert!(overrides.combined.matched(Path::new("dist/bundle.js"), false).is_ignore());
assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore());
}
#[test]
@@ -407,8 +553,8 @@ mod tests {
// No .gitignore present — patterns from .gitignore should not affect overrides.
let overrides = build_overrides(root, &[], &[]).unwrap();
assert!(!overrides.matched(Path::new("a.log"), false).is_ignore());
assert!(!overrides.matched(Path::new("src/main.rs"), false).is_ignore());
assert!(!overrides.combined.matched(Path::new("a.log"), false).is_ignore());
assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore());
}
#[test]
@@ -424,4 +570,128 @@ mod tests {
let result = build_overrides(root, &[], &[]);
assert!(result.is_ok(), "should not error on negation pattern: {:?}", result.err());
}
// ── Skip attribution tests ────────────────────────────────────────────────
#[test]
fn classify_skip_attributes_builtin_over_gitignore() {
use std::fs;
use tempfile::TempDir;
let tmp = TempDir::new().unwrap();
let root = tmp.path();
// node_modules matches both BUILTIN_BLACKLIST and a hypothetical
// .gitignore entry. Builtin must win (priority order §5.2).
fs::write(root.join(".gitignore"), "node_modules/\n").unwrap();
let ov = build_overrides(root, &[], &[]).unwrap();
// node_modules/ dir itself
let cat = classify_skip(Path::new("node_modules"), true, &ov);
assert_eq!(cat, SkipCategory::BuiltinBlacklist, "builtin must have priority");
}
#[test]
fn classify_skip_attributes_gitignore_for_log_files() {
use std::fs;
use tempfile::TempDir;
let tmp = TempDir::new().unwrap();
let root = tmp.path();
fs::write(root.join(".gitignore"), "*.log\n").unwrap();
let ov = build_overrides(root, &[], &[]).unwrap();
let cat = classify_skip(Path::new("app.log"), false, &ov);
assert_eq!(cat, SkipCategory::Gitignore);
}
#[test]
fn classify_skip_attributes_kebabignore() {
use tempfile::TempDir;
let tmp = TempDir::new().unwrap();
let root = tmp.path();
let ov = build_overrides(root, &[], &["*.secret".to_string()]).unwrap();
let cat = classify_skip(Path::new("creds.secret"), false, &ov);
assert_eq!(cat, SkipCategory::Kebabignore);
}
#[test]
fn walk_files_with_skips_counts_gitignored_files() {
use std::fs;
use tempfile::TempDir;
let tmp = TempDir::new().unwrap();
let root = tmp.path();
fs::write(root.join(".gitignore"), "*.log\n").unwrap();
fs::write(root.join("ok.md"), "# ok").unwrap();
fs::write(root.join("skipme.log"), "x").unwrap();
let ov = build_overrides(root, &[], &[]).unwrap();
let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap();
let accepted_names: Vec<_> = accepted
.iter()
.map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
.collect();
assert!(
accepted_names.iter().any(|n| n == "ok.md"),
"ok.md should be accepted; got: {accepted_names:?}"
);
assert!(
!accepted_names.iter().any(|n| n == "skipme.log"),
"skipme.log should not be accepted; got: {accepted_names:?}"
);
let gitignore_skipped: Vec<_> = skipped_entries
.iter()
.filter(|e| e.category == SkipCategory::Gitignore)
.collect();
assert!(
gitignore_skipped.iter().any(|e| e.path.file_name()
.map(|n| n == "skipme.log")
.unwrap_or(false)),
"skipme.log should appear in gitignore_skipped; skipped: {:?}",
skipped_entries.iter().map(|e| &e.path).collect::<Vec<_>>()
);
}
#[test]
fn walk_files_with_skips_counts_builtin_blacklist_dirs() {
use std::fs;
use tempfile::TempDir;
let tmp = TempDir::new().unwrap();
let root = tmp.path();
fs::create_dir_all(root.join("node_modules/foo")).unwrap();
fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap();
fs::write(root.join("ok.md"), "# ok").unwrap();
let ov = build_overrides(root, &[], &[]).unwrap();
let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap();
let accepted_names: Vec<_> = accepted
.iter()
.map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
.collect();
assert!(
accepted_names.iter().any(|n| n == "ok.md"),
"ok.md must be accepted; got: {accepted_names:?}"
);
let builtin_skipped: Vec<_> = skipped_entries
.iter()
.filter(|e| e.category == SkipCategory::BuiltinBlacklist)
.collect();
assert!(
!builtin_skipped.is_empty(),
"node_modules/ should produce at least one BuiltinBlacklist skip"
);
assert!(
builtin_skipped.iter().any(|e| e.path.components()
.any(|c| c.as_os_str() == "node_modules")),
"skipped path should contain node_modules; got: {:?}",
builtin_skipped.iter().map(|e| &e.path).collect::<Vec<_>>()
);
}
}