From 9fce24b1061c8556879ce80c81959a9e0c5c98a1 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Fri, 15 May 2026 16:42:28 +0900
Subject: [PATCH] feat(p10-1a-1): wire IngestReport skip counters by category
(gitignore/builtin/kebabignore)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Refactor walker to expose WalkOverrides (combined + per-source matchers),
add walk_files_with_skips that returns accepted files alongside skip
attribution, wire FsSourceConnector::scan_with_skips into kebab-app so
IngestReport.skipped_gitignore, skipped_kebabignore, skipped_builtin_blacklist,
and skip_examples are populated instead of left at zero. Priority order
per spec §5.2 (builtin > gitignore > kebabignore) enforced in classify_skip,
with a directory-aware builtin matcher so pruned directory entries are
correctly attributed to builtin rather than a coincident gitignore entry.
Co-Authored-By: Claude Sonnet 4.6
---
crates/kebab-app/src/lib.rs | 14 +-
crates/kebab-source-fs/src/connector.rs | 376 ++++++++++++++++-----
crates/kebab-source-fs/src/lib.rs | 2 +-
crates/kebab-source-fs/src/walker.rs | 416 +++++++++++++++++++-----
4 files changed, 644 insertions(+), 164 deletions(-)
diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs
index 3c8e03c..99f5597 100644
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -44,7 +44,7 @@ use kebab_core::{
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType,
- ParserVersion, RawAsset, SearchHit, SearchQuery, SkipExamples, SourceConnector, SourceScope,
+ ParserVersion, RawAsset, SearchHit, SearchQuery, SourceScope,
SourceUri, VectorRecord, VectorStore,
};
use kebab_llm_local::OllamaLanguageModel;
@@ -305,8 +305,8 @@ pub fn ingest_with_config_opts(
);
let connector = FsSourceConnector::new(&app.config)
.context("kb-app::ingest: build FsSourceConnector")?;
- let assets = connector
- .scan(&scope)
+ let (assets, fs_skips) = connector
+ .scan_with_skips(&scope)
.context("kb-app::ingest: scan workspace")?;
crate::ingest_progress::emit(
progress,
@@ -675,12 +675,12 @@ pub fn ingest_with_config_opts(
errors: error_count,
duration_ms,
skipped_by_extension,
- skipped_gitignore: 0,
- skipped_kebabignore: 0,
- skipped_builtin_blacklist: 0,
+ skipped_gitignore: fs_skips.skipped_gitignore,
+ skipped_kebabignore: fs_skips.skipped_kebabignore,
+ skipped_builtin_blacklist: fs_skips.skipped_builtin_blacklist,
skipped_generated: 0,
skipped_size_exceeded: 0,
- skip_examples: SkipExamples::default(),
+ skip_examples: fs_skips.skip_examples,
items: if summary_only { None } else { Some(items) },
})
}
diff --git a/crates/kebab-source-fs/src/connector.rs b/crates/kebab-source-fs/src/connector.rs
index b01c14a..d3cdb1f 100644
--- a/crates/kebab-source-fs/src/connector.rs
+++ b/crates/kebab-source-fs/src/connector.rs
@@ -10,20 +10,20 @@
//! }
//! ```
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
use anyhow::{Context, Result};
use time::OffsetDateTime;
use kebab_config::Config;
use kebab_core::{
- AssetStorage, Checksum, RawAsset, SourceConnector, SourceScope, SourceUri,
+ AssetStorage, Checksum, RawAsset, SkipExamples, SourceConnector, SourceScope, SourceUri,
id_for_asset, to_posix,
};
use crate::hash::hash_file;
use crate::media::media_type_for;
-use crate::walker::{build_overrides, read_kbignore, walk_files};
+use crate::walker::{SkipCategory, WalkOverrides, build_overrides, read_kbignore, walk_files_with_skips};
/// Local-filesystem `SourceConnector`. Constructed once from `Config`,
/// reused across `scan` calls.
@@ -61,107 +61,179 @@ impl FsSourceConnector {
copy_threshold_bytes,
})
}
-}
-impl SourceConnector for FsSourceConnector {
- fn scan(&self, scope: &SourceScope) -> Result> {
- // `SourceScope::root` overrides config root when non-empty. This
- // matches the design's "scope is the per-call lens; config is the
- // default" split (§7.1).
+ /// Resolve the effective root and build the merged + per-source overrides.
+ fn resolve_scan_params(
+ &self,
+ scope: &SourceScope,
+ ) -> Result<(PathBuf, Vec, WalkOverrides)> {
let root = if scope.root.as_os_str().is_empty() {
self.default_root.clone()
} else {
scope.root.clone()
};
- // Union: config.workspace.exclude ∪ scope.exclude ∪ .kebabignore.
- // Per §6.2 the union of `.kebabignore` and `config.workspace.exclude`
- // is the filter set. `scope.exclude` is added on top so a caller
- // can layer a per-call narrowing.
let mut excludes = self.default_exclude.clone();
excludes.extend(scope.exclude.iter().cloned());
- // .kebabignore is re-read on every scan() so users can edit it without
- // restarting any long-running process.
let kbignore = read_kbignore(&root)?;
let overrides = build_overrides(&root, &excludes, &kbignore)?;
+ Ok((root, kbignore, overrides))
+ }
- // TODO(P1-2/P1-3 router): apply SourceScope::include glob filter at the
- // extractor router layer once that crate lands. SourceConnector emits all
- // non-excluded files; routing by include-glob is a downstream concern
- // (design §6.2 + §7.2 are silent on this split, treat it as router work).
- //
- // `scope.include` is intentionally ignored at this stage of the
- // pipeline: per §6.2 the workspace-level include lives in
- // `WorkspaceCfg` and is enforced by the asset writer / extractors.
- // Surfacing it here would double-filter Markdown vs PDF before the
- // extractor router gets to see them.
- if !scope.include.is_empty() {
- tracing::debug!(
- count = scope.include.len(),
- "FsSourceConnector ignores scope.include — handled by extractor router"
- );
- }
+ /// Scan the workspace and return the accepted assets together with
+ /// per-category skip counts and sample paths for `IngestReport`.
+ ///
+ /// This is the **preferred entry point** for `kebab-app`: it provides
+ /// all the information needed to populate `IngestReport.skipped_gitignore`,
+ /// `skipped_kebabignore`, `skipped_builtin_blacklist`, and `skip_examples`
+ /// without a second walker pass.
+ pub fn scan_with_skips(
+ &self,
+ scope: &SourceScope,
+ ) -> Result<(Vec, FsScanSkips)> {
+ let (root, _kbignore, overrides) = self.resolve_scan_params(scope)?;
- let files = walk_files(&root, &overrides)?;
+ // Suppress unused-variable warning — kbignore patterns are already
+ // baked into `overrides`; we don't need them again here.
- let mut assets = Vec::with_capacity(files.len());
- for abs in &files {
- // `to_posix` does NFC + leading `./` strip + `#` rejection.
- // Compute the workspace-relative path before handing to it so
- // emitted `WorkspacePath` is always relative.
- let rel = abs.strip_prefix(&root).unwrap_or(abs);
- let workspace_path = match to_posix(rel) {
- Ok(p) => p,
- Err(e) => {
- // A path containing `#` is the only documented reason
- // `to_posix` fails today. Drop the file with a warning
- // rather than aborting the entire scan — a single bad
- // filename should not nuke a 10 000-file ingest.
- tracing::warn!(
- path = %abs.display(),
- error = %e,
- "skipping file: path is not a valid WorkspacePath",
+ log_scope_include_warning(scope);
+
+ let (files, skipped_entries) = walk_files_with_skips(&root, &overrides)?;
+
+ // Accumulate per-category skip counts and sample paths.
+ let mut fs_skips = FsScanSkips::default();
+ for entry in &skipped_entries {
+ match entry.category {
+ SkipCategory::BuiltinBlacklist => {
+ fs_skips.skipped_builtin_blacklist =
+ fs_skips.skipped_builtin_blacklist.saturating_add(1);
+ push_sample(
+ &mut fs_skips.skip_examples.builtin_blacklist,
+ &entry.path,
+ &root,
);
- continue;
}
- };
-
- let media_type = media_type_for(abs);
- let (byte_len, full_hex) = hash_file(abs)
- .with_context(|| format!("hashing {}", abs.display()))?;
- let checksum = Checksum(full_hex.clone());
- let asset_id = id_for_asset(&full_hex);
-
- // Storage variant signals *intent*, not an actual copy.
- // P1-6 (asset writer) is responsible for the on-disk copy.
- let stored = if byte_len > self.copy_threshold_bytes {
- AssetStorage::Reference {
- path: abs.clone(),
- sha: checksum.clone(),
+ SkipCategory::Gitignore => {
+ fs_skips.skipped_gitignore =
+ fs_skips.skipped_gitignore.saturating_add(1);
+ push_sample(
+ &mut fs_skips.skip_examples.gitignore,
+ &entry.path,
+ &root,
+ );
}
- } else {
- AssetStorage::Copied { path: abs.clone() }
- };
-
- assets.push(RawAsset {
- asset_id,
- source_uri: SourceUri::File(abs.clone()),
- workspace_path,
- media_type,
- byte_len,
- checksum,
- discovered_at: OffsetDateTime::now_utc(),
- stored,
- });
+ SkipCategory::Kebabignore => {
+ fs_skips.skipped_kebabignore =
+ fs_skips.skipped_kebabignore.saturating_add(1);
+ // kebabignore intentionally NOT in skip_examples per spec §5.5.
+ }
+ SkipCategory::Other => {
+ // DEFAULT_EXCLUDES or config.workspace.exclude — no dedicated
+ // IngestReport counter; these are lumped into the existing
+ // `skipped` field by kebab-app.
+ }
+ }
}
- // Determinism: sort by workspace_path. WorkspacePath is a String
- // newtype with stable lexicographic ordering. Two scans of the
- // same tree must produce identical Vec modulo the
- // wall-clock `discovered_at` field.
- assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0));
+ let assets = build_assets(&files, &root, self.copy_threshold_bytes)?;
+ Ok((assets, fs_skips))
+ }
+}
+/// Per-category skip counts and sample paths returned alongside the asset list
+/// by [`FsSourceConnector::scan_with_skips`].
+///
+/// Populated from the walker's per-source matchers without a second pass.
+#[derive(Debug, Default)]
+pub struct FsScanSkips {
+ pub skipped_gitignore: u32,
+ pub skipped_kebabignore: u32,
+ pub skipped_builtin_blacklist: u32,
+ /// Sample paths per spec §5.5 (≤ 5 per category). Paths are
+ /// workspace-relative POSIX strings when available, absolute otherwise.
+ pub skip_examples: SkipExamples,
+}
+
+/// Push a path into a sample vec (cap = 5) as a workspace-relative POSIX
+/// string. Falls back to the lossy absolute path if relativisation fails.
+fn push_sample(samples: &mut Vec, abs: &Path, root: &Path) {
+ if samples.len() >= 5 {
+ return;
+ }
+ let rel = abs.strip_prefix(root).unwrap_or(abs);
+ // Best-effort POSIX string; any non-UTF8 char → replacement char.
+ let s = rel.to_string_lossy().replace('\\', "/");
+ samples.push(s);
+}
+
+/// Convert a list of absolute file paths to `Vec`, sorted by
+/// workspace-relative POSIX path for determinism.
+fn build_assets(
+ files: &[PathBuf],
+ root: &Path,
+ copy_threshold_bytes: u64,
+) -> Result> {
+ let mut assets = Vec::with_capacity(files.len());
+ for abs in files {
+ let rel = abs.strip_prefix(root).unwrap_or(abs);
+ let workspace_path = match to_posix(rel) {
+ Ok(p) => p,
+ Err(e) => {
+ tracing::warn!(
+ path = %abs.display(),
+ error = %e,
+ "skipping file: path is not a valid WorkspacePath",
+ );
+ continue;
+ }
+ };
+
+ let media_type = media_type_for(abs);
+ let (byte_len, full_hex) = hash_file(abs)
+ .with_context(|| format!("hashing {}", abs.display()))?;
+ let checksum = Checksum(full_hex.clone());
+ let asset_id = id_for_asset(&full_hex);
+
+ let stored = if byte_len > copy_threshold_bytes {
+ AssetStorage::Reference {
+ path: abs.clone(),
+ sha: checksum.clone(),
+ }
+ } else {
+ AssetStorage::Copied { path: abs.clone() }
+ };
+
+ assets.push(RawAsset {
+ asset_id,
+ source_uri: SourceUri::File(abs.clone()),
+ workspace_path,
+ media_type,
+ byte_len,
+ checksum,
+ discovered_at: OffsetDateTime::now_utc(),
+ stored,
+ });
+ }
+
+ assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0));
+ Ok(assets)
+}
+
+fn log_scope_include_warning(scope: &SourceScope) {
+ if !scope.include.is_empty() {
+ tracing::debug!(
+ count = scope.include.len(),
+ "FsSourceConnector ignores scope.include — handled by extractor router"
+ );
+ }
+}
+
+impl SourceConnector for FsSourceConnector {
+ fn scan(&self, scope: &SourceScope) -> Result> {
+ // Delegate to scan_with_skips; discard the skip counts.
+ // Callers that need skip attribution should call scan_with_skips directly.
+ let (assets, _skips) = self.scan_with_skips(scope)?;
Ok(assets)
}
}
@@ -401,4 +473,142 @@ mod tests {
let v2 = conn2.scan(&SourceScope::default()).unwrap();
assert!(matches!(v2[0].stored, AssetStorage::Copied { .. }));
}
+
+ // ── IngestReport skip counter wiring tests ───────────────────────────────
+
+ #[test]
+ fn scan_with_skips_counts_gitignored_files() {
+ let dir = tempfile::tempdir().unwrap();
+ let root = dir.path();
+ std::fs::write(root.join(".gitignore"), "*.log\n").unwrap();
+ std::fs::write(root.join("ok.md"), b"# ok").unwrap();
+ std::fs::write(root.join("skipme.log"), b"x").unwrap();
+
+ let conn =
+ FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
+ .unwrap();
+ let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
+
+ assert!(
+ skips.skipped_gitignore >= 1,
+ "skipped_gitignore should be >= 1; got {}",
+ skips.skipped_gitignore
+ );
+ assert!(
+ skips.skip_examples.gitignore.iter().any(|p| p.contains("skipme.log")),
+ "skip_examples.gitignore should contain 'skipme.log'; got: {:?}",
+ skips.skip_examples.gitignore
+ );
+ // kebabignore counter must be 0 — file matched gitignore, not kebabignore.
+ assert_eq!(skips.skipped_kebabignore, 0);
+ }
+
+ #[test]
+ fn scan_with_skips_counts_builtin_blacklist_dirs() {
+ let dir = tempfile::tempdir().unwrap();
+ let root = dir.path();
+ std::fs::create_dir_all(root.join("node_modules/foo")).unwrap();
+ std::fs::write(root.join("node_modules/foo/bar.js"), b"x").unwrap();
+ std::fs::write(root.join("ok.md"), b"# ok").unwrap();
+
+ let conn =
+ FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
+ .unwrap();
+ let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
+
+ assert!(
+ skips.skipped_builtin_blacklist >= 1,
+ "skipped_builtin_blacklist should be >= 1; got {}",
+ skips.skipped_builtin_blacklist
+ );
+ assert!(
+ skips.skip_examples.builtin_blacklist.iter().any(|p| p.contains("node_modules")),
+ "skip_examples.builtin_blacklist should contain a node_modules path; got: {:?}",
+ skips.skip_examples.builtin_blacklist
+ );
+ }
+
+ #[test]
+ fn scan_with_skips_kebabignore_increments_counter_no_example() {
+ let dir = tempfile::tempdir().unwrap();
+ let root = dir.path();
+ std::fs::write(root.join(".kebabignore"), "*.secret\n").unwrap();
+ std::fs::write(root.join("ok.md"), b"x").unwrap();
+ std::fs::write(root.join("creds.secret"), b"pw").unwrap();
+
+ let conn =
+ FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
+ .unwrap();
+ let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
+
+ assert!(
+ skips.skipped_kebabignore >= 1,
+ "skipped_kebabignore should be >= 1; got {}",
+ skips.skipped_kebabignore
+ );
+ // Per spec §5.5: kebabignore is intentionally NOT in skip_examples.
+ assert!(
+ skips.skip_examples.gitignore.is_empty(),
+ "gitignore examples should be empty; got: {:?}",
+ skips.skip_examples.gitignore
+ );
+ assert!(
+ skips.skip_examples.builtin_blacklist.is_empty(),
+ "builtin_blacklist examples should be empty; got: {:?}",
+ skips.skip_examples.builtin_blacklist
+ );
+ }
+
+ #[test]
+ fn scan_with_skips_builtin_priority_over_gitignore() {
+ // node_modules/ matches both BUILTIN_BLACKLIST and a .gitignore entry.
+ // It must be attributed to builtin (spec §5.2 priority order).
+ let dir = tempfile::tempdir().unwrap();
+ let root = dir.path();
+ std::fs::write(root.join(".gitignore"), "node_modules/\n").unwrap();
+ std::fs::create_dir_all(root.join("node_modules/pkg")).unwrap();
+ std::fs::write(root.join("node_modules/pkg/index.js"), b"x").unwrap();
+ std::fs::write(root.join("ok.md"), b"x").unwrap();
+
+ let conn =
+ FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
+ .unwrap();
+ let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
+
+ assert!(
+ skips.skipped_builtin_blacklist >= 1,
+ "builtin counter should be >= 1; got {}",
+ skips.skipped_builtin_blacklist
+ );
+ assert_eq!(
+ skips.skipped_gitignore, 0,
+ "gitignore counter must be 0 when builtin wins; got {}",
+ skips.skipped_gitignore
+ );
+ }
+
+ #[test]
+ fn skip_examples_cap_at_five() {
+ // Write 7 .log files — skip_examples.gitignore must cap at 5.
+ let dir = tempfile::tempdir().unwrap();
+ let root = dir.path();
+ std::fs::write(root.join(".gitignore"), "*.log\n").unwrap();
+ for i in 0..7 {
+ std::fs::write(root.join(format!("f{i}.log")), b"x").unwrap();
+ }
+ std::fs::write(root.join("ok.md"), b"x").unwrap();
+
+ let conn =
+ FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
+ .unwrap();
+ let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
+
+ assert_eq!(skips.skipped_gitignore, 7, "should count all 7");
+ assert_eq!(
+ skips.skip_examples.gitignore.len(),
+ 5,
+ "skip_examples.gitignore must cap at 5; got: {:?}",
+ skips.skip_examples.gitignore
+ );
+ }
}
diff --git a/crates/kebab-source-fs/src/lib.rs b/crates/kebab-source-fs/src/lib.rs
index ea7f294..6975271 100644
--- a/crates/kebab-source-fs/src/lib.rs
+++ b/crates/kebab-source-fs/src/lib.rs
@@ -13,4 +13,4 @@ mod hash;
mod media;
mod walker;
-pub use connector::FsSourceConnector;
+pub use connector::{FsScanSkips, FsSourceConnector};
diff --git a/crates/kebab-source-fs/src/walker.rs b/crates/kebab-source-fs/src/walker.rs
index 0d18baa..e963c0b 100644
--- a/crates/kebab-source-fs/src/walker.rs
+++ b/crates/kebab-source-fs/src/walker.rs
@@ -35,7 +35,7 @@ use std::path::{Path, PathBuf};
use anyhow::{Context, Result};
use ignore::overrides::{Override, OverrideBuilder};
-use walkdir::{DirEntry, WalkDir};
+use walkdir::WalkDir;
/// Default-excludes baked into the connector. These are NOT configurable;
/// they cover noise that is never useful to ingest and would otherwise need
@@ -49,7 +49,96 @@ const DEFAULT_EXCLUDES: &[&str] = &[
"**/._*",
];
-/// Build the merged `Override` from all five filter sources, in order:
+/// Per-source `Override` matchers for skip-counter attribution (spec §5.5).
+///
+/// `combined` is the merged union of all sources — used for the actual
+/// "is this entry excluded?" decision in the walker. The three per-source
+/// matchers (`gitignore`, `kebabignore`, `builtin`) are used ONLY when
+/// classifying an already-excluded path for `IngestReport` counter wiring;
+/// they are never consulted for every walked file.
+///
+/// `default_and_config` covers DEFAULT_EXCLUDES + `config.workspace.exclude`
+/// — these do NOT map to any of the three named `IngestReport` counters.
+pub(crate) struct WalkOverrides {
+ /// Merged matcher — same as today's `Override`; used for the walk decision.
+ pub combined: Override,
+ /// Matcher built from `/.gitignore` patterns only.
+ pub gitignore: Override,
+ /// Matcher built from `/.kebabignore` patterns only.
+ pub kebabignore: Override,
+ /// Matcher built from `kebab_parse_code::BUILTIN_BLACKLIST` only.
+ pub builtin: Override,
+}
+
+/// Skip attribution category. Used by the connector when counting per-source
+/// skips for `IngestReport` (spec §5.5).
+///
+/// Priority order per spec §5.2: built-in > gitignore > kebabignore.
+/// A path matching multiple sources is attributed to the first match.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum SkipCategory {
+ BuiltinBlacklist,
+ Gitignore,
+ Kebabignore,
+ /// Matched DEFAULT_EXCLUDES or `config.workspace.exclude`. No dedicated
+ /// counter in `IngestReport` — lumped into the existing `skipped` field.
+ Other,
+}
+
+/// Build a single `Override` from a list of gitignore-style patterns, all
+/// registered as excludes (prepend `!`).
+///
+/// Empty pattern list → an `Override` that matches nothing (i.e. no
+/// exclusions). Callers must strip blanks / comments before passing.
+fn build_single_matcher(root: &Path, patterns: &[&str]) -> Result {
+ let mut builder = OverrideBuilder::new(root);
+ for pat in patterns {
+ builder
+ .add(&format!("!{pat}"))
+ .with_context(|| format!("invalid pattern: {pat}"))?;
+ }
+ builder.build().context("failed to compile override")
+}
+
+/// Build the builtin-blacklist `Override`, adding directory-level patterns in
+/// addition to the `/**`-suffix ones from `BUILTIN_BLACKLIST`.
+///
+/// BUILTIN_BLACKLIST uses `**/X/**` patterns which match files *inside* X but
+/// NOT the directory entry `X` itself (because `**/X/**` requires a path
+/// component after X). The walker prunes at the directory level (`is_dir=true`),
+/// so we need `**/X` (no trailing `/**`) to also match the directory itself
+/// for attribution purposes.
+fn build_builtin_matcher(root: &Path) -> Result {
+ let mut builder = OverrideBuilder::new(root);
+ for pat in kebab_parse_code::BUILTIN_BLACKLIST {
+ // Register the original pattern (matches files inside the dir).
+ builder
+ .add(&format!("!{pat}"))
+ .with_context(|| format!("builtin pattern: {pat}"))?;
+ // Also derive a directory-level match by stripping trailing `/**`.
+ // This makes `is_dir=true` checks on the directory itself work.
+ if let Some(dir_pat) = pat.strip_suffix("/**") {
+ builder
+ .add(&format!("!{dir_pat}"))
+ .with_context(|| format!("builtin dir pattern: {dir_pat}"))?;
+ }
+ }
+ builder.build().context("failed to compile builtin override")
+}
+
+/// Owned-string variant of `build_single_matcher` for caller-supplied
+/// `Vec` sources (config.workspace.exclude, .kebabignore).
+fn build_single_matcher_owned(root: &Path, patterns: &[String]) -> Result {
+ let mut builder = OverrideBuilder::new(root);
+ for pat in patterns {
+ builder
+ .add(&format!("!{pat}"))
+ .with_context(|| format!("invalid pattern: {pat}"))?;
+ }
+ builder.build().context("failed to compile override")
+}
+
+/// Build the merged `WalkOverrides` from all five filter sources, in order:
/// DEFAULT_EXCLUDES, `config.workspace.exclude`, `.kebabignore`,
/// built-in safety-net blacklist (`kebab_parse_code::BUILTIN_BLACKLIST`),
/// and `/.gitignore` (root-only, no nested cascade).
@@ -58,48 +147,82 @@ const DEFAULT_EXCLUDES: &[&str] = &[
/// leading `!` flips a positive match to a negative one in the
/// `OverrideBuilder` API). Order doesn't matter — the union is computed by
/// the underlying gitignore engine.
+///
+/// The three per-source matchers (`gitignore`, `kebabignore`, `builtin`) are
+/// built in addition to the combined one so the connector can attribute skips
+/// to the correct `IngestReport` counter without a second walker pass.
pub(crate) fn build_overrides(
root: &Path,
config_exclude: &[String],
kbignore_patterns: &[String],
-) -> Result {
- let mut builder = OverrideBuilder::new(root);
+) -> Result {
+ let gitignore_patterns = read_gitignore(root)?;
+
+ // Per-source matchers (for attribution only).
+ let gitignore =
+ build_single_matcher(root, &gitignore_patterns.iter().map(|s| s.as_str()).collect::>())?;
+ let kebabignore = build_single_matcher_owned(root, kbignore_patterns)?;
+ // Use the directory-aware builtin matcher so that `is_dir=true` checks on
+ // directory entries (e.g., `node_modules/`) are attributed to builtin rather
+ // than to an overlapping gitignore pattern.
+ let builtin = build_builtin_matcher(root)?;
+
+ // Combined matcher — union of all five sources.
+ let mut combined_builder = OverrideBuilder::new(root);
for pat in DEFAULT_EXCLUDES {
- builder
+ combined_builder
.add(&format!("!{pat}"))
.with_context(|| format!("invalid default-exclude pattern: {pat}"))?;
}
for pat in config_exclude {
- builder
+ combined_builder
.add(&format!("!{pat}"))
.with_context(|| format!("invalid workspace.exclude pattern: {pat}"))?;
}
for pat in kbignore_patterns {
- builder
+ combined_builder
.add(&format!("!{pat}"))
.with_context(|| format!("invalid .kebabignore pattern: {pat}"))?;
}
-
- // p10-1A-1: built-in safety-net blacklist (spec §5.2). 6 patterns
- // universal across ecosystems. User can negate via `.kebabignore`.
for pat in kebab_parse_code::BUILTIN_BLACKLIST {
- builder
+ combined_builder
.add(&format!("!{pat}"))
.with_context(|| format!("built-in blacklist pattern: {pat}"))?;
}
-
- // p10-1A-1: honor repo-root `.gitignore` (spec §5.2). Read once,
- // merge with same convention as user `.kebabignore`. Nested
- // cascade deferred to P+.
- let gitignore_patterns = read_gitignore(root)?;
for pat in &gitignore_patterns {
- builder
+ combined_builder
.add(&format!("!{pat}"))
.with_context(|| format!(".gitignore pattern: {pat}"))?;
}
+ let combined = combined_builder
+ .build()
+ .context("failed to compile combined override set")?;
- builder.build().context("failed to compile override set")
+ Ok(WalkOverrides {
+ combined,
+ gitignore,
+ kebabignore,
+ builtin,
+ })
+}
+
+/// Classify why a path was excluded, using per-source matchers in spec §5.2
+/// priority order: built-in > gitignore > kebabignore > other.
+///
+/// `rel` must be relative to the walker root (same as `Override::matched`
+/// expects). `is_dir` should match what the original walker saw.
+pub(crate) fn classify_skip(rel: &Path, is_dir: bool, ov: &WalkOverrides) -> SkipCategory {
+ if ov.builtin.matched(rel, is_dir).is_ignore() {
+ return SkipCategory::BuiltinBlacklist;
+ }
+ if ov.gitignore.matched(rel, is_dir).is_ignore() {
+ return SkipCategory::Gitignore;
+ }
+ if ov.kebabignore.matched(rel, is_dir).is_ignore() {
+ return SkipCategory::Kebabignore;
+ }
+ SkipCategory::Other
}
/// Read `/.gitignore` (single-file, root-only — nested cascade is P+).
@@ -161,51 +284,87 @@ pub(crate) fn read_kbignore(root: &Path) -> Result> {
.collect())
}
-/// Iterate every regular file under `root`, applying `overrides` and
-/// detecting symlink cycles. Returns absolute file paths.
+/// Skipped-path record emitted by `walk_files_with_skips`.
+///
+/// `path` is the absolute path of the excluded entry (dir or file).
+/// For excluded directories, this is the directory itself — individual
+/// files inside are not enumerated (the subtree is pruned).
+pub(crate) struct SkippedEntry {
+ pub path: PathBuf,
+ pub category: SkipCategory,
+}
+
+/// Iterate every regular file under `root`, applying `overrides.combined` and
+/// detecting symlink cycles. Returns:
+/// - `accepted`: absolute paths of files that passed all filters.
+/// - `skipped`: entries that were excluded, with attribution.
+///
+/// For excluded *directories*, the directory path itself is returned (not the
+/// individual files inside — the subtree is pruned in one step, matching the
+/// walker's `skip_current_dir` behavior).
///
/// Strategy:
/// - `walkdir::WalkDir::follow_links(true)` to traverse symlinks.
+/// - Manual per-entry check (instead of `filter_entry`) so we can capture
+/// the excluded paths for skip attribution.
/// - Maintain `visited: HashSet` of *canonical* paths. Before
/// descending into a directory entry, canonicalize and check the set;
/// if already present, skip. This breaks `a -> b -> a` cycles in O(n)
/// per entry without a custom recursive walker.
-/// - For each yielded entry, ask `overrides` whether it is excluded; if
-/// so, drop it. If the entry is a directory, also short-circuit
-/// `WalkDir`'s descent via `it.skip_current_dir()`.
-pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result> {
- let mut out = Vec::new();
+pub(crate) fn walk_files_with_skips(
+ root: &Path,
+ overrides: &WalkOverrides,
+) -> Result<(Vec, Vec)> {
+ let mut accepted = Vec::new();
+ let mut skipped: Vec = Vec::new();
let mut visited: HashSet = HashSet::new();
+ // Use a non-filtering iterator so we see excluded entries too.
let walker = WalkDir::new(root).follow_links(true).into_iter();
- let mut it = walker.filter_entry(|e| !is_excluded(e, root, overrides));
+ // We still use filter_entry for the *combined* override so that walkdir
+ // can short-circuit pruned directories. But we wrap it so we can capture
+ // the exclusion reason before discarding the entry.
+ //
+ // Problem: filter_entry discards without letting us see the entry first.
+ // Solution: use the raw iterator (no filter_entry) and manage skip_current_dir
+ // manually, which lets us record what was excluded before pruning.
+ let mut it = walker;
while let Some(res) = it.next() {
let entry = match res {
Ok(e) => e,
Err(err) => {
- // `walkdir` surfaces I/O errors AND its own cycle detector
- // (when follow_links is on it sometimes catches them).
- // Either way: log and skip; do not abort the whole scan.
tracing::warn!(error = %err, "walkdir entry error; skipping");
continue;
}
};
let path = entry.path();
+ let rel = match path.strip_prefix(root) {
+ Ok(p) => p,
+ Err(_) => path,
+ };
+ let is_dir = entry.file_type().is_dir();
+ let excluded = overrides.combined.matched(rel, is_dir).is_ignore();
- // Cycle guard: only canonicalize symlinks (cheap on the common case
- // of plain files/dirs) and on directories that are followed via a
- // symlink. `walkdir`'s `path_is_symlink()` is true when the entry's
- // *original* path is a symlink (it returns true for the link, not
- // for the resolved target). For non-symlinked directories we still
- // record the canonical path so a *later* symlink that points back
- // to one of them is detected.
- if entry.file_type().is_dir() {
+ if excluded {
+ let cat = classify_skip(rel, is_dir, overrides);
+ skipped.push(SkippedEntry {
+ path: path.to_path_buf(),
+ category: cat,
+ });
+ if is_dir {
+ // Prune the subtree — don't descend into excluded dirs.
+ it.skip_current_dir();
+ }
+ continue;
+ }
+
+ // Cycle guard for directories.
+ if is_dir {
match std::fs::canonicalize(path) {
Ok(canon) => {
if !visited.insert(canon) {
- // Already visited via another path → break cycle.
it.skip_current_dir();
continue;
}
@@ -222,26 +381,13 @@ pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result bool {
- // `Override::matched(path, is_dir)` uses the path *relative to* the
- // override builder's root. `walkdir` gives absolute paths when
- // `WalkDir::new` was given an absolute path — strip the root prefix
- // before consulting the override.
- let rel = match entry.path().strip_prefix(root) {
- Ok(p) => p,
- Err(_) => entry.path(),
- };
- overrides
- .matched(rel, entry.file_type().is_dir())
- .is_ignore()
-}
#[cfg(test)]
mod tests {
@@ -252,7 +398,7 @@ mod tests {
let dir = tempfile::tempdir().unwrap();
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
// Default-excludes only; non-special files should not match.
- let m = ov.matched(Path::new("notes/alpha.md"), false);
+ let m = ov.combined.matched(Path::new("notes/alpha.md"), false);
assert!(!m.is_ignore());
}
@@ -260,13 +406,13 @@ mod tests {
fn default_excludes_ds_store_and_resource_forks() {
let dir = tempfile::tempdir().unwrap();
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
- assert!(ov.matched(Path::new(".DS_Store"), false).is_ignore());
+ assert!(ov.combined.matched(Path::new(".DS_Store"), false).is_ignore());
assert!(
- ov.matched(Path::new("notes/.DS_Store"), false).is_ignore()
+ ov.combined.matched(Path::new("notes/.DS_Store"), false).is_ignore()
);
- assert!(ov.matched(Path::new("._foo.md"), false).is_ignore());
+ assert!(ov.combined.matched(Path::new("._foo.md"), false).is_ignore());
assert!(
- ov.matched(Path::new("notes/._sidecar"), false).is_ignore()
+ ov.combined.matched(Path::new("notes/._sidecar"), false).is_ignore()
);
}
@@ -279,13 +425,13 @@ mod tests {
&[],
)
.unwrap();
- assert!(ov.matched(Path::new("a.tmp"), false).is_ignore());
- assert!(ov.matched(Path::new("notes/x.tmp"), false).is_ignore());
+ assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore());
+ assert!(ov.combined.matched(Path::new("notes/x.tmp"), false).is_ignore());
assert!(
- ov.matched(Path::new("node_modules/foo/bar.js"), false)
+ ov.combined.matched(Path::new("node_modules/foo/bar.js"), false)
.is_ignore()
);
- assert!(!ov.matched(Path::new("alpha.md"), false).is_ignore());
+ assert!(!ov.combined.matched(Path::new("alpha.md"), false).is_ignore());
}
#[test]
@@ -298,9 +444,9 @@ mod tests {
&["secret/**".to_string()],
)
.unwrap();
- assert!(ov.matched(Path::new("a.tmp"), false).is_ignore());
+ assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore());
assert!(
- ov.matched(Path::new("secret/key.md"), false).is_ignore()
+ ov.combined.matched(Path::new("secret/key.md"), false).is_ignore()
);
}
@@ -337,8 +483,8 @@ mod tests {
let overrides = build_overrides(root, &[], &[]).unwrap();
// Override::matched expects paths relative to the builder's root.
- let m_in = overrides.matched(Path::new("src/main.rs"), false);
- let m_out = overrides.matched(Path::new("node_modules/foo/bar.js"), false);
+ let m_in = overrides.combined.matched(Path::new("src/main.rs"), false);
+ let m_out = overrides.combined.matched(Path::new("node_modules/foo/bar.js"), false);
assert!(!m_in.is_ignore(), "src/main.rs should NOT be ignored");
assert!(m_out.is_ignore(), "node_modules/foo/bar.js SHOULD be ignored");
@@ -367,10 +513,10 @@ mod tests {
"venv/x/y.txt",
"env/x/y.txt",
] {
- let m = overrides.matched(Path::new(blacklisted), false);
+ let m = overrides.combined.matched(Path::new(blacklisted), false);
assert!(m.is_ignore(), "{blacklisted} should be ignored");
}
- let m_ok = overrides.matched(Path::new("ok/z.txt"), false);
+ let m_ok = overrides.combined.matched(Path::new("ok/z.txt"), false);
assert!(!m_ok.is_ignore(), "ok/z.txt should not be ignored");
}
@@ -389,9 +535,9 @@ mod tests {
fs::write(root.join("dist/bundle.js"), "x").unwrap();
let overrides = build_overrides(root, &[], &[]).unwrap();
- assert!(overrides.matched(Path::new("a.log"), false).is_ignore());
- assert!(overrides.matched(Path::new("dist/bundle.js"), false).is_ignore());
- assert!(!overrides.matched(Path::new("src/main.rs"), false).is_ignore());
+ assert!(overrides.combined.matched(Path::new("a.log"), false).is_ignore());
+ assert!(overrides.combined.matched(Path::new("dist/bundle.js"), false).is_ignore());
+ assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore());
}
#[test]
@@ -407,8 +553,8 @@ mod tests {
// No .gitignore present — patterns from .gitignore should not affect overrides.
let overrides = build_overrides(root, &[], &[]).unwrap();
- assert!(!overrides.matched(Path::new("a.log"), false).is_ignore());
- assert!(!overrides.matched(Path::new("src/main.rs"), false).is_ignore());
+ assert!(!overrides.combined.matched(Path::new("a.log"), false).is_ignore());
+ assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore());
}
#[test]
@@ -424,4 +570,128 @@ mod tests {
let result = build_overrides(root, &[], &[]);
assert!(result.is_ok(), "should not error on negation pattern: {:?}", result.err());
}
+
+ // ── Skip attribution tests ────────────────────────────────────────────────
+
+ #[test]
+ fn classify_skip_attributes_builtin_over_gitignore() {
+ use std::fs;
+ use tempfile::TempDir;
+
+ let tmp = TempDir::new().unwrap();
+ let root = tmp.path();
+ // node_modules matches both BUILTIN_BLACKLIST and a hypothetical
+ // .gitignore entry. Builtin must win (priority order §5.2).
+ fs::write(root.join(".gitignore"), "node_modules/\n").unwrap();
+
+ let ov = build_overrides(root, &[], &[]).unwrap();
+ // node_modules/ dir itself
+ let cat = classify_skip(Path::new("node_modules"), true, &ov);
+ assert_eq!(cat, SkipCategory::BuiltinBlacklist, "builtin must have priority");
+ }
+
+ #[test]
+ fn classify_skip_attributes_gitignore_for_log_files() {
+ use std::fs;
+ use tempfile::TempDir;
+
+ let tmp = TempDir::new().unwrap();
+ let root = tmp.path();
+ fs::write(root.join(".gitignore"), "*.log\n").unwrap();
+
+ let ov = build_overrides(root, &[], &[]).unwrap();
+ let cat = classify_skip(Path::new("app.log"), false, &ov);
+ assert_eq!(cat, SkipCategory::Gitignore);
+ }
+
+ #[test]
+ fn classify_skip_attributes_kebabignore() {
+ use tempfile::TempDir;
+
+ let tmp = TempDir::new().unwrap();
+ let root = tmp.path();
+
+ let ov = build_overrides(root, &[], &["*.secret".to_string()]).unwrap();
+ let cat = classify_skip(Path::new("creds.secret"), false, &ov);
+ assert_eq!(cat, SkipCategory::Kebabignore);
+ }
+
+ #[test]
+ fn walk_files_with_skips_counts_gitignored_files() {
+ use std::fs;
+ use tempfile::TempDir;
+
+ let tmp = TempDir::new().unwrap();
+ let root = tmp.path();
+ fs::write(root.join(".gitignore"), "*.log\n").unwrap();
+ fs::write(root.join("ok.md"), "# ok").unwrap();
+ fs::write(root.join("skipme.log"), "x").unwrap();
+
+ let ov = build_overrides(root, &[], &[]).unwrap();
+ let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap();
+
+ let accepted_names: Vec<_> = accepted
+ .iter()
+ .map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
+ .collect();
+ assert!(
+ accepted_names.iter().any(|n| n == "ok.md"),
+ "ok.md should be accepted; got: {accepted_names:?}"
+ );
+ assert!(
+ !accepted_names.iter().any(|n| n == "skipme.log"),
+ "skipme.log should not be accepted; got: {accepted_names:?}"
+ );
+
+ let gitignore_skipped: Vec<_> = skipped_entries
+ .iter()
+ .filter(|e| e.category == SkipCategory::Gitignore)
+ .collect();
+ assert!(
+ gitignore_skipped.iter().any(|e| e.path.file_name()
+ .map(|n| n == "skipme.log")
+ .unwrap_or(false)),
+ "skipme.log should appear in gitignore_skipped; skipped: {:?}",
+ skipped_entries.iter().map(|e| &e.path).collect::>()
+ );
+ }
+
+ #[test]
+ fn walk_files_with_skips_counts_builtin_blacklist_dirs() {
+ use std::fs;
+ use tempfile::TempDir;
+
+ let tmp = TempDir::new().unwrap();
+ let root = tmp.path();
+ fs::create_dir_all(root.join("node_modules/foo")).unwrap();
+ fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap();
+ fs::write(root.join("ok.md"), "# ok").unwrap();
+
+ let ov = build_overrides(root, &[], &[]).unwrap();
+ let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap();
+
+ let accepted_names: Vec<_> = accepted
+ .iter()
+ .map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
+ .collect();
+ assert!(
+ accepted_names.iter().any(|n| n == "ok.md"),
+ "ok.md must be accepted; got: {accepted_names:?}"
+ );
+
+ let builtin_skipped: Vec<_> = skipped_entries
+ .iter()
+ .filter(|e| e.category == SkipCategory::BuiltinBlacklist)
+ .collect();
+ assert!(
+ !builtin_skipped.is_empty(),
+ "node_modules/ should produce at least one BuiltinBlacklist skip"
+ );
+ assert!(
+ builtin_skipped.iter().any(|e| e.path.components()
+ .any(|c| c.as_os_str() == "node_modules")),
+ "skipped path should contain node_modules; got: {:?}",
+ builtin_skipped.iter().map(|e| &e.path).collect::>()
+ );
+ }
}