feat(p10-1a-1): wire IngestReport skip counters by category (gitignore/builtin/kebabignore)
Refactor walker to expose WalkOverrides (combined + per-source matchers), add walk_files_with_skips that returns accepted files alongside skip attribution, wire FsSourceConnector::scan_with_skips into kebab-app so IngestReport.skipped_gitignore, skipped_kebabignore, skipped_builtin_blacklist, and skip_examples are populated instead of left at zero. Priority order per spec §5.2 (builtin > gitignore > kebabignore) enforced in classify_skip, with a directory-aware builtin matcher so pruned directory entries are correctly attributed to builtin rather than a coincident gitignore entry. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -44,7 +44,7 @@ use kebab_core::{
|
|||||||
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
|
Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker,
|
||||||
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
|
DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput,
|
||||||
EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType,
|
EmbeddingKind, ExtractContext, Extractor, IngestReport, Lang, LanguageModel, MediaType,
|
||||||
ParserVersion, RawAsset, SearchHit, SearchQuery, SkipExamples, SourceConnector, SourceScope,
|
ParserVersion, RawAsset, SearchHit, SearchQuery, SourceScope,
|
||||||
SourceUri, VectorRecord, VectorStore,
|
SourceUri, VectorRecord, VectorStore,
|
||||||
};
|
};
|
||||||
use kebab_llm_local::OllamaLanguageModel;
|
use kebab_llm_local::OllamaLanguageModel;
|
||||||
@@ -305,8 +305,8 @@ pub fn ingest_with_config_opts(
|
|||||||
);
|
);
|
||||||
let connector = FsSourceConnector::new(&app.config)
|
let connector = FsSourceConnector::new(&app.config)
|
||||||
.context("kb-app::ingest: build FsSourceConnector")?;
|
.context("kb-app::ingest: build FsSourceConnector")?;
|
||||||
let assets = connector
|
let (assets, fs_skips) = connector
|
||||||
.scan(&scope)
|
.scan_with_skips(&scope)
|
||||||
.context("kb-app::ingest: scan workspace")?;
|
.context("kb-app::ingest: scan workspace")?;
|
||||||
crate::ingest_progress::emit(
|
crate::ingest_progress::emit(
|
||||||
progress,
|
progress,
|
||||||
@@ -675,12 +675,12 @@ pub fn ingest_with_config_opts(
|
|||||||
errors: error_count,
|
errors: error_count,
|
||||||
duration_ms,
|
duration_ms,
|
||||||
skipped_by_extension,
|
skipped_by_extension,
|
||||||
skipped_gitignore: 0,
|
skipped_gitignore: fs_skips.skipped_gitignore,
|
||||||
skipped_kebabignore: 0,
|
skipped_kebabignore: fs_skips.skipped_kebabignore,
|
||||||
skipped_builtin_blacklist: 0,
|
skipped_builtin_blacklist: fs_skips.skipped_builtin_blacklist,
|
||||||
skipped_generated: 0,
|
skipped_generated: 0,
|
||||||
skipped_size_exceeded: 0,
|
skipped_size_exceeded: 0,
|
||||||
skip_examples: SkipExamples::default(),
|
skip_examples: fs_skips.skip_examples,
|
||||||
items: if summary_only { None } else { Some(items) },
|
items: if summary_only { None } else { Some(items) },
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,20 +10,20 @@
|
|||||||
//! }
|
//! }
|
||||||
//! ```
|
//! ```
|
||||||
|
|
||||||
use std::path::PathBuf;
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
use kebab_config::Config;
|
use kebab_config::Config;
|
||||||
use kebab_core::{
|
use kebab_core::{
|
||||||
AssetStorage, Checksum, RawAsset, SourceConnector, SourceScope, SourceUri,
|
AssetStorage, Checksum, RawAsset, SkipExamples, SourceConnector, SourceScope, SourceUri,
|
||||||
id_for_asset, to_posix,
|
id_for_asset, to_posix,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::hash::hash_file;
|
use crate::hash::hash_file;
|
||||||
use crate::media::media_type_for;
|
use crate::media::media_type_for;
|
||||||
use crate::walker::{build_overrides, read_kbignore, walk_files};
|
use crate::walker::{SkipCategory, WalkOverrides, build_overrides, read_kbignore, walk_files_with_skips};
|
||||||
|
|
||||||
/// Local-filesystem `SourceConnector`. Constructed once from `Config`,
|
/// Local-filesystem `SourceConnector`. Constructed once from `Config`,
|
||||||
/// reused across `scan` calls.
|
/// reused across `scan` calls.
|
||||||
@@ -61,107 +61,179 @@ impl FsSourceConnector {
|
|||||||
copy_threshold_bytes,
|
copy_threshold_bytes,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl SourceConnector for FsSourceConnector {
|
/// Resolve the effective root and build the merged + per-source overrides.
|
||||||
fn scan(&self, scope: &SourceScope) -> Result<Vec<RawAsset>> {
|
fn resolve_scan_params(
|
||||||
// `SourceScope::root` overrides config root when non-empty. This
|
&self,
|
||||||
// matches the design's "scope is the per-call lens; config is the
|
scope: &SourceScope,
|
||||||
// default" split (§7.1).
|
) -> Result<(PathBuf, Vec<String>, WalkOverrides)> {
|
||||||
let root = if scope.root.as_os_str().is_empty() {
|
let root = if scope.root.as_os_str().is_empty() {
|
||||||
self.default_root.clone()
|
self.default_root.clone()
|
||||||
} else {
|
} else {
|
||||||
scope.root.clone()
|
scope.root.clone()
|
||||||
};
|
};
|
||||||
|
|
||||||
// Union: config.workspace.exclude ∪ scope.exclude ∪ .kebabignore.
|
|
||||||
// Per §6.2 the union of `.kebabignore` and `config.workspace.exclude`
|
|
||||||
// is the filter set. `scope.exclude` is added on top so a caller
|
|
||||||
// can layer a per-call narrowing.
|
|
||||||
let mut excludes = self.default_exclude.clone();
|
let mut excludes = self.default_exclude.clone();
|
||||||
excludes.extend(scope.exclude.iter().cloned());
|
excludes.extend(scope.exclude.iter().cloned());
|
||||||
// .kebabignore is re-read on every scan() so users can edit it without
|
|
||||||
// restarting any long-running process.
|
|
||||||
let kbignore = read_kbignore(&root)?;
|
let kbignore = read_kbignore(&root)?;
|
||||||
|
|
||||||
let overrides = build_overrides(&root, &excludes, &kbignore)?;
|
let overrides = build_overrides(&root, &excludes, &kbignore)?;
|
||||||
|
Ok((root, kbignore, overrides))
|
||||||
|
}
|
||||||
|
|
||||||
// TODO(P1-2/P1-3 router): apply SourceScope::include glob filter at the
|
/// Scan the workspace and return the accepted assets together with
|
||||||
// extractor router layer once that crate lands. SourceConnector emits all
|
/// per-category skip counts and sample paths for `IngestReport`.
|
||||||
// non-excluded files; routing by include-glob is a downstream concern
|
///
|
||||||
// (design §6.2 + §7.2 are silent on this split, treat it as router work).
|
/// This is the **preferred entry point** for `kebab-app`: it provides
|
||||||
//
|
/// all the information needed to populate `IngestReport.skipped_gitignore`,
|
||||||
// `scope.include` is intentionally ignored at this stage of the
|
/// `skipped_kebabignore`, `skipped_builtin_blacklist`, and `skip_examples`
|
||||||
// pipeline: per §6.2 the workspace-level include lives in
|
/// without a second walker pass.
|
||||||
// `WorkspaceCfg` and is enforced by the asset writer / extractors.
|
pub fn scan_with_skips(
|
||||||
// Surfacing it here would double-filter Markdown vs PDF before the
|
&self,
|
||||||
// extractor router gets to see them.
|
scope: &SourceScope,
|
||||||
if !scope.include.is_empty() {
|
) -> Result<(Vec<RawAsset>, FsScanSkips)> {
|
||||||
tracing::debug!(
|
let (root, _kbignore, overrides) = self.resolve_scan_params(scope)?;
|
||||||
count = scope.include.len(),
|
|
||||||
"FsSourceConnector ignores scope.include — handled by extractor router"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let files = walk_files(&root, &overrides)?;
|
// Suppress unused-variable warning — kbignore patterns are already
|
||||||
|
// baked into `overrides`; we don't need them again here.
|
||||||
|
|
||||||
let mut assets = Vec::with_capacity(files.len());
|
log_scope_include_warning(scope);
|
||||||
for abs in &files {
|
|
||||||
// `to_posix` does NFC + leading `./` strip + `#` rejection.
|
let (files, skipped_entries) = walk_files_with_skips(&root, &overrides)?;
|
||||||
// Compute the workspace-relative path before handing to it so
|
|
||||||
// emitted `WorkspacePath` is always relative.
|
// Accumulate per-category skip counts and sample paths.
|
||||||
let rel = abs.strip_prefix(&root).unwrap_or(abs);
|
let mut fs_skips = FsScanSkips::default();
|
||||||
let workspace_path = match to_posix(rel) {
|
for entry in &skipped_entries {
|
||||||
Ok(p) => p,
|
match entry.category {
|
||||||
Err(e) => {
|
SkipCategory::BuiltinBlacklist => {
|
||||||
// A path containing `#` is the only documented reason
|
fs_skips.skipped_builtin_blacklist =
|
||||||
// `to_posix` fails today. Drop the file with a warning
|
fs_skips.skipped_builtin_blacklist.saturating_add(1);
|
||||||
// rather than aborting the entire scan — a single bad
|
push_sample(
|
||||||
// filename should not nuke a 10 000-file ingest.
|
&mut fs_skips.skip_examples.builtin_blacklist,
|
||||||
tracing::warn!(
|
&entry.path,
|
||||||
path = %abs.display(),
|
&root,
|
||||||
error = %e,
|
|
||||||
"skipping file: path is not a valid WorkspacePath",
|
|
||||||
);
|
);
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
};
|
SkipCategory::Gitignore => {
|
||||||
|
fs_skips.skipped_gitignore =
|
||||||
let media_type = media_type_for(abs);
|
fs_skips.skipped_gitignore.saturating_add(1);
|
||||||
let (byte_len, full_hex) = hash_file(abs)
|
push_sample(
|
||||||
.with_context(|| format!("hashing {}", abs.display()))?;
|
&mut fs_skips.skip_examples.gitignore,
|
||||||
let checksum = Checksum(full_hex.clone());
|
&entry.path,
|
||||||
let asset_id = id_for_asset(&full_hex);
|
&root,
|
||||||
|
);
|
||||||
// Storage variant signals *intent*, not an actual copy.
|
|
||||||
// P1-6 (asset writer) is responsible for the on-disk copy.
|
|
||||||
let stored = if byte_len > self.copy_threshold_bytes {
|
|
||||||
AssetStorage::Reference {
|
|
||||||
path: abs.clone(),
|
|
||||||
sha: checksum.clone(),
|
|
||||||
}
|
}
|
||||||
} else {
|
SkipCategory::Kebabignore => {
|
||||||
AssetStorage::Copied { path: abs.clone() }
|
fs_skips.skipped_kebabignore =
|
||||||
};
|
fs_skips.skipped_kebabignore.saturating_add(1);
|
||||||
|
// kebabignore intentionally NOT in skip_examples per spec §5.5.
|
||||||
assets.push(RawAsset {
|
}
|
||||||
asset_id,
|
SkipCategory::Other => {
|
||||||
source_uri: SourceUri::File(abs.clone()),
|
// DEFAULT_EXCLUDES or config.workspace.exclude — no dedicated
|
||||||
workspace_path,
|
// IngestReport counter; these are lumped into the existing
|
||||||
media_type,
|
// `skipped` field by kebab-app.
|
||||||
byte_len,
|
}
|
||||||
checksum,
|
}
|
||||||
discovered_at: OffsetDateTime::now_utc(),
|
|
||||||
stored,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determinism: sort by workspace_path. WorkspacePath is a String
|
let assets = build_assets(&files, &root, self.copy_threshold_bytes)?;
|
||||||
// newtype with stable lexicographic ordering. Two scans of the
|
Ok((assets, fs_skips))
|
||||||
// same tree must produce identical Vec<RawAsset> modulo the
|
}
|
||||||
// wall-clock `discovered_at` field.
|
}
|
||||||
assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0));
|
|
||||||
|
|
||||||
|
/// Per-category skip counts and sample paths returned alongside the asset list
|
||||||
|
/// by [`FsSourceConnector::scan_with_skips`].
|
||||||
|
///
|
||||||
|
/// Populated from the walker's per-source matchers without a second pass.
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct FsScanSkips {
|
||||||
|
pub skipped_gitignore: u32,
|
||||||
|
pub skipped_kebabignore: u32,
|
||||||
|
pub skipped_builtin_blacklist: u32,
|
||||||
|
/// Sample paths per spec §5.5 (≤ 5 per category). Paths are
|
||||||
|
/// workspace-relative POSIX strings when available, absolute otherwise.
|
||||||
|
pub skip_examples: SkipExamples,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Push a path into a sample vec (cap = 5) as a workspace-relative POSIX
|
||||||
|
/// string. Falls back to the lossy absolute path if relativisation fails.
|
||||||
|
fn push_sample(samples: &mut Vec<String>, abs: &Path, root: &Path) {
|
||||||
|
if samples.len() >= 5 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let rel = abs.strip_prefix(root).unwrap_or(abs);
|
||||||
|
// Best-effort POSIX string; any non-UTF8 char → replacement char.
|
||||||
|
let s = rel.to_string_lossy().replace('\\', "/");
|
||||||
|
samples.push(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert a list of absolute file paths to `Vec<RawAsset>`, sorted by
|
||||||
|
/// workspace-relative POSIX path for determinism.
|
||||||
|
fn build_assets(
|
||||||
|
files: &[PathBuf],
|
||||||
|
root: &Path,
|
||||||
|
copy_threshold_bytes: u64,
|
||||||
|
) -> Result<Vec<RawAsset>> {
|
||||||
|
let mut assets = Vec::with_capacity(files.len());
|
||||||
|
for abs in files {
|
||||||
|
let rel = abs.strip_prefix(root).unwrap_or(abs);
|
||||||
|
let workspace_path = match to_posix(rel) {
|
||||||
|
Ok(p) => p,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
path = %abs.display(),
|
||||||
|
error = %e,
|
||||||
|
"skipping file: path is not a valid WorkspacePath",
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let media_type = media_type_for(abs);
|
||||||
|
let (byte_len, full_hex) = hash_file(abs)
|
||||||
|
.with_context(|| format!("hashing {}", abs.display()))?;
|
||||||
|
let checksum = Checksum(full_hex.clone());
|
||||||
|
let asset_id = id_for_asset(&full_hex);
|
||||||
|
|
||||||
|
let stored = if byte_len > copy_threshold_bytes {
|
||||||
|
AssetStorage::Reference {
|
||||||
|
path: abs.clone(),
|
||||||
|
sha: checksum.clone(),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
AssetStorage::Copied { path: abs.clone() }
|
||||||
|
};
|
||||||
|
|
||||||
|
assets.push(RawAsset {
|
||||||
|
asset_id,
|
||||||
|
source_uri: SourceUri::File(abs.clone()),
|
||||||
|
workspace_path,
|
||||||
|
media_type,
|
||||||
|
byte_len,
|
||||||
|
checksum,
|
||||||
|
discovered_at: OffsetDateTime::now_utc(),
|
||||||
|
stored,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
assets.sort_by(|a, b| a.workspace_path.0.cmp(&b.workspace_path.0));
|
||||||
|
Ok(assets)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn log_scope_include_warning(scope: &SourceScope) {
|
||||||
|
if !scope.include.is_empty() {
|
||||||
|
tracing::debug!(
|
||||||
|
count = scope.include.len(),
|
||||||
|
"FsSourceConnector ignores scope.include — handled by extractor router"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SourceConnector for FsSourceConnector {
|
||||||
|
fn scan(&self, scope: &SourceScope) -> Result<Vec<RawAsset>> {
|
||||||
|
// Delegate to scan_with_skips; discard the skip counts.
|
||||||
|
// Callers that need skip attribution should call scan_with_skips directly.
|
||||||
|
let (assets, _skips) = self.scan_with_skips(scope)?;
|
||||||
Ok(assets)
|
Ok(assets)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -401,4 +473,142 @@ mod tests {
|
|||||||
let v2 = conn2.scan(&SourceScope::default()).unwrap();
|
let v2 = conn2.scan(&SourceScope::default()).unwrap();
|
||||||
assert!(matches!(v2[0].stored, AssetStorage::Copied { .. }));
|
assert!(matches!(v2[0].stored, AssetStorage::Copied { .. }));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── IngestReport skip counter wiring tests ───────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn scan_with_skips_counts_gitignored_files() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let root = dir.path();
|
||||||
|
std::fs::write(root.join(".gitignore"), "*.log\n").unwrap();
|
||||||
|
std::fs::write(root.join("ok.md"), b"# ok").unwrap();
|
||||||
|
std::fs::write(root.join("skipme.log"), b"x").unwrap();
|
||||||
|
|
||||||
|
let conn =
|
||||||
|
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||||
|
.unwrap();
|
||||||
|
let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
skips.skipped_gitignore >= 1,
|
||||||
|
"skipped_gitignore should be >= 1; got {}",
|
||||||
|
skips.skipped_gitignore
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
skips.skip_examples.gitignore.iter().any(|p| p.contains("skipme.log")),
|
||||||
|
"skip_examples.gitignore should contain 'skipme.log'; got: {:?}",
|
||||||
|
skips.skip_examples.gitignore
|
||||||
|
);
|
||||||
|
// kebabignore counter must be 0 — file matched gitignore, not kebabignore.
|
||||||
|
assert_eq!(skips.skipped_kebabignore, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn scan_with_skips_counts_builtin_blacklist_dirs() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let root = dir.path();
|
||||||
|
std::fs::create_dir_all(root.join("node_modules/foo")).unwrap();
|
||||||
|
std::fs::write(root.join("node_modules/foo/bar.js"), b"x").unwrap();
|
||||||
|
std::fs::write(root.join("ok.md"), b"# ok").unwrap();
|
||||||
|
|
||||||
|
let conn =
|
||||||
|
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||||
|
.unwrap();
|
||||||
|
let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
skips.skipped_builtin_blacklist >= 1,
|
||||||
|
"skipped_builtin_blacklist should be >= 1; got {}",
|
||||||
|
skips.skipped_builtin_blacklist
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
skips.skip_examples.builtin_blacklist.iter().any(|p| p.contains("node_modules")),
|
||||||
|
"skip_examples.builtin_blacklist should contain a node_modules path; got: {:?}",
|
||||||
|
skips.skip_examples.builtin_blacklist
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn scan_with_skips_kebabignore_increments_counter_no_example() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let root = dir.path();
|
||||||
|
std::fs::write(root.join(".kebabignore"), "*.secret\n").unwrap();
|
||||||
|
std::fs::write(root.join("ok.md"), b"x").unwrap();
|
||||||
|
std::fs::write(root.join("creds.secret"), b"pw").unwrap();
|
||||||
|
|
||||||
|
let conn =
|
||||||
|
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||||
|
.unwrap();
|
||||||
|
let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
skips.skipped_kebabignore >= 1,
|
||||||
|
"skipped_kebabignore should be >= 1; got {}",
|
||||||
|
skips.skipped_kebabignore
|
||||||
|
);
|
||||||
|
// Per spec §5.5: kebabignore is intentionally NOT in skip_examples.
|
||||||
|
assert!(
|
||||||
|
skips.skip_examples.gitignore.is_empty(),
|
||||||
|
"gitignore examples should be empty; got: {:?}",
|
||||||
|
skips.skip_examples.gitignore
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
skips.skip_examples.builtin_blacklist.is_empty(),
|
||||||
|
"builtin_blacklist examples should be empty; got: {:?}",
|
||||||
|
skips.skip_examples.builtin_blacklist
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn scan_with_skips_builtin_priority_over_gitignore() {
|
||||||
|
// node_modules/ matches both BUILTIN_BLACKLIST and a .gitignore entry.
|
||||||
|
// It must be attributed to builtin (spec §5.2 priority order).
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let root = dir.path();
|
||||||
|
std::fs::write(root.join(".gitignore"), "node_modules/\n").unwrap();
|
||||||
|
std::fs::create_dir_all(root.join("node_modules/pkg")).unwrap();
|
||||||
|
std::fs::write(root.join("node_modules/pkg/index.js"), b"x").unwrap();
|
||||||
|
std::fs::write(root.join("ok.md"), b"x").unwrap();
|
||||||
|
|
||||||
|
let conn =
|
||||||
|
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||||
|
.unwrap();
|
||||||
|
let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
skips.skipped_builtin_blacklist >= 1,
|
||||||
|
"builtin counter should be >= 1; got {}",
|
||||||
|
skips.skipped_builtin_blacklist
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
skips.skipped_gitignore, 0,
|
||||||
|
"gitignore counter must be 0 when builtin wins; got {}",
|
||||||
|
skips.skipped_gitignore
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn skip_examples_cap_at_five() {
|
||||||
|
// Write 7 .log files — skip_examples.gitignore must cap at 5.
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let root = dir.path();
|
||||||
|
std::fs::write(root.join(".gitignore"), "*.log\n").unwrap();
|
||||||
|
for i in 0..7 {
|
||||||
|
std::fs::write(root.join(format!("f{i}.log")), b"x").unwrap();
|
||||||
|
}
|
||||||
|
std::fs::write(root.join("ok.md"), b"x").unwrap();
|
||||||
|
|
||||||
|
let conn =
|
||||||
|
FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap()))
|
||||||
|
.unwrap();
|
||||||
|
let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(skips.skipped_gitignore, 7, "should count all 7");
|
||||||
|
assert_eq!(
|
||||||
|
skips.skip_examples.gitignore.len(),
|
||||||
|
5,
|
||||||
|
"skip_examples.gitignore must cap at 5; got: {:?}",
|
||||||
|
skips.skip_examples.gitignore
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,4 +13,4 @@ mod hash;
|
|||||||
mod media;
|
mod media;
|
||||||
mod walker;
|
mod walker;
|
||||||
|
|
||||||
pub use connector::FsSourceConnector;
|
pub use connector::{FsScanSkips, FsSourceConnector};
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ use std::path::{Path, PathBuf};
|
|||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use ignore::overrides::{Override, OverrideBuilder};
|
use ignore::overrides::{Override, OverrideBuilder};
|
||||||
use walkdir::{DirEntry, WalkDir};
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
/// Default-excludes baked into the connector. These are NOT configurable;
|
/// Default-excludes baked into the connector. These are NOT configurable;
|
||||||
/// they cover noise that is never useful to ingest and would otherwise need
|
/// they cover noise that is never useful to ingest and would otherwise need
|
||||||
@@ -49,7 +49,96 @@ const DEFAULT_EXCLUDES: &[&str] = &[
|
|||||||
"**/._*",
|
"**/._*",
|
||||||
];
|
];
|
||||||
|
|
||||||
/// Build the merged `Override` from all five filter sources, in order:
|
/// Per-source `Override` matchers for skip-counter attribution (spec §5.5).
|
||||||
|
///
|
||||||
|
/// `combined` is the merged union of all sources — used for the actual
|
||||||
|
/// "is this entry excluded?" decision in the walker. The three per-source
|
||||||
|
/// matchers (`gitignore`, `kebabignore`, `builtin`) are used ONLY when
|
||||||
|
/// classifying an already-excluded path for `IngestReport` counter wiring;
|
||||||
|
/// they are never consulted for every walked file.
|
||||||
|
///
|
||||||
|
/// `default_and_config` covers DEFAULT_EXCLUDES + `config.workspace.exclude`
|
||||||
|
/// — these do NOT map to any of the three named `IngestReport` counters.
|
||||||
|
pub(crate) struct WalkOverrides {
|
||||||
|
/// Merged matcher — same as today's `Override`; used for the walk decision.
|
||||||
|
pub combined: Override,
|
||||||
|
/// Matcher built from `<root>/.gitignore` patterns only.
|
||||||
|
pub gitignore: Override,
|
||||||
|
/// Matcher built from `<root>/.kebabignore` patterns only.
|
||||||
|
pub kebabignore: Override,
|
||||||
|
/// Matcher built from `kebab_parse_code::BUILTIN_BLACKLIST` only.
|
||||||
|
pub builtin: Override,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Skip attribution category. Used by the connector when counting per-source
|
||||||
|
/// skips for `IngestReport` (spec §5.5).
|
||||||
|
///
|
||||||
|
/// Priority order per spec §5.2: built-in > gitignore > kebabignore.
|
||||||
|
/// A path matching multiple sources is attributed to the first match.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub(crate) enum SkipCategory {
|
||||||
|
BuiltinBlacklist,
|
||||||
|
Gitignore,
|
||||||
|
Kebabignore,
|
||||||
|
/// Matched DEFAULT_EXCLUDES or `config.workspace.exclude`. No dedicated
|
||||||
|
/// counter in `IngestReport` — lumped into the existing `skipped` field.
|
||||||
|
Other,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a single `Override` from a list of gitignore-style patterns, all
|
||||||
|
/// registered as excludes (prepend `!`).
|
||||||
|
///
|
||||||
|
/// Empty pattern list → an `Override` that matches nothing (i.e. no
|
||||||
|
/// exclusions). Callers must strip blanks / comments before passing.
|
||||||
|
fn build_single_matcher(root: &Path, patterns: &[&str]) -> Result<Override> {
|
||||||
|
let mut builder = OverrideBuilder::new(root);
|
||||||
|
for pat in patterns {
|
||||||
|
builder
|
||||||
|
.add(&format!("!{pat}"))
|
||||||
|
.with_context(|| format!("invalid pattern: {pat}"))?;
|
||||||
|
}
|
||||||
|
builder.build().context("failed to compile override")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build the builtin-blacklist `Override`, adding directory-level patterns in
|
||||||
|
/// addition to the `/**`-suffix ones from `BUILTIN_BLACKLIST`.
|
||||||
|
///
|
||||||
|
/// BUILTIN_BLACKLIST uses `**/X/**` patterns which match files *inside* X but
|
||||||
|
/// NOT the directory entry `X` itself (because `**/X/**` requires a path
|
||||||
|
/// component after X). The walker prunes at the directory level (`is_dir=true`),
|
||||||
|
/// so we need `**/X` (no trailing `/**`) to also match the directory itself
|
||||||
|
/// for attribution purposes.
|
||||||
|
fn build_builtin_matcher(root: &Path) -> Result<Override> {
|
||||||
|
let mut builder = OverrideBuilder::new(root);
|
||||||
|
for pat in kebab_parse_code::BUILTIN_BLACKLIST {
|
||||||
|
// Register the original pattern (matches files inside the dir).
|
||||||
|
builder
|
||||||
|
.add(&format!("!{pat}"))
|
||||||
|
.with_context(|| format!("builtin pattern: {pat}"))?;
|
||||||
|
// Also derive a directory-level match by stripping trailing `/**`.
|
||||||
|
// This makes `is_dir=true` checks on the directory itself work.
|
||||||
|
if let Some(dir_pat) = pat.strip_suffix("/**") {
|
||||||
|
builder
|
||||||
|
.add(&format!("!{dir_pat}"))
|
||||||
|
.with_context(|| format!("builtin dir pattern: {dir_pat}"))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
builder.build().context("failed to compile builtin override")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Owned-string variant of `build_single_matcher` for caller-supplied
|
||||||
|
/// `Vec<String>` sources (config.workspace.exclude, .kebabignore).
|
||||||
|
fn build_single_matcher_owned(root: &Path, patterns: &[String]) -> Result<Override> {
|
||||||
|
let mut builder = OverrideBuilder::new(root);
|
||||||
|
for pat in patterns {
|
||||||
|
builder
|
||||||
|
.add(&format!("!{pat}"))
|
||||||
|
.with_context(|| format!("invalid pattern: {pat}"))?;
|
||||||
|
}
|
||||||
|
builder.build().context("failed to compile override")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build the merged `WalkOverrides` from all five filter sources, in order:
|
||||||
/// DEFAULT_EXCLUDES, `config.workspace.exclude`, `.kebabignore`,
|
/// DEFAULT_EXCLUDES, `config.workspace.exclude`, `.kebabignore`,
|
||||||
/// built-in safety-net blacklist (`kebab_parse_code::BUILTIN_BLACKLIST`),
|
/// built-in safety-net blacklist (`kebab_parse_code::BUILTIN_BLACKLIST`),
|
||||||
/// and `<root>/.gitignore` (root-only, no nested cascade).
|
/// and `<root>/.gitignore` (root-only, no nested cascade).
|
||||||
@@ -58,48 +147,82 @@ const DEFAULT_EXCLUDES: &[&str] = &[
|
|||||||
/// leading `!` flips a positive match to a negative one in the
|
/// leading `!` flips a positive match to a negative one in the
|
||||||
/// `OverrideBuilder` API). Order doesn't matter — the union is computed by
|
/// `OverrideBuilder` API). Order doesn't matter — the union is computed by
|
||||||
/// the underlying gitignore engine.
|
/// the underlying gitignore engine.
|
||||||
|
///
|
||||||
|
/// The three per-source matchers (`gitignore`, `kebabignore`, `builtin`) are
|
||||||
|
/// built in addition to the combined one so the connector can attribute skips
|
||||||
|
/// to the correct `IngestReport` counter without a second walker pass.
|
||||||
pub(crate) fn build_overrides(
|
pub(crate) fn build_overrides(
|
||||||
root: &Path,
|
root: &Path,
|
||||||
config_exclude: &[String],
|
config_exclude: &[String],
|
||||||
kbignore_patterns: &[String],
|
kbignore_patterns: &[String],
|
||||||
) -> Result<Override> {
|
) -> Result<WalkOverrides> {
|
||||||
let mut builder = OverrideBuilder::new(root);
|
let gitignore_patterns = read_gitignore(root)?;
|
||||||
|
|
||||||
|
// Per-source matchers (for attribution only).
|
||||||
|
let gitignore =
|
||||||
|
build_single_matcher(root, &gitignore_patterns.iter().map(|s| s.as_str()).collect::<Vec<_>>())?;
|
||||||
|
let kebabignore = build_single_matcher_owned(root, kbignore_patterns)?;
|
||||||
|
// Use the directory-aware builtin matcher so that `is_dir=true` checks on
|
||||||
|
// directory entries (e.g., `node_modules/`) are attributed to builtin rather
|
||||||
|
// than to an overlapping gitignore pattern.
|
||||||
|
let builtin = build_builtin_matcher(root)?;
|
||||||
|
|
||||||
|
// Combined matcher — union of all five sources.
|
||||||
|
let mut combined_builder = OverrideBuilder::new(root);
|
||||||
|
|
||||||
for pat in DEFAULT_EXCLUDES {
|
for pat in DEFAULT_EXCLUDES {
|
||||||
builder
|
combined_builder
|
||||||
.add(&format!("!{pat}"))
|
.add(&format!("!{pat}"))
|
||||||
.with_context(|| format!("invalid default-exclude pattern: {pat}"))?;
|
.with_context(|| format!("invalid default-exclude pattern: {pat}"))?;
|
||||||
}
|
}
|
||||||
for pat in config_exclude {
|
for pat in config_exclude {
|
||||||
builder
|
combined_builder
|
||||||
.add(&format!("!{pat}"))
|
.add(&format!("!{pat}"))
|
||||||
.with_context(|| format!("invalid workspace.exclude pattern: {pat}"))?;
|
.with_context(|| format!("invalid workspace.exclude pattern: {pat}"))?;
|
||||||
}
|
}
|
||||||
for pat in kbignore_patterns {
|
for pat in kbignore_patterns {
|
||||||
builder
|
combined_builder
|
||||||
.add(&format!("!{pat}"))
|
.add(&format!("!{pat}"))
|
||||||
.with_context(|| format!("invalid .kebabignore pattern: {pat}"))?;
|
.with_context(|| format!("invalid .kebabignore pattern: {pat}"))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// p10-1A-1: built-in safety-net blacklist (spec §5.2). 6 patterns
|
|
||||||
// universal across ecosystems. User can negate via `.kebabignore`.
|
|
||||||
for pat in kebab_parse_code::BUILTIN_BLACKLIST {
|
for pat in kebab_parse_code::BUILTIN_BLACKLIST {
|
||||||
builder
|
combined_builder
|
||||||
.add(&format!("!{pat}"))
|
.add(&format!("!{pat}"))
|
||||||
.with_context(|| format!("built-in blacklist pattern: {pat}"))?;
|
.with_context(|| format!("built-in blacklist pattern: {pat}"))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// p10-1A-1: honor repo-root `.gitignore` (spec §5.2). Read once,
|
|
||||||
// merge with same convention as user `.kebabignore`. Nested
|
|
||||||
// cascade deferred to P+.
|
|
||||||
let gitignore_patterns = read_gitignore(root)?;
|
|
||||||
for pat in &gitignore_patterns {
|
for pat in &gitignore_patterns {
|
||||||
builder
|
combined_builder
|
||||||
.add(&format!("!{pat}"))
|
.add(&format!("!{pat}"))
|
||||||
.with_context(|| format!(".gitignore pattern: {pat}"))?;
|
.with_context(|| format!(".gitignore pattern: {pat}"))?;
|
||||||
}
|
}
|
||||||
|
let combined = combined_builder
|
||||||
|
.build()
|
||||||
|
.context("failed to compile combined override set")?;
|
||||||
|
|
||||||
builder.build().context("failed to compile override set")
|
Ok(WalkOverrides {
|
||||||
|
combined,
|
||||||
|
gitignore,
|
||||||
|
kebabignore,
|
||||||
|
builtin,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Classify why a path was excluded, using per-source matchers in spec §5.2
|
||||||
|
/// priority order: built-in > gitignore > kebabignore > other.
|
||||||
|
///
|
||||||
|
/// `rel` must be relative to the walker root (same as `Override::matched`
|
||||||
|
/// expects). `is_dir` should match what the original walker saw.
|
||||||
|
pub(crate) fn classify_skip(rel: &Path, is_dir: bool, ov: &WalkOverrides) -> SkipCategory {
|
||||||
|
if ov.builtin.matched(rel, is_dir).is_ignore() {
|
||||||
|
return SkipCategory::BuiltinBlacklist;
|
||||||
|
}
|
||||||
|
if ov.gitignore.matched(rel, is_dir).is_ignore() {
|
||||||
|
return SkipCategory::Gitignore;
|
||||||
|
}
|
||||||
|
if ov.kebabignore.matched(rel, is_dir).is_ignore() {
|
||||||
|
return SkipCategory::Kebabignore;
|
||||||
|
}
|
||||||
|
SkipCategory::Other
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read `<root>/.gitignore` (single-file, root-only — nested cascade is P+).
|
/// Read `<root>/.gitignore` (single-file, root-only — nested cascade is P+).
|
||||||
@@ -161,51 +284,87 @@ pub(crate) fn read_kbignore(root: &Path) -> Result<Vec<String>> {
|
|||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Iterate every regular file under `root`, applying `overrides` and
|
/// Skipped-path record emitted by `walk_files_with_skips`.
|
||||||
/// detecting symlink cycles. Returns absolute file paths.
|
///
|
||||||
|
/// `path` is the absolute path of the excluded entry (dir or file).
|
||||||
|
/// For excluded directories, this is the directory itself — individual
|
||||||
|
/// files inside are not enumerated (the subtree is pruned).
|
||||||
|
pub(crate) struct SkippedEntry {
|
||||||
|
pub path: PathBuf,
|
||||||
|
pub category: SkipCategory,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Iterate every regular file under `root`, applying `overrides.combined` and
|
||||||
|
/// detecting symlink cycles. Returns:
|
||||||
|
/// - `accepted`: absolute paths of files that passed all filters.
|
||||||
|
/// - `skipped`: entries that were excluded, with attribution.
|
||||||
|
///
|
||||||
|
/// For excluded *directories*, the directory path itself is returned (not the
|
||||||
|
/// individual files inside — the subtree is pruned in one step, matching the
|
||||||
|
/// walker's `skip_current_dir` behavior).
|
||||||
///
|
///
|
||||||
/// Strategy:
|
/// Strategy:
|
||||||
/// - `walkdir::WalkDir::follow_links(true)` to traverse symlinks.
|
/// - `walkdir::WalkDir::follow_links(true)` to traverse symlinks.
|
||||||
|
/// - Manual per-entry check (instead of `filter_entry`) so we can capture
|
||||||
|
/// the excluded paths for skip attribution.
|
||||||
/// - Maintain `visited: HashSet<PathBuf>` of *canonical* paths. Before
|
/// - Maintain `visited: HashSet<PathBuf>` of *canonical* paths. Before
|
||||||
/// descending into a directory entry, canonicalize and check the set;
|
/// descending into a directory entry, canonicalize and check the set;
|
||||||
/// if already present, skip. This breaks `a -> b -> a` cycles in O(n)
|
/// if already present, skip. This breaks `a -> b -> a` cycles in O(n)
|
||||||
/// per entry without a custom recursive walker.
|
/// per entry without a custom recursive walker.
|
||||||
/// - For each yielded entry, ask `overrides` whether it is excluded; if
|
pub(crate) fn walk_files_with_skips(
|
||||||
/// so, drop it. If the entry is a directory, also short-circuit
|
root: &Path,
|
||||||
/// `WalkDir`'s descent via `it.skip_current_dir()`.
|
overrides: &WalkOverrides,
|
||||||
pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result<Vec<PathBuf>> {
|
) -> Result<(Vec<PathBuf>, Vec<SkippedEntry>)> {
|
||||||
let mut out = Vec::new();
|
let mut accepted = Vec::new();
|
||||||
|
let mut skipped: Vec<SkippedEntry> = Vec::new();
|
||||||
let mut visited: HashSet<PathBuf> = HashSet::new();
|
let mut visited: HashSet<PathBuf> = HashSet::new();
|
||||||
|
|
||||||
|
// Use a non-filtering iterator so we see excluded entries too.
|
||||||
let walker = WalkDir::new(root).follow_links(true).into_iter();
|
let walker = WalkDir::new(root).follow_links(true).into_iter();
|
||||||
let mut it = walker.filter_entry(|e| !is_excluded(e, root, overrides));
|
// We still use filter_entry for the *combined* override so that walkdir
|
||||||
|
// can short-circuit pruned directories. But we wrap it so we can capture
|
||||||
|
// the exclusion reason before discarding the entry.
|
||||||
|
//
|
||||||
|
// Problem: filter_entry discards without letting us see the entry first.
|
||||||
|
// Solution: use the raw iterator (no filter_entry) and manage skip_current_dir
|
||||||
|
// manually, which lets us record what was excluded before pruning.
|
||||||
|
let mut it = walker;
|
||||||
|
|
||||||
while let Some(res) = it.next() {
|
while let Some(res) = it.next() {
|
||||||
let entry = match res {
|
let entry = match res {
|
||||||
Ok(e) => e,
|
Ok(e) => e,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
// `walkdir` surfaces I/O errors AND its own cycle detector
|
|
||||||
// (when follow_links is on it sometimes catches them).
|
|
||||||
// Either way: log and skip; do not abort the whole scan.
|
|
||||||
tracing::warn!(error = %err, "walkdir entry error; skipping");
|
tracing::warn!(error = %err, "walkdir entry error; skipping");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let path = entry.path();
|
let path = entry.path();
|
||||||
|
let rel = match path.strip_prefix(root) {
|
||||||
|
Ok(p) => p,
|
||||||
|
Err(_) => path,
|
||||||
|
};
|
||||||
|
let is_dir = entry.file_type().is_dir();
|
||||||
|
let excluded = overrides.combined.matched(rel, is_dir).is_ignore();
|
||||||
|
|
||||||
// Cycle guard: only canonicalize symlinks (cheap on the common case
|
if excluded {
|
||||||
// of plain files/dirs) and on directories that are followed via a
|
let cat = classify_skip(rel, is_dir, overrides);
|
||||||
// symlink. `walkdir`'s `path_is_symlink()` is true when the entry's
|
skipped.push(SkippedEntry {
|
||||||
// *original* path is a symlink (it returns true for the link, not
|
path: path.to_path_buf(),
|
||||||
// for the resolved target). For non-symlinked directories we still
|
category: cat,
|
||||||
// record the canonical path so a *later* symlink that points back
|
});
|
||||||
// to one of them is detected.
|
if is_dir {
|
||||||
if entry.file_type().is_dir() {
|
// Prune the subtree — don't descend into excluded dirs.
|
||||||
|
it.skip_current_dir();
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cycle guard for directories.
|
||||||
|
if is_dir {
|
||||||
match std::fs::canonicalize(path) {
|
match std::fs::canonicalize(path) {
|
||||||
Ok(canon) => {
|
Ok(canon) => {
|
||||||
if !visited.insert(canon) {
|
if !visited.insert(canon) {
|
||||||
// Already visited via another path → break cycle.
|
|
||||||
it.skip_current_dir();
|
it.skip_current_dir();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -222,26 +381,13 @@ pub(crate) fn walk_files(root: &Path, overrides: &Override) -> Result<Vec<PathBu
|
|||||||
}
|
}
|
||||||
|
|
||||||
if entry.file_type().is_file() {
|
if entry.file_type().is_file() {
|
||||||
out.push(path.to_path_buf());
|
accepted.push(path.to_path_buf());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(out)
|
Ok((accepted, skipped))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_excluded(entry: &DirEntry, root: &Path, overrides: &Override) -> bool {
|
|
||||||
// `Override::matched(path, is_dir)` uses the path *relative to* the
|
|
||||||
// override builder's root. `walkdir` gives absolute paths when
|
|
||||||
// `WalkDir::new` was given an absolute path — strip the root prefix
|
|
||||||
// before consulting the override.
|
|
||||||
let rel = match entry.path().strip_prefix(root) {
|
|
||||||
Ok(p) => p,
|
|
||||||
Err(_) => entry.path(),
|
|
||||||
};
|
|
||||||
overrides
|
|
||||||
.matched(rel, entry.file_type().is_dir())
|
|
||||||
.is_ignore()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
@@ -252,7 +398,7 @@ mod tests {
|
|||||||
let dir = tempfile::tempdir().unwrap();
|
let dir = tempfile::tempdir().unwrap();
|
||||||
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
|
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
|
||||||
// Default-excludes only; non-special files should not match.
|
// Default-excludes only; non-special files should not match.
|
||||||
let m = ov.matched(Path::new("notes/alpha.md"), false);
|
let m = ov.combined.matched(Path::new("notes/alpha.md"), false);
|
||||||
assert!(!m.is_ignore());
|
assert!(!m.is_ignore());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -260,13 +406,13 @@ mod tests {
|
|||||||
fn default_excludes_ds_store_and_resource_forks() {
|
fn default_excludes_ds_store_and_resource_forks() {
|
||||||
let dir = tempfile::tempdir().unwrap();
|
let dir = tempfile::tempdir().unwrap();
|
||||||
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
|
let ov = build_overrides(dir.path(), &[], &[]).unwrap();
|
||||||
assert!(ov.matched(Path::new(".DS_Store"), false).is_ignore());
|
assert!(ov.combined.matched(Path::new(".DS_Store"), false).is_ignore());
|
||||||
assert!(
|
assert!(
|
||||||
ov.matched(Path::new("notes/.DS_Store"), false).is_ignore()
|
ov.combined.matched(Path::new("notes/.DS_Store"), false).is_ignore()
|
||||||
);
|
);
|
||||||
assert!(ov.matched(Path::new("._foo.md"), false).is_ignore());
|
assert!(ov.combined.matched(Path::new("._foo.md"), false).is_ignore());
|
||||||
assert!(
|
assert!(
|
||||||
ov.matched(Path::new("notes/._sidecar"), false).is_ignore()
|
ov.combined.matched(Path::new("notes/._sidecar"), false).is_ignore()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -279,13 +425,13 @@ mod tests {
|
|||||||
&[],
|
&[],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(ov.matched(Path::new("a.tmp"), false).is_ignore());
|
assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore());
|
||||||
assert!(ov.matched(Path::new("notes/x.tmp"), false).is_ignore());
|
assert!(ov.combined.matched(Path::new("notes/x.tmp"), false).is_ignore());
|
||||||
assert!(
|
assert!(
|
||||||
ov.matched(Path::new("node_modules/foo/bar.js"), false)
|
ov.combined.matched(Path::new("node_modules/foo/bar.js"), false)
|
||||||
.is_ignore()
|
.is_ignore()
|
||||||
);
|
);
|
||||||
assert!(!ov.matched(Path::new("alpha.md"), false).is_ignore());
|
assert!(!ov.combined.matched(Path::new("alpha.md"), false).is_ignore());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -298,9 +444,9 @@ mod tests {
|
|||||||
&["secret/**".to_string()],
|
&["secret/**".to_string()],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(ov.matched(Path::new("a.tmp"), false).is_ignore());
|
assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore());
|
||||||
assert!(
|
assert!(
|
||||||
ov.matched(Path::new("secret/key.md"), false).is_ignore()
|
ov.combined.matched(Path::new("secret/key.md"), false).is_ignore()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -337,8 +483,8 @@ mod tests {
|
|||||||
|
|
||||||
let overrides = build_overrides(root, &[], &[]).unwrap();
|
let overrides = build_overrides(root, &[], &[]).unwrap();
|
||||||
// Override::matched expects paths relative to the builder's root.
|
// Override::matched expects paths relative to the builder's root.
|
||||||
let m_in = overrides.matched(Path::new("src/main.rs"), false);
|
let m_in = overrides.combined.matched(Path::new("src/main.rs"), false);
|
||||||
let m_out = overrides.matched(Path::new("node_modules/foo/bar.js"), false);
|
let m_out = overrides.combined.matched(Path::new("node_modules/foo/bar.js"), false);
|
||||||
|
|
||||||
assert!(!m_in.is_ignore(), "src/main.rs should NOT be ignored");
|
assert!(!m_in.is_ignore(), "src/main.rs should NOT be ignored");
|
||||||
assert!(m_out.is_ignore(), "node_modules/foo/bar.js SHOULD be ignored");
|
assert!(m_out.is_ignore(), "node_modules/foo/bar.js SHOULD be ignored");
|
||||||
@@ -367,10 +513,10 @@ mod tests {
|
|||||||
"venv/x/y.txt",
|
"venv/x/y.txt",
|
||||||
"env/x/y.txt",
|
"env/x/y.txt",
|
||||||
] {
|
] {
|
||||||
let m = overrides.matched(Path::new(blacklisted), false);
|
let m = overrides.combined.matched(Path::new(blacklisted), false);
|
||||||
assert!(m.is_ignore(), "{blacklisted} should be ignored");
|
assert!(m.is_ignore(), "{blacklisted} should be ignored");
|
||||||
}
|
}
|
||||||
let m_ok = overrides.matched(Path::new("ok/z.txt"), false);
|
let m_ok = overrides.combined.matched(Path::new("ok/z.txt"), false);
|
||||||
assert!(!m_ok.is_ignore(), "ok/z.txt should not be ignored");
|
assert!(!m_ok.is_ignore(), "ok/z.txt should not be ignored");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -389,9 +535,9 @@ mod tests {
|
|||||||
fs::write(root.join("dist/bundle.js"), "x").unwrap();
|
fs::write(root.join("dist/bundle.js"), "x").unwrap();
|
||||||
|
|
||||||
let overrides = build_overrides(root, &[], &[]).unwrap();
|
let overrides = build_overrides(root, &[], &[]).unwrap();
|
||||||
assert!(overrides.matched(Path::new("a.log"), false).is_ignore());
|
assert!(overrides.combined.matched(Path::new("a.log"), false).is_ignore());
|
||||||
assert!(overrides.matched(Path::new("dist/bundle.js"), false).is_ignore());
|
assert!(overrides.combined.matched(Path::new("dist/bundle.js"), false).is_ignore());
|
||||||
assert!(!overrides.matched(Path::new("src/main.rs"), false).is_ignore());
|
assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -407,8 +553,8 @@ mod tests {
|
|||||||
|
|
||||||
// No .gitignore present — patterns from .gitignore should not affect overrides.
|
// No .gitignore present — patterns from .gitignore should not affect overrides.
|
||||||
let overrides = build_overrides(root, &[], &[]).unwrap();
|
let overrides = build_overrides(root, &[], &[]).unwrap();
|
||||||
assert!(!overrides.matched(Path::new("a.log"), false).is_ignore());
|
assert!(!overrides.combined.matched(Path::new("a.log"), false).is_ignore());
|
||||||
assert!(!overrides.matched(Path::new("src/main.rs"), false).is_ignore());
|
assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -424,4 +570,128 @@ mod tests {
|
|||||||
let result = build_overrides(root, &[], &[]);
|
let result = build_overrides(root, &[], &[]);
|
||||||
assert!(result.is_ok(), "should not error on negation pattern: {:?}", result.err());
|
assert!(result.is_ok(), "should not error on negation pattern: {:?}", result.err());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Skip attribution tests ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_skip_attributes_builtin_over_gitignore() {
|
||||||
|
use std::fs;
|
||||||
|
use tempfile::TempDir;
|
||||||
|
|
||||||
|
let tmp = TempDir::new().unwrap();
|
||||||
|
let root = tmp.path();
|
||||||
|
// node_modules matches both BUILTIN_BLACKLIST and a hypothetical
|
||||||
|
// .gitignore entry. Builtin must win (priority order §5.2).
|
||||||
|
fs::write(root.join(".gitignore"), "node_modules/\n").unwrap();
|
||||||
|
|
||||||
|
let ov = build_overrides(root, &[], &[]).unwrap();
|
||||||
|
// node_modules/ dir itself
|
||||||
|
let cat = classify_skip(Path::new("node_modules"), true, &ov);
|
||||||
|
assert_eq!(cat, SkipCategory::BuiltinBlacklist, "builtin must have priority");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_skip_attributes_gitignore_for_log_files() {
|
||||||
|
use std::fs;
|
||||||
|
use tempfile::TempDir;
|
||||||
|
|
||||||
|
let tmp = TempDir::new().unwrap();
|
||||||
|
let root = tmp.path();
|
||||||
|
fs::write(root.join(".gitignore"), "*.log\n").unwrap();
|
||||||
|
|
||||||
|
let ov = build_overrides(root, &[], &[]).unwrap();
|
||||||
|
let cat = classify_skip(Path::new("app.log"), false, &ov);
|
||||||
|
assert_eq!(cat, SkipCategory::Gitignore);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_skip_attributes_kebabignore() {
|
||||||
|
use tempfile::TempDir;
|
||||||
|
|
||||||
|
let tmp = TempDir::new().unwrap();
|
||||||
|
let root = tmp.path();
|
||||||
|
|
||||||
|
let ov = build_overrides(root, &[], &["*.secret".to_string()]).unwrap();
|
||||||
|
let cat = classify_skip(Path::new("creds.secret"), false, &ov);
|
||||||
|
assert_eq!(cat, SkipCategory::Kebabignore);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn walk_files_with_skips_counts_gitignored_files() {
|
||||||
|
use std::fs;
|
||||||
|
use tempfile::TempDir;
|
||||||
|
|
||||||
|
let tmp = TempDir::new().unwrap();
|
||||||
|
let root = tmp.path();
|
||||||
|
fs::write(root.join(".gitignore"), "*.log\n").unwrap();
|
||||||
|
fs::write(root.join("ok.md"), "# ok").unwrap();
|
||||||
|
fs::write(root.join("skipme.log"), "x").unwrap();
|
||||||
|
|
||||||
|
let ov = build_overrides(root, &[], &[]).unwrap();
|
||||||
|
let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap();
|
||||||
|
|
||||||
|
let accepted_names: Vec<_> = accepted
|
||||||
|
.iter()
|
||||||
|
.map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
|
||||||
|
.collect();
|
||||||
|
assert!(
|
||||||
|
accepted_names.iter().any(|n| n == "ok.md"),
|
||||||
|
"ok.md should be accepted; got: {accepted_names:?}"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
!accepted_names.iter().any(|n| n == "skipme.log"),
|
||||||
|
"skipme.log should not be accepted; got: {accepted_names:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let gitignore_skipped: Vec<_> = skipped_entries
|
||||||
|
.iter()
|
||||||
|
.filter(|e| e.category == SkipCategory::Gitignore)
|
||||||
|
.collect();
|
||||||
|
assert!(
|
||||||
|
gitignore_skipped.iter().any(|e| e.path.file_name()
|
||||||
|
.map(|n| n == "skipme.log")
|
||||||
|
.unwrap_or(false)),
|
||||||
|
"skipme.log should appear in gitignore_skipped; skipped: {:?}",
|
||||||
|
skipped_entries.iter().map(|e| &e.path).collect::<Vec<_>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn walk_files_with_skips_counts_builtin_blacklist_dirs() {
|
||||||
|
use std::fs;
|
||||||
|
use tempfile::TempDir;
|
||||||
|
|
||||||
|
let tmp = TempDir::new().unwrap();
|
||||||
|
let root = tmp.path();
|
||||||
|
fs::create_dir_all(root.join("node_modules/foo")).unwrap();
|
||||||
|
fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap();
|
||||||
|
fs::write(root.join("ok.md"), "# ok").unwrap();
|
||||||
|
|
||||||
|
let ov = build_overrides(root, &[], &[]).unwrap();
|
||||||
|
let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap();
|
||||||
|
|
||||||
|
let accepted_names: Vec<_> = accepted
|
||||||
|
.iter()
|
||||||
|
.map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
|
||||||
|
.collect();
|
||||||
|
assert!(
|
||||||
|
accepted_names.iter().any(|n| n == "ok.md"),
|
||||||
|
"ok.md must be accepted; got: {accepted_names:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let builtin_skipped: Vec<_> = skipped_entries
|
||||||
|
.iter()
|
||||||
|
.filter(|e| e.category == SkipCategory::BuiltinBlacklist)
|
||||||
|
.collect();
|
||||||
|
assert!(
|
||||||
|
!builtin_skipped.is_empty(),
|
||||||
|
"node_modules/ should produce at least one BuiltinBlacklist skip"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
builtin_skipped.iter().any(|e| e.path.components()
|
||||||
|
.any(|c| c.as_os_str() == "node_modules")),
|
||||||
|
"skipped path should contain node_modules; got: {:?}",
|
||||||
|
builtin_skipped.iter().map(|e| &e.path).collect::<Vec<_>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user