From 803d02b68b57747f3547a428ff041623315b9cde Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 20 May 2026 05:15:04 +0000 Subject: [PATCH] fix(dogfood): enforce workspace.include in walker (allow-list semantics) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit config.workspace.include was completely ignored by the walker — connector.rs log_scope_include_warning literally said "handled by extractor router" but no extractor router exists. Dogfooding (PR #142 1B + multi-root corpus kebab-docs + httpx + zod + lodash) showed user-set include of code+md still ingested 84 .png + 8 .pdf files. Fix: walker treats scope.include as an allow-list — empty Vec preserves backward-compat (all files pass), non-empty requires file path to match at least one pattern (AND with the existing exclude rules). Removed the misleading debug log. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 1 + crates/kebab-source-fs/Cargo.toml | 1 + crates/kebab-source-fs/src/connector.rs | 12 +- crates/kebab-source-fs/src/walker.rs | 73 ++++++++++-- .../tests/include_allowlist.rs | 111 ++++++++++++++++++ 5 files changed, 175 insertions(+), 23 deletions(-) create mode 100644 crates/kebab-source-fs/tests/include_allowlist.rs diff --git a/Cargo.lock b/Cargo.lock index d457c88..0ba566b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4466,6 +4466,7 @@ version = "0.8.1" dependencies = [ "anyhow", "blake3", + "globset", "ignore", "kebab-config", "kebab-core", diff --git a/crates/kebab-source-fs/Cargo.toml b/crates/kebab-source-fs/Cargo.toml index 5c976cd..68ea7be 100644 --- a/crates/kebab-source-fs/Cargo.toml +++ b/crates/kebab-source-fs/Cargo.toml @@ -18,6 +18,7 @@ blake3 = { workspace = true } tracing = { workspace = true } walkdir = "2" ignore = "0.4" +globset = "0.4" [dev-dependencies] serde_json = { workspace = true } diff --git a/crates/kebab-source-fs/src/connector.rs b/crates/kebab-source-fs/src/connector.rs index c61da8d..fd73734 100644 --- a/crates/kebab-source-fs/src/connector.rs +++ b/crates/kebab-source-fs/src/connector.rs @@ -86,7 +86,7 @@ impl FsSourceConnector { excludes.extend(scope.exclude.iter().cloned()); let kbignore = read_kbignore(&root)?; - let overrides = build_overrides(&root, &excludes, &kbignore)?; + let overrides = build_overrides(&root, &excludes, &kbignore, &scope.include)?; Ok((root, overrides)) } @@ -103,8 +103,6 @@ impl FsSourceConnector { ) -> Result<(Vec, FsScanSkips)> { let (root, overrides) = self.resolve_scan_params(scope)?; - log_scope_include_warning(scope); - let (files, skipped_entries) = walk_files_with_skips(&root, &overrides)?; // Accumulate per-category skip counts and sample paths. @@ -284,14 +282,6 @@ fn build_assets( Ok(assets) } -fn log_scope_include_warning(scope: &SourceScope) { - if !scope.include.is_empty() { - tracing::debug!( - count = scope.include.len(), - "FsSourceConnector ignores scope.include — handled by extractor router" - ); - } -} impl SourceConnector for FsSourceConnector { fn scan(&self, scope: &SourceScope) -> Result> { diff --git a/crates/kebab-source-fs/src/walker.rs b/crates/kebab-source-fs/src/walker.rs index 8eaf0ff..9f6a540 100644 --- a/crates/kebab-source-fs/src/walker.rs +++ b/crates/kebab-source-fs/src/walker.rs @@ -44,6 +44,7 @@ use std::collections::HashSet; use std::path::{Path, PathBuf}; use anyhow::{Context, Result}; +use globset::{GlobBuilder, GlobSet, GlobSetBuilder}; use ignore::overrides::{Override, OverrideBuilder}; use walkdir::WalkDir; @@ -69,6 +70,11 @@ const DEFAULT_EXCLUDES: &[&str] = &[ /// /// `default_and_config` covers DEFAULT_EXCLUDES + `config.workspace.exclude` /// — these do NOT map to any of the three named `IngestReport` counters. +/// +/// `include` is the compiled `scope.include` allow-list. When the set is +/// empty (no patterns) every file passes; when non-empty a file must match +/// at least one pattern to be accepted (directories always pass, so the +/// walker can still descend into them). pub(crate) struct WalkOverrides { /// Merged matcher — same as today's `Override`; used for the walk decision. pub combined: Override, @@ -78,6 +84,8 @@ pub(crate) struct WalkOverrides { pub kebabignore: Override, /// Matcher built from `kebab_parse_code::BUILTIN_BLACKLIST` only. pub builtin: Override, + /// Compiled allow-list from `scope.include`. Empty set = pass all. + pub include: GlobSet, } /// Skip attribution category. Used by the connector when counting per-source @@ -161,10 +169,15 @@ fn build_single_matcher_owned(root: &Path, patterns: &[String]) -> Result Result { let gitignore_patterns = read_gitignore(root)?; @@ -209,14 +222,41 @@ pub(crate) fn build_overrides( .build() .context("failed to compile combined override set")?; + // Allow-list GlobSet: empty Vec → matches nothing (= pass all); non-empty + // → file must match at least one glob to be accepted. We compile with + // `case_insensitive=false` to keep the semantics consistent with the + // OverrideBuilder exclude patterns above. + let include = build_include_globset(include_patterns)?; + Ok(WalkOverrides { combined, gitignore, kebabignore, builtin, + include, }) } +/// Compile `scope.include` patterns into a `GlobSet` allow-list. +/// +/// Each pattern uses `GlobBuilder` with `literal_separator = true` so that +/// `**` can cross directory boundaries while `*` stops at `/`, matching the +/// gitignore convention used throughout the rest of the walker. +/// +/// An empty slice produces an empty `GlobSet` — callers interpret that as +/// "pass all files" (no allow-list constraint). +fn build_include_globset(patterns: &[String]) -> Result { + let mut builder = GlobSetBuilder::new(); + for pat in patterns { + let glob = GlobBuilder::new(pat) + .literal_separator(true) + .build() + .with_context(|| format!("invalid include pattern: {pat}"))?; + builder.add(glob); + } + builder.build().context("failed to compile include globset") +} + /// Classify why a path was excluded, using per-source matchers in spec §5.2 /// priority order: built-in > gitignore > kebabignore > other. /// @@ -391,6 +431,13 @@ pub(crate) fn walk_files_with_skips( } if entry.file_type().is_file() { + // Apply include allow-list: if non-empty, the file's path + // relative to root must match at least one pattern. + if !overrides.include.is_empty() && !overrides.include.is_match(rel) { + // Not in the allow-list — silently drop (no skip counter; + // the include filter is not a "skip" source in IngestReport). + continue; + } accepted.push(path.to_path_buf()); } } @@ -406,7 +453,7 @@ mod tests { #[test] fn empty_inputs_compile_into_an_override() { let dir = tempfile::tempdir().unwrap(); - let ov = build_overrides(dir.path(), &[], &[]).unwrap(); + let ov = build_overrides(dir.path(), &[], &[], &[]).unwrap(); // Default-excludes only; non-special files should not match. let m = ov.combined.matched(Path::new("notes/alpha.md"), false); assert!(!m.is_ignore()); @@ -415,7 +462,7 @@ mod tests { #[test] fn default_excludes_ds_store_and_resource_forks() { let dir = tempfile::tempdir().unwrap(); - let ov = build_overrides(dir.path(), &[], &[]).unwrap(); + let ov = build_overrides(dir.path(), &[], &[], &[]).unwrap(); assert!(ov.combined.matched(Path::new(".DS_Store"), false).is_ignore()); assert!( ov.combined.matched(Path::new("notes/.DS_Store"), false).is_ignore() @@ -433,6 +480,7 @@ mod tests { dir.path(), &["*.tmp".to_string(), "node_modules/**".to_string()], &[], + &[], ) .unwrap(); assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore()); @@ -452,6 +500,7 @@ mod tests { dir.path(), &["*.tmp".to_string()], &["secret/**".to_string()], + &[], ) .unwrap(); assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore()); @@ -491,7 +540,7 @@ mod tests { fs::write(root.join("src/main.rs"), "x").unwrap(); fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap(); - let overrides = build_overrides(root, &[], &[]).unwrap(); + let overrides = build_overrides(root, &[], &[], &[]).unwrap(); // Override::matched expects paths relative to the builder's root. let m_in = overrides.combined.matched(Path::new("src/main.rs"), false); let m_out = overrides.combined.matched(Path::new("node_modules/foo/bar.js"), false); @@ -514,7 +563,7 @@ mod tests { fs::create_dir_all(root.join("ok")).unwrap(); fs::write(root.join("ok/z.txt"), "z").unwrap(); - let overrides = build_overrides(root, &[], &[]).unwrap(); + let overrides = build_overrides(root, &[], &[], &[]).unwrap(); // Override::matched expects paths relative to the builder's root. for blacklisted in [ "target/x/y.txt", @@ -544,7 +593,7 @@ mod tests { fs::create_dir_all(root.join("dist")).unwrap(); fs::write(root.join("dist/bundle.js"), "x").unwrap(); - let overrides = build_overrides(root, &[], &[]).unwrap(); + let overrides = build_overrides(root, &[], &[], &[]).unwrap(); assert!(overrides.combined.matched(Path::new("a.log"), false).is_ignore()); assert!(overrides.combined.matched(Path::new("dist/bundle.js"), false).is_ignore()); assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore()); @@ -562,7 +611,7 @@ mod tests { fs::write(root.join("src/main.rs"), "x").unwrap(); // No .gitignore present — patterns from .gitignore should not affect overrides. - let overrides = build_overrides(root, &[], &[]).unwrap(); + let overrides = build_overrides(root, &[], &[], &[]).unwrap(); assert!(!overrides.combined.matched(Path::new("a.log"), false).is_ignore()); assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore()); } @@ -577,7 +626,7 @@ mod tests { // semantics, but at minimum it must not produce double-`!` corruption. fs::write(root.join(".gitignore"), "!keep/\n").unwrap(); // Just verify build_overrides doesn't error. - let result = build_overrides(root, &[], &[]); + let result = build_overrides(root, &[], &[], &[]); assert!(result.is_ok(), "should not error on negation pattern: {:?}", result.err()); } @@ -594,7 +643,7 @@ mod tests { // .gitignore entry. Builtin must win (priority order §5.2). fs::write(root.join(".gitignore"), "node_modules/\n").unwrap(); - let ov = build_overrides(root, &[], &[]).unwrap(); + let ov = build_overrides(root, &[], &[], &[]).unwrap(); // node_modules/ dir itself let cat = classify_skip(Path::new("node_modules"), true, &ov); assert_eq!(cat, SkipCategory::BuiltinBlacklist, "builtin must have priority"); @@ -609,7 +658,7 @@ mod tests { let root = tmp.path(); fs::write(root.join(".gitignore"), "*.log\n").unwrap(); - let ov = build_overrides(root, &[], &[]).unwrap(); + let ov = build_overrides(root, &[], &[], &[]).unwrap(); let cat = classify_skip(Path::new("app.log"), false, &ov); assert_eq!(cat, SkipCategory::Gitignore); } @@ -621,7 +670,7 @@ mod tests { let tmp = TempDir::new().unwrap(); let root = tmp.path(); - let ov = build_overrides(root, &[], &["*.secret".to_string()]).unwrap(); + let ov = build_overrides(root, &[], &["*.secret".to_string()], &[]).unwrap(); let cat = classify_skip(Path::new("creds.secret"), false, &ov); assert_eq!(cat, SkipCategory::Kebabignore); } @@ -637,7 +686,7 @@ mod tests { fs::write(root.join("ok.md"), "# ok").unwrap(); fs::write(root.join("skipme.log"), "x").unwrap(); - let ov = build_overrides(root, &[], &[]).unwrap(); + let ov = build_overrides(root, &[], &[], &[]).unwrap(); let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap(); let accepted_names: Vec<_> = accepted @@ -677,7 +726,7 @@ mod tests { fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap(); fs::write(root.join("ok.md"), "# ok").unwrap(); - let ov = build_overrides(root, &[], &[]).unwrap(); + let ov = build_overrides(root, &[], &[], &[]).unwrap(); let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap(); let accepted_names: Vec<_> = accepted diff --git a/crates/kebab-source-fs/tests/include_allowlist.rs b/crates/kebab-source-fs/tests/include_allowlist.rs new file mode 100644 index 0000000..3ccf7d1 --- /dev/null +++ b/crates/kebab-source-fs/tests/include_allowlist.rs @@ -0,0 +1,111 @@ +//! Integration test: `scope.include` enforces an allow-list. +//! +//! Semantics (gitignore convention): +//! - `include` is empty Vec → all files pass through (backward-compat). +//! - `include` is non-empty → only files matching at least one pattern +//! are accepted. `exclude` rules still apply after include. +//! +//! Layout (built per-test in a TempDir): +//! root/ +//! ├── a.md +//! ├── b.py +//! ├── c.png +//! └── d.pdf + +use std::fs; + +use kebab_config::Config; +use kebab_core::{SourceConnector, SourceScope}; +use kebab_source_fs::FsSourceConnector; + +fn cfg_with_root(root: &str) -> Config { + let mut c = Config::defaults(); + c.workspace.root = root.to_string(); + c.workspace.exclude.clear(); + // Disable size / generated caps so small test files always pass. + c.ingest.code.max_file_bytes = u64::MAX; + c.ingest.code.max_file_lines = u32::MAX; + c.ingest.code.skip_generated_header = false; + c +} + +fn setup_mixed_dir() -> tempfile::TempDir { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join("a.md"), b"md").unwrap(); + fs::write(root.join("b.py"), b"py").unwrap(); + fs::write(root.join("c.png"), b"\x89PNG").unwrap(); + fs::write(root.join("d.pdf"), b"%PDF").unwrap(); + dir +} + +/// Empty include → all 4 files pass (backward-compat). +#[test] +fn include_empty_accepts_all_files() { + let dir = setup_mixed_dir(); + let conn = FsSourceConnector::new(&cfg_with_root(dir.path().to_str().unwrap())).unwrap(); + let scope = SourceScope { + include: vec![], + ..SourceScope::default() + }; + let assets = conn.scan(&scope).unwrap(); + let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert!(names.contains(&"a.md".to_string()), "a.md missing; got: {names:?}"); + assert!(names.contains(&"b.py".to_string()), "b.py missing; got: {names:?}"); + assert!(names.contains(&"c.png".to_string()), "c.png missing; got: {names:?}"); + assert!(names.contains(&"d.pdf".to_string()), "d.pdf missing; got: {names:?}"); + assert_eq!(names.len(), 4, "expected exactly 4 files; got: {names:?}"); +} + +/// Non-empty include → only md + py come back; png + pdf are excluded. +#[test] +fn include_nonempty_is_allowlist() { + let dir = setup_mixed_dir(); + let conn = FsSourceConnector::new(&cfg_with_root(dir.path().to_str().unwrap())).unwrap(); + let scope = SourceScope { + include: vec!["**/*.md".to_string(), "**/*.py".to_string()], + ..SourceScope::default() + }; + let assets = conn.scan(&scope).unwrap(); + let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert!(names.contains(&"a.md".to_string()), "a.md should be accepted; got: {names:?}"); + assert!(names.contains(&"b.py".to_string()), "b.py should be accepted; got: {names:?}"); + assert!( + !names.contains(&"c.png".to_string()), + "c.png must be rejected by include allowlist; got: {names:?}" + ); + assert!( + !names.contains(&"d.pdf".to_string()), + "d.pdf must be rejected by include allowlist; got: {names:?}" + ); + assert_eq!(names.len(), 2, "expected exactly 2 files; got: {names:?}"); +} + +/// include + exclude are ANDed: a file matching include but also matching +/// exclude must be rejected. +#[test] +fn include_and_exclude_are_anded() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join("keep.md"), b"keep").unwrap(); + fs::write(root.join("drop.md"), b"drop").unwrap(); + fs::write(root.join("other.py"), b"py").unwrap(); + + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); + let scope = SourceScope { + include: vec!["**/*.md".to_string()], + exclude: vec!["drop.md".to_string()], + ..SourceScope::default() + }; + let assets = conn.scan(&scope).unwrap(); + let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert!(names.contains(&"keep.md".to_string()), "keep.md should be accepted; got: {names:?}"); + assert!( + !names.contains(&"drop.md".to_string()), + "drop.md should be excluded (matched exclude); got: {names:?}" + ); + assert!( + !names.contains(&"other.py".to_string()), + "other.py should be excluded (not in include); got: {names:?}" + ); +}