From 4e8b84c4e04ed5e299ee962a7feb61ab9b94a9fc Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 20 May 2026 05:09:19 +0000 Subject: [PATCH 1/2] fix(dogfood): populate schema.v1.repo_breakdown (Task 9 follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dogfooding (PR #142 1B + multi-root corpus: kebab-docs + httpx + zod + lodash) revealed schema.v1.repo_breakdown is always {} despite the 1A-2 Task 9 having added the code_lang_breakdown sibling. The schema.rs:171 placeholder `BTreeMap::new()` was left in place. Mirror Task 9's code_lang_breakdown query for the repo field — same metadata_json JSON-path pattern. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/schema.rs | 4 +- crates/kebab-store-sqlite/src/store.rs | 107 +++++++++++++++++++++++++ 2 files changed, 110 insertions(+), 1 deletion(-) diff --git a/crates/kebab-app/src/schema.rs b/crates/kebab-app/src/schema.rs index ab7d144..8e085ec 100644 --- a/crates/kebab-app/src/schema.rs +++ b/crates/kebab-app/src/schema.rs @@ -168,7 +168,9 @@ fn collect_stats( stale_doc_count: counts.stale_doc_count, // p10-1A-2: populated by the store query added in this task. code_lang_breakdown: store.code_lang_breakdown()?, - repo_breakdown: std::collections::BTreeMap::new(), + // p10-1A-2 follow-up: dogfooding (2026-05-20) revealed this was a + // placeholder — mirror of code_lang_breakdown for the repo field. + repo_breakdown: store.repo_breakdown()?, }) } diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index 89dce6a..8db087b 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -701,6 +701,39 @@ impl SqliteStore { } Ok(out) } + + /// p10-1A-2 follow-up (dogfooding 2026-05-20): per-repo doc count for + /// `schema.v1`. + /// + /// Reads `metadata_json->'$.repo'`, groups by the value, and skips rows + /// where `repo` is NULL (documents without an explicit repo tag). + /// Returns `BTreeMap` — key is the repo name as stored in + /// frontmatter, value is the doc count. + pub fn repo_breakdown( + &self, + ) -> anyhow::Result> { + use anyhow::Context; + let conn = self.read_conn(); + let mut stmt = conn + .prepare( + "SELECT json_extract(metadata_json, '$.repo') AS rp, COUNT(*) \ + FROM documents \ + WHERE rp IS NOT NULL \ + GROUP BY rp", + ) + .context("prepare repo_breakdown")?; + let rows = stmt + .query_map([], |r| { + Ok((r.get::<_, String>(0)?, r.get::<_, i64>(1)? as u32)) + }) + .context("query repo_breakdown")?; + let mut out = std::collections::BTreeMap::new(); + for row in rows { + let (k, v) = row.context("read repo_breakdown row")?; + out.insert(k, v); + } + Ok(out) + } } /// Apply the design §5 / task-spec pragmas. Called once per connection. @@ -817,5 +850,79 @@ mod tests { // only one key total assert_eq!(bd.len(), 1, "expected exactly 1 entry, got: {bd:?}"); } + + /// p10-1A-2 follow-up: `repo_breakdown` counts docs by + /// `metadata_json.repo`. + /// + /// Inserts: + /// - one doc with `repo = "my-repo"` → must appear with count 1 + /// - one doc with `repo = null` → must NOT appear (NULL skipped) + /// + /// Uses a side rusqlite connection that bypasses the `assets` FK via + /// `PRAGMA foreign_keys = OFF` so the test is self-contained. + #[test] + fn repo_breakdown_counts_by_repo() { + let (dir, store) = open_fresh_store(); + + let db_path = dir.path().join("kebab.sqlite"); + let conn = rusqlite::Connection::open(&db_path).unwrap(); + conn.pragma_update(None, "foreign_keys", "OFF").unwrap(); + + // Doc 1: doc with repo = "my-repo" + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, + source_type, trust_level, parser_version, + doc_version, schema_version, + metadata_json, provenance_json, + created_at, updated_at + ) VALUES ( + 'doc-repo-1', 'asset-r1', 'my-repo/README.md', + 'markdown', 'primary', 'test-v1', + 1, 1, + '{\"repo\":\"my-repo\"}', '{}', + '2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z' + )", + [], + ) + .unwrap(); + + // Doc 2: doc with repo absent (null in JSON) + conn.execute( + "INSERT INTO documents ( + doc_id, asset_id, workspace_path, + source_type, trust_level, parser_version, + doc_version, schema_version, + metadata_json, provenance_json, + created_at, updated_at + ) VALUES ( + 'doc-norepo-1', 'asset-r2', 'standalone/notes.md', + 'markdown', 'primary', 'test-v1', + 1, 1, + '{\"repo\":null}', '{}', + '2024-01-01T00:00:00Z', '2024-01-01T00:00:00Z' + )", + [], + ) + .unwrap(); + + drop(conn); // release side connection before querying via store + + let bd = store.repo_breakdown().unwrap(); + + // "my-repo" must appear with count 1 + assert_eq!( + bd.get("my-repo"), + Some(&1u32), + "expected my-repo=1 in repo_breakdown, got: {bd:?}" + ); + // null repo must NOT appear as any key + assert!( + !bd.contains_key("null"), + "null repo must not appear in breakdown, got: {bd:?}" + ); + // only one key total + assert_eq!(bd.len(), 1, "expected exactly 1 entry, got: {bd:?}"); + } } From 803d02b68b57747f3547a428ff041623315b9cde Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 20 May 2026 05:15:04 +0000 Subject: [PATCH 2/2] fix(dogfood): enforce workspace.include in walker (allow-list semantics) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit config.workspace.include was completely ignored by the walker — connector.rs log_scope_include_warning literally said "handled by extractor router" but no extractor router exists. Dogfooding (PR #142 1B + multi-root corpus kebab-docs + httpx + zod + lodash) showed user-set include of code+md still ingested 84 .png + 8 .pdf files. Fix: walker treats scope.include as an allow-list — empty Vec preserves backward-compat (all files pass), non-empty requires file path to match at least one pattern (AND with the existing exclude rules). Removed the misleading debug log. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 1 + crates/kebab-source-fs/Cargo.toml | 1 + crates/kebab-source-fs/src/connector.rs | 12 +- crates/kebab-source-fs/src/walker.rs | 73 ++++++++++-- .../tests/include_allowlist.rs | 111 ++++++++++++++++++ 5 files changed, 175 insertions(+), 23 deletions(-) create mode 100644 crates/kebab-source-fs/tests/include_allowlist.rs diff --git a/Cargo.lock b/Cargo.lock index d457c88..0ba566b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4466,6 +4466,7 @@ version = "0.8.1" dependencies = [ "anyhow", "blake3", + "globset", "ignore", "kebab-config", "kebab-core", diff --git a/crates/kebab-source-fs/Cargo.toml b/crates/kebab-source-fs/Cargo.toml index 5c976cd..68ea7be 100644 --- a/crates/kebab-source-fs/Cargo.toml +++ b/crates/kebab-source-fs/Cargo.toml @@ -18,6 +18,7 @@ blake3 = { workspace = true } tracing = { workspace = true } walkdir = "2" ignore = "0.4" +globset = "0.4" [dev-dependencies] serde_json = { workspace = true } diff --git a/crates/kebab-source-fs/src/connector.rs b/crates/kebab-source-fs/src/connector.rs index c61da8d..fd73734 100644 --- a/crates/kebab-source-fs/src/connector.rs +++ b/crates/kebab-source-fs/src/connector.rs @@ -86,7 +86,7 @@ impl FsSourceConnector { excludes.extend(scope.exclude.iter().cloned()); let kbignore = read_kbignore(&root)?; - let overrides = build_overrides(&root, &excludes, &kbignore)?; + let overrides = build_overrides(&root, &excludes, &kbignore, &scope.include)?; Ok((root, overrides)) } @@ -103,8 +103,6 @@ impl FsSourceConnector { ) -> Result<(Vec, FsScanSkips)> { let (root, overrides) = self.resolve_scan_params(scope)?; - log_scope_include_warning(scope); - let (files, skipped_entries) = walk_files_with_skips(&root, &overrides)?; // Accumulate per-category skip counts and sample paths. @@ -284,14 +282,6 @@ fn build_assets( Ok(assets) } -fn log_scope_include_warning(scope: &SourceScope) { - if !scope.include.is_empty() { - tracing::debug!( - count = scope.include.len(), - "FsSourceConnector ignores scope.include — handled by extractor router" - ); - } -} impl SourceConnector for FsSourceConnector { fn scan(&self, scope: &SourceScope) -> Result> { diff --git a/crates/kebab-source-fs/src/walker.rs b/crates/kebab-source-fs/src/walker.rs index 8eaf0ff..9f6a540 100644 --- a/crates/kebab-source-fs/src/walker.rs +++ b/crates/kebab-source-fs/src/walker.rs @@ -44,6 +44,7 @@ use std::collections::HashSet; use std::path::{Path, PathBuf}; use anyhow::{Context, Result}; +use globset::{GlobBuilder, GlobSet, GlobSetBuilder}; use ignore::overrides::{Override, OverrideBuilder}; use walkdir::WalkDir; @@ -69,6 +70,11 @@ const DEFAULT_EXCLUDES: &[&str] = &[ /// /// `default_and_config` covers DEFAULT_EXCLUDES + `config.workspace.exclude` /// — these do NOT map to any of the three named `IngestReport` counters. +/// +/// `include` is the compiled `scope.include` allow-list. When the set is +/// empty (no patterns) every file passes; when non-empty a file must match +/// at least one pattern to be accepted (directories always pass, so the +/// walker can still descend into them). pub(crate) struct WalkOverrides { /// Merged matcher — same as today's `Override`; used for the walk decision. pub combined: Override, @@ -78,6 +84,8 @@ pub(crate) struct WalkOverrides { pub kebabignore: Override, /// Matcher built from `kebab_parse_code::BUILTIN_BLACKLIST` only. pub builtin: Override, + /// Compiled allow-list from `scope.include`. Empty set = pass all. + pub include: GlobSet, } /// Skip attribution category. Used by the connector when counting per-source @@ -161,10 +169,15 @@ fn build_single_matcher_owned(root: &Path, patterns: &[String]) -> Result Result { let gitignore_patterns = read_gitignore(root)?; @@ -209,14 +222,41 @@ pub(crate) fn build_overrides( .build() .context("failed to compile combined override set")?; + // Allow-list GlobSet: empty Vec → matches nothing (= pass all); non-empty + // → file must match at least one glob to be accepted. We compile with + // `case_insensitive=false` to keep the semantics consistent with the + // OverrideBuilder exclude patterns above. + let include = build_include_globset(include_patterns)?; + Ok(WalkOverrides { combined, gitignore, kebabignore, builtin, + include, }) } +/// Compile `scope.include` patterns into a `GlobSet` allow-list. +/// +/// Each pattern uses `GlobBuilder` with `literal_separator = true` so that +/// `**` can cross directory boundaries while `*` stops at `/`, matching the +/// gitignore convention used throughout the rest of the walker. +/// +/// An empty slice produces an empty `GlobSet` — callers interpret that as +/// "pass all files" (no allow-list constraint). +fn build_include_globset(patterns: &[String]) -> Result { + let mut builder = GlobSetBuilder::new(); + for pat in patterns { + let glob = GlobBuilder::new(pat) + .literal_separator(true) + .build() + .with_context(|| format!("invalid include pattern: {pat}"))?; + builder.add(glob); + } + builder.build().context("failed to compile include globset") +} + /// Classify why a path was excluded, using per-source matchers in spec §5.2 /// priority order: built-in > gitignore > kebabignore > other. /// @@ -391,6 +431,13 @@ pub(crate) fn walk_files_with_skips( } if entry.file_type().is_file() { + // Apply include allow-list: if non-empty, the file's path + // relative to root must match at least one pattern. + if !overrides.include.is_empty() && !overrides.include.is_match(rel) { + // Not in the allow-list — silently drop (no skip counter; + // the include filter is not a "skip" source in IngestReport). + continue; + } accepted.push(path.to_path_buf()); } } @@ -406,7 +453,7 @@ mod tests { #[test] fn empty_inputs_compile_into_an_override() { let dir = tempfile::tempdir().unwrap(); - let ov = build_overrides(dir.path(), &[], &[]).unwrap(); + let ov = build_overrides(dir.path(), &[], &[], &[]).unwrap(); // Default-excludes only; non-special files should not match. let m = ov.combined.matched(Path::new("notes/alpha.md"), false); assert!(!m.is_ignore()); @@ -415,7 +462,7 @@ mod tests { #[test] fn default_excludes_ds_store_and_resource_forks() { let dir = tempfile::tempdir().unwrap(); - let ov = build_overrides(dir.path(), &[], &[]).unwrap(); + let ov = build_overrides(dir.path(), &[], &[], &[]).unwrap(); assert!(ov.combined.matched(Path::new(".DS_Store"), false).is_ignore()); assert!( ov.combined.matched(Path::new("notes/.DS_Store"), false).is_ignore() @@ -433,6 +480,7 @@ mod tests { dir.path(), &["*.tmp".to_string(), "node_modules/**".to_string()], &[], + &[], ) .unwrap(); assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore()); @@ -452,6 +500,7 @@ mod tests { dir.path(), &["*.tmp".to_string()], &["secret/**".to_string()], + &[], ) .unwrap(); assert!(ov.combined.matched(Path::new("a.tmp"), false).is_ignore()); @@ -491,7 +540,7 @@ mod tests { fs::write(root.join("src/main.rs"), "x").unwrap(); fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap(); - let overrides = build_overrides(root, &[], &[]).unwrap(); + let overrides = build_overrides(root, &[], &[], &[]).unwrap(); // Override::matched expects paths relative to the builder's root. let m_in = overrides.combined.matched(Path::new("src/main.rs"), false); let m_out = overrides.combined.matched(Path::new("node_modules/foo/bar.js"), false); @@ -514,7 +563,7 @@ mod tests { fs::create_dir_all(root.join("ok")).unwrap(); fs::write(root.join("ok/z.txt"), "z").unwrap(); - let overrides = build_overrides(root, &[], &[]).unwrap(); + let overrides = build_overrides(root, &[], &[], &[]).unwrap(); // Override::matched expects paths relative to the builder's root. for blacklisted in [ "target/x/y.txt", @@ -544,7 +593,7 @@ mod tests { fs::create_dir_all(root.join("dist")).unwrap(); fs::write(root.join("dist/bundle.js"), "x").unwrap(); - let overrides = build_overrides(root, &[], &[]).unwrap(); + let overrides = build_overrides(root, &[], &[], &[]).unwrap(); assert!(overrides.combined.matched(Path::new("a.log"), false).is_ignore()); assert!(overrides.combined.matched(Path::new("dist/bundle.js"), false).is_ignore()); assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore()); @@ -562,7 +611,7 @@ mod tests { fs::write(root.join("src/main.rs"), "x").unwrap(); // No .gitignore present — patterns from .gitignore should not affect overrides. - let overrides = build_overrides(root, &[], &[]).unwrap(); + let overrides = build_overrides(root, &[], &[], &[]).unwrap(); assert!(!overrides.combined.matched(Path::new("a.log"), false).is_ignore()); assert!(!overrides.combined.matched(Path::new("src/main.rs"), false).is_ignore()); } @@ -577,7 +626,7 @@ mod tests { // semantics, but at minimum it must not produce double-`!` corruption. fs::write(root.join(".gitignore"), "!keep/\n").unwrap(); // Just verify build_overrides doesn't error. - let result = build_overrides(root, &[], &[]); + let result = build_overrides(root, &[], &[], &[]); assert!(result.is_ok(), "should not error on negation pattern: {:?}", result.err()); } @@ -594,7 +643,7 @@ mod tests { // .gitignore entry. Builtin must win (priority order §5.2). fs::write(root.join(".gitignore"), "node_modules/\n").unwrap(); - let ov = build_overrides(root, &[], &[]).unwrap(); + let ov = build_overrides(root, &[], &[], &[]).unwrap(); // node_modules/ dir itself let cat = classify_skip(Path::new("node_modules"), true, &ov); assert_eq!(cat, SkipCategory::BuiltinBlacklist, "builtin must have priority"); @@ -609,7 +658,7 @@ mod tests { let root = tmp.path(); fs::write(root.join(".gitignore"), "*.log\n").unwrap(); - let ov = build_overrides(root, &[], &[]).unwrap(); + let ov = build_overrides(root, &[], &[], &[]).unwrap(); let cat = classify_skip(Path::new("app.log"), false, &ov); assert_eq!(cat, SkipCategory::Gitignore); } @@ -621,7 +670,7 @@ mod tests { let tmp = TempDir::new().unwrap(); let root = tmp.path(); - let ov = build_overrides(root, &[], &["*.secret".to_string()]).unwrap(); + let ov = build_overrides(root, &[], &["*.secret".to_string()], &[]).unwrap(); let cat = classify_skip(Path::new("creds.secret"), false, &ov); assert_eq!(cat, SkipCategory::Kebabignore); } @@ -637,7 +686,7 @@ mod tests { fs::write(root.join("ok.md"), "# ok").unwrap(); fs::write(root.join("skipme.log"), "x").unwrap(); - let ov = build_overrides(root, &[], &[]).unwrap(); + let ov = build_overrides(root, &[], &[], &[]).unwrap(); let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap(); let accepted_names: Vec<_> = accepted @@ -677,7 +726,7 @@ mod tests { fs::write(root.join("node_modules/foo/bar.js"), "x").unwrap(); fs::write(root.join("ok.md"), "# ok").unwrap(); - let ov = build_overrides(root, &[], &[]).unwrap(); + let ov = build_overrides(root, &[], &[], &[]).unwrap(); let (accepted, skipped_entries) = walk_files_with_skips(root, &ov).unwrap(); let accepted_names: Vec<_> = accepted diff --git a/crates/kebab-source-fs/tests/include_allowlist.rs b/crates/kebab-source-fs/tests/include_allowlist.rs new file mode 100644 index 0000000..3ccf7d1 --- /dev/null +++ b/crates/kebab-source-fs/tests/include_allowlist.rs @@ -0,0 +1,111 @@ +//! Integration test: `scope.include` enforces an allow-list. +//! +//! Semantics (gitignore convention): +//! - `include` is empty Vec → all files pass through (backward-compat). +//! - `include` is non-empty → only files matching at least one pattern +//! are accepted. `exclude` rules still apply after include. +//! +//! Layout (built per-test in a TempDir): +//! root/ +//! ├── a.md +//! ├── b.py +//! ├── c.png +//! └── d.pdf + +use std::fs; + +use kebab_config::Config; +use kebab_core::{SourceConnector, SourceScope}; +use kebab_source_fs::FsSourceConnector; + +fn cfg_with_root(root: &str) -> Config { + let mut c = Config::defaults(); + c.workspace.root = root.to_string(); + c.workspace.exclude.clear(); + // Disable size / generated caps so small test files always pass. + c.ingest.code.max_file_bytes = u64::MAX; + c.ingest.code.max_file_lines = u32::MAX; + c.ingest.code.skip_generated_header = false; + c +} + +fn setup_mixed_dir() -> tempfile::TempDir { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join("a.md"), b"md").unwrap(); + fs::write(root.join("b.py"), b"py").unwrap(); + fs::write(root.join("c.png"), b"\x89PNG").unwrap(); + fs::write(root.join("d.pdf"), b"%PDF").unwrap(); + dir +} + +/// Empty include → all 4 files pass (backward-compat). +#[test] +fn include_empty_accepts_all_files() { + let dir = setup_mixed_dir(); + let conn = FsSourceConnector::new(&cfg_with_root(dir.path().to_str().unwrap())).unwrap(); + let scope = SourceScope { + include: vec![], + ..SourceScope::default() + }; + let assets = conn.scan(&scope).unwrap(); + let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert!(names.contains(&"a.md".to_string()), "a.md missing; got: {names:?}"); + assert!(names.contains(&"b.py".to_string()), "b.py missing; got: {names:?}"); + assert!(names.contains(&"c.png".to_string()), "c.png missing; got: {names:?}"); + assert!(names.contains(&"d.pdf".to_string()), "d.pdf missing; got: {names:?}"); + assert_eq!(names.len(), 4, "expected exactly 4 files; got: {names:?}"); +} + +/// Non-empty include → only md + py come back; png + pdf are excluded. +#[test] +fn include_nonempty_is_allowlist() { + let dir = setup_mixed_dir(); + let conn = FsSourceConnector::new(&cfg_with_root(dir.path().to_str().unwrap())).unwrap(); + let scope = SourceScope { + include: vec!["**/*.md".to_string(), "**/*.py".to_string()], + ..SourceScope::default() + }; + let assets = conn.scan(&scope).unwrap(); + let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert!(names.contains(&"a.md".to_string()), "a.md should be accepted; got: {names:?}"); + assert!(names.contains(&"b.py".to_string()), "b.py should be accepted; got: {names:?}"); + assert!( + !names.contains(&"c.png".to_string()), + "c.png must be rejected by include allowlist; got: {names:?}" + ); + assert!( + !names.contains(&"d.pdf".to_string()), + "d.pdf must be rejected by include allowlist; got: {names:?}" + ); + assert_eq!(names.len(), 2, "expected exactly 2 files; got: {names:?}"); +} + +/// include + exclude are ANDed: a file matching include but also matching +/// exclude must be rejected. +#[test] +fn include_and_exclude_are_anded() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join("keep.md"), b"keep").unwrap(); + fs::write(root.join("drop.md"), b"drop").unwrap(); + fs::write(root.join("other.py"), b"py").unwrap(); + + let conn = FsSourceConnector::new(&cfg_with_root(root.to_str().unwrap())).unwrap(); + let scope = SourceScope { + include: vec!["**/*.md".to_string()], + exclude: vec!["drop.md".to_string()], + ..SourceScope::default() + }; + let assets = conn.scan(&scope).unwrap(); + let names: Vec<_> = assets.iter().map(|a| a.workspace_path.0.clone()).collect(); + assert!(names.contains(&"keep.md".to_string()), "keep.md should be accepted; got: {names:?}"); + assert!( + !names.contains(&"drop.md".to_string()), + "drop.md should be excluded (matched exclude); got: {names:?}" + ); + assert!( + !names.contains(&"other.py".to_string()), + "other.py should be excluded (not in include); got: {names:?}" + ); +}