From 4e8b70a04b756fb178fadb6c9e87daa533f47fe0 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Fri, 15 May 2026 17:06:59 +0900
Subject: [PATCH] feat(p10-1a-1): apply generated-header + size-cap skip per
file
Wire kebab_parse_code::is_generated_file and is_oversized into
FsSourceConnector::scan_with_skips. Files that pass gitignore/builtin/
kebabignore matching are now checked for generated-file markers
(config-gated via ingest.code.skip_generated_header) and byte/line caps
(ingest.code.max_file_bytes / max_file_lines). FsScanSkips gains
skipped_generated + skipped_size_exceeded counters; kebab-app threads
them into IngestReport. Also fixes a pre-existing clippy::derivable_impls
warning in IngestCfg. Three new connector tests cover all three paths.
Co-Authored-By: Claude Sonnet 4.6
---
crates/kebab-app/src/lib.rs | 4 +-
crates/kebab-config/src/lib.rs | 10 +-
crates/kebab-source-fs/src/connector.rs | 168 +++++++++++++++++++++++-
3 files changed, 170 insertions(+), 12 deletions(-)
diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs
index 99f5597..e0a0375 100644
--- a/crates/kebab-app/src/lib.rs
+++ b/crates/kebab-app/src/lib.rs
@@ -678,8 +678,8 @@ pub fn ingest_with_config_opts(
skipped_gitignore: fs_skips.skipped_gitignore,
skipped_kebabignore: fs_skips.skipped_kebabignore,
skipped_builtin_blacklist: fs_skips.skipped_builtin_blacklist,
- skipped_generated: 0,
- skipped_size_exceeded: 0,
+ skipped_generated: fs_skips.skipped_generated,
+ skipped_size_exceeded: fs_skips.skipped_size_exceeded,
skip_examples: fs_skips.skip_examples,
items: if summary_only { None } else { Some(items) },
})
diff --git a/crates/kebab-config/src/lib.rs b/crates/kebab-config/src/lib.rs
index b9d1c4d..1117713 100644
--- a/crates/kebab-config/src/lib.rs
+++ b/crates/kebab-config/src/lib.rs
@@ -272,20 +272,12 @@ impl UiCfg {
/// p10-1A-1: top-level ingest configuration wrapper. Contains per-media-type
/// sub-sections; currently only `code` is defined.
-#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
#[serde(default)]
pub struct IngestCfg {
pub code: IngestCodeCfg,
}
-impl Default for IngestCfg {
- fn default() -> Self {
- Self {
- code: IngestCodeCfg::default(),
- }
- }
-}
-
/// p10-1A-1: settings for the code ingest pipeline. All fields have
/// reasonable defaults so the user need not set anything in `config.toml`
/// to get working code ingest.
diff --git a/crates/kebab-source-fs/src/connector.rs b/crates/kebab-source-fs/src/connector.rs
index 599673e..c61da8d 100644
--- a/crates/kebab-source-fs/src/connector.rs
+++ b/crates/kebab-source-fs/src/connector.rs
@@ -36,10 +36,16 @@ use crate::walker::{SkipCategory, WalkOverrides, build_overrides, read_kbignore,
/// construction time.
/// - `copy_threshold_bytes`: `config.storage.copy_threshold_mb * 1 MiB`
/// pre-multiplied so we don't recompute per file.
+/// - `skip_generated_header`: `config.ingest.code.skip_generated_header`.
+/// - `max_file_bytes`: `config.ingest.code.max_file_bytes`.
+/// - `max_file_lines`: `config.ingest.code.max_file_lines`.
pub struct FsSourceConnector {
default_root: PathBuf,
default_exclude: Vec,
copy_threshold_bytes: u64,
+ skip_generated_header: bool,
+ max_file_bytes: u64,
+ max_file_lines: u32,
}
impl FsSourceConnector {
@@ -59,6 +65,9 @@ impl FsSourceConnector {
default_root: root,
default_exclude: config.workspace.exclude.clone(),
copy_threshold_bytes,
+ skip_generated_header: config.ingest.code.skip_generated_header,
+ max_file_bytes: config.ingest.code.max_file_bytes,
+ max_file_lines: config.ingest.code.max_file_lines,
})
}
@@ -133,7 +142,59 @@ impl FsSourceConnector {
}
}
- let assets = build_assets(&files, &root, self.copy_threshold_bytes)?;
+ // p10-1A-1: apply per-file generated-header + size-cap checks on files
+ // that passed the override (gitignore/builtin/kebabignore) matching.
+ // These run AFTER the walk-level skip attribution, BEFORE parse dispatch.
+ let mut accepted_files: Vec = Vec::with_capacity(files.len());
+ for abs_path in files {
+ let rel_path = abs_path.strip_prefix(&root).unwrap_or(&abs_path);
+
+ // Generated-header sniff (config-gated).
+ if self.skip_generated_header
+ && kebab_parse_code::is_generated_file(&abs_path).unwrap_or(false)
+ {
+ fs_skips.skipped_generated =
+ fs_skips.skipped_generated.saturating_add(1);
+ push_sample(
+ &mut fs_skips.skip_examples.generated,
+ &abs_path,
+ &root,
+ );
+ tracing::debug!(
+ path = %rel_path.display(),
+ "skip: generated-file marker detected"
+ );
+ continue;
+ }
+
+ // Size-cap check (byte or line limit).
+ if kebab_parse_code::is_oversized(
+ &abs_path,
+ self.max_file_bytes,
+ self.max_file_lines,
+ )
+ .unwrap_or(false)
+ {
+ fs_skips.skipped_size_exceeded =
+ fs_skips.skipped_size_exceeded.saturating_add(1);
+ push_sample(
+ &mut fs_skips.skip_examples.size_exceeded,
+ &abs_path,
+ &root,
+ );
+ tracing::debug!(
+ path = %rel_path.display(),
+ max_bytes = self.max_file_bytes,
+ max_lines = self.max_file_lines,
+ "skip: file exceeds size cap"
+ );
+ continue;
+ }
+
+ accepted_files.push(abs_path);
+ }
+
+ let assets = build_assets(&accepted_files, &root, self.copy_threshold_bytes)?;
Ok((assets, fs_skips))
}
}
@@ -147,6 +208,12 @@ pub struct FsScanSkips {
pub skipped_gitignore: u32,
pub skipped_kebabignore: u32,
pub skipped_builtin_blacklist: u32,
+ /// p10-1A-1: files skipped because their first ~512 bytes contained a
+ /// generated-file marker (`@generated`, `do not edit`, …).
+ pub skipped_generated: u32,
+ /// p10-1A-1: files skipped because they exceeded `max_file_bytes` or
+ /// `max_file_lines` in `[ingest.code]`.
+ pub skipped_size_exceeded: u32,
/// Sample paths per spec §5.5 (≤ 5 per category). Paths are
/// workspace-relative POSIX strings when available, absolute otherwise.
pub skip_examples: SkipExamples,
@@ -608,4 +675,103 @@ mod tests {
skips.skip_examples.gitignore
);
}
+
+ // ── p10-1A-1: generated-header + size-cap skip tests ────────────────────
+
+ /// Helper: connector with default ingest.code settings.
+ fn cfg_with_root_defaults(root: &str) -> Config {
+ // cfg_with_root already uses Config::defaults() which has
+ // skip_generated_header=true, max_file_bytes=262144, max_file_lines=5000.
+ cfg_with_root(root)
+ }
+
+ /// Helper: connector with overridden size caps.
+ fn cfg_with_size_cap(root: &str, max_bytes: u64, max_lines: u32) -> Config {
+ let mut c = cfg_with_root(root);
+ c.ingest.code.max_file_bytes = max_bytes;
+ c.ingest.code.max_file_lines = max_lines;
+ c
+ }
+
+ #[test]
+ fn ingest_report_counts_generated_files() {
+ let dir = tempfile::tempdir().unwrap();
+ let root = dir.path();
+ std::fs::write(root.join("normal.md"), "# hi").unwrap();
+ std::fs::write(root.join("autogen.rs"), "// @generated\nfn x() {}\n").unwrap();
+
+ let conn = FsSourceConnector::new(
+ &cfg_with_root_defaults(root.to_str().unwrap()),
+ )
+ .unwrap();
+ let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
+
+ assert!(
+ skips.skipped_generated >= 1,
+ "skipped_generated should be >= 1; got {}",
+ skips.skipped_generated
+ );
+ assert!(
+ skips.skip_examples.generated.iter().any(|p| p.contains("autogen")),
+ "skip_examples.generated should contain 'autogen'; got: {:?}",
+ skips.skip_examples.generated
+ );
+ // The normal.md file must NOT be skipped.
+ let asset_paths: Vec<_> = _assets
+ .iter()
+ .map(|a| a.workspace_path.0.clone())
+ .collect();
+ assert!(
+ asset_paths.iter().any(|p| p.contains("normal")),
+ "normal.md should still be emitted; assets: {asset_paths:?}"
+ );
+ }
+
+ #[test]
+ fn ingest_report_counts_oversized_files_by_bytes() {
+ let dir = tempfile::tempdir().unwrap();
+ let root = dir.path();
+ std::fs::write(root.join("normal.md"), "# hi").unwrap();
+ // Write a file larger than the 1024-byte cap.
+ let big: String = "x\n".repeat(1_000);
+ std::fs::write(root.join("huge.rs"), &big).unwrap();
+
+ let conn = FsSourceConnector::new(
+ &cfg_with_size_cap(root.to_str().unwrap(), 1024, 5_000),
+ )
+ .unwrap();
+ let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
+
+ assert!(
+ skips.skipped_size_exceeded >= 1,
+ "skipped_size_exceeded should be >= 1; got {}",
+ skips.skipped_size_exceeded
+ );
+ assert!(
+ skips.skip_examples.size_exceeded.iter().any(|p| p.contains("huge")),
+ "skip_examples.size_exceeded should contain 'huge'; got: {:?}",
+ skips.skip_examples.size_exceeded
+ );
+ }
+
+ #[test]
+ fn ingest_report_size_cap_by_line_count() {
+ let dir = tempfile::tempdir().unwrap();
+ let root = dir.path();
+ // 6000 lines but small per-line — line cap of 5000 should trigger.
+ let body: String = "x\n".repeat(6_000);
+ std::fs::write(root.join("longfile.rs"), &body).unwrap();
+
+ let conn = FsSourceConnector::new(
+ &cfg_with_size_cap(root.to_str().unwrap(), 262_144, 5_000),
+ )
+ .unwrap();
+ let (_assets, skips) = conn.scan_with_skips(&SourceScope::default()).unwrap();
+
+ assert!(
+ skips.skipped_size_exceeded >= 1,
+ "skipped_size_exceeded should be >= 1 (line cap); got {}",
+ skips.skipped_size_exceeded
+ );
+ }
}