diff --git a/crates/kebab-source-fs/src/code_meta.rs b/crates/kebab-source-fs/src/code_meta.rs index afb7b53..d09ec7a 100644 --- a/crates/kebab-source-fs/src/code_meta.rs +++ b/crates/kebab-source-fs/src/code_meta.rs @@ -109,6 +109,15 @@ pub(crate) fn is_generated_file(path: &Path) -> Result { ) } +/// Returns true when `path`'s filename/extension is recognised as a code +/// file (per `code_lang_for_path`). Used by the walker to apply +/// `[ingest.code].max_file_bytes` / `max_file_lines` only to code files, +/// not to PDF/image/markdown (which have their own size controls in +/// their respective parsers). +pub(crate) fn is_code_file(path: &Path) -> bool { + code_lang_for_path(path).is_some() +} + /// Check if `path` exceeds `max_bytes` or `max_lines`. Byte cap first /// (cheap), then line cap (streaming with early exit). pub(crate) fn is_oversized(path: &Path, max_bytes: u64, max_lines: u32) -> Result { diff --git a/crates/kebab-source-fs/src/connector.rs b/crates/kebab-source-fs/src/connector.rs index 288b8a3..95ddc57 100644 --- a/crates/kebab-source-fs/src/connector.rs +++ b/crates/kebab-source-fs/src/connector.rs @@ -165,13 +165,16 @@ impl FsSourceConnector { continue; } - // Size-cap check (byte or line limit). - if crate::code_meta::is_oversized( - &abs_path, - self.max_file_bytes, - self.max_file_lines, - ) - .unwrap_or(false) + // v0.20.0 sub-item 1 bugfix (#2): size-cap applies ONLY to + // code files. PDF/image/markdown bypass — their parsers + // have their own size controls. spec §3.3. + if crate::code_meta::is_code_file(&abs_path) + && crate::code_meta::is_oversized( + &abs_path, + self.max_file_bytes, + self.max_file_lines, + ) + .unwrap_or(false) { fs_skips.skipped_size_exceeded = fs_skips.skipped_size_exceeded.saturating_add(1); @@ -184,7 +187,7 @@ impl FsSourceConnector { path = %rel_path.display(), max_bytes = self.max_file_bytes, max_lines = self.max_file_lines, - "skip: file exceeds size cap" + "skip: code file exceeds size cap" ); continue; } @@ -764,4 +767,28 @@ mod tests { skips.skipped_size_exceeded ); } + + #[test] + fn size_cap_skips_only_code_files() { + use tempfile::tempdir; + let tmp = tempdir().unwrap(); + let root = tmp.path().to_path_buf(); + + // 300 KB pdf / md / rs (each > 262 144 byte cap) + std::fs::write(root.join("paper.pdf"), vec![b'%'; 300_000]).unwrap(); + std::fs::write(root.join("notes.md"), vec![b'#'; 300_000]).unwrap(); + std::fs::write(root.join("big.rs"), vec![b'/'; 300_000]).unwrap(); + + let cfg = cfg_with_size_cap(root.to_str().unwrap(), 262_144, 5_000); + let connector = FsSourceConnector::new(&cfg).unwrap(); + let (assets, skips) = connector.scan_with_skips(&SourceScope::default()).unwrap(); + + let paths: Vec<_> = assets.iter().map(|a| a.workspace_path.0.as_str()).collect(); + assert!(paths.iter().any(|p| p.contains("paper.pdf")), "PDF must pass: {paths:?}"); + assert!(paths.iter().any(|p| p.contains("notes.md")), "MD must pass: {paths:?}"); + assert!(!paths.iter().any(|p| p.contains("big.rs")), "code file must skip: {paths:?}"); + + assert_eq!(skips.skip_examples.size_exceeded.len(), 1); + assert!(skips.skip_examples.size_exceeded[0].contains("big.rs")); + } }