fix(source-fs): apply size limit only to code files; PDF/image/markdown bypass walker cap (Bug #2)

v0.20.0 sub-item 1 dogfood report 의 Bug #2 — `[ingest.code].max_file_bytes`
가 walker 단계의 모든 file 에 일률 적용 → PDF/image/markdown 의 대부분 (256 KB+)
이 walker pre-extract skip. fix:

- `crates/kebab-source-fs/src/code_meta.rs`: `pub(crate) fn is_code_file(path)
  -> bool` helper 추가 (= `code_lang_for_path(path).is_some()`).
- `crates/kebab-source-fs/src/connector.rs:168-190`: walker size-cap check 가
  `is_code_file(&abs_path) && is_oversized(...)` short-circuit. PDF/image/
  markdown 는 walker bypass — parser 의 자체 size control (lopdf load_mem,
  image OCR max_pixels) 가 cover.
- `crates/kebab-source-fs/src/connector.rs` 기존 mod tests 안 추가:
  `size_cap_skips_only_code_files` — 300 KB PDF + MD + .rs 의 walker 결과
  검증. 기존 sibling test (huge.rs / longfile.rs, fixture 명 `.rs`) regression 0.

spec:  docs/superpowers/specs/2026-05-27-v0.20-sub1-bugfix-spec.md (§3)
plan:  docs/superpowers/plans/2026-05-27-v0.20-sub1-bugfix-plan.md (Step 1)
prior: b4d9e60 (PR #189)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-27 13:20:38 +00:00
parent b4d9e60816
commit d9acda517a
2 changed files with 44 additions and 8 deletions

View File

@@ -109,6 +109,15 @@ pub(crate) fn is_generated_file(path: &Path) -> Result<bool> {
)
}
/// Returns true when `path`'s filename/extension is recognised as a code
/// file (per `code_lang_for_path`). Used by the walker to apply
/// `[ingest.code].max_file_bytes` / `max_file_lines` only to code files,
/// not to PDF/image/markdown (which have their own size controls in
/// their respective parsers).
pub(crate) fn is_code_file(path: &Path) -> bool {
code_lang_for_path(path).is_some()
}
/// Check if `path` exceeds `max_bytes` or `max_lines`. Byte cap first
/// (cheap), then line cap (streaming with early exit).
pub(crate) fn is_oversized(path: &Path, max_bytes: u64, max_lines: u32) -> Result<bool> {

View File

@@ -165,13 +165,16 @@ impl FsSourceConnector {
continue;
}
// Size-cap check (byte or line limit).
if crate::code_meta::is_oversized(
&abs_path,
self.max_file_bytes,
self.max_file_lines,
)
.unwrap_or(false)
// v0.20.0 sub-item 1 bugfix (#2): size-cap applies ONLY to
// code files. PDF/image/markdown bypass — their parsers
// have their own size controls. spec §3.3.
if crate::code_meta::is_code_file(&abs_path)
&& crate::code_meta::is_oversized(
&abs_path,
self.max_file_bytes,
self.max_file_lines,
)
.unwrap_or(false)
{
fs_skips.skipped_size_exceeded =
fs_skips.skipped_size_exceeded.saturating_add(1);
@@ -184,7 +187,7 @@ impl FsSourceConnector {
path = %rel_path.display(),
max_bytes = self.max_file_bytes,
max_lines = self.max_file_lines,
"skip: file exceeds size cap"
"skip: code file exceeds size cap"
);
continue;
}
@@ -764,4 +767,28 @@ mod tests {
skips.skipped_size_exceeded
);
}
#[test]
fn size_cap_skips_only_code_files() {
use tempfile::tempdir;
let tmp = tempdir().unwrap();
let root = tmp.path().to_path_buf();
// 300 KB pdf / md / rs (each > 262 144 byte cap)
std::fs::write(root.join("paper.pdf"), vec![b'%'; 300_000]).unwrap();
std::fs::write(root.join("notes.md"), vec![b'#'; 300_000]).unwrap();
std::fs::write(root.join("big.rs"), vec![b'/'; 300_000]).unwrap();
let cfg = cfg_with_size_cap(root.to_str().unwrap(), 262_144, 5_000);
let connector = FsSourceConnector::new(&cfg).unwrap();
let (assets, skips) = connector.scan_with_skips(&SourceScope::default()).unwrap();
let paths: Vec<_> = assets.iter().map(|a| a.workspace_path.0.as_str()).collect();
assert!(paths.iter().any(|p| p.contains("paper.pdf")), "PDF must pass: {paths:?}");
assert!(paths.iter().any(|p| p.contains("notes.md")), "MD must pass: {paths:?}");
assert!(!paths.iter().any(|p| p.contains("big.rs")), "code file must skip: {paths:?}");
assert_eq!(skips.skip_examples.size_exceeded.len(), 1);
assert!(skips.skip_examples.size_exceeded[0].contains("big.rs"));
}
}