fix(source-fs): apply size limit only to code files; PDF/image/markdown bypass walker cap (Bug #2)
v0.20.0 sub-item 1 dogfood report 의 Bug #2 — `[ingest.code].max_file_bytes`
가 walker 단계의 모든 file 에 일률 적용 → PDF/image/markdown 의 대부분 (256 KB+)
이 walker pre-extract skip. fix:
- `crates/kebab-source-fs/src/code_meta.rs`: `pub(crate) fn is_code_file(path)
-> bool` helper 추가 (= `code_lang_for_path(path).is_some()`).
- `crates/kebab-source-fs/src/connector.rs:168-190`: walker size-cap check 가
`is_code_file(&abs_path) && is_oversized(...)` short-circuit. PDF/image/
markdown 는 walker bypass — parser 의 자체 size control (lopdf load_mem,
image OCR max_pixels) 가 cover.
- `crates/kebab-source-fs/src/connector.rs` 기존 mod tests 안 추가:
`size_cap_skips_only_code_files` — 300 KB PDF + MD + .rs 의 walker 결과
검증. 기존 sibling test (huge.rs / longfile.rs, fixture 명 `.rs`) regression 0.
spec: docs/superpowers/specs/2026-05-27-v0.20-sub1-bugfix-spec.md (§3)
plan: docs/superpowers/plans/2026-05-27-v0.20-sub1-bugfix-plan.md (Step 1)
prior: b4d9e60 (PR #189)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -109,6 +109,15 @@ pub(crate) fn is_generated_file(path: &Path) -> Result<bool> {
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns true when `path`'s filename/extension is recognised as a code
|
||||
/// file (per `code_lang_for_path`). Used by the walker to apply
|
||||
/// `[ingest.code].max_file_bytes` / `max_file_lines` only to code files,
|
||||
/// not to PDF/image/markdown (which have their own size controls in
|
||||
/// their respective parsers).
|
||||
pub(crate) fn is_code_file(path: &Path) -> bool {
|
||||
code_lang_for_path(path).is_some()
|
||||
}
|
||||
|
||||
/// Check if `path` exceeds `max_bytes` or `max_lines`. Byte cap first
|
||||
/// (cheap), then line cap (streaming with early exit).
|
||||
pub(crate) fn is_oversized(path: &Path, max_bytes: u64, max_lines: u32) -> Result<bool> {
|
||||
|
||||
@@ -165,13 +165,16 @@ impl FsSourceConnector {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Size-cap check (byte or line limit).
|
||||
if crate::code_meta::is_oversized(
|
||||
&abs_path,
|
||||
self.max_file_bytes,
|
||||
self.max_file_lines,
|
||||
)
|
||||
.unwrap_or(false)
|
||||
// v0.20.0 sub-item 1 bugfix (#2): size-cap applies ONLY to
|
||||
// code files. PDF/image/markdown bypass — their parsers
|
||||
// have their own size controls. spec §3.3.
|
||||
if crate::code_meta::is_code_file(&abs_path)
|
||||
&& crate::code_meta::is_oversized(
|
||||
&abs_path,
|
||||
self.max_file_bytes,
|
||||
self.max_file_lines,
|
||||
)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
fs_skips.skipped_size_exceeded =
|
||||
fs_skips.skipped_size_exceeded.saturating_add(1);
|
||||
@@ -184,7 +187,7 @@ impl FsSourceConnector {
|
||||
path = %rel_path.display(),
|
||||
max_bytes = self.max_file_bytes,
|
||||
max_lines = self.max_file_lines,
|
||||
"skip: file exceeds size cap"
|
||||
"skip: code file exceeds size cap"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
@@ -764,4 +767,28 @@ mod tests {
|
||||
skips.skipped_size_exceeded
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn size_cap_skips_only_code_files() {
|
||||
use tempfile::tempdir;
|
||||
let tmp = tempdir().unwrap();
|
||||
let root = tmp.path().to_path_buf();
|
||||
|
||||
// 300 KB pdf / md / rs (each > 262 144 byte cap)
|
||||
std::fs::write(root.join("paper.pdf"), vec![b'%'; 300_000]).unwrap();
|
||||
std::fs::write(root.join("notes.md"), vec![b'#'; 300_000]).unwrap();
|
||||
std::fs::write(root.join("big.rs"), vec![b'/'; 300_000]).unwrap();
|
||||
|
||||
let cfg = cfg_with_size_cap(root.to_str().unwrap(), 262_144, 5_000);
|
||||
let connector = FsSourceConnector::new(&cfg).unwrap();
|
||||
let (assets, skips) = connector.scan_with_skips(&SourceScope::default()).unwrap();
|
||||
|
||||
let paths: Vec<_> = assets.iter().map(|a| a.workspace_path.0.as_str()).collect();
|
||||
assert!(paths.iter().any(|p| p.contains("paper.pdf")), "PDF must pass: {paths:?}");
|
||||
assert!(paths.iter().any(|p| p.contains("notes.md")), "MD must pass: {paths:?}");
|
||||
assert!(!paths.iter().any(|p| p.contains("big.rs")), "code file must skip: {paths:?}");
|
||||
|
||||
assert_eq!(skips.skip_examples.size_exceeded.len(), 1);
|
||||
assert!(skips.skip_examples.size_exceeded[0].contains("big.rs"));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user