From d9acda517a784c5b3c96e460ce771efebd3452ea Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 27 May 2026 13:20:38 +0000 Subject: [PATCH] fix(source-fs): apply size limit only to code files; PDF/image/markdown bypass walker cap (Bug #2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.20.0 sub-item 1 dogfood report 의 Bug #2 — `[ingest.code].max_file_bytes` 가 walker 단계의 모든 file 에 일률 적용 → PDF/image/markdown 의 대부분 (256 KB+) 이 walker pre-extract skip. fix: - `crates/kebab-source-fs/src/code_meta.rs`: `pub(crate) fn is_code_file(path) -> bool` helper 추가 (= `code_lang_for_path(path).is_some()`). - `crates/kebab-source-fs/src/connector.rs:168-190`: walker size-cap check 가 `is_code_file(&abs_path) && is_oversized(...)` short-circuit. PDF/image/ markdown 는 walker bypass — parser 의 자체 size control (lopdf load_mem, image OCR max_pixels) 가 cover. - `crates/kebab-source-fs/src/connector.rs` 기존 mod tests 안 추가: `size_cap_skips_only_code_files` — 300 KB PDF + MD + .rs 의 walker 결과 검증. 기존 sibling test (huge.rs / longfile.rs, fixture 명 `.rs`) regression 0. spec: docs/superpowers/specs/2026-05-27-v0.20-sub1-bugfix-spec.md (§3) plan: docs/superpowers/plans/2026-05-27-v0.20-sub1-bugfix-plan.md (Step 1) prior: b4d9e60 (PR #189) Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-source-fs/src/code_meta.rs | 9 ++++++ crates/kebab-source-fs/src/connector.rs | 43 ++++++++++++++++++++----- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/crates/kebab-source-fs/src/code_meta.rs b/crates/kebab-source-fs/src/code_meta.rs index afb7b53..d09ec7a 100644 --- a/crates/kebab-source-fs/src/code_meta.rs +++ b/crates/kebab-source-fs/src/code_meta.rs @@ -109,6 +109,15 @@ pub(crate) fn is_generated_file(path: &Path) -> Result { ) } +/// Returns true when `path`'s filename/extension is recognised as a code +/// file (per `code_lang_for_path`). Used by the walker to apply +/// `[ingest.code].max_file_bytes` / `max_file_lines` only to code files, +/// not to PDF/image/markdown (which have their own size controls in +/// their respective parsers). +pub(crate) fn is_code_file(path: &Path) -> bool { + code_lang_for_path(path).is_some() +} + /// Check if `path` exceeds `max_bytes` or `max_lines`. Byte cap first /// (cheap), then line cap (streaming with early exit). pub(crate) fn is_oversized(path: &Path, max_bytes: u64, max_lines: u32) -> Result { diff --git a/crates/kebab-source-fs/src/connector.rs b/crates/kebab-source-fs/src/connector.rs index 288b8a3..95ddc57 100644 --- a/crates/kebab-source-fs/src/connector.rs +++ b/crates/kebab-source-fs/src/connector.rs @@ -165,13 +165,16 @@ impl FsSourceConnector { continue; } - // Size-cap check (byte or line limit). - if crate::code_meta::is_oversized( - &abs_path, - self.max_file_bytes, - self.max_file_lines, - ) - .unwrap_or(false) + // v0.20.0 sub-item 1 bugfix (#2): size-cap applies ONLY to + // code files. PDF/image/markdown bypass — their parsers + // have their own size controls. spec §3.3. + if crate::code_meta::is_code_file(&abs_path) + && crate::code_meta::is_oversized( + &abs_path, + self.max_file_bytes, + self.max_file_lines, + ) + .unwrap_or(false) { fs_skips.skipped_size_exceeded = fs_skips.skipped_size_exceeded.saturating_add(1); @@ -184,7 +187,7 @@ impl FsSourceConnector { path = %rel_path.display(), max_bytes = self.max_file_bytes, max_lines = self.max_file_lines, - "skip: file exceeds size cap" + "skip: code file exceeds size cap" ); continue; } @@ -764,4 +767,28 @@ mod tests { skips.skipped_size_exceeded ); } + + #[test] + fn size_cap_skips_only_code_files() { + use tempfile::tempdir; + let tmp = tempdir().unwrap(); + let root = tmp.path().to_path_buf(); + + // 300 KB pdf / md / rs (each > 262 144 byte cap) + std::fs::write(root.join("paper.pdf"), vec![b'%'; 300_000]).unwrap(); + std::fs::write(root.join("notes.md"), vec![b'#'; 300_000]).unwrap(); + std::fs::write(root.join("big.rs"), vec![b'/'; 300_000]).unwrap(); + + let cfg = cfg_with_size_cap(root.to_str().unwrap(), 262_144, 5_000); + let connector = FsSourceConnector::new(&cfg).unwrap(); + let (assets, skips) = connector.scan_with_skips(&SourceScope::default()).unwrap(); + + let paths: Vec<_> = assets.iter().map(|a| a.workspace_path.0.as_str()).collect(); + assert!(paths.iter().any(|p| p.contains("paper.pdf")), "PDF must pass: {paths:?}"); + assert!(paths.iter().any(|p| p.contains("notes.md")), "MD must pass: {paths:?}"); + assert!(!paths.iter().any(|p| p.contains("big.rs")), "code file must skip: {paths:?}"); + + assert_eq!(skips.skip_examples.size_exceeded.len(), 1); + assert!(skips.skip_examples.size_exceeded[0].contains("big.rs")); + } }