refactor(source-fs): drop kebab-parse-code dep — 9 tree-sitter grammars drag 제거

kebab-source-fs 가 kebab-parse-code 의 9 tree-sitter grammars 를 drag 했던 무거운 의존성 제거. 4 surface (code_lang_for_path / is_generated_file / is_oversized / BUILTIN_BLACKLIST) 만 사용하지만 dep 그래프에서 9 grammar 전체 link → kebab-source-fs::code_meta 로 이전 + kebab-parse-code 측 cleanup.

핵심 변경:
- kebab-source-fs::code_meta 신설: 4 surface 이전 (BUILTIN_BLACKLIST `pub` for frozen contract + 3 helper fn `pub(crate)`). lib.rs 의 `pub use code_meta::BUILTIN_BLACKLIST` 1 줄 추가 (Option A — 다른 mod surface 무근거 확장 0).
- callsite migration: media.rs (1) + walker.rs (2) + connector.rs (2) 모두 `kebab_source_fs::code_meta::*` 로 갱신.
- kebab-parse-code 측 cleanup: skip.rs 삭제 + lang.rs narrow edit (code_lang_for_path body + unit test 2 + Path import 삭제, module_path_for_* 보존) + lib.rs 헤더 doc rewrite (migration breadcrumb 포함).
- tests/{lang,skip}.rs 13 test 이동 — 12 unit (`src/code_meta.rs::tests`) + 1 integration (`tests/code_meta.rs` for BUILTIN_BLACKLIST frozen contract).
- design §8 graph: edge 제거 + p10-2 inline note. ARCHITECTURE.md 산문 1 줄 갱신. kebab-core::metadata.rs:36 stale dep reference 정정.

G1+G5: cargo tree -p kebab-source-fs | grep tree-sitter = 0 줄.
G2+G3: workspace test 회귀 0 + 13 test 1:1 이동.
G4: design §8 + ARCHITECTURE.md 갱신.

Wire 영향: 없음 (internal Rust crate-API surface 만, user-facing 0). Cargo workspace.version bump 불필요.

Refs:
- docs/superpowers/specs/2026-05-26-source-fs-dep-lightening-spec.md (v3, 4-round APPROVE)
- docs/superpowers/plans/2026-05-26-source-fs-dep-lightening-plan.md (v4, 4-round ACCEPT)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-26 12:19:32 +00:00
parent b02ac8200e
commit bd48baa19a
18 changed files with 1518 additions and 322 deletions

View File

@@ -1,69 +1,6 @@
//! Canonical extension → language identifier mapping (spec §3.5).
//!
//! Lowercase canonical identifiers, matching tree-sitter parser conventions:
//! `rust`, `python`, `typescript`, `javascript`, `go`, `java`, `kotlin`, `c`,
//! `cpp`, `yaml`, `toml`, `json`, `shell`, `make`, `dockerfile`.
use std::path::Path;
/// Returns the canonical language identifier for a given file path, or
/// `None` if the extension / filename is not recognized.
///
/// Matching priority:
/// 1. Tier 1 basename exact match (e.g. `Dockerfile`, `Makefile`)
/// 2. Tier 2 basename match (e.g. `Cargo.toml`, `package.json`, `build.gradle`)
/// 3. Tier 2 `Dockerfile.*` prefix variant
/// 4. Tier 1 + Tier 2 extension fallback (lowercase)
pub fn code_lang_for_path(path: &Path) -> Option<&'static str> {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
// Tier 1 basename exact match
match name {
"Dockerfile" => return Some("dockerfile"),
"Makefile" | "GNUmakefile" => return Some("make"),
_ => {}
}
// Tier 2 basename match (configuration / manifest files)
match name {
"Cargo.toml" | "pyproject.toml" => return Some("toml"),
"package.json" | "tsconfig.json" => return Some("json"),
"go.mod" => return Some("go-mod"),
"pom.xml" => return Some("xml"),
"build.gradle" => return Some("groovy"),
_ => {}
}
// Tier 2: `Dockerfile.*` prefix variant (e.g. `Dockerfile.dev`, `Dockerfile.prod`)
if name.starts_with("Dockerfile.") && name.len() > "Dockerfile.".len() {
return Some("dockerfile");
}
}
// Extension fallback (Tier 1 + Tier 2)
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
match ext.as_str() {
// Tier 1 extensions
"rs" => Some("rust"),
"py" | "pyi" => Some("python"),
"ts" | "tsx" | "mts" | "cts" => Some("typescript"),
"js" | "mjs" | "cjs" | "jsx" => Some("javascript"),
"go" => Some("go"),
"java" => Some("java"),
"kt" | "kts" => Some("kotlin"),
"c" | "h" => Some("c"),
"cpp" | "cc" | "cxx" | "hpp" | "hh" | "hxx" => Some("cpp"),
"sh" | "bash" | "zsh" => Some("shell"),
"mk" => Some("make"),
// Tier 2 extensions
"yaml" | "yml" => Some("yaml"),
"toml" => Some("toml"),
"json" => Some("json"),
"xml" => Some("xml"),
"dockerfile" => Some("dockerfile"),
"gradle" => Some("groovy"),
_ => None,
}
}
//! Workspace-relative path → module-path conversion for P10-1B AST extractors
//! (Python dotted form / TS+JS slash form). 본 module 의 `code_lang_for_path`
//! 는 v0.18.0+ 부터 `kebab-source-fs::code_meta` 로 이동.
/// p10-1B: workspace-relative Python file path → dotted module-path prefix.
/// See plan §Task C for the exact rules + tasks/p10/p10-1b for the §3.4
@@ -142,28 +79,4 @@ mod tests {
assert_eq!(module_path_for_tsjs("a/b/c.ts"), "a/b/c");
assert_eq!(module_path_for_tsjs("packages/x/src/Foo.ts"), "packages/x/src/Foo");
}
#[test]
fn tier2_basename_takes_precedence_over_extension() {
assert_eq!(code_lang_for_path(Path::new("Dockerfile")), Some("dockerfile"));
assert_eq!(code_lang_for_path(Path::new("foo/Dockerfile.dev")), Some("dockerfile"));
assert_eq!(code_lang_for_path(Path::new("myapp.dockerfile")), Some("dockerfile"));
assert_eq!(code_lang_for_path(Path::new("repo/Cargo.toml")), Some("toml"));
assert_eq!(code_lang_for_path(Path::new("pyproject.toml")), Some("toml"));
assert_eq!(code_lang_for_path(Path::new("repo/package.json")), Some("json"));
assert_eq!(code_lang_for_path(Path::new("tsconfig.json")), Some("json"));
assert_eq!(code_lang_for_path(Path::new("go.mod")), Some("go-mod"));
assert_eq!(code_lang_for_path(Path::new("pom.xml")), Some("xml"));
assert_eq!(code_lang_for_path(Path::new("build.gradle")), Some("groovy"));
}
#[test]
fn tier2_extension_fallback() {
assert_eq!(code_lang_for_path(Path::new("k8s/deploy.yaml")), Some("yaml"));
assert_eq!(code_lang_for_path(Path::new("k8s/deploy.yml")), Some("yaml"));
assert_eq!(code_lang_for_path(Path::new("foo/bar.toml")), Some("toml"));
assert_eq!(code_lang_for_path(Path::new("foo/bar.json")), Some("json"));
assert_eq!(code_lang_for_path(Path::new("foo/bar.xml")), Some("xml"));
assert_eq!(code_lang_for_path(Path::new("foo/bar.gradle")), Some("groovy"));
}
}

View File

@@ -1,17 +1,10 @@
//! `kebab-parse-code` — language-aware parsing for code corpora.
//!
//! Phase 1A-1 ships infrastructure only:
//! Repo metadata (`detect_repo`) + per-language AST extractors (Rust = P10-1A-2, Python/TS/JS = P10-1B, Go = P10-1C-Go, Java+Kotlin = P10-1C-JK, C+C++ = P10-1D).
//!
//! - [`lang::code_lang_for_path`] — extension → language identifier.
//! - [`repo::detect_repo`] — `.git/` walk-up → repo / branch / commit metadata.
//! - [`skip::is_generated_file`] / [`skip::is_oversized`] — pre-ingest skip
//! helpers consulted by `kebab-source-fs`.
//! - [`skip::BUILTIN_BLACKLIST`] — 6-entry safety-net pattern list.
//! lang detect (`code_lang_for_path`) + pre-ingest skip helpers (`is_generated_file`, `is_oversized`, `BUILTIN_BLACKLIST`) 는 v0.18.0+ 부터 `kebab-source-fs::code_meta` 로 이동 — refactor 2026-05-26.
//!
//! Per-language parser modules (`rust`, `python`, `typescript`, …) land in
//! later phases (1A-2 onwards). The crate boundary follows other
//! `kebab-parse-*` crates per design §8: must NOT depend on store / embed
//! / llm / rag.
//! 본 crate 의 boundary 는 design §8 — store / embed / llm / rag / UI 의존 금지.
pub mod c;
pub mod cpp;
@@ -24,7 +17,6 @@ pub mod python;
pub mod repo;
pub mod rust;
pub(crate) mod scaffold;
pub mod skip;
pub mod typescript;
pub use c::{PARSER_VERSION as C_PARSER_VERSION, CAstExtractor};
@@ -33,9 +25,8 @@ pub use go::{PARSER_VERSION as GO_PARSER_VERSION, GoAstExtractor};
pub use java::{PARSER_VERSION as JAVA_PARSER_VERSION, JavaAstExtractor};
pub use javascript::{PARSER_VERSION as JS_PARSER_VERSION, JavascriptAstExtractor};
pub use kotlin::{PARSER_VERSION as KOTLIN_PARSER_VERSION, KotlinAstExtractor};
pub use lang::{code_lang_for_path, module_path_for_python, module_path_for_tsjs};
pub use lang::{module_path_for_python, module_path_for_tsjs};
pub use python::{PARSER_VERSION as PYTHON_PARSER_VERSION, PythonAstExtractor};
pub use repo::{RepoMeta, detect_repo};
pub use rust::{PARSER_VERSION as RUST_PARSER_VERSION, RustAstExtractor};
pub use skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized};
pub use typescript::{PARSER_VERSION as TS_PARSER_VERSION, TypescriptAstExtractor};

View File

@@ -1,65 +0,0 @@
//! Pre-ingest skip helpers (spec §5.2 + §5.3 + §5.4).
//!
//! - [`BUILTIN_BLACKLIST`] — 6 gitignore-style patterns universal across
//! ecosystems. Source of truth: spec §5.2.
//! - [`is_generated_file`] — reads first ~512 bytes, checks for 7
//! case-insensitive markers.
//! - [`is_oversized`] — byte cap then line cap.
use anyhow::Result;
use std::fs::File;
use std::io::{BufRead, BufReader, Read};
use std::path::Path;
/// 6 built-in gitignore-style patterns. Applied in addition to `.gitignore`
/// + `.kebabignore`. User can override via `.kebabignore` negation
/// (`!pattern`).
pub const BUILTIN_BLACKLIST: &[&str] = &[
"**/node_modules/**",
"**/target/**",
"**/__pycache__/**",
"**/.venv/**",
"**/venv/**",
"**/env/**",
];
/// Read first 512 bytes, check for any of 7 case-insensitive generated-file
/// markers. Returns Ok(true) on match, Ok(false) otherwise.
pub fn is_generated_file(path: &Path) -> Result<bool> {
let mut buf = [0u8; 512];
let mut f = File::open(path)?;
let n = f.read(&mut buf)?;
if n == 0 {
return Ok(false);
}
let head = std::str::from_utf8(&buf[..n]).unwrap_or("");
let lower: String = head.lines().take(10).collect::<Vec<_>>().join("\n").to_ascii_lowercase();
Ok(
lower.contains("@generated")
|| lower.contains("code generated by")
|| lower.contains("do not edit")
|| lower.contains("do not modify")
|| lower.contains("automatically generated")
|| lower.contains("auto-generated")
|| lower.contains("autogenerated"),
)
}
/// Check if `path` exceeds `max_bytes` or `max_lines`. Byte cap first
/// (cheap), then line cap (streaming with early exit).
pub fn is_oversized(path: &Path, max_bytes: u64, max_lines: u32) -> Result<bool> {
let meta = std::fs::metadata(path)?;
if meta.len() > max_bytes {
return Ok(true);
}
let reader = BufReader::new(File::open(path)?);
let mut count: u32 = 0;
for line in reader.lines() {
let _ = line?;
count = count.saturating_add(1);
if count > max_lines {
return Ok(true);
}
}
Ok(false)
}

View File

@@ -1,67 +0,0 @@
use kebab_parse_code::code_lang_for_path;
use std::path::Path;
#[test]
fn known_extensions_map_to_canonical_identifiers() {
let cases = [
("foo.rs", Some("rust")),
("foo.py", Some("python")),
("foo.pyi", Some("python")),
("foo.ts", Some("typescript")),
("foo.tsx", Some("typescript")),
("foo.mts", Some("typescript")), // ESM TS — same grammar
("foo.cts", Some("typescript")), // CommonJS TS — same grammar
("foo.js", Some("javascript")),
("foo.mjs", Some("javascript")),
("foo.cjs", Some("javascript")),
("foo.jsx", Some("javascript")),
("foo.go", Some("go")),
("foo.java", Some("java")),
("foo.kt", Some("kotlin")),
("foo.kts", Some("kotlin")),
("foo.c", Some("c")),
("foo.h", Some("c")),
("foo.cpp", Some("cpp")),
("foo.cc", Some("cpp")),
("foo.cxx", Some("cpp")),
("foo.hpp", Some("cpp")),
("foo.hh", Some("cpp")),
("foo.hxx", Some("cpp")),
("foo.yaml", Some("yaml")),
("foo.yml", Some("yaml")),
("foo.toml", Some("toml")),
("foo.json", Some("json")),
("foo.sh", Some("shell")),
("foo.bash", Some("shell")),
("foo.zsh", Some("shell")),
("foo.mk", Some("make")),
];
for (path, expected) in cases {
assert_eq!(
code_lang_for_path(Path::new(path)),
expected,
"path = {path}"
);
}
}
#[test]
fn special_filenames_map_to_identifiers() {
assert_eq!(code_lang_for_path(Path::new("Dockerfile")), Some("dockerfile"));
assert_eq!(code_lang_for_path(Path::new("foo.dockerfile")), Some("dockerfile"));
assert_eq!(code_lang_for_path(Path::new("Makefile")), Some("make"));
assert_eq!(code_lang_for_path(Path::new("GNUmakefile")), Some("make"));
}
#[test]
fn unknown_extension_returns_none() {
assert_eq!(code_lang_for_path(Path::new("foo.docx")), None);
assert_eq!(code_lang_for_path(Path::new("foo")), None);
assert_eq!(code_lang_for_path(Path::new("foo.unknown")), None);
}
#[test]
fn case_insensitive() {
assert_eq!(code_lang_for_path(Path::new("Foo.RS")), Some("rust"));
assert_eq!(code_lang_for_path(Path::new("FOO.YAML")), Some("yaml"));
}

View File

@@ -1,74 +0,0 @@
use kebab_parse_code::skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized};
use std::fs;
use tempfile::NamedTempFile;
#[test]
fn generated_header_markers_trigger_skip() {
let cases = [
"// @generated\nfn foo() {}\n",
"// Code generated by tonic-build. DO NOT EDIT.\nfn x() {}\n",
"/* DO NOT EDIT */\nfn x() {}\n",
"/* do not modify */\nfn x() {}\n",
"// AUTOMATICALLY GENERATED\nfn x() {}\n",
"# auto-generated\ndef x(): pass\n",
"// autogenerated\nfn x() {}\n",
];
for content in cases {
let f = NamedTempFile::new().unwrap();
fs::write(f.path(), content).unwrap();
assert!(is_generated_file(f.path()).unwrap(), "content: {content:?}");
}
}
#[test]
fn normal_code_is_not_flagged_generated() {
let f = NamedTempFile::new().unwrap();
fs::write(f.path(), "fn main() {\n println!(\"hi\");\n}\n").unwrap();
assert!(!is_generated_file(f.path()).unwrap());
}
#[test]
fn is_generated_returns_false_for_empty_file() {
let f = NamedTempFile::new().unwrap();
fs::write(f.path(), "").unwrap();
assert!(!is_generated_file(f.path()).unwrap());
}
#[test]
fn oversized_by_bytes_returns_true() {
let f = NamedTempFile::new().unwrap();
let body: String = "x".repeat(300_000);
fs::write(f.path(), &body).unwrap();
assert!(is_oversized(f.path(), 262_144, 5_000).unwrap());
}
#[test]
fn oversized_by_lines_returns_true() {
let f = NamedTempFile::new().unwrap();
let body: String = "x\n".repeat(6_000);
fs::write(f.path(), &body).unwrap();
assert!(is_oversized(f.path(), 262_144, 5_000).unwrap());
}
#[test]
fn small_file_returns_false_for_oversize() {
let f = NamedTempFile::new().unwrap();
fs::write(f.path(), "fn foo() {}\n").unwrap();
assert!(!is_oversized(f.path(), 262_144, 5_000).unwrap());
}
#[test]
fn builtin_blacklist_has_exactly_six_entries() {
assert_eq!(BUILTIN_BLACKLIST.len(), 6);
let expected = [
"**/node_modules/**",
"**/target/**",
"**/__pycache__/**",
"**/.venv/**",
"**/venv/**",
"**/env/**",
];
for pat in expected {
assert!(BUILTIN_BLACKLIST.contains(&pat), "missing pattern: {pat}");
}
}