refactor(source-fs): drop kebab-parse-code dep — 9 tree-sitter grammars drag 제거
kebab-source-fs 가 kebab-parse-code 의 9 tree-sitter grammars 를 drag 했던 무거운 의존성 제거. 4 surface (code_lang_for_path / is_generated_file / is_oversized / BUILTIN_BLACKLIST) 만 사용하지만 dep 그래프에서 9 grammar 전체 link → kebab-source-fs::code_meta 로 이전 + kebab-parse-code 측 cleanup.
핵심 변경:
- kebab-source-fs::code_meta 신설: 4 surface 이전 (BUILTIN_BLACKLIST `pub` for frozen contract + 3 helper fn `pub(crate)`). lib.rs 의 `pub use code_meta::BUILTIN_BLACKLIST` 1 줄 추가 (Option A — 다른 mod surface 무근거 확장 0).
- callsite migration: media.rs (1) + walker.rs (2) + connector.rs (2) 모두 `kebab_source_fs::code_meta::*` 로 갱신.
- kebab-parse-code 측 cleanup: skip.rs 삭제 + lang.rs narrow edit (code_lang_for_path body + unit test 2 + Path import 삭제, module_path_for_* 보존) + lib.rs 헤더 doc rewrite (migration breadcrumb 포함).
- tests/{lang,skip}.rs 13 test 이동 — 12 unit (`src/code_meta.rs::tests`) + 1 integration (`tests/code_meta.rs` for BUILTIN_BLACKLIST frozen contract).
- design §8 graph: edge 제거 + p10-2 inline note. ARCHITECTURE.md 산문 1 줄 갱신. kebab-core::metadata.rs:36 stale dep reference 정정.
G1+G5: cargo tree -p kebab-source-fs | grep tree-sitter = 0 줄.
G2+G3: workspace test 회귀 0 + 13 test 1:1 이동.
G4: design §8 + ARCHITECTURE.md 갱신.
Wire 영향: 없음 (internal Rust crate-API surface 만, user-facing 0). Cargo workspace.version bump 불필요.
Refs:
- docs/superpowers/specs/2026-05-26-source-fs-dep-lightening-spec.md (v3, 4-round APPROVE)
- docs/superpowers/plans/2026-05-26-source-fs-dep-lightening-plan.md (v4, 4-round ACCEPT)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,69 +1,6 @@
|
||||
//! Canonical extension → language identifier mapping (spec §3.5).
|
||||
//!
|
||||
//! Lowercase canonical identifiers, matching tree-sitter parser conventions:
|
||||
//! `rust`, `python`, `typescript`, `javascript`, `go`, `java`, `kotlin`, `c`,
|
||||
//! `cpp`, `yaml`, `toml`, `json`, `shell`, `make`, `dockerfile`.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
/// Returns the canonical language identifier for a given file path, or
|
||||
/// `None` if the extension / filename is not recognized.
|
||||
///
|
||||
/// Matching priority:
|
||||
/// 1. Tier 1 basename exact match (e.g. `Dockerfile`, `Makefile`)
|
||||
/// 2. Tier 2 basename match (e.g. `Cargo.toml`, `package.json`, `build.gradle`)
|
||||
/// 3. Tier 2 `Dockerfile.*` prefix variant
|
||||
/// 4. Tier 1 + Tier 2 extension fallback (lowercase)
|
||||
pub fn code_lang_for_path(path: &Path) -> Option<&'static str> {
|
||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||
// Tier 1 basename exact match
|
||||
match name {
|
||||
"Dockerfile" => return Some("dockerfile"),
|
||||
"Makefile" | "GNUmakefile" => return Some("make"),
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// Tier 2 basename match (configuration / manifest files)
|
||||
match name {
|
||||
"Cargo.toml" | "pyproject.toml" => return Some("toml"),
|
||||
"package.json" | "tsconfig.json" => return Some("json"),
|
||||
"go.mod" => return Some("go-mod"),
|
||||
"pom.xml" => return Some("xml"),
|
||||
"build.gradle" => return Some("groovy"),
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// Tier 2: `Dockerfile.*` prefix variant (e.g. `Dockerfile.dev`, `Dockerfile.prod`)
|
||||
if name.starts_with("Dockerfile.") && name.len() > "Dockerfile.".len() {
|
||||
return Some("dockerfile");
|
||||
}
|
||||
}
|
||||
|
||||
// Extension fallback (Tier 1 + Tier 2)
|
||||
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
|
||||
match ext.as_str() {
|
||||
// Tier 1 extensions
|
||||
"rs" => Some("rust"),
|
||||
"py" | "pyi" => Some("python"),
|
||||
"ts" | "tsx" | "mts" | "cts" => Some("typescript"),
|
||||
"js" | "mjs" | "cjs" | "jsx" => Some("javascript"),
|
||||
"go" => Some("go"),
|
||||
"java" => Some("java"),
|
||||
"kt" | "kts" => Some("kotlin"),
|
||||
"c" | "h" => Some("c"),
|
||||
"cpp" | "cc" | "cxx" | "hpp" | "hh" | "hxx" => Some("cpp"),
|
||||
"sh" | "bash" | "zsh" => Some("shell"),
|
||||
"mk" => Some("make"),
|
||||
// Tier 2 extensions
|
||||
"yaml" | "yml" => Some("yaml"),
|
||||
"toml" => Some("toml"),
|
||||
"json" => Some("json"),
|
||||
"xml" => Some("xml"),
|
||||
"dockerfile" => Some("dockerfile"),
|
||||
"gradle" => Some("groovy"),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
//! Workspace-relative path → module-path conversion for P10-1B AST extractors
|
||||
//! (Python dotted form / TS+JS slash form). 본 module 의 `code_lang_for_path`
|
||||
//! 는 v0.18.0+ 부터 `kebab-source-fs::code_meta` 로 이동.
|
||||
|
||||
/// p10-1B: workspace-relative Python file path → dotted module-path prefix.
|
||||
/// See plan §Task C for the exact rules + tasks/p10/p10-1b for the §3.4
|
||||
@@ -142,28 +79,4 @@ mod tests {
|
||||
assert_eq!(module_path_for_tsjs("a/b/c.ts"), "a/b/c");
|
||||
assert_eq!(module_path_for_tsjs("packages/x/src/Foo.ts"), "packages/x/src/Foo");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tier2_basename_takes_precedence_over_extension() {
|
||||
assert_eq!(code_lang_for_path(Path::new("Dockerfile")), Some("dockerfile"));
|
||||
assert_eq!(code_lang_for_path(Path::new("foo/Dockerfile.dev")), Some("dockerfile"));
|
||||
assert_eq!(code_lang_for_path(Path::new("myapp.dockerfile")), Some("dockerfile"));
|
||||
assert_eq!(code_lang_for_path(Path::new("repo/Cargo.toml")), Some("toml"));
|
||||
assert_eq!(code_lang_for_path(Path::new("pyproject.toml")), Some("toml"));
|
||||
assert_eq!(code_lang_for_path(Path::new("repo/package.json")), Some("json"));
|
||||
assert_eq!(code_lang_for_path(Path::new("tsconfig.json")), Some("json"));
|
||||
assert_eq!(code_lang_for_path(Path::new("go.mod")), Some("go-mod"));
|
||||
assert_eq!(code_lang_for_path(Path::new("pom.xml")), Some("xml"));
|
||||
assert_eq!(code_lang_for_path(Path::new("build.gradle")), Some("groovy"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tier2_extension_fallback() {
|
||||
assert_eq!(code_lang_for_path(Path::new("k8s/deploy.yaml")), Some("yaml"));
|
||||
assert_eq!(code_lang_for_path(Path::new("k8s/deploy.yml")), Some("yaml"));
|
||||
assert_eq!(code_lang_for_path(Path::new("foo/bar.toml")), Some("toml"));
|
||||
assert_eq!(code_lang_for_path(Path::new("foo/bar.json")), Some("json"));
|
||||
assert_eq!(code_lang_for_path(Path::new("foo/bar.xml")), Some("xml"));
|
||||
assert_eq!(code_lang_for_path(Path::new("foo/bar.gradle")), Some("groovy"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,17 +1,10 @@
|
||||
//! `kebab-parse-code` — language-aware parsing for code corpora.
|
||||
//!
|
||||
//! Phase 1A-1 ships infrastructure only:
|
||||
//! Repo metadata (`detect_repo`) + per-language AST extractors (Rust = P10-1A-2, Python/TS/JS = P10-1B, Go = P10-1C-Go, Java+Kotlin = P10-1C-JK, C+C++ = P10-1D).
|
||||
//!
|
||||
//! - [`lang::code_lang_for_path`] — extension → language identifier.
|
||||
//! - [`repo::detect_repo`] — `.git/` walk-up → repo / branch / commit metadata.
|
||||
//! - [`skip::is_generated_file`] / [`skip::is_oversized`] — pre-ingest skip
|
||||
//! helpers consulted by `kebab-source-fs`.
|
||||
//! - [`skip::BUILTIN_BLACKLIST`] — 6-entry safety-net pattern list.
|
||||
//! lang detect (`code_lang_for_path`) + pre-ingest skip helpers (`is_generated_file`, `is_oversized`, `BUILTIN_BLACKLIST`) 는 v0.18.0+ 부터 `kebab-source-fs::code_meta` 로 이동 — refactor 2026-05-26.
|
||||
//!
|
||||
//! Per-language parser modules (`rust`, `python`, `typescript`, …) land in
|
||||
//! later phases (1A-2 onwards). The crate boundary follows other
|
||||
//! `kebab-parse-*` crates per design §8: must NOT depend on store / embed
|
||||
//! / llm / rag.
|
||||
//! 본 crate 의 boundary 는 design §8 — store / embed / llm / rag / UI 의존 금지.
|
||||
|
||||
pub mod c;
|
||||
pub mod cpp;
|
||||
@@ -24,7 +17,6 @@ pub mod python;
|
||||
pub mod repo;
|
||||
pub mod rust;
|
||||
pub(crate) mod scaffold;
|
||||
pub mod skip;
|
||||
pub mod typescript;
|
||||
|
||||
pub use c::{PARSER_VERSION as C_PARSER_VERSION, CAstExtractor};
|
||||
@@ -33,9 +25,8 @@ pub use go::{PARSER_VERSION as GO_PARSER_VERSION, GoAstExtractor};
|
||||
pub use java::{PARSER_VERSION as JAVA_PARSER_VERSION, JavaAstExtractor};
|
||||
pub use javascript::{PARSER_VERSION as JS_PARSER_VERSION, JavascriptAstExtractor};
|
||||
pub use kotlin::{PARSER_VERSION as KOTLIN_PARSER_VERSION, KotlinAstExtractor};
|
||||
pub use lang::{code_lang_for_path, module_path_for_python, module_path_for_tsjs};
|
||||
pub use lang::{module_path_for_python, module_path_for_tsjs};
|
||||
pub use python::{PARSER_VERSION as PYTHON_PARSER_VERSION, PythonAstExtractor};
|
||||
pub use repo::{RepoMeta, detect_repo};
|
||||
pub use rust::{PARSER_VERSION as RUST_PARSER_VERSION, RustAstExtractor};
|
||||
pub use skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized};
|
||||
pub use typescript::{PARSER_VERSION as TS_PARSER_VERSION, TypescriptAstExtractor};
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
//! Pre-ingest skip helpers (spec §5.2 + §5.3 + §5.4).
|
||||
//!
|
||||
//! - [`BUILTIN_BLACKLIST`] — 6 gitignore-style patterns universal across
|
||||
//! ecosystems. Source of truth: spec §5.2.
|
||||
//! - [`is_generated_file`] — reads first ~512 bytes, checks for 7
|
||||
//! case-insensitive markers.
|
||||
//! - [`is_oversized`] — byte cap then line cap.
|
||||
|
||||
use anyhow::Result;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Read};
|
||||
use std::path::Path;
|
||||
|
||||
/// 6 built-in gitignore-style patterns. Applied in addition to `.gitignore`
|
||||
/// + `.kebabignore`. User can override via `.kebabignore` negation
|
||||
/// (`!pattern`).
|
||||
pub const BUILTIN_BLACKLIST: &[&str] = &[
|
||||
"**/node_modules/**",
|
||||
"**/target/**",
|
||||
"**/__pycache__/**",
|
||||
"**/.venv/**",
|
||||
"**/venv/**",
|
||||
"**/env/**",
|
||||
];
|
||||
|
||||
/// Read first 512 bytes, check for any of 7 case-insensitive generated-file
|
||||
/// markers. Returns Ok(true) on match, Ok(false) otherwise.
|
||||
pub fn is_generated_file(path: &Path) -> Result<bool> {
|
||||
let mut buf = [0u8; 512];
|
||||
let mut f = File::open(path)?;
|
||||
let n = f.read(&mut buf)?;
|
||||
if n == 0 {
|
||||
return Ok(false);
|
||||
}
|
||||
let head = std::str::from_utf8(&buf[..n]).unwrap_or("");
|
||||
let lower: String = head.lines().take(10).collect::<Vec<_>>().join("\n").to_ascii_lowercase();
|
||||
Ok(
|
||||
lower.contains("@generated")
|
||||
|| lower.contains("code generated by")
|
||||
|| lower.contains("do not edit")
|
||||
|| lower.contains("do not modify")
|
||||
|| lower.contains("automatically generated")
|
||||
|| lower.contains("auto-generated")
|
||||
|| lower.contains("autogenerated"),
|
||||
)
|
||||
}
|
||||
|
||||
/// Check if `path` exceeds `max_bytes` or `max_lines`. Byte cap first
|
||||
/// (cheap), then line cap (streaming with early exit).
|
||||
pub fn is_oversized(path: &Path, max_bytes: u64, max_lines: u32) -> Result<bool> {
|
||||
let meta = std::fs::metadata(path)?;
|
||||
if meta.len() > max_bytes {
|
||||
return Ok(true);
|
||||
}
|
||||
let reader = BufReader::new(File::open(path)?);
|
||||
let mut count: u32 = 0;
|
||||
for line in reader.lines() {
|
||||
let _ = line?;
|
||||
count = count.saturating_add(1);
|
||||
if count > max_lines {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
@@ -1,67 +0,0 @@
|
||||
use kebab_parse_code::code_lang_for_path;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn known_extensions_map_to_canonical_identifiers() {
|
||||
let cases = [
|
||||
("foo.rs", Some("rust")),
|
||||
("foo.py", Some("python")),
|
||||
("foo.pyi", Some("python")),
|
||||
("foo.ts", Some("typescript")),
|
||||
("foo.tsx", Some("typescript")),
|
||||
("foo.mts", Some("typescript")), // ESM TS — same grammar
|
||||
("foo.cts", Some("typescript")), // CommonJS TS — same grammar
|
||||
("foo.js", Some("javascript")),
|
||||
("foo.mjs", Some("javascript")),
|
||||
("foo.cjs", Some("javascript")),
|
||||
("foo.jsx", Some("javascript")),
|
||||
("foo.go", Some("go")),
|
||||
("foo.java", Some("java")),
|
||||
("foo.kt", Some("kotlin")),
|
||||
("foo.kts", Some("kotlin")),
|
||||
("foo.c", Some("c")),
|
||||
("foo.h", Some("c")),
|
||||
("foo.cpp", Some("cpp")),
|
||||
("foo.cc", Some("cpp")),
|
||||
("foo.cxx", Some("cpp")),
|
||||
("foo.hpp", Some("cpp")),
|
||||
("foo.hh", Some("cpp")),
|
||||
("foo.hxx", Some("cpp")),
|
||||
("foo.yaml", Some("yaml")),
|
||||
("foo.yml", Some("yaml")),
|
||||
("foo.toml", Some("toml")),
|
||||
("foo.json", Some("json")),
|
||||
("foo.sh", Some("shell")),
|
||||
("foo.bash", Some("shell")),
|
||||
("foo.zsh", Some("shell")),
|
||||
("foo.mk", Some("make")),
|
||||
];
|
||||
for (path, expected) in cases {
|
||||
assert_eq!(
|
||||
code_lang_for_path(Path::new(path)),
|
||||
expected,
|
||||
"path = {path}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn special_filenames_map_to_identifiers() {
|
||||
assert_eq!(code_lang_for_path(Path::new("Dockerfile")), Some("dockerfile"));
|
||||
assert_eq!(code_lang_for_path(Path::new("foo.dockerfile")), Some("dockerfile"));
|
||||
assert_eq!(code_lang_for_path(Path::new("Makefile")), Some("make"));
|
||||
assert_eq!(code_lang_for_path(Path::new("GNUmakefile")), Some("make"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_extension_returns_none() {
|
||||
assert_eq!(code_lang_for_path(Path::new("foo.docx")), None);
|
||||
assert_eq!(code_lang_for_path(Path::new("foo")), None);
|
||||
assert_eq!(code_lang_for_path(Path::new("foo.unknown")), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn case_insensitive() {
|
||||
assert_eq!(code_lang_for_path(Path::new("Foo.RS")), Some("rust"));
|
||||
assert_eq!(code_lang_for_path(Path::new("FOO.YAML")), Some("yaml"));
|
||||
}
|
||||
@@ -1,74 +0,0 @@
|
||||
use kebab_parse_code::skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized};
|
||||
use std::fs;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
#[test]
|
||||
fn generated_header_markers_trigger_skip() {
|
||||
let cases = [
|
||||
"// @generated\nfn foo() {}\n",
|
||||
"// Code generated by tonic-build. DO NOT EDIT.\nfn x() {}\n",
|
||||
"/* DO NOT EDIT */\nfn x() {}\n",
|
||||
"/* do not modify */\nfn x() {}\n",
|
||||
"// AUTOMATICALLY GENERATED\nfn x() {}\n",
|
||||
"# auto-generated\ndef x(): pass\n",
|
||||
"// autogenerated\nfn x() {}\n",
|
||||
];
|
||||
for content in cases {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
fs::write(f.path(), content).unwrap();
|
||||
assert!(is_generated_file(f.path()).unwrap(), "content: {content:?}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normal_code_is_not_flagged_generated() {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
fs::write(f.path(), "fn main() {\n println!(\"hi\");\n}\n").unwrap();
|
||||
assert!(!is_generated_file(f.path()).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_generated_returns_false_for_empty_file() {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
fs::write(f.path(), "").unwrap();
|
||||
assert!(!is_generated_file(f.path()).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversized_by_bytes_returns_true() {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
let body: String = "x".repeat(300_000);
|
||||
fs::write(f.path(), &body).unwrap();
|
||||
assert!(is_oversized(f.path(), 262_144, 5_000).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversized_by_lines_returns_true() {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
let body: String = "x\n".repeat(6_000);
|
||||
fs::write(f.path(), &body).unwrap();
|
||||
assert!(is_oversized(f.path(), 262_144, 5_000).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn small_file_returns_false_for_oversize() {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
fs::write(f.path(), "fn foo() {}\n").unwrap();
|
||||
assert!(!is_oversized(f.path(), 262_144, 5_000).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn builtin_blacklist_has_exactly_six_entries() {
|
||||
assert_eq!(BUILTIN_BLACKLIST.len(), 6);
|
||||
let expected = [
|
||||
"**/node_modules/**",
|
||||
"**/target/**",
|
||||
"**/__pycache__/**",
|
||||
"**/.venv/**",
|
||||
"**/venv/**",
|
||||
"**/env/**",
|
||||
];
|
||||
for pat in expected {
|
||||
assert!(BUILTIN_BLACKLIST.contains(&pat), "missing pattern: {pat}");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user