feat(p10-1a-1): kebab-parse-code crate (lang + repo + skip)
Tasks 5-8: new `kebab-parse-code` crate with three infrastructure modules for the code ingest framework. Ships lang.rs (extension→language identifier mapping), repo.rs (.git walk-up via gix 0.70 for RepoMeta), and skip.rs (BUILTIN_BLACKLIST, is_generated_file, is_oversized). 14 integration tests across three test files, all passing; clippy -D warnings clean. Note: gix pinned to 0.70 (not 0.83 as originally suggested) because 0.83 fails to compile against Rust 1.94.1 due to non-exhaustive match patterns in gix-hash. 0.70 resolves cleanly and has identical head_name/head_id API. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
13
crates/kebab-parse-code/Cargo.toml
Normal file
13
crates/kebab-parse-code/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "kebab-parse-code"
|
||||
version = { workspace = true }
|
||||
edition = { workspace = true }
|
||||
license = { workspace = true }
|
||||
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
gix = { workspace = true }
|
||||
kebab-core = { path = "../kebab-core" }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
42
crates/kebab-parse-code/src/lang.rs
Normal file
42
crates/kebab-parse-code/src/lang.rs
Normal file
@@ -0,0 +1,42 @@
|
||||
//! Canonical extension → language identifier mapping (spec §3.5).
|
||||
//!
|
||||
//! Lowercase canonical identifiers, matching tree-sitter parser conventions:
|
||||
//! `rust`, `python`, `typescript`, `javascript`, `go`, `java`, `kotlin`, `c`,
|
||||
//! `cpp`, `yaml`, `toml`, `json`, `shell`, `make`, `dockerfile`.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
/// Returns the canonical language identifier for a given file path, or
|
||||
/// `None` if the extension / filename is not recognized.
|
||||
///
|
||||
/// Matching priority:
|
||||
/// 1. exact filename match (e.g. `Dockerfile`, `Makefile`)
|
||||
/// 2. lowercase extension match
|
||||
pub fn code_lang_for_path(path: &Path) -> Option<&'static str> {
|
||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||
match name {
|
||||
"Dockerfile" => return Some("dockerfile"),
|
||||
"Makefile" | "GNUmakefile" => return Some("make"),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
|
||||
match ext.as_str() {
|
||||
"rs" => Some("rust"),
|
||||
"py" | "pyi" => Some("python"),
|
||||
"ts" | "tsx" => Some("typescript"),
|
||||
"js" | "mjs" | "cjs" | "jsx" => Some("javascript"),
|
||||
"go" => Some("go"),
|
||||
"java" => Some("java"),
|
||||
"kt" | "kts" => Some("kotlin"),
|
||||
"c" | "h" => Some("c"),
|
||||
"cpp" | "cc" | "cxx" | "hpp" | "hh" | "hxx" => Some("cpp"),
|
||||
"yaml" | "yml" => Some("yaml"),
|
||||
"toml" => Some("toml"),
|
||||
"json" => Some("json"),
|
||||
"sh" | "bash" | "zsh" => Some("shell"),
|
||||
"mk" => Some("make"),
|
||||
"dockerfile" => Some("dockerfile"),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
22
crates/kebab-parse-code/src/lib.rs
Normal file
22
crates/kebab-parse-code/src/lib.rs
Normal file
@@ -0,0 +1,22 @@
|
||||
//! `kebab-parse-code` — language-aware parsing for code corpora.
|
||||
//!
|
||||
//! Phase 1A-1 ships infrastructure only:
|
||||
//!
|
||||
//! - [`lang::code_lang_for_path`] — extension → language identifier.
|
||||
//! - [`repo::detect_repo`] — `.git/` walk-up → repo / branch / commit metadata.
|
||||
//! - [`skip::is_generated_file`] / [`skip::is_oversized`] — pre-ingest skip
|
||||
//! helpers consulted by `kebab-source-fs`.
|
||||
//! - [`skip::BUILTIN_BLACKLIST`] — 6-entry safety-net pattern list.
|
||||
//!
|
||||
//! Per-language parser modules (`rust`, `python`, `typescript`, …) land in
|
||||
//! later phases (1A-2 onwards). The crate boundary follows other
|
||||
//! `kebab-parse-*` crates per design §8: must NOT depend on store / embed
|
||||
//! / llm / rag.
|
||||
|
||||
pub mod lang;
|
||||
pub mod repo;
|
||||
pub mod skip;
|
||||
|
||||
pub use lang::code_lang_for_path;
|
||||
pub use repo::{RepoMeta, detect_repo};
|
||||
pub use skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized};
|
||||
61
crates/kebab-parse-code/src/repo.rs
Normal file
61
crates/kebab-parse-code/src/repo.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
//! Git repo auto-detection (spec §5.1).
|
||||
//!
|
||||
//! Walks up from `path` looking for a `.git/` directory. If found, reads
|
||||
//! repo dir name, current branch, and HEAD commit using `gix` (pure Rust;
|
||||
//! no `git` binary on PATH required).
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct RepoMeta {
|
||||
pub name: String,
|
||||
pub branch: Option<String>,
|
||||
pub commit: Option<String>,
|
||||
}
|
||||
|
||||
/// Walk up from `path` until a `.git/` directory is found. Returns repo
|
||||
/// metadata, or `None` if no repo boundary is reached before the filesystem
|
||||
/// root.
|
||||
///
|
||||
/// - `name`: directory name containing `.git/`.
|
||||
/// - `branch`: current HEAD branch, or `"detached"` if detached HEAD, or
|
||||
/// `None` if branch can't be read.
|
||||
/// - `commit`: 40-hex commit SHA at HEAD, or `None` if empty repo / read
|
||||
/// failure.
|
||||
///
|
||||
/// `.git/` as a file (worktree marker / submodule) returns `None` for
|
||||
/// `branch` and `commit` and falls back to the parent dir name for `name`.
|
||||
pub fn detect_repo(path: &Path) -> Option<RepoMeta> {
|
||||
let mut cur = if path.is_dir() { path } else { path.parent()? };
|
||||
loop {
|
||||
let dotgit = cur.join(".git");
|
||||
if dotgit.is_dir() {
|
||||
let name = cur.file_name()?.to_string_lossy().into_owned();
|
||||
let (branch, commit) = read_head(cur);
|
||||
return Some(RepoMeta { name, branch, commit });
|
||||
} else if dotgit.is_file() {
|
||||
let name = cur.file_name()?.to_string_lossy().into_owned();
|
||||
return Some(RepoMeta { name, branch: None, commit: None });
|
||||
}
|
||||
cur = cur.parent()?;
|
||||
}
|
||||
}
|
||||
|
||||
fn read_head(repo_dir: &Path) -> (Option<String>, Option<String>) {
|
||||
match gix::open(repo_dir) {
|
||||
Ok(repo) => {
|
||||
let branch = repo
|
||||
.head_name()
|
||||
.ok()
|
||||
.flatten()
|
||||
.map(|n| n.shorten().to_string())
|
||||
.or_else(|| Some("detached".to_string()));
|
||||
let commit = repo
|
||||
.head_id()
|
||||
.ok()
|
||||
.map(|id| id.to_string());
|
||||
(branch, commit)
|
||||
}
|
||||
Err(_) => (None, None),
|
||||
}
|
||||
}
|
||||
65
crates/kebab-parse-code/src/skip.rs
Normal file
65
crates/kebab-parse-code/src/skip.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
//! Pre-ingest skip helpers (spec §5.2 + §5.3 + §5.4).
|
||||
//!
|
||||
//! - [`BUILTIN_BLACKLIST`] — 6 gitignore-style patterns universal across
|
||||
//! ecosystems. Source of truth: spec §5.2.
|
||||
//! - [`is_generated_file`] — reads first ~512 bytes, checks for 7
|
||||
//! case-insensitive markers.
|
||||
//! - [`is_oversized`] — byte cap then line cap.
|
||||
|
||||
use anyhow::Result;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Read};
|
||||
use std::path::Path;
|
||||
|
||||
/// 6 built-in gitignore-style patterns. Applied in addition to `.gitignore`
|
||||
/// + `.kebabignore`. User can override via `.kebabignore` negation
|
||||
/// (`!pattern`).
|
||||
pub const BUILTIN_BLACKLIST: &[&str] = &[
|
||||
"**/node_modules/**",
|
||||
"**/target/**",
|
||||
"**/__pycache__/**",
|
||||
"**/.venv/**",
|
||||
"**/venv/**",
|
||||
"**/env/**",
|
||||
];
|
||||
|
||||
/// Read first 512 bytes, check for any of 7 case-insensitive generated-file
|
||||
/// markers. Returns Ok(true) on match, Ok(false) otherwise.
|
||||
pub fn is_generated_file(path: &Path) -> Result<bool> {
|
||||
let mut buf = [0u8; 512];
|
||||
let mut f = File::open(path)?;
|
||||
let n = f.read(&mut buf)?;
|
||||
if n == 0 {
|
||||
return Ok(false);
|
||||
}
|
||||
let head = std::str::from_utf8(&buf[..n]).unwrap_or("");
|
||||
let lower: String = head.lines().take(10).collect::<Vec<_>>().join("\n").to_ascii_lowercase();
|
||||
Ok(
|
||||
lower.contains("@generated")
|
||||
|| lower.contains("code generated by")
|
||||
|| lower.contains("do not edit")
|
||||
|| lower.contains("do not modify")
|
||||
|| lower.contains("automatically generated")
|
||||
|| lower.contains("auto-generated")
|
||||
|| lower.contains("autogenerated"),
|
||||
)
|
||||
}
|
||||
|
||||
/// Check if `path` exceeds `max_bytes` or `max_lines`. Byte cap first
|
||||
/// (cheap), then line cap (streaming with early exit).
|
||||
pub fn is_oversized(path: &Path, max_bytes: u64, max_lines: u32) -> Result<bool> {
|
||||
let meta = std::fs::metadata(path)?;
|
||||
if meta.len() > max_bytes {
|
||||
return Ok(true);
|
||||
}
|
||||
let reader = BufReader::new(File::open(path)?);
|
||||
let mut count: u32 = 0;
|
||||
for line in reader.lines() {
|
||||
let _ = line?;
|
||||
count = count.saturating_add(1);
|
||||
if count > max_lines {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
64
crates/kebab-parse-code/tests/lang.rs
Normal file
64
crates/kebab-parse-code/tests/lang.rs
Normal file
@@ -0,0 +1,64 @@
|
||||
use kebab_parse_code::code_lang_for_path;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn known_extensions_map_to_canonical_identifiers() {
|
||||
let cases = [
|
||||
("foo.rs", Some("rust")),
|
||||
("foo.py", Some("python")),
|
||||
("foo.pyi", Some("python")),
|
||||
("foo.ts", Some("typescript")),
|
||||
("foo.tsx", Some("typescript")),
|
||||
("foo.js", Some("javascript")),
|
||||
("foo.mjs", Some("javascript")),
|
||||
("foo.cjs", Some("javascript")),
|
||||
("foo.jsx", Some("javascript")),
|
||||
("foo.go", Some("go")),
|
||||
("foo.java", Some("java")),
|
||||
("foo.kt", Some("kotlin")),
|
||||
("foo.kts", Some("kotlin")),
|
||||
("foo.c", Some("c")),
|
||||
("foo.h", Some("c")),
|
||||
("foo.cpp", Some("cpp")),
|
||||
("foo.cc", Some("cpp")),
|
||||
("foo.cxx", Some("cpp")),
|
||||
("foo.hpp", Some("cpp")),
|
||||
("foo.hh", Some("cpp")),
|
||||
("foo.hxx", Some("cpp")),
|
||||
("foo.yaml", Some("yaml")),
|
||||
("foo.yml", Some("yaml")),
|
||||
("foo.toml", Some("toml")),
|
||||
("foo.json", Some("json")),
|
||||
("foo.sh", Some("shell")),
|
||||
("foo.bash", Some("shell")),
|
||||
("foo.zsh", Some("shell")),
|
||||
("foo.mk", Some("make")),
|
||||
];
|
||||
for (path, expected) in cases {
|
||||
assert_eq!(
|
||||
code_lang_for_path(Path::new(path)),
|
||||
expected,
|
||||
"path = {path}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn special_filenames_map_to_identifiers() {
|
||||
assert_eq!(code_lang_for_path(Path::new("Dockerfile")), Some("dockerfile"));
|
||||
assert_eq!(code_lang_for_path(Path::new("foo.dockerfile")), Some("dockerfile"));
|
||||
assert_eq!(code_lang_for_path(Path::new("Makefile")), Some("make"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_extension_returns_none() {
|
||||
assert_eq!(code_lang_for_path(Path::new("foo.docx")), None);
|
||||
assert_eq!(code_lang_for_path(Path::new("foo")), None);
|
||||
assert_eq!(code_lang_for_path(Path::new("foo.unknown")), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn case_insensitive() {
|
||||
assert_eq!(code_lang_for_path(Path::new("Foo.RS")), Some("rust"));
|
||||
assert_eq!(code_lang_for_path(Path::new("FOO.YAML")), Some("yaml"));
|
||||
}
|
||||
62
crates/kebab-parse-code/tests/repo.rs
Normal file
62
crates/kebab-parse-code/tests/repo.rs
Normal file
@@ -0,0 +1,62 @@
|
||||
use kebab_parse_code::repo::detect_repo;
|
||||
use std::fs;
|
||||
use std::process::Command;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn init_git_repo(root: &std::path::Path) {
|
||||
let run = |args: &[&str]| {
|
||||
Command::new("git")
|
||||
.args(args)
|
||||
.current_dir(root)
|
||||
.status()
|
||||
.expect("git command failed");
|
||||
};
|
||||
run(&["init", "-q"]);
|
||||
run(&["config", "user.email", "test@test"]);
|
||||
run(&["config", "user.name", "test"]);
|
||||
fs::write(root.join("README.md"), "hi").unwrap();
|
||||
run(&["add", "README.md"]);
|
||||
run(&["commit", "-q", "-m", "init"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_repo_returns_none_outside_git() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let nested = tmp.path().join("a/b/c.txt");
|
||||
fs::create_dir_all(nested.parent().unwrap()).unwrap();
|
||||
fs::write(&nested, "x").unwrap();
|
||||
assert!(detect_repo(&nested).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_repo_walks_up_to_git_dir() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let repo_root = tmp.path().join("myrepo");
|
||||
fs::create_dir_all(&repo_root).unwrap();
|
||||
init_git_repo(&repo_root);
|
||||
let nested = repo_root.join("src/deep/file.rs");
|
||||
fs::create_dir_all(nested.parent().unwrap()).unwrap();
|
||||
fs::write(&nested, "x").unwrap();
|
||||
|
||||
let meta = detect_repo(&nested).expect("should detect repo");
|
||||
assert_eq!(meta.name, "myrepo");
|
||||
assert!(meta.branch.is_some());
|
||||
assert!(meta.commit.is_some());
|
||||
assert_eq!(meta.commit.as_ref().unwrap().len(), 40);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_repo_caches_per_path_call_for_repeated_files_in_same_repo() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
let repo_root = tmp.path().join("myrepo");
|
||||
fs::create_dir_all(&repo_root).unwrap();
|
||||
init_git_repo(&repo_root);
|
||||
let f1 = repo_root.join("a.rs");
|
||||
let f2 = repo_root.join("b.rs");
|
||||
fs::write(&f1, "x").unwrap();
|
||||
fs::write(&f2, "x").unwrap();
|
||||
let m1 = detect_repo(&f1).unwrap();
|
||||
let m2 = detect_repo(&f2).unwrap();
|
||||
assert_eq!(m1.name, m2.name);
|
||||
assert_eq!(m1.commit, m2.commit);
|
||||
}
|
||||
74
crates/kebab-parse-code/tests/skip.rs
Normal file
74
crates/kebab-parse-code/tests/skip.rs
Normal file
@@ -0,0 +1,74 @@
|
||||
use kebab_parse_code::skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized};
|
||||
use std::fs;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
#[test]
|
||||
fn generated_header_markers_trigger_skip() {
|
||||
let cases = [
|
||||
"// @generated\nfn foo() {}\n",
|
||||
"// Code generated by tonic-build. DO NOT EDIT.\nfn x() {}\n",
|
||||
"/* DO NOT EDIT */\nfn x() {}\n",
|
||||
"/* do not modify */\nfn x() {}\n",
|
||||
"// AUTOMATICALLY GENERATED\nfn x() {}\n",
|
||||
"# auto-generated\ndef x(): pass\n",
|
||||
"// autogenerated\nfn x() {}\n",
|
||||
];
|
||||
for content in cases {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
fs::write(f.path(), content).unwrap();
|
||||
assert!(is_generated_file(f.path()).unwrap(), "content: {content:?}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normal_code_is_not_flagged_generated() {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
fs::write(f.path(), "fn main() {\n println!(\"hi\");\n}\n").unwrap();
|
||||
assert!(!is_generated_file(f.path()).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_generated_returns_false_for_empty_file() {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
fs::write(f.path(), "").unwrap();
|
||||
assert!(!is_generated_file(f.path()).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversized_by_bytes_returns_true() {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
let body: String = "x".repeat(300_000);
|
||||
fs::write(f.path(), &body).unwrap();
|
||||
assert!(is_oversized(f.path(), 262_144, 5_000).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oversized_by_lines_returns_true() {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
let body: String = "x\n".repeat(6_000);
|
||||
fs::write(f.path(), &body).unwrap();
|
||||
assert!(is_oversized(f.path(), 262_144, 5_000).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn small_file_returns_false_for_oversize() {
|
||||
let f = NamedTempFile::new().unwrap();
|
||||
fs::write(f.path(), "fn foo() {}\n").unwrap();
|
||||
assert!(!is_oversized(f.path(), 262_144, 5_000).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn builtin_blacklist_has_exactly_six_entries() {
|
||||
assert_eq!(BUILTIN_BLACKLIST.len(), 6);
|
||||
let expected = [
|
||||
"**/node_modules/**",
|
||||
"**/target/**",
|
||||
"**/__pycache__/**",
|
||||
"**/.venv/**",
|
||||
"**/venv/**",
|
||||
"**/env/**",
|
||||
];
|
||||
for pat in expected {
|
||||
assert!(BUILTIN_BLACKLIST.contains(&pat), "missing pattern: {pat}");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user