From ff11f81f7f15ee66bdd073c15063f5e3fa5dd0e8 Mon Sep 17 00:00:00 2001
From: th-kim0823
Date: Fri, 15 May 2026 15:57:59 +0900
Subject: [PATCH] feat(p10-1a-1): kebab-parse-code crate (lang + repo + skip)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Tasks 5-8: new `kebab-parse-code` crate with three infrastructure modules
for the code ingest framework. Ships lang.rs (extension→language identifier
mapping), repo.rs (.git walk-up via gix 0.70 for RepoMeta), and skip.rs
(BUILTIN_BLACKLIST, is_generated_file, is_oversized). 14 integration tests
across three test files, all passing; clippy -D warnings clean.
Note: gix pinned to 0.70 (not 0.83 as originally suggested) because 0.83
fails to compile against Rust 1.94.1 due to non-exhaustive match patterns
in gix-hash. 0.70 resolves cleanly and has identical head_name/head_id API.
Co-Authored-By: Claude Sonnet 4.6
---
Cargo.lock | 660 ++++++++++++++++++++++++++
Cargo.toml | 5 +
crates/kebab-parse-code/Cargo.toml | 13 +
crates/kebab-parse-code/src/lang.rs | 42 ++
crates/kebab-parse-code/src/lib.rs | 22 +
crates/kebab-parse-code/src/repo.rs | 61 +++
crates/kebab-parse-code/src/skip.rs | 65 +++
crates/kebab-parse-code/tests/lang.rs | 64 +++
crates/kebab-parse-code/tests/repo.rs | 62 +++
crates/kebab-parse-code/tests/skip.rs | 74 +++
10 files changed, 1068 insertions(+)
create mode 100644 crates/kebab-parse-code/Cargo.toml
create mode 100644 crates/kebab-parse-code/src/lang.rs
create mode 100644 crates/kebab-parse-code/src/lib.rs
create mode 100644 crates/kebab-parse-code/src/repo.rs
create mode 100644 crates/kebab-parse-code/src/skip.rs
create mode 100644 crates/kebab-parse-code/tests/lang.rs
create mode 100644 crates/kebab-parse-code/tests/repo.rs
create mode 100644 crates/kebab-parse-code/tests/skip.rs
diff --git a/Cargo.lock b/Cargo.lock
index 332d306..499f233 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -755,6 +755,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab"
dependencies = [
"memchr",
+ "regex-automata",
"serde",
]
@@ -931,6 +932,15 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
+[[package]]
+name = "clru"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "197fd99cb113a8d5d9b6376f3aa817f32c1078f2343b714fff7d2ca44fdf67d5"
+dependencies = [
+ "hashbrown 0.16.1",
+]
+
[[package]]
name = "color_quant"
version = "1.1.0"
@@ -2140,6 +2150,12 @@ version = "2.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "117240f60069e65410b3ae1bb213295bd828f707b5bec6596a1afc8793ce0cbc"
+[[package]]
+name = "dunce"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
+
[[package]]
name = "dyn-clone"
version = "1.0.20"
@@ -2302,6 +2318,15 @@ dependencies = [
"tokenizers",
]
+[[package]]
+name = "faster-hex"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2a2b11eda1d40935b26cf18f6833c526845ae8c41e58d09af6adeb6f0269183"
+dependencies = [
+ "serde",
+]
+
[[package]]
name = "fastrand"
version = "2.4.1"
@@ -2738,6 +2763,583 @@ dependencies = [
"weezl",
]
+[[package]]
+name = "gix"
+version = "0.70.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "736f14636705f3a56ea52b553e67282519418d9a35bb1e90b3a9637a00296b68"
+dependencies = [
+ "gix-actor",
+ "gix-commitgraph",
+ "gix-config",
+ "gix-date",
+ "gix-diff",
+ "gix-discover",
+ "gix-features",
+ "gix-fs",
+ "gix-glob",
+ "gix-hash",
+ "gix-hashtable",
+ "gix-index",
+ "gix-lock",
+ "gix-object",
+ "gix-odb",
+ "gix-pack",
+ "gix-path",
+ "gix-protocol",
+ "gix-ref",
+ "gix-refspec",
+ "gix-revision",
+ "gix-revwalk",
+ "gix-sec",
+ "gix-shallow",
+ "gix-tempfile",
+ "gix-trace",
+ "gix-traverse",
+ "gix-url",
+ "gix-utils",
+ "gix-validate 0.9.4",
+ "once_cell",
+ "smallvec",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-actor"
+version = "0.33.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20018a1a6332e065f1fcc8305c1c932c6b8c9985edea2284b3c79dc6fa3ee4b2"
+dependencies = [
+ "bstr",
+ "gix-date",
+ "gix-utils",
+ "itoa",
+ "thiserror 2.0.18",
+ "winnow 0.6.26",
+]
+
+[[package]]
+name = "gix-bitmap"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d982fc7ef0608e669851d0d2a6141dae74c60d5a27e8daa451f2a4857bbf41e2"
+dependencies = [
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-chunk"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c356b3825677cb6ff579551bb8311a81821e184453cbd105e2fc5311b288eeb"
+dependencies = [
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-command"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb410b84d6575db45e62025a9118bdbf4d4b099ce7575a76161e898d9ca98df1"
+dependencies = [
+ "bstr",
+ "gix-path",
+ "gix-trace",
+ "shell-words",
+]
+
+[[package]]
+name = "gix-commitgraph"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e23a8ec2d8a16026a10dafdb6ed51bcfd08f5d97f20fa52e200bc50cb72e4877"
+dependencies = [
+ "bstr",
+ "gix-chunk",
+ "gix-features",
+ "gix-hash",
+ "memmap2",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-config"
+version = "0.43.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "377c1efd2014d5d469e0b3cd2952c8097bce9828f634e04d5665383249f1d9e9"
+dependencies = [
+ "bstr",
+ "gix-config-value",
+ "gix-features",
+ "gix-glob",
+ "gix-path",
+ "gix-ref",
+ "gix-sec",
+ "memchr",
+ "once_cell",
+ "smallvec",
+ "thiserror 2.0.18",
+ "unicode-bom",
+ "winnow 0.6.26",
+]
+
+[[package]]
+name = "gix-config-value"
+version = "0.14.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8dc2c844c4cf141884678cabef736fd91dd73068b9146e6f004ba1a0457944b6"
+dependencies = [
+ "bitflags",
+ "bstr",
+ "gix-path",
+ "libc",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-date"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "daa30058ec7d3511fbc229e4f9e696a35abd07ec5b82e635eff864a2726217e4"
+dependencies = [
+ "bstr",
+ "itoa",
+ "jiff",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-diff"
+version = "0.50.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62afb7f4ca0acdf4e9dad92065b2eb1bf2993bcc5014b57bc796e3a365b17c4d"
+dependencies = [
+ "bstr",
+ "gix-hash",
+ "gix-object",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-discover"
+version = "0.38.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0c2414bdf04064e0f5a5aa029dfda1e663cf9a6c4bfc8759f2d369299bb65d8"
+dependencies = [
+ "bstr",
+ "dunce",
+ "gix-fs",
+ "gix-hash",
+ "gix-path",
+ "gix-ref",
+ "gix-sec",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-features"
+version = "0.40.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bfdd4838a8d42bd482c9f0cb526411d003ee94cc7c7b08afe5007329c71d554"
+dependencies = [
+ "crc32fast",
+ "flate2",
+ "gix-hash",
+ "gix-trace",
+ "gix-utils",
+ "libc",
+ "once_cell",
+ "prodash",
+ "sha1_smol",
+ "thiserror 2.0.18",
+ "walkdir",
+]
+
+[[package]]
+name = "gix-fs"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "182e7fa7bfdf44ffb7cfe7451b373cdf1e00870ac9a488a49587a110c562063d"
+dependencies = [
+ "fastrand",
+ "gix-features",
+ "gix-utils",
+]
+
+[[package]]
+name = "gix-glob"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e9c7249fa0a78f9b363aa58323db71e0a6161fd69860ed6f48dedf0ef3a314e"
+dependencies = [
+ "bitflags",
+ "bstr",
+ "gix-features",
+ "gix-path",
+]
+
+[[package]]
+name = "gix-hash"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e81c5ec48649b1821b3ed066a44efb95f1a268b35c1d91295e61252539fbe9f8"
+dependencies = [
+ "faster-hex",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-hashtable"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "189130bc372accd02e0520dc5ab1cef318dcc2bc829b76ab8d84bbe90ac212d1"
+dependencies = [
+ "gix-hash",
+ "hashbrown 0.14.5",
+ "parking_lot",
+]
+
+[[package]]
+name = "gix-index"
+version = "0.38.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acd12e3626879369310fffe2ac61acc828613ef656b50c4ea984dd59d7dc85d8"
+dependencies = [
+ "bitflags",
+ "bstr",
+ "filetime",
+ "fnv",
+ "gix-bitmap",
+ "gix-features",
+ "gix-fs",
+ "gix-hash",
+ "gix-lock",
+ "gix-object",
+ "gix-traverse",
+ "gix-utils",
+ "gix-validate 0.9.4",
+ "hashbrown 0.14.5",
+ "itoa",
+ "libc",
+ "memmap2",
+ "rustix 0.38.44",
+ "smallvec",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-lock"
+version = "16.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9739815270ff6940968441824d162df9433db19211ca9ba8c3fc1b50b849c642"
+dependencies = [
+ "gix-tempfile",
+ "gix-utils",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-object"
+version = "0.47.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddc4b3a0044244f0fe22347fb7a79cca165e37829d668b41b85ff46a43e5fd68"
+dependencies = [
+ "bstr",
+ "gix-actor",
+ "gix-date",
+ "gix-features",
+ "gix-hash",
+ "gix-hashtable",
+ "gix-path",
+ "gix-utils",
+ "gix-validate 0.9.4",
+ "itoa",
+ "smallvec",
+ "thiserror 2.0.18",
+ "winnow 0.6.26",
+]
+
+[[package]]
+name = "gix-odb"
+version = "0.67.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e93457df69cd09573608ce9fa4f443fbd84bc8d15d8d83adecd471058459c1b"
+dependencies = [
+ "arc-swap",
+ "gix-date",
+ "gix-features",
+ "gix-fs",
+ "gix-hash",
+ "gix-hashtable",
+ "gix-object",
+ "gix-pack",
+ "gix-path",
+ "gix-quote",
+ "parking_lot",
+ "tempfile",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-pack"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc13a475b3db735617017fb35f816079bf503765312d4b1913b18cf96f3fa515"
+dependencies = [
+ "clru",
+ "gix-chunk",
+ "gix-features",
+ "gix-hash",
+ "gix-hashtable",
+ "gix-object",
+ "gix-path",
+ "memmap2",
+ "smallvec",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-packetline"
+version = "0.18.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "123844a70cf4d5352441dc06bab0da8aef61be94ec239cb631e0ba01dc6d3a04"
+dependencies = [
+ "bstr",
+ "faster-hex",
+ "gix-trace",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-path"
+version = "0.10.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cb06c3e4f8eed6e24fd915fa93145e28a511f4ea0e768bae16673e05ed3f366"
+dependencies = [
+ "bstr",
+ "gix-trace",
+ "gix-validate 0.10.1",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-protocol"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c61bd61afc6b67d213241e2100394c164be421e3f7228d3521b04f48ca5ba90"
+dependencies = [
+ "bstr",
+ "gix-date",
+ "gix-features",
+ "gix-hash",
+ "gix-ref",
+ "gix-shallow",
+ "gix-transport",
+ "gix-utils",
+ "maybe-async",
+ "thiserror 2.0.18",
+ "winnow 0.6.26",
+]
+
+[[package]]
+name = "gix-quote"
+version = "0.4.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e49357fccdb0c85c0d3a3292a9f6db32d9b3535959b5471bb9624908f4a066c6"
+dependencies = [
+ "bstr",
+ "gix-utils",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-ref"
+version = "0.50.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47adf4c5f933429f8554e95d0d92eee583cfe4b95d2bf665cd6fd4a1531ee20c"
+dependencies = [
+ "gix-actor",
+ "gix-features",
+ "gix-fs",
+ "gix-hash",
+ "gix-lock",
+ "gix-object",
+ "gix-path",
+ "gix-tempfile",
+ "gix-utils",
+ "gix-validate 0.9.4",
+ "memmap2",
+ "thiserror 2.0.18",
+ "winnow 0.6.26",
+]
+
+[[package]]
+name = "gix-refspec"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59650228d8f612f68e7f7a25f517fcf386c5d0d39826085492e94766858b0a90"
+dependencies = [
+ "bstr",
+ "gix-hash",
+ "gix-revision",
+ "gix-validate 0.9.4",
+ "smallvec",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-revision"
+version = "0.32.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fe28bbccca55da6d66e6c6efc6bb4003c29d407afd8178380293729733e6b53"
+dependencies = [
+ "bitflags",
+ "bstr",
+ "gix-commitgraph",
+ "gix-date",
+ "gix-hash",
+ "gix-hashtable",
+ "gix-object",
+ "gix-revwalk",
+ "gix-trace",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-revwalk"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4ecb80c235b1e9ef2b99b23a81ea50dd569a88a9eb767179793269e0e616247"
+dependencies = [
+ "gix-commitgraph",
+ "gix-date",
+ "gix-hash",
+ "gix-hashtable",
+ "gix-object",
+ "smallvec",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-sec"
+version = "0.10.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47aeb0f13de9ef2f3033f5ff218de30f44db827ac9f1286f9ef050aacddd5888"
+dependencies = [
+ "bitflags",
+ "gix-path",
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "gix-shallow"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab72543011e303e52733c85bef784603ef39632ddf47f69723def52825e35066"
+dependencies = [
+ "bstr",
+ "gix-hash",
+ "gix-lock",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-tempfile"
+version = "16.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2558f423945ef24a8328c55d1fd6db06b8376b0e7013b1bb476cc4ffdf678501"
+dependencies = [
+ "gix-fs",
+ "libc",
+ "once_cell",
+ "parking_lot",
+ "tempfile",
+]
+
+[[package]]
+name = "gix-trace"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f23569e55f2ffaf958617353b9734a7d52a7c19c439eeaa5e3efc217fd2270e"
+
+[[package]]
+name = "gix-transport"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11187418489477b1b5b862ae1aedbbac77e582f2c4b0ef54280f20cfe5b964d9"
+dependencies = [
+ "bstr",
+ "gix-command",
+ "gix-features",
+ "gix-packetline",
+ "gix-quote",
+ "gix-sec",
+ "gix-url",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-traverse"
+version = "0.44.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bec70e53896586ef32a3efa7e4427b67308531ed186bb6120fb3eca0f0d61b4"
+dependencies = [
+ "bitflags",
+ "gix-commitgraph",
+ "gix-date",
+ "gix-hash",
+ "gix-hashtable",
+ "gix-object",
+ "gix-revwalk",
+ "smallvec",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-url"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29218c768b53dd8f116045d87fec05b294c731a4b2bdd257eeca2084cc150b13"
+dependencies = [
+ "bstr",
+ "gix-features",
+ "gix-path",
+ "percent-encoding",
+ "thiserror 2.0.18",
+ "url",
+]
+
+[[package]]
+name = "gix-utils"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff08f24e03ac8916c478c8419d7d3c33393da9bb41fa4c24455d5406aeefd35f"
+dependencies = [
+ "fastrand",
+ "unicode-normalization",
+]
+
+[[package]]
+name = "gix-validate"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34b5f1253109da6c79ed7cf6e1e38437080bb6d704c76af14c93e2f255234084"
+dependencies = [
+ "bstr",
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "gix-validate"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b1e63a5b516e970a594f870ed4571a8fdcb8a344e7bd407a20db8bd61dbfde4"
+dependencies = [
+ "bstr",
+ "thiserror 2.0.18",
+]
+
[[package]]
name = "glob"
version = "0.3.3"
@@ -3737,6 +4339,16 @@ dependencies = [
"unicode-normalization",
]
+[[package]]
+name = "kebab-parse-code"
+version = "0.6.0"
+dependencies = [
+ "anyhow",
+ "gix",
+ "kebab-core",
+ "tempfile",
+]
+
[[package]]
name = "kebab-parse-image"
version = "0.6.0"
@@ -4846,6 +5458,17 @@ dependencies = [
"thread-tree",
]
+[[package]]
+name = "maybe-async"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "746873a384ad60adc5db74471dfaba74bd278afbdcfd81db93fafcdfc8b5ca0c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
[[package]]
name = "maybe-rayon"
version = "0.1.1"
@@ -5702,6 +6325,16 @@ dependencies = [
"unicode-ident",
]
+[[package]]
+name = "prodash"
+version = "29.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f04bb108f648884c23b98a0e940ebc2c93c0c3b89f04dbaf7eb8256ce617d1bc"
+dependencies = [
+ "log",
+ "parking_lot",
+]
+
[[package]]
name = "profiling"
version = "1.0.17"
@@ -6841,6 +7474,12 @@ dependencies = [
"unsafe-libyaml",
]
+[[package]]
+name = "sha1_smol"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d"
+
[[package]]
name = "sha2"
version = "0.10.9"
@@ -6861,6 +7500,12 @@ dependencies = [
"lazy_static",
]
+[[package]]
+name = "shell-words"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc6fe69c597f9c37bfeeeeeb33da3530379845f10be461a66d16d03eca2ded77"
+
[[package]]
name = "shellexpand"
version = "3.1.2"
@@ -7889,6 +8534,12 @@ version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
+[[package]]
+name = "unicode-bom"
+version = "2.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7eec5d1121208364f6793f7d2e222bf75a915c19557537745b195b253dd64217"
+
[[package]]
name = "unicode-ident"
version = "1.0.24"
@@ -8587,6 +9238,15 @@ version = "0.53.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
+[[package]]
+name = "winnow"
+version = "0.6.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28"
+dependencies = [
+ "memchr",
+]
+
[[package]]
name = "winnow"
version = "0.7.15"
diff --git a/Cargo.toml b/Cargo.toml
index 3731f22..7b2527c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,6 +23,7 @@ members = [
"crates/kebab-parse-pdf",
"crates/kebab-tui",
"crates/kebab-mcp",
+ "crates/kebab-parse-code",
]
[workspace.package]
@@ -81,6 +82,10 @@ rmcp = { version = "1.6", default-features = false, features = ["server"
# sync via reqwest::blocking — wiremock is dev-only there).
wiremock = "0.6"
base64 = "0.22"
+# Pure-Rust git library for repo metadata detection (kebab-parse-code).
+# No `git` binary required. Default features include thread-safety + most
+# object-reading capabilities needed for HEAD name + commit SHA queries.
+gix = { version = "0.70", default-features = false, features = ["revision"] }
# Disk-footprint trim for dev / test builds. Codegen, opt-level, and
# behavior are unchanged — only DWARF debug info is reduced (line
diff --git a/crates/kebab-parse-code/Cargo.toml b/crates/kebab-parse-code/Cargo.toml
new file mode 100644
index 0000000..ac76da0
--- /dev/null
+++ b/crates/kebab-parse-code/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "kebab-parse-code"
+version = { workspace = true }
+edition = { workspace = true }
+license = { workspace = true }
+
+[dependencies]
+anyhow = { workspace = true }
+gix = { workspace = true }
+kebab-core = { path = "../kebab-core" }
+
+[dev-dependencies]
+tempfile = { workspace = true }
diff --git a/crates/kebab-parse-code/src/lang.rs b/crates/kebab-parse-code/src/lang.rs
new file mode 100644
index 0000000..bd850f6
--- /dev/null
+++ b/crates/kebab-parse-code/src/lang.rs
@@ -0,0 +1,42 @@
+//! Canonical extension → language identifier mapping (spec §3.5).
+//!
+//! Lowercase canonical identifiers, matching tree-sitter parser conventions:
+//! `rust`, `python`, `typescript`, `javascript`, `go`, `java`, `kotlin`, `c`,
+//! `cpp`, `yaml`, `toml`, `json`, `shell`, `make`, `dockerfile`.
+
+use std::path::Path;
+
+/// Returns the canonical language identifier for a given file path, or
+/// `None` if the extension / filename is not recognized.
+///
+/// Matching priority:
+/// 1. exact filename match (e.g. `Dockerfile`, `Makefile`)
+/// 2. lowercase extension match
+pub fn code_lang_for_path(path: &Path) -> Option<&'static str> {
+ if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
+ match name {
+ "Dockerfile" => return Some("dockerfile"),
+ "Makefile" | "GNUmakefile" => return Some("make"),
+ _ => {}
+ }
+ }
+ let ext = path.extension()?.to_str()?.to_ascii_lowercase();
+ match ext.as_str() {
+ "rs" => Some("rust"),
+ "py" | "pyi" => Some("python"),
+ "ts" | "tsx" => Some("typescript"),
+ "js" | "mjs" | "cjs" | "jsx" => Some("javascript"),
+ "go" => Some("go"),
+ "java" => Some("java"),
+ "kt" | "kts" => Some("kotlin"),
+ "c" | "h" => Some("c"),
+ "cpp" | "cc" | "cxx" | "hpp" | "hh" | "hxx" => Some("cpp"),
+ "yaml" | "yml" => Some("yaml"),
+ "toml" => Some("toml"),
+ "json" => Some("json"),
+ "sh" | "bash" | "zsh" => Some("shell"),
+ "mk" => Some("make"),
+ "dockerfile" => Some("dockerfile"),
+ _ => None,
+ }
+}
diff --git a/crates/kebab-parse-code/src/lib.rs b/crates/kebab-parse-code/src/lib.rs
new file mode 100644
index 0000000..3b699c1
--- /dev/null
+++ b/crates/kebab-parse-code/src/lib.rs
@@ -0,0 +1,22 @@
+//! `kebab-parse-code` — language-aware parsing for code corpora.
+//!
+//! Phase 1A-1 ships infrastructure only:
+//!
+//! - [`lang::code_lang_for_path`] — extension → language identifier.
+//! - [`repo::detect_repo`] — `.git/` walk-up → repo / branch / commit metadata.
+//! - [`skip::is_generated_file`] / [`skip::is_oversized`] — pre-ingest skip
+//! helpers consulted by `kebab-source-fs`.
+//! - [`skip::BUILTIN_BLACKLIST`] — 6-entry safety-net pattern list.
+//!
+//! Per-language parser modules (`rust`, `python`, `typescript`, …) land in
+//! later phases (1A-2 onwards). The crate boundary follows other
+//! `kebab-parse-*` crates per design §8: must NOT depend on store / embed
+//! / llm / rag.
+
+pub mod lang;
+pub mod repo;
+pub mod skip;
+
+pub use lang::code_lang_for_path;
+pub use repo::{RepoMeta, detect_repo};
+pub use skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized};
diff --git a/crates/kebab-parse-code/src/repo.rs b/crates/kebab-parse-code/src/repo.rs
new file mode 100644
index 0000000..6798fbe
--- /dev/null
+++ b/crates/kebab-parse-code/src/repo.rs
@@ -0,0 +1,61 @@
+//! Git repo auto-detection (spec §5.1).
+//!
+//! Walks up from `path` looking for a `.git/` directory. If found, reads
+//! repo dir name, current branch, and HEAD commit using `gix` (pure Rust;
+//! no `git` binary on PATH required).
+
+use std::path::Path;
+
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct RepoMeta {
+ pub name: String,
+ pub branch: Option,
+ pub commit: Option,
+}
+
+/// Walk up from `path` until a `.git/` directory is found. Returns repo
+/// metadata, or `None` if no repo boundary is reached before the filesystem
+/// root.
+///
+/// - `name`: directory name containing `.git/`.
+/// - `branch`: current HEAD branch, or `"detached"` if detached HEAD, or
+/// `None` if branch can't be read.
+/// - `commit`: 40-hex commit SHA at HEAD, or `None` if empty repo / read
+/// failure.
+///
+/// `.git/` as a file (worktree marker / submodule) returns `None` for
+/// `branch` and `commit` and falls back to the parent dir name for `name`.
+pub fn detect_repo(path: &Path) -> Option {
+ let mut cur = if path.is_dir() { path } else { path.parent()? };
+ loop {
+ let dotgit = cur.join(".git");
+ if dotgit.is_dir() {
+ let name = cur.file_name()?.to_string_lossy().into_owned();
+ let (branch, commit) = read_head(cur);
+ return Some(RepoMeta { name, branch, commit });
+ } else if dotgit.is_file() {
+ let name = cur.file_name()?.to_string_lossy().into_owned();
+ return Some(RepoMeta { name, branch: None, commit: None });
+ }
+ cur = cur.parent()?;
+ }
+}
+
+fn read_head(repo_dir: &Path) -> (Option, Option) {
+ match gix::open(repo_dir) {
+ Ok(repo) => {
+ let branch = repo
+ .head_name()
+ .ok()
+ .flatten()
+ .map(|n| n.shorten().to_string())
+ .or_else(|| Some("detached".to_string()));
+ let commit = repo
+ .head_id()
+ .ok()
+ .map(|id| id.to_string());
+ (branch, commit)
+ }
+ Err(_) => (None, None),
+ }
+}
diff --git a/crates/kebab-parse-code/src/skip.rs b/crates/kebab-parse-code/src/skip.rs
new file mode 100644
index 0000000..eafecf8
--- /dev/null
+++ b/crates/kebab-parse-code/src/skip.rs
@@ -0,0 +1,65 @@
+//! Pre-ingest skip helpers (spec §5.2 + §5.3 + §5.4).
+//!
+//! - [`BUILTIN_BLACKLIST`] — 6 gitignore-style patterns universal across
+//! ecosystems. Source of truth: spec §5.2.
+//! - [`is_generated_file`] — reads first ~512 bytes, checks for 7
+//! case-insensitive markers.
+//! - [`is_oversized`] — byte cap then line cap.
+
+use anyhow::Result;
+use std::fs::File;
+use std::io::{BufRead, BufReader, Read};
+use std::path::Path;
+
+/// 6 built-in gitignore-style patterns. Applied in addition to `.gitignore`
+/// + `.kebabignore`. User can override via `.kebabignore` negation
+/// (`!pattern`).
+pub const BUILTIN_BLACKLIST: &[&str] = &[
+ "**/node_modules/**",
+ "**/target/**",
+ "**/__pycache__/**",
+ "**/.venv/**",
+ "**/venv/**",
+ "**/env/**",
+];
+
+/// Read first 512 bytes, check for any of 7 case-insensitive generated-file
+/// markers. Returns Ok(true) on match, Ok(false) otherwise.
+pub fn is_generated_file(path: &Path) -> Result {
+ let mut buf = [0u8; 512];
+ let mut f = File::open(path)?;
+ let n = f.read(&mut buf)?;
+ if n == 0 {
+ return Ok(false);
+ }
+ let head = std::str::from_utf8(&buf[..n]).unwrap_or("");
+ let lower: String = head.lines().take(10).collect::>().join("\n").to_ascii_lowercase();
+ Ok(
+ lower.contains("@generated")
+ || lower.contains("code generated by")
+ || lower.contains("do not edit")
+ || lower.contains("do not modify")
+ || lower.contains("automatically generated")
+ || lower.contains("auto-generated")
+ || lower.contains("autogenerated"),
+ )
+}
+
+/// Check if `path` exceeds `max_bytes` or `max_lines`. Byte cap first
+/// (cheap), then line cap (streaming with early exit).
+pub fn is_oversized(path: &Path, max_bytes: u64, max_lines: u32) -> Result {
+ let meta = std::fs::metadata(path)?;
+ if meta.len() > max_bytes {
+ return Ok(true);
+ }
+ let reader = BufReader::new(File::open(path)?);
+ let mut count: u32 = 0;
+ for line in reader.lines() {
+ let _ = line?;
+ count = count.saturating_add(1);
+ if count > max_lines {
+ return Ok(true);
+ }
+ }
+ Ok(false)
+}
diff --git a/crates/kebab-parse-code/tests/lang.rs b/crates/kebab-parse-code/tests/lang.rs
new file mode 100644
index 0000000..73a1551
--- /dev/null
+++ b/crates/kebab-parse-code/tests/lang.rs
@@ -0,0 +1,64 @@
+use kebab_parse_code::code_lang_for_path;
+use std::path::Path;
+
+#[test]
+fn known_extensions_map_to_canonical_identifiers() {
+ let cases = [
+ ("foo.rs", Some("rust")),
+ ("foo.py", Some("python")),
+ ("foo.pyi", Some("python")),
+ ("foo.ts", Some("typescript")),
+ ("foo.tsx", Some("typescript")),
+ ("foo.js", Some("javascript")),
+ ("foo.mjs", Some("javascript")),
+ ("foo.cjs", Some("javascript")),
+ ("foo.jsx", Some("javascript")),
+ ("foo.go", Some("go")),
+ ("foo.java", Some("java")),
+ ("foo.kt", Some("kotlin")),
+ ("foo.kts", Some("kotlin")),
+ ("foo.c", Some("c")),
+ ("foo.h", Some("c")),
+ ("foo.cpp", Some("cpp")),
+ ("foo.cc", Some("cpp")),
+ ("foo.cxx", Some("cpp")),
+ ("foo.hpp", Some("cpp")),
+ ("foo.hh", Some("cpp")),
+ ("foo.hxx", Some("cpp")),
+ ("foo.yaml", Some("yaml")),
+ ("foo.yml", Some("yaml")),
+ ("foo.toml", Some("toml")),
+ ("foo.json", Some("json")),
+ ("foo.sh", Some("shell")),
+ ("foo.bash", Some("shell")),
+ ("foo.zsh", Some("shell")),
+ ("foo.mk", Some("make")),
+ ];
+ for (path, expected) in cases {
+ assert_eq!(
+ code_lang_for_path(Path::new(path)),
+ expected,
+ "path = {path}"
+ );
+ }
+}
+
+#[test]
+fn special_filenames_map_to_identifiers() {
+ assert_eq!(code_lang_for_path(Path::new("Dockerfile")), Some("dockerfile"));
+ assert_eq!(code_lang_for_path(Path::new("foo.dockerfile")), Some("dockerfile"));
+ assert_eq!(code_lang_for_path(Path::new("Makefile")), Some("make"));
+}
+
+#[test]
+fn unknown_extension_returns_none() {
+ assert_eq!(code_lang_for_path(Path::new("foo.docx")), None);
+ assert_eq!(code_lang_for_path(Path::new("foo")), None);
+ assert_eq!(code_lang_for_path(Path::new("foo.unknown")), None);
+}
+
+#[test]
+fn case_insensitive() {
+ assert_eq!(code_lang_for_path(Path::new("Foo.RS")), Some("rust"));
+ assert_eq!(code_lang_for_path(Path::new("FOO.YAML")), Some("yaml"));
+}
diff --git a/crates/kebab-parse-code/tests/repo.rs b/crates/kebab-parse-code/tests/repo.rs
new file mode 100644
index 0000000..68365a1
--- /dev/null
+++ b/crates/kebab-parse-code/tests/repo.rs
@@ -0,0 +1,62 @@
+use kebab_parse_code::repo::detect_repo;
+use std::fs;
+use std::process::Command;
+use tempfile::TempDir;
+
+fn init_git_repo(root: &std::path::Path) {
+ let run = |args: &[&str]| {
+ Command::new("git")
+ .args(args)
+ .current_dir(root)
+ .status()
+ .expect("git command failed");
+ };
+ run(&["init", "-q"]);
+ run(&["config", "user.email", "test@test"]);
+ run(&["config", "user.name", "test"]);
+ fs::write(root.join("README.md"), "hi").unwrap();
+ run(&["add", "README.md"]);
+ run(&["commit", "-q", "-m", "init"]);
+}
+
+#[test]
+fn detect_repo_returns_none_outside_git() {
+ let tmp = TempDir::new().unwrap();
+ let nested = tmp.path().join("a/b/c.txt");
+ fs::create_dir_all(nested.parent().unwrap()).unwrap();
+ fs::write(&nested, "x").unwrap();
+ assert!(detect_repo(&nested).is_none());
+}
+
+#[test]
+fn detect_repo_walks_up_to_git_dir() {
+ let tmp = TempDir::new().unwrap();
+ let repo_root = tmp.path().join("myrepo");
+ fs::create_dir_all(&repo_root).unwrap();
+ init_git_repo(&repo_root);
+ let nested = repo_root.join("src/deep/file.rs");
+ fs::create_dir_all(nested.parent().unwrap()).unwrap();
+ fs::write(&nested, "x").unwrap();
+
+ let meta = detect_repo(&nested).expect("should detect repo");
+ assert_eq!(meta.name, "myrepo");
+ assert!(meta.branch.is_some());
+ assert!(meta.commit.is_some());
+ assert_eq!(meta.commit.as_ref().unwrap().len(), 40);
+}
+
+#[test]
+fn detect_repo_caches_per_path_call_for_repeated_files_in_same_repo() {
+ let tmp = TempDir::new().unwrap();
+ let repo_root = tmp.path().join("myrepo");
+ fs::create_dir_all(&repo_root).unwrap();
+ init_git_repo(&repo_root);
+ let f1 = repo_root.join("a.rs");
+ let f2 = repo_root.join("b.rs");
+ fs::write(&f1, "x").unwrap();
+ fs::write(&f2, "x").unwrap();
+ let m1 = detect_repo(&f1).unwrap();
+ let m2 = detect_repo(&f2).unwrap();
+ assert_eq!(m1.name, m2.name);
+ assert_eq!(m1.commit, m2.commit);
+}
diff --git a/crates/kebab-parse-code/tests/skip.rs b/crates/kebab-parse-code/tests/skip.rs
new file mode 100644
index 0000000..b85dafe
--- /dev/null
+++ b/crates/kebab-parse-code/tests/skip.rs
@@ -0,0 +1,74 @@
+use kebab_parse_code::skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized};
+use std::fs;
+use tempfile::NamedTempFile;
+
+#[test]
+fn generated_header_markers_trigger_skip() {
+ let cases = [
+ "// @generated\nfn foo() {}\n",
+ "// Code generated by tonic-build. DO NOT EDIT.\nfn x() {}\n",
+ "/* DO NOT EDIT */\nfn x() {}\n",
+ "/* do not modify */\nfn x() {}\n",
+ "// AUTOMATICALLY GENERATED\nfn x() {}\n",
+ "# auto-generated\ndef x(): pass\n",
+ "// autogenerated\nfn x() {}\n",
+ ];
+ for content in cases {
+ let f = NamedTempFile::new().unwrap();
+ fs::write(f.path(), content).unwrap();
+ assert!(is_generated_file(f.path()).unwrap(), "content: {content:?}");
+ }
+}
+
+#[test]
+fn normal_code_is_not_flagged_generated() {
+ let f = NamedTempFile::new().unwrap();
+ fs::write(f.path(), "fn main() {\n println!(\"hi\");\n}\n").unwrap();
+ assert!(!is_generated_file(f.path()).unwrap());
+}
+
+#[test]
+fn is_generated_returns_false_for_empty_file() {
+ let f = NamedTempFile::new().unwrap();
+ fs::write(f.path(), "").unwrap();
+ assert!(!is_generated_file(f.path()).unwrap());
+}
+
+#[test]
+fn oversized_by_bytes_returns_true() {
+ let f = NamedTempFile::new().unwrap();
+ let body: String = "x".repeat(300_000);
+ fs::write(f.path(), &body).unwrap();
+ assert!(is_oversized(f.path(), 262_144, 5_000).unwrap());
+}
+
+#[test]
+fn oversized_by_lines_returns_true() {
+ let f = NamedTempFile::new().unwrap();
+ let body: String = "x\n".repeat(6_000);
+ fs::write(f.path(), &body).unwrap();
+ assert!(is_oversized(f.path(), 262_144, 5_000).unwrap());
+}
+
+#[test]
+fn small_file_returns_false_for_oversize() {
+ let f = NamedTempFile::new().unwrap();
+ fs::write(f.path(), "fn foo() {}\n").unwrap();
+ assert!(!is_oversized(f.path(), 262_144, 5_000).unwrap());
+}
+
+#[test]
+fn builtin_blacklist_has_exactly_six_entries() {
+ assert_eq!(BUILTIN_BLACKLIST.len(), 6);
+ let expected = [
+ "**/node_modules/**",
+ "**/target/**",
+ "**/__pycache__/**",
+ "**/.venv/**",
+ "**/venv/**",
+ "**/env/**",
+ ];
+ for pat in expected {
+ assert!(BUILTIN_BLACKLIST.contains(&pat), "missing pattern: {pat}");
+ }
+}