From ff11f81f7f15ee66bdd073c15063f5e3fa5dd0e8 Mon Sep 17 00:00:00 2001 From: th-kim0823 Date: Fri, 15 May 2026 15:57:59 +0900 Subject: [PATCH] feat(p10-1a-1): kebab-parse-code crate (lang + repo + skip) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tasks 5-8: new `kebab-parse-code` crate with three infrastructure modules for the code ingest framework. Ships lang.rs (extension→language identifier mapping), repo.rs (.git walk-up via gix 0.70 for RepoMeta), and skip.rs (BUILTIN_BLACKLIST, is_generated_file, is_oversized). 14 integration tests across three test files, all passing; clippy -D warnings clean. Note: gix pinned to 0.70 (not 0.83 as originally suggested) because 0.83 fails to compile against Rust 1.94.1 due to non-exhaustive match patterns in gix-hash. 0.70 resolves cleanly and has identical head_name/head_id API. Co-Authored-By: Claude Sonnet 4.6 --- Cargo.lock | 660 ++++++++++++++++++++++++++ Cargo.toml | 5 + crates/kebab-parse-code/Cargo.toml | 13 + crates/kebab-parse-code/src/lang.rs | 42 ++ crates/kebab-parse-code/src/lib.rs | 22 + crates/kebab-parse-code/src/repo.rs | 61 +++ crates/kebab-parse-code/src/skip.rs | 65 +++ crates/kebab-parse-code/tests/lang.rs | 64 +++ crates/kebab-parse-code/tests/repo.rs | 62 +++ crates/kebab-parse-code/tests/skip.rs | 74 +++ 10 files changed, 1068 insertions(+) create mode 100644 crates/kebab-parse-code/Cargo.toml create mode 100644 crates/kebab-parse-code/src/lang.rs create mode 100644 crates/kebab-parse-code/src/lib.rs create mode 100644 crates/kebab-parse-code/src/repo.rs create mode 100644 crates/kebab-parse-code/src/skip.rs create mode 100644 crates/kebab-parse-code/tests/lang.rs create mode 100644 crates/kebab-parse-code/tests/repo.rs create mode 100644 crates/kebab-parse-code/tests/skip.rs diff --git a/Cargo.lock b/Cargo.lock index 332d306..499f233 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -755,6 +755,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" dependencies = [ "memchr", + "regex-automata", "serde", ] @@ -931,6 +932,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" +[[package]] +name = "clru" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "197fd99cb113a8d5d9b6376f3aa817f32c1078f2343b714fff7d2ca44fdf67d5" +dependencies = [ + "hashbrown 0.16.1", +] + [[package]] name = "color_quant" version = "1.1.0" @@ -2140,6 +2150,12 @@ version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "117240f60069e65410b3ae1bb213295bd828f707b5bec6596a1afc8793ce0cbc" +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "dyn-clone" version = "1.0.20" @@ -2302,6 +2318,15 @@ dependencies = [ "tokenizers", ] +[[package]] +name = "faster-hex" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2a2b11eda1d40935b26cf18f6833c526845ae8c41e58d09af6adeb6f0269183" +dependencies = [ + "serde", +] + [[package]] name = "fastrand" version = "2.4.1" @@ -2738,6 +2763,583 @@ dependencies = [ "weezl", ] +[[package]] +name = "gix" +version = "0.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736f14636705f3a56ea52b553e67282519418d9a35bb1e90b3a9637a00296b68" +dependencies = [ + "gix-actor", + "gix-commitgraph", + "gix-config", + "gix-date", + "gix-diff", + "gix-discover", + "gix-features", + "gix-fs", + "gix-glob", + "gix-hash", + "gix-hashtable", + "gix-index", + "gix-lock", + "gix-object", + "gix-odb", + "gix-pack", + "gix-path", + "gix-protocol", + "gix-ref", + "gix-refspec", + "gix-revision", + "gix-revwalk", + "gix-sec", + "gix-shallow", + "gix-tempfile", + "gix-trace", + "gix-traverse", + "gix-url", + "gix-utils", + "gix-validate 0.9.4", + "once_cell", + "smallvec", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-actor" +version = "0.33.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20018a1a6332e065f1fcc8305c1c932c6b8c9985edea2284b3c79dc6fa3ee4b2" +dependencies = [ + "bstr", + "gix-date", + "gix-utils", + "itoa", + "thiserror 2.0.18", + "winnow 0.6.26", +] + +[[package]] +name = "gix-bitmap" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d982fc7ef0608e669851d0d2a6141dae74c60d5a27e8daa451f2a4857bbf41e2" +dependencies = [ + "thiserror 2.0.18", +] + +[[package]] +name = "gix-chunk" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c356b3825677cb6ff579551bb8311a81821e184453cbd105e2fc5311b288eeb" +dependencies = [ + "thiserror 2.0.18", +] + +[[package]] +name = "gix-command" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb410b84d6575db45e62025a9118bdbf4d4b099ce7575a76161e898d9ca98df1" +dependencies = [ + "bstr", + "gix-path", + "gix-trace", + "shell-words", +] + +[[package]] +name = "gix-commitgraph" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e23a8ec2d8a16026a10dafdb6ed51bcfd08f5d97f20fa52e200bc50cb72e4877" +dependencies = [ + "bstr", + "gix-chunk", + "gix-features", + "gix-hash", + "memmap2", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-config" +version = "0.43.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "377c1efd2014d5d469e0b3cd2952c8097bce9828f634e04d5665383249f1d9e9" +dependencies = [ + "bstr", + "gix-config-value", + "gix-features", + "gix-glob", + "gix-path", + "gix-ref", + "gix-sec", + "memchr", + "once_cell", + "smallvec", + "thiserror 2.0.18", + "unicode-bom", + "winnow 0.6.26", +] + +[[package]] +name = "gix-config-value" +version = "0.14.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8dc2c844c4cf141884678cabef736fd91dd73068b9146e6f004ba1a0457944b6" +dependencies = [ + "bitflags", + "bstr", + "gix-path", + "libc", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-date" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daa30058ec7d3511fbc229e4f9e696a35abd07ec5b82e635eff864a2726217e4" +dependencies = [ + "bstr", + "itoa", + "jiff", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-diff" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62afb7f4ca0acdf4e9dad92065b2eb1bf2993bcc5014b57bc796e3a365b17c4d" +dependencies = [ + "bstr", + "gix-hash", + "gix-object", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-discover" +version = "0.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0c2414bdf04064e0f5a5aa029dfda1e663cf9a6c4bfc8759f2d369299bb65d8" +dependencies = [ + "bstr", + "dunce", + "gix-fs", + "gix-hash", + "gix-path", + "gix-ref", + "gix-sec", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-features" +version = "0.40.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bfdd4838a8d42bd482c9f0cb526411d003ee94cc7c7b08afe5007329c71d554" +dependencies = [ + "crc32fast", + "flate2", + "gix-hash", + "gix-trace", + "gix-utils", + "libc", + "once_cell", + "prodash", + "sha1_smol", + "thiserror 2.0.18", + "walkdir", +] + +[[package]] +name = "gix-fs" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "182e7fa7bfdf44ffb7cfe7451b373cdf1e00870ac9a488a49587a110c562063d" +dependencies = [ + "fastrand", + "gix-features", + "gix-utils", +] + +[[package]] +name = "gix-glob" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e9c7249fa0a78f9b363aa58323db71e0a6161fd69860ed6f48dedf0ef3a314e" +dependencies = [ + "bitflags", + "bstr", + "gix-features", + "gix-path", +] + +[[package]] +name = "gix-hash" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e81c5ec48649b1821b3ed066a44efb95f1a268b35c1d91295e61252539fbe9f8" +dependencies = [ + "faster-hex", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-hashtable" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "189130bc372accd02e0520dc5ab1cef318dcc2bc829b76ab8d84bbe90ac212d1" +dependencies = [ + "gix-hash", + "hashbrown 0.14.5", + "parking_lot", +] + +[[package]] +name = "gix-index" +version = "0.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acd12e3626879369310fffe2ac61acc828613ef656b50c4ea984dd59d7dc85d8" +dependencies = [ + "bitflags", + "bstr", + "filetime", + "fnv", + "gix-bitmap", + "gix-features", + "gix-fs", + "gix-hash", + "gix-lock", + "gix-object", + "gix-traverse", + "gix-utils", + "gix-validate 0.9.4", + "hashbrown 0.14.5", + "itoa", + "libc", + "memmap2", + "rustix 0.38.44", + "smallvec", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-lock" +version = "16.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9739815270ff6940968441824d162df9433db19211ca9ba8c3fc1b50b849c642" +dependencies = [ + "gix-tempfile", + "gix-utils", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-object" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddc4b3a0044244f0fe22347fb7a79cca165e37829d668b41b85ff46a43e5fd68" +dependencies = [ + "bstr", + "gix-actor", + "gix-date", + "gix-features", + "gix-hash", + "gix-hashtable", + "gix-path", + "gix-utils", + "gix-validate 0.9.4", + "itoa", + "smallvec", + "thiserror 2.0.18", + "winnow 0.6.26", +] + +[[package]] +name = "gix-odb" +version = "0.67.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e93457df69cd09573608ce9fa4f443fbd84bc8d15d8d83adecd471058459c1b" +dependencies = [ + "arc-swap", + "gix-date", + "gix-features", + "gix-fs", + "gix-hash", + "gix-hashtable", + "gix-object", + "gix-pack", + "gix-path", + "gix-quote", + "parking_lot", + "tempfile", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-pack" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc13a475b3db735617017fb35f816079bf503765312d4b1913b18cf96f3fa515" +dependencies = [ + "clru", + "gix-chunk", + "gix-features", + "gix-hash", + "gix-hashtable", + "gix-object", + "gix-path", + "memmap2", + "smallvec", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-packetline" +version = "0.18.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "123844a70cf4d5352441dc06bab0da8aef61be94ec239cb631e0ba01dc6d3a04" +dependencies = [ + "bstr", + "faster-hex", + "gix-trace", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-path" +version = "0.10.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cb06c3e4f8eed6e24fd915fa93145e28a511f4ea0e768bae16673e05ed3f366" +dependencies = [ + "bstr", + "gix-trace", + "gix-validate 0.10.1", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-protocol" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c61bd61afc6b67d213241e2100394c164be421e3f7228d3521b04f48ca5ba90" +dependencies = [ + "bstr", + "gix-date", + "gix-features", + "gix-hash", + "gix-ref", + "gix-shallow", + "gix-transport", + "gix-utils", + "maybe-async", + "thiserror 2.0.18", + "winnow 0.6.26", +] + +[[package]] +name = "gix-quote" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e49357fccdb0c85c0d3a3292a9f6db32d9b3535959b5471bb9624908f4a066c6" +dependencies = [ + "bstr", + "gix-utils", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-ref" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47adf4c5f933429f8554e95d0d92eee583cfe4b95d2bf665cd6fd4a1531ee20c" +dependencies = [ + "gix-actor", + "gix-features", + "gix-fs", + "gix-hash", + "gix-lock", + "gix-object", + "gix-path", + "gix-tempfile", + "gix-utils", + "gix-validate 0.9.4", + "memmap2", + "thiserror 2.0.18", + "winnow 0.6.26", +] + +[[package]] +name = "gix-refspec" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59650228d8f612f68e7f7a25f517fcf386c5d0d39826085492e94766858b0a90" +dependencies = [ + "bstr", + "gix-hash", + "gix-revision", + "gix-validate 0.9.4", + "smallvec", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-revision" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fe28bbccca55da6d66e6c6efc6bb4003c29d407afd8178380293729733e6b53" +dependencies = [ + "bitflags", + "bstr", + "gix-commitgraph", + "gix-date", + "gix-hash", + "gix-hashtable", + "gix-object", + "gix-revwalk", + "gix-trace", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-revwalk" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4ecb80c235b1e9ef2b99b23a81ea50dd569a88a9eb767179793269e0e616247" +dependencies = [ + "gix-commitgraph", + "gix-date", + "gix-hash", + "gix-hashtable", + "gix-object", + "smallvec", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-sec" +version = "0.10.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47aeb0f13de9ef2f3033f5ff218de30f44db827ac9f1286f9ef050aacddd5888" +dependencies = [ + "bitflags", + "gix-path", + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "gix-shallow" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab72543011e303e52733c85bef784603ef39632ddf47f69723def52825e35066" +dependencies = [ + "bstr", + "gix-hash", + "gix-lock", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-tempfile" +version = "16.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2558f423945ef24a8328c55d1fd6db06b8376b0e7013b1bb476cc4ffdf678501" +dependencies = [ + "gix-fs", + "libc", + "once_cell", + "parking_lot", + "tempfile", +] + +[[package]] +name = "gix-trace" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f23569e55f2ffaf958617353b9734a7d52a7c19c439eeaa5e3efc217fd2270e" + +[[package]] +name = "gix-transport" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11187418489477b1b5b862ae1aedbbac77e582f2c4b0ef54280f20cfe5b964d9" +dependencies = [ + "bstr", + "gix-command", + "gix-features", + "gix-packetline", + "gix-quote", + "gix-sec", + "gix-url", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-traverse" +version = "0.44.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bec70e53896586ef32a3efa7e4427b67308531ed186bb6120fb3eca0f0d61b4" +dependencies = [ + "bitflags", + "gix-commitgraph", + "gix-date", + "gix-hash", + "gix-hashtable", + "gix-object", + "gix-revwalk", + "smallvec", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-url" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29218c768b53dd8f116045d87fec05b294c731a4b2bdd257eeca2084cc150b13" +dependencies = [ + "bstr", + "gix-features", + "gix-path", + "percent-encoding", + "thiserror 2.0.18", + "url", +] + +[[package]] +name = "gix-utils" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff08f24e03ac8916c478c8419d7d3c33393da9bb41fa4c24455d5406aeefd35f" +dependencies = [ + "fastrand", + "unicode-normalization", +] + +[[package]] +name = "gix-validate" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34b5f1253109da6c79ed7cf6e1e38437080bb6d704c76af14c93e2f255234084" +dependencies = [ + "bstr", + "thiserror 2.0.18", +] + +[[package]] +name = "gix-validate" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b1e63a5b516e970a594f870ed4571a8fdcb8a344e7bd407a20db8bd61dbfde4" +dependencies = [ + "bstr", + "thiserror 2.0.18", +] + [[package]] name = "glob" version = "0.3.3" @@ -3737,6 +4339,16 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "kebab-parse-code" +version = "0.6.0" +dependencies = [ + "anyhow", + "gix", + "kebab-core", + "tempfile", +] + [[package]] name = "kebab-parse-image" version = "0.6.0" @@ -4846,6 +5458,17 @@ dependencies = [ "thread-tree", ] +[[package]] +name = "maybe-async" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "746873a384ad60adc5db74471dfaba74bd278afbdcfd81db93fafcdfc8b5ca0c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "maybe-rayon" version = "0.1.1" @@ -5702,6 +6325,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prodash" +version = "29.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f04bb108f648884c23b98a0e940ebc2c93c0c3b89f04dbaf7eb8256ce617d1bc" +dependencies = [ + "log", + "parking_lot", +] + [[package]] name = "profiling" version = "1.0.17" @@ -6841,6 +7474,12 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "sha1_smol" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d" + [[package]] name = "sha2" version = "0.10.9" @@ -6861,6 +7500,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shell-words" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc6fe69c597f9c37bfeeeeeb33da3530379845f10be461a66d16d03eca2ded77" + [[package]] name = "shellexpand" version = "3.1.2" @@ -7889,6 +8534,12 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" +[[package]] +name = "unicode-bom" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eec5d1121208364f6793f7d2e222bf75a915c19557537745b195b253dd64217" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -8587,6 +9238,15 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "0.6.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28" +dependencies = [ + "memchr", +] + [[package]] name = "winnow" version = "0.7.15" diff --git a/Cargo.toml b/Cargo.toml index 3731f22..7b2527c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "crates/kebab-parse-pdf", "crates/kebab-tui", "crates/kebab-mcp", + "crates/kebab-parse-code", ] [workspace.package] @@ -81,6 +82,10 @@ rmcp = { version = "1.6", default-features = false, features = ["server" # sync via reqwest::blocking — wiremock is dev-only there). wiremock = "0.6" base64 = "0.22" +# Pure-Rust git library for repo metadata detection (kebab-parse-code). +# No `git` binary required. Default features include thread-safety + most +# object-reading capabilities needed for HEAD name + commit SHA queries. +gix = { version = "0.70", default-features = false, features = ["revision"] } # Disk-footprint trim for dev / test builds. Codegen, opt-level, and # behavior are unchanged — only DWARF debug info is reduced (line diff --git a/crates/kebab-parse-code/Cargo.toml b/crates/kebab-parse-code/Cargo.toml new file mode 100644 index 0000000..ac76da0 --- /dev/null +++ b/crates/kebab-parse-code/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "kebab-parse-code" +version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } + +[dependencies] +anyhow = { workspace = true } +gix = { workspace = true } +kebab-core = { path = "../kebab-core" } + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/crates/kebab-parse-code/src/lang.rs b/crates/kebab-parse-code/src/lang.rs new file mode 100644 index 0000000..bd850f6 --- /dev/null +++ b/crates/kebab-parse-code/src/lang.rs @@ -0,0 +1,42 @@ +//! Canonical extension → language identifier mapping (spec §3.5). +//! +//! Lowercase canonical identifiers, matching tree-sitter parser conventions: +//! `rust`, `python`, `typescript`, `javascript`, `go`, `java`, `kotlin`, `c`, +//! `cpp`, `yaml`, `toml`, `json`, `shell`, `make`, `dockerfile`. + +use std::path::Path; + +/// Returns the canonical language identifier for a given file path, or +/// `None` if the extension / filename is not recognized. +/// +/// Matching priority: +/// 1. exact filename match (e.g. `Dockerfile`, `Makefile`) +/// 2. lowercase extension match +pub fn code_lang_for_path(path: &Path) -> Option<&'static str> { + if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + match name { + "Dockerfile" => return Some("dockerfile"), + "Makefile" | "GNUmakefile" => return Some("make"), + _ => {} + } + } + let ext = path.extension()?.to_str()?.to_ascii_lowercase(); + match ext.as_str() { + "rs" => Some("rust"), + "py" | "pyi" => Some("python"), + "ts" | "tsx" => Some("typescript"), + "js" | "mjs" | "cjs" | "jsx" => Some("javascript"), + "go" => Some("go"), + "java" => Some("java"), + "kt" | "kts" => Some("kotlin"), + "c" | "h" => Some("c"), + "cpp" | "cc" | "cxx" | "hpp" | "hh" | "hxx" => Some("cpp"), + "yaml" | "yml" => Some("yaml"), + "toml" => Some("toml"), + "json" => Some("json"), + "sh" | "bash" | "zsh" => Some("shell"), + "mk" => Some("make"), + "dockerfile" => Some("dockerfile"), + _ => None, + } +} diff --git a/crates/kebab-parse-code/src/lib.rs b/crates/kebab-parse-code/src/lib.rs new file mode 100644 index 0000000..3b699c1 --- /dev/null +++ b/crates/kebab-parse-code/src/lib.rs @@ -0,0 +1,22 @@ +//! `kebab-parse-code` — language-aware parsing for code corpora. +//! +//! Phase 1A-1 ships infrastructure only: +//! +//! - [`lang::code_lang_for_path`] — extension → language identifier. +//! - [`repo::detect_repo`] — `.git/` walk-up → repo / branch / commit metadata. +//! - [`skip::is_generated_file`] / [`skip::is_oversized`] — pre-ingest skip +//! helpers consulted by `kebab-source-fs`. +//! - [`skip::BUILTIN_BLACKLIST`] — 6-entry safety-net pattern list. +//! +//! Per-language parser modules (`rust`, `python`, `typescript`, …) land in +//! later phases (1A-2 onwards). The crate boundary follows other +//! `kebab-parse-*` crates per design §8: must NOT depend on store / embed +//! / llm / rag. + +pub mod lang; +pub mod repo; +pub mod skip; + +pub use lang::code_lang_for_path; +pub use repo::{RepoMeta, detect_repo}; +pub use skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized}; diff --git a/crates/kebab-parse-code/src/repo.rs b/crates/kebab-parse-code/src/repo.rs new file mode 100644 index 0000000..6798fbe --- /dev/null +++ b/crates/kebab-parse-code/src/repo.rs @@ -0,0 +1,61 @@ +//! Git repo auto-detection (spec §5.1). +//! +//! Walks up from `path` looking for a `.git/` directory. If found, reads +//! repo dir name, current branch, and HEAD commit using `gix` (pure Rust; +//! no `git` binary on PATH required). + +use std::path::Path; + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct RepoMeta { + pub name: String, + pub branch: Option, + pub commit: Option, +} + +/// Walk up from `path` until a `.git/` directory is found. Returns repo +/// metadata, or `None` if no repo boundary is reached before the filesystem +/// root. +/// +/// - `name`: directory name containing `.git/`. +/// - `branch`: current HEAD branch, or `"detached"` if detached HEAD, or +/// `None` if branch can't be read. +/// - `commit`: 40-hex commit SHA at HEAD, or `None` if empty repo / read +/// failure. +/// +/// `.git/` as a file (worktree marker / submodule) returns `None` for +/// `branch` and `commit` and falls back to the parent dir name for `name`. +pub fn detect_repo(path: &Path) -> Option { + let mut cur = if path.is_dir() { path } else { path.parent()? }; + loop { + let dotgit = cur.join(".git"); + if dotgit.is_dir() { + let name = cur.file_name()?.to_string_lossy().into_owned(); + let (branch, commit) = read_head(cur); + return Some(RepoMeta { name, branch, commit }); + } else if dotgit.is_file() { + let name = cur.file_name()?.to_string_lossy().into_owned(); + return Some(RepoMeta { name, branch: None, commit: None }); + } + cur = cur.parent()?; + } +} + +fn read_head(repo_dir: &Path) -> (Option, Option) { + match gix::open(repo_dir) { + Ok(repo) => { + let branch = repo + .head_name() + .ok() + .flatten() + .map(|n| n.shorten().to_string()) + .or_else(|| Some("detached".to_string())); + let commit = repo + .head_id() + .ok() + .map(|id| id.to_string()); + (branch, commit) + } + Err(_) => (None, None), + } +} diff --git a/crates/kebab-parse-code/src/skip.rs b/crates/kebab-parse-code/src/skip.rs new file mode 100644 index 0000000..eafecf8 --- /dev/null +++ b/crates/kebab-parse-code/src/skip.rs @@ -0,0 +1,65 @@ +//! Pre-ingest skip helpers (spec §5.2 + §5.3 + §5.4). +//! +//! - [`BUILTIN_BLACKLIST`] — 6 gitignore-style patterns universal across +//! ecosystems. Source of truth: spec §5.2. +//! - [`is_generated_file`] — reads first ~512 bytes, checks for 7 +//! case-insensitive markers. +//! - [`is_oversized`] — byte cap then line cap. + +use anyhow::Result; +use std::fs::File; +use std::io::{BufRead, BufReader, Read}; +use std::path::Path; + +/// 6 built-in gitignore-style patterns. Applied in addition to `.gitignore` +/// + `.kebabignore`. User can override via `.kebabignore` negation +/// (`!pattern`). +pub const BUILTIN_BLACKLIST: &[&str] = &[ + "**/node_modules/**", + "**/target/**", + "**/__pycache__/**", + "**/.venv/**", + "**/venv/**", + "**/env/**", +]; + +/// Read first 512 bytes, check for any of 7 case-insensitive generated-file +/// markers. Returns Ok(true) on match, Ok(false) otherwise. +pub fn is_generated_file(path: &Path) -> Result { + let mut buf = [0u8; 512]; + let mut f = File::open(path)?; + let n = f.read(&mut buf)?; + if n == 0 { + return Ok(false); + } + let head = std::str::from_utf8(&buf[..n]).unwrap_or(""); + let lower: String = head.lines().take(10).collect::>().join("\n").to_ascii_lowercase(); + Ok( + lower.contains("@generated") + || lower.contains("code generated by") + || lower.contains("do not edit") + || lower.contains("do not modify") + || lower.contains("automatically generated") + || lower.contains("auto-generated") + || lower.contains("autogenerated"), + ) +} + +/// Check if `path` exceeds `max_bytes` or `max_lines`. Byte cap first +/// (cheap), then line cap (streaming with early exit). +pub fn is_oversized(path: &Path, max_bytes: u64, max_lines: u32) -> Result { + let meta = std::fs::metadata(path)?; + if meta.len() > max_bytes { + return Ok(true); + } + let reader = BufReader::new(File::open(path)?); + let mut count: u32 = 0; + for line in reader.lines() { + let _ = line?; + count = count.saturating_add(1); + if count > max_lines { + return Ok(true); + } + } + Ok(false) +} diff --git a/crates/kebab-parse-code/tests/lang.rs b/crates/kebab-parse-code/tests/lang.rs new file mode 100644 index 0000000..73a1551 --- /dev/null +++ b/crates/kebab-parse-code/tests/lang.rs @@ -0,0 +1,64 @@ +use kebab_parse_code::code_lang_for_path; +use std::path::Path; + +#[test] +fn known_extensions_map_to_canonical_identifiers() { + let cases = [ + ("foo.rs", Some("rust")), + ("foo.py", Some("python")), + ("foo.pyi", Some("python")), + ("foo.ts", Some("typescript")), + ("foo.tsx", Some("typescript")), + ("foo.js", Some("javascript")), + ("foo.mjs", Some("javascript")), + ("foo.cjs", Some("javascript")), + ("foo.jsx", Some("javascript")), + ("foo.go", Some("go")), + ("foo.java", Some("java")), + ("foo.kt", Some("kotlin")), + ("foo.kts", Some("kotlin")), + ("foo.c", Some("c")), + ("foo.h", Some("c")), + ("foo.cpp", Some("cpp")), + ("foo.cc", Some("cpp")), + ("foo.cxx", Some("cpp")), + ("foo.hpp", Some("cpp")), + ("foo.hh", Some("cpp")), + ("foo.hxx", Some("cpp")), + ("foo.yaml", Some("yaml")), + ("foo.yml", Some("yaml")), + ("foo.toml", Some("toml")), + ("foo.json", Some("json")), + ("foo.sh", Some("shell")), + ("foo.bash", Some("shell")), + ("foo.zsh", Some("shell")), + ("foo.mk", Some("make")), + ]; + for (path, expected) in cases { + assert_eq!( + code_lang_for_path(Path::new(path)), + expected, + "path = {path}" + ); + } +} + +#[test] +fn special_filenames_map_to_identifiers() { + assert_eq!(code_lang_for_path(Path::new("Dockerfile")), Some("dockerfile")); + assert_eq!(code_lang_for_path(Path::new("foo.dockerfile")), Some("dockerfile")); + assert_eq!(code_lang_for_path(Path::new("Makefile")), Some("make")); +} + +#[test] +fn unknown_extension_returns_none() { + assert_eq!(code_lang_for_path(Path::new("foo.docx")), None); + assert_eq!(code_lang_for_path(Path::new("foo")), None); + assert_eq!(code_lang_for_path(Path::new("foo.unknown")), None); +} + +#[test] +fn case_insensitive() { + assert_eq!(code_lang_for_path(Path::new("Foo.RS")), Some("rust")); + assert_eq!(code_lang_for_path(Path::new("FOO.YAML")), Some("yaml")); +} diff --git a/crates/kebab-parse-code/tests/repo.rs b/crates/kebab-parse-code/tests/repo.rs new file mode 100644 index 0000000..68365a1 --- /dev/null +++ b/crates/kebab-parse-code/tests/repo.rs @@ -0,0 +1,62 @@ +use kebab_parse_code::repo::detect_repo; +use std::fs; +use std::process::Command; +use tempfile::TempDir; + +fn init_git_repo(root: &std::path::Path) { + let run = |args: &[&str]| { + Command::new("git") + .args(args) + .current_dir(root) + .status() + .expect("git command failed"); + }; + run(&["init", "-q"]); + run(&["config", "user.email", "test@test"]); + run(&["config", "user.name", "test"]); + fs::write(root.join("README.md"), "hi").unwrap(); + run(&["add", "README.md"]); + run(&["commit", "-q", "-m", "init"]); +} + +#[test] +fn detect_repo_returns_none_outside_git() { + let tmp = TempDir::new().unwrap(); + let nested = tmp.path().join("a/b/c.txt"); + fs::create_dir_all(nested.parent().unwrap()).unwrap(); + fs::write(&nested, "x").unwrap(); + assert!(detect_repo(&nested).is_none()); +} + +#[test] +fn detect_repo_walks_up_to_git_dir() { + let tmp = TempDir::new().unwrap(); + let repo_root = tmp.path().join("myrepo"); + fs::create_dir_all(&repo_root).unwrap(); + init_git_repo(&repo_root); + let nested = repo_root.join("src/deep/file.rs"); + fs::create_dir_all(nested.parent().unwrap()).unwrap(); + fs::write(&nested, "x").unwrap(); + + let meta = detect_repo(&nested).expect("should detect repo"); + assert_eq!(meta.name, "myrepo"); + assert!(meta.branch.is_some()); + assert!(meta.commit.is_some()); + assert_eq!(meta.commit.as_ref().unwrap().len(), 40); +} + +#[test] +fn detect_repo_caches_per_path_call_for_repeated_files_in_same_repo() { + let tmp = TempDir::new().unwrap(); + let repo_root = tmp.path().join("myrepo"); + fs::create_dir_all(&repo_root).unwrap(); + init_git_repo(&repo_root); + let f1 = repo_root.join("a.rs"); + let f2 = repo_root.join("b.rs"); + fs::write(&f1, "x").unwrap(); + fs::write(&f2, "x").unwrap(); + let m1 = detect_repo(&f1).unwrap(); + let m2 = detect_repo(&f2).unwrap(); + assert_eq!(m1.name, m2.name); + assert_eq!(m1.commit, m2.commit); +} diff --git a/crates/kebab-parse-code/tests/skip.rs b/crates/kebab-parse-code/tests/skip.rs new file mode 100644 index 0000000..b85dafe --- /dev/null +++ b/crates/kebab-parse-code/tests/skip.rs @@ -0,0 +1,74 @@ +use kebab_parse_code::skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized}; +use std::fs; +use tempfile::NamedTempFile; + +#[test] +fn generated_header_markers_trigger_skip() { + let cases = [ + "// @generated\nfn foo() {}\n", + "// Code generated by tonic-build. DO NOT EDIT.\nfn x() {}\n", + "/* DO NOT EDIT */\nfn x() {}\n", + "/* do not modify */\nfn x() {}\n", + "// AUTOMATICALLY GENERATED\nfn x() {}\n", + "# auto-generated\ndef x(): pass\n", + "// autogenerated\nfn x() {}\n", + ]; + for content in cases { + let f = NamedTempFile::new().unwrap(); + fs::write(f.path(), content).unwrap(); + assert!(is_generated_file(f.path()).unwrap(), "content: {content:?}"); + } +} + +#[test] +fn normal_code_is_not_flagged_generated() { + let f = NamedTempFile::new().unwrap(); + fs::write(f.path(), "fn main() {\n println!(\"hi\");\n}\n").unwrap(); + assert!(!is_generated_file(f.path()).unwrap()); +} + +#[test] +fn is_generated_returns_false_for_empty_file() { + let f = NamedTempFile::new().unwrap(); + fs::write(f.path(), "").unwrap(); + assert!(!is_generated_file(f.path()).unwrap()); +} + +#[test] +fn oversized_by_bytes_returns_true() { + let f = NamedTempFile::new().unwrap(); + let body: String = "x".repeat(300_000); + fs::write(f.path(), &body).unwrap(); + assert!(is_oversized(f.path(), 262_144, 5_000).unwrap()); +} + +#[test] +fn oversized_by_lines_returns_true() { + let f = NamedTempFile::new().unwrap(); + let body: String = "x\n".repeat(6_000); + fs::write(f.path(), &body).unwrap(); + assert!(is_oversized(f.path(), 262_144, 5_000).unwrap()); +} + +#[test] +fn small_file_returns_false_for_oversize() { + let f = NamedTempFile::new().unwrap(); + fs::write(f.path(), "fn foo() {}\n").unwrap(); + assert!(!is_oversized(f.path(), 262_144, 5_000).unwrap()); +} + +#[test] +fn builtin_blacklist_has_exactly_six_entries() { + assert_eq!(BUILTIN_BLACKLIST.len(), 6); + let expected = [ + "**/node_modules/**", + "**/target/**", + "**/__pycache__/**", + "**/.venv/**", + "**/venv/**", + "**/env/**", + ]; + for pat in expected { + assert!(BUILTIN_BLACKLIST.contains(&pat), "missing pattern: {pat}"); + } +}