feat(dogfood): route .mts/.cts to typescript + .mdx to markdown

Dogfood (PR #142 1B + multi-root: kebab-docs + httpx + zod + lodash)
showed 28 files skipped by extension that are routable to existing
extractors:
- .mts (ESM TypeScript) / .cts (CommonJS TypeScript) — same grammar as
  .ts in tree-sitter-typescript 0.23 (LANGUAGE_TYPESCRIPT covers JSX-
  agnostic variants; LANGUAGE_TSX stays for .tsx only)
- .mdx (Markdown + JSX) — routed as MediaType::Markdown; the md parser
  folds JSX islands through as raw passthrough

Changes:
- crates/kebab-source-fs/src/media.rs: 'mts'|'cts' → Code(typescript),
  'mdx' → Markdown. +2 unit tests.
- crates/kebab-parse-code/src/lang.rs: code_lang_for_path matches mts/cts;
  module_path_for_tsjs strips .mts/.cts as well. Test cases extended.
- crates/kebab-parse-code/src/typescript.rs: doc comment on select_grammar
  refreshed to mention .mts/.cts.
- crates/kebab-parse-code/tests/lang.rs: 2 new assertions.

verify: kebab-source-fs 44 / kebab-parse-code lib 20 + lang 4 all pass; clippy clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 06:24:21 +00:00
parent 5a90940f1c
commit 5497c6e7b5
4 changed files with 27 additions and 7 deletions

View File

@@ -24,7 +24,7 @@ pub fn code_lang_for_path(path: &Path) -> Option<&'static str> {
match ext.as_str() {
"rs" => Some("rust"),
"py" | "pyi" => Some("python"),
"ts" | "tsx" => Some("typescript"),
"ts" | "tsx" | "mts" | "cts" => Some("typescript"),
"js" | "mjs" | "cjs" | "jsx" => Some("javascript"),
"go" => Some("go"),
"java" => Some("java"),
@@ -82,7 +82,7 @@ pub fn module_path_for_python(workspace_path: &str) -> String {
/// (no slash replacement, no source-root strip). See plan §Task C.
pub fn module_path_for_tsjs(workspace_path: &str) -> String {
let p = workspace_path;
for ext in [".tsx", ".ts", ".jsx", ".mjs", ".cjs", ".js"] {
for ext in [".tsx", ".mts", ".cts", ".ts", ".jsx", ".mjs", ".cjs", ".js"] {
if let Some(stripped) = p.strip_suffix(ext) {
return stripped.to_string();
}
@@ -110,7 +110,7 @@ mod tests {
#[test]
fn module_path_for_tsjs_keeps_slashes_and_strips_ext() {
for ext in ["ts", "tsx", "js", "jsx", "mjs", "cjs"] {
for ext in ["ts", "tsx", "mts", "cts", "js", "jsx", "mjs", "cjs"] {
let p = format!("src/search/retriever/Retriever.{ext}");
assert_eq!(module_path_for_tsjs(&p), "src/search/retriever/Retriever");
}

View File

@@ -173,8 +173,9 @@ impl Extractor for TypescriptAstExtractor {
}
/// Select the tree-sitter grammar based on the workspace path's
/// extension. `.tsx` → TSX grammar; everything else (`.ts`, `.d.ts`,
/// missing extension) → TypeScript grammar.
/// extension. `.tsx` → TSX grammar; everything else (`.ts`, `.mts`,
/// `.cts`, `.d.ts`, missing extension) → TypeScript grammar (the JSX-
/// agnostic variants all share one grammar in tree-sitter-typescript 0.23).
fn select_grammar(workspace_path: &str) -> tree_sitter::Language {
if workspace_path.ends_with(".tsx") {
tree_sitter_typescript::LANGUAGE_TSX.into()

View File

@@ -9,6 +9,8 @@ fn known_extensions_map_to_canonical_identifiers() {
("foo.pyi", Some("python")),
("foo.ts", Some("typescript")),
("foo.tsx", Some("typescript")),
("foo.mts", Some("typescript")), // ESM TS — same grammar
("foo.cts", Some("typescript")), // CommonJS TS — same grammar
("foo.js", Some("javascript")),
("foo.mjs", Some("javascript")),
("foo.cjs", Some("javascript")),

View File

@@ -19,7 +19,9 @@ pub(crate) fn media_type_for(path: &Path) -> MediaType {
.unwrap_or_default();
match ext.as_str() {
"md" => MediaType::Markdown,
// Markdown + MDX (markdown + JSX, treated as plain markdown — the
// JSX islands are folded into raw passthrough by the md parser).
"md" | "mdx" => MediaType::Markdown,
"pdf" => MediaType::Pdf,
"png" => MediaType::Image(ImageType::Png),
@@ -40,7 +42,8 @@ pub(crate) fn media_type_for(path: &Path) -> MediaType {
// p10-1B: Python / TS / JS AST chunkers active.
"py" | "pyi" => MediaType::Code("python".into()),
"ts" | "tsx" => MediaType::Code("typescript".into()),
// .mts / .cts are TypeScript ESM / CommonJS variants — same grammar.
"ts" | "tsx" | "mts" | "cts" => MediaType::Code("typescript".into()),
"js" | "mjs" | "cjs" | "jsx" => MediaType::Code("javascript".into()),
// Empty string (no extension) and any other extension: bucket as
@@ -102,6 +105,20 @@ mod tests {
assert_eq!(media_type_for(Path::new("a/b.rs")), MediaType::Code("rust".into()));
}
#[test]
fn ts_variants_mts_cts() {
// .mts / .cts are TypeScript ESM / CommonJS — same grammar as .ts.
assert_eq!(media_type_for(Path::new("a/b.mts")), MediaType::Code("typescript".into()));
assert_eq!(media_type_for(Path::new("a/b.cts")), MediaType::Code("typescript".into()));
}
#[test]
fn mdx_routes_to_markdown() {
// MDX is markdown with JSX islands; the md parser folds the JSX
// through as raw passthrough.
assert_eq!(media_type_for(Path::new("docs/page.mdx")), MediaType::Markdown);
}
#[test]
fn unknown_and_missing_extension() {
assert_eq!(