From de63f161ac3187641e51f3150f10d372ca1a3316 Mon Sep 17 00:00:00 2001 From: altair823 Date: Wed, 20 May 2026 00:54:27 +0000 Subject: [PATCH] feat(p10-1b): tree-sitter-typescript AST extractor (TS + TSX via grammar selection) Adds `kebab_parse_code::typescript::TypescriptAstExtractor` (PARSER_VERSION `code-typescript-v1`), mirroring the Python extractor (P10-1B Task E) and the Rust scaffold (P10-1A-2). One `Block::Code` per top-level AST semantic unit (free fn / class / each method / interface / type alias / enum, recursively per nested class), each carrying `SourceSpan::Code` with the unit's dotted symbol path prefixed by `module_path_for_tsjs`. Grammar selection per `tree-sitter-typescript` 0.23: the workspace path's `.tsx` extension routes to `LANGUAGE_TSX`, everything else to `LANGUAGE_TYPESCRIPT`. The `export_statement` arm unwraps a `declaration` field (`function_declaration` / `class_declaration` / `interface_declaration` / `type_alias_declaration` / `enum_declaration`) using the OUTER statement's line range so `export ` is folded in; for `export default function () {}` and `export default class {}` (where the inner node sits under the `value` field as `function_expression` / `class` with no `name`), the symbol leaf is `default`. Bare value exports / re-exports fall into glue. Glue grouping reuses the Python post-pass: `` only when the entire group is imports + bare re-exports; demoted to `` if the file produced any real unit. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-parse-code/src/lib.rs | 2 + crates/kebab-parse-code/src/typescript.rs | 601 ++++++++++++++++++ .../kebab-parse-code/tests/fixtures/sample.ts | 11 + .../tests/fixtures/sample.tsx | 4 + 4 files changed, 618 insertions(+) create mode 100644 crates/kebab-parse-code/src/typescript.rs create mode 100644 crates/kebab-parse-code/tests/fixtures/sample.ts create mode 100644 crates/kebab-parse-code/tests/fixtures/sample.tsx diff --git a/crates/kebab-parse-code/src/lib.rs b/crates/kebab-parse-code/src/lib.rs index eabcab0..921795a 100644 --- a/crates/kebab-parse-code/src/lib.rs +++ b/crates/kebab-parse-code/src/lib.rs @@ -18,9 +18,11 @@ pub mod python; pub mod repo; pub mod rust; pub mod skip; +pub mod typescript; pub use lang::{code_lang_for_path, module_path_for_python, module_path_for_tsjs}; pub use python::{PARSER_VERSION as PYTHON_PARSER_VERSION, PythonAstExtractor}; pub use repo::{RepoMeta, detect_repo}; pub use rust::{PARSER_VERSION as RUST_PARSER_VERSION, RustAstExtractor}; pub use skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized}; +pub use typescript::{PARSER_VERSION as TS_PARSER_VERSION, TypescriptAstExtractor}; diff --git a/crates/kebab-parse-code/src/typescript.rs b/crates/kebab-parse-code/src/typescript.rs new file mode 100644 index 0000000..d261ba1 --- /dev/null +++ b/crates/kebab-parse-code/src/typescript.rs @@ -0,0 +1,601 @@ +//! `kebab-parse-code::typescript` — tree-sitter TypeScript / TSX AST +//! extractor (P10-1B Task H). +//! +//! Implements [`kebab_core::Extractor`] for [`MediaType::Code("typescript")`]. +//! Walks the tree-sitter parse tree (one of two grammars selected by the +//! workspace path's extension — `.tsx` uses [`tree_sitter_typescript::LANGUAGE_TSX`], +//! everything else uses [`tree_sitter_typescript::LANGUAGE_TYPESCRIPT`]) and +//! emits one [`Block::Code`] per top-level AST semantic unit (free fn, +//! class, each method, interface, type alias, enum, recursively per +//! nested class), each carrying [`SourceSpan::Code`] with the unit's +//! dotted symbol path prefixed by [`module_path_for_tsjs`]. +//! +//! Glue declarations (`import_statement`, bare `export_statement` +//! re-exports, `lexical_declaration` / `variable_declaration` at the +//! module level, namespace / module declarations, etc.) collapse into +//! one grouped `` (or ``) unit. +//! +//! `export_statement` is unwrapped: an `export function|class|interface +//! |type|enum` is treated as the inner declaration arm but the unit's +//! line range comes from the OUTER `export_statement` so the `export ` +//! prefix is folded in. `export default function () {}` / `export +//! default class {}` (no `name` field) emits `default` as the symbol +//! name. +//! +//! Scope follows 1A-2 / 1B Task E: AST unit extraction + dotted symbol +//! paths + line ranges. Per design §3.4 / §9.1 / §9 versioning. + +use anyhow::Result; +use kebab_core::{ + Block, CanonicalDocument, CodeBlock, CommonBlock, Extractor, Lang, MediaType, Metadata, + ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TrustLevel, + id_for_block, id_for_doc, +}; +use serde_json::Map; +use time::OffsetDateTime; + +pub const PARSER_VERSION: &str = "code-typescript-v1"; + +/// TypeScript / TSX AST extractor. Per-unit blocks via +/// tree-sitter-typescript 0.23 (`LANGUAGE_TYPESCRIPT` / `LANGUAGE_TSX` +/// — two `LanguageFn`s, selected by extension) parsed by tree-sitter +/// 0.26. +pub struct TypescriptAstExtractor; + +impl TypescriptAstExtractor { + pub fn new() -> Self { + Self + } +} + +impl Default for TypescriptAstExtractor { + fn default() -> Self { + Self::new() + } +} + +impl Extractor for TypescriptAstExtractor { + fn supports(&self, m: &MediaType) -> bool { + matches!(m, MediaType::Code(l) if l == "typescript") + } + + fn parser_version(&self) -> ParserVersion { + ParserVersion(PARSER_VERSION.to_string()) + } + + fn extract( + &self, + ctx: &kebab_core::ExtractContext<'_>, + bytes: &[u8], + ) -> Result { + let asset = ctx.asset; + if !self.supports(&asset.media_type) { + anyhow::bail!( + "kebab-parse-code: unsupported media_type for TypescriptAstExtractor: {:?}", + asset.media_type + ); + } + + let parser_version = self.parser_version(); + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); + + let source = String::from_utf8(bytes.to_vec()).map_err(|e| { + anyhow::anyhow!("kebab-parse-code: TypeScript source is not valid UTF-8: {e}") + })?; + + let mod_prefix = crate::lang::module_path_for_tsjs(&asset.workspace_path.0); + let language = select_grammar(&asset.workspace_path.0); + let blocks = build_blocks(&source, &doc_id, &mod_prefix, language)?; + let unit_count = blocks.len() as u32; + + let now = OffsetDateTime::now_utc(); + let mut events: Vec = Vec::with_capacity(2); + events.push(ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-code".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!( + "parser_version={}; unit_count={}", + parser_version.0, unit_count + )), + }); + + let title = { + let fname = filename_from_workspace_path(&asset.workspace_path.0); + strip_extension(&fname) + }; + + // Resolve the file's absolute path for repo detection. If the + // source URI carries a relative path, anchor it at the workspace + // root so the `.git/` walk-up starts from the right place. + let abs_path = match &asset.source_uri { + kebab_core::SourceUri::File(p) => { + if p.is_absolute() { + p.clone() + } else { + ctx.workspace_root.join(p) + } + } + kebab_core::SourceUri::Kb(_) => ctx.workspace_root.to_path_buf(), + }; + let (repo, git_branch, git_commit) = match crate::repo::detect_repo(&abs_path) { + Some(r) => (Some(r.name), r.branch, r.commit), + None => (None, None, None), + }; + + let metadata = Metadata { + aliases: Vec::new(), + tags: Vec::new(), + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Map::new(), + repo, + git_branch, + git_commit, + code_lang: Some("typescript".to_string()), + }; + + tracing::debug!( + target: "kebab-parse-code", + "extracted TypeScript doc_id={} workspace_path={} units={}", + doc_id.0, + asset.workspace_path.0, + unit_count + ); + + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks, + metadata, + provenance: Provenance { events }, + parser_version, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + }) + } +} + +/// Select the tree-sitter grammar based on the workspace path's +/// extension. `.tsx` → TSX grammar; everything else (`.ts`, `.d.ts`, +/// missing extension) → TypeScript grammar. +fn select_grammar(workspace_path: &str) -> tree_sitter::Language { + if workspace_path.ends_with(".tsx") { + tree_sitter_typescript::LANGUAGE_TSX.into() + } else { + tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into() + } +} + +fn filename_from_workspace_path(p: &str) -> String { + p.rsplit('/').next().unwrap_or(p).to_string() +} + +fn strip_extension(filename: &str) -> String { + match filename.rfind('.') { + Some(0) => filename.to_string(), + Some(idx) => filename[..idx].to_string(), + None => filename.to_string(), + } +} + +/// Join (mod_prefix, mod_path, name) into a dotted TS symbol. +/// +/// Note: TS uses `.` as the join separator between mod_prefix / +/// class-nesting / leaf — even though `mod_prefix` itself may contain +/// `/` (e.g. `src/search/Retriever`), the JOIN between segments stays +/// `.`. So a class method symbol looks like `src/search/Foo.search`. +fn join_symbol(mod_prefix: &str, mod_path: &[String], name: &str) -> String { + let mut parts: Vec<&str> = Vec::with_capacity(mod_path.len() + 2); + if !mod_prefix.is_empty() { + parts.push(mod_prefix); + } + for p in mod_path { + parts.push(p.as_str()); + } + parts.push(name); + parts.join(".") +} + +fn build_blocks( + source: &str, + doc_id: &kebab_core::DocumentId, + mod_prefix: &str, + language: tree_sitter::Language, +) -> anyhow::Result> { + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&language) + .map_err(|e| anyhow::anyhow!("set tree-sitter-typescript language: {e}"))?; + let tree = parser + .parse(source.as_bytes(), None) + .ok_or_else(|| anyhow::anyhow!("tree-sitter failed to parse TypeScript source"))?; + let lines: Vec<&str> = source.split('\n').collect(); + + // units: (symbol, line_start, line_end, is_real_semantic_unit). + // Glue groups are pushed with a sentinel symbol + is_real=false so a + // post-pass can decide `` vs `` (same algorithm + // as 1A Gap 1 / 1B Python). + let mut units: Vec<(String, u32, u32, bool)> = Vec::new(); + // (is_module_only_kind 0/1, s, e). `is_module_only_kind` flags + // `import_statement` and bare re-export `export_statement`s — used by + // the glue flush to pick `` vs `` provisional + // label (1A's `is_mod_decl` analog). + let mut glue: Vec<(usize, u32, u32)> = Vec::new(); + + /// Walk preceding `comment` siblings to extend the unit's line range + /// upward, folding leading doc / line comments into the unit. TS + /// class / method decorators live INSIDE the parent declaration (as + /// children, surfaced via the `decorator` field) — for 1B 1차 we do + /// not specially unwrap them; this matches the plan §Task H note. + fn unit_start(n: &tree_sitter::Node) -> u32 { + let mut start = n.start_position().row as u32 + 1; + let mut prev = n.prev_sibling(); + while let Some(p) = prev { + if p.kind() == "comment" { + start = p.start_position().row as u32 + 1; + prev = p.prev_sibling(); + } else { + break; + } + } + start + } + fn name_text<'a>(n: &tree_sitter::Node, src: &'a str) -> Option<&'a str> { + n.child_by_field_name("name") + .map(|c| &src[c.start_byte()..c.end_byte()]) + } + /// Walk a class body, emitting one unit per `method_definition`. + /// Class names already pushed onto `mod_path` by the caller, so + /// method symbols come out as `..`. + fn walk_class_body( + body: tree_sitter::Node, + src: &str, + mod_prefix: &str, + mod_path: &[String], + units: &mut Vec<(String, u32, u32, bool)>, + ) { + let mut cur = body.walk(); + for child in body.named_children(&mut cur) { + if child.kind() == "method_definition" { + if let Some(name) = name_text(&child, src) { + let s = unit_start(&child); + let e = child.end_position().row as u32 + 1; + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + } + } + } + } + fn walk( + node: tree_sitter::Node, + src: &str, + mod_prefix: &str, + mod_path: &[String], + units: &mut Vec<(String, u32, u32, bool)>, + glue: &mut Vec<(usize, u32, u32)>, + ) { + let mut cur = node.walk(); + for child in node.named_children(&mut cur) { + let s = unit_start(&child); + let e = child.end_position().row as u32 + 1; + match child.kind() { + "function_declaration" => { + if let Some(name) = name_text(&child, src) { + glue.retain(|(_, gs, _)| *gs < s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + } + } + "class_declaration" => { + if let Some(name) = name_text(&child, src) { + glue.retain(|(_, gs, _)| *gs < s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + if let Some(body) = child.child_by_field_name("body") { + let mut np = mod_path.to_vec(); + np.push(name.to_string()); + walk_class_body(body, src, mod_prefix, &np, units); + } + } + } + "interface_declaration" + | "type_alias_declaration" + | "enum_declaration" => { + if let Some(name) = name_text(&child, src) { + glue.retain(|(_, gs, _)| *gs < s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + } + } + "export_statement" => { + // Try field "declaration" first (export class / + // function / interface / type / enum). If absent, + // fall back to "value" — `export default function + // () {}` / `export default class {}` expose the + // anonymous function_expression / class under the + // `value` field (TS grammar 0.23). + let outer_s = s; // includes `export ` prefix line + let outer_e = e; + if let Some(inner) = child.child_by_field_name("declaration") { + let inner_kind = inner.kind(); + match inner_kind { + "function_declaration" + | "class_declaration" + | "interface_declaration" + | "type_alias_declaration" + | "enum_declaration" => { + let name_opt = name_text(&inner, src).map(|s| s.to_string()); + if let Some(name) = name_opt { + glue.retain(|(_, gs, _)| *gs < outer_s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = + join_symbol(mod_prefix, mod_path, &name); + units.push((sym, outer_s, outer_e, true)); + if inner_kind == "class_declaration" { + if let Some(body) = + inner.child_by_field_name("body") + { + let mut np = mod_path.to_vec(); + np.push(name); + walk_class_body( + body, src, mod_prefix, &np, units, + ); + } + } + } else { + // `export default function foo() {}` + // path is covered by name_opt = + // Some(_) above; the no-name path + // here is `export default` with a + // function_declaration that + // somehow lacks `name`. Emit + // `default` defensively. + glue.retain(|(_, gs, _)| *gs < outer_s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = + join_symbol(mod_prefix, mod_path, "default"); + units.push((sym, outer_s, outer_e, true)); + } + } + // `lexical_declaration` etc. wrapped in + // export: treat as glue (assigned arrow + // fns / consts don't get their own unit). + _ => { + glue.push((0, s, e)); + } + } + } else if let Some(value) = child.child_by_field_name("value") { + // `export default `. We emit a unit only + // for the function / class shapes (named or + // anonymous); other value shapes are glue. + match value.kind() { + "function_expression" + | "function_declaration" + | "class" + | "class_declaration" => { + let name_opt = + name_text(&value, src).map(|s| s.to_string()); + let leaf = name_opt + .as_deref() + .unwrap_or("default") + .to_string(); + glue.retain(|(_, gs, _)| *gs < outer_s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, &leaf); + units.push((sym, outer_s, outer_e, true)); + // Recurse into class body if we have one. + if matches!( + value.kind(), + "class" | "class_declaration" + ) { + if let Some(body) = + value.child_by_field_name("body") + { + let mut np = mod_path.to_vec(); + np.push(leaf); + walk_class_body( + body, src, mod_prefix, &np, units, + ); + } + } + } + _ => { + glue.push((0, s, e)); + } + } + } else { + // Bare `export { x };` / `export * from "..."` — + // a re-export, glue with module-only flag set + // (we have no `declaration` / `value` field for + // it). + glue.push((1, s, e)); + } + } + "import_statement" => { + glue.push((1, s, e)); + } + "lexical_declaration" | "variable_declaration" => { + glue.push((0, s, e)); + } + // Namespace / module declarations (rare in app code, + // common in `.d.ts`): treat as glue per plan §Task H + // (1B 1차 scope; documented under spec Risks). + "internal_module" | "module" | "ambient_declaration" => { + glue.push((0, s, e)); + } + _ => {} + } + } + flush_glue(glue, units, mod_prefix, mod_path); + } + fn flush_glue( + glue: &mut Vec<(usize, u32, u32)>, + units: &mut Vec<(String, u32, u32, bool)>, + mod_prefix: &str, + mod_path: &[String], + ) { + if glue.is_empty() { + return; + } + let s = glue.iter().map(|(_, a, _)| *a).min().unwrap(); + let e = glue.iter().map(|(_, _, b)| *b).max().unwrap(); + let only_module = glue.iter().all(|(is_mod, _, _)| *is_mod == 1); + let label = if only_module { "" } else { "" }; + units.push((join_symbol(mod_prefix, mod_path, label), s, e, false)); + glue.clear(); + } + + walk( + tree.root_node(), + source, + mod_prefix, + &[], + &mut units, + &mut glue, + ); + + // `` is correct only when the file produced no real unit. + // Otherwise the import-only group becomes `` (same + // post-pass as 1A Gap 1 / Python). + let has_real_unit = units.iter().any(|(_, _, _, is_real)| *is_real); + if has_real_unit { + for (sym, _, _, is_real) in units.iter_mut() { + if !*is_real && sym.ends_with("") { + let pre = &sym[..sym.len() - "".len()]; + *sym = format!("{pre}"); + } + } + } + + let total_lines = lines.len() as u32; + let mut blocks = Vec::with_capacity(units.len()); + for (ordinal, (symbol, ls, le, _is_real)) in units.into_iter().enumerate() { + let line_start = ls.max(1); + let line_end = le.min(total_lines.max(1)); + let span = SourceSpan::Code { + line_start, + line_end, + symbol: Some(symbol), + lang: Some("typescript".to_string()), + }; + let block_id = id_for_block(doc_id, "code", &[], ordinal as u32, &span); + let code = lines[(line_start as usize - 1)..=(line_end as usize - 1)].join("\n"); + blocks.push(Block::Code(CodeBlock { + common: CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span, + }, + lang: Some("typescript".to_string()), + code, + })); + } + Ok(blocks) +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{Block, MediaType, SourceSpan}; + + fn extract_fixture(name: &str, workspace_path: &str) -> kebab_core::CanonicalDocument { + let bytes = std::fs::read(format!( + concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/{}"), + name + )) + .unwrap(); + let asset = crate::rust::tests_support::fixed_code_asset(workspace_path, "typescript"); + let cfg = kebab_core::ExtractConfig::default(); + let root = std::path::PathBuf::from("/tmp"); + let ctx = kebab_core::ExtractContext { + asset: &asset, + workspace_root: &root, + config: &cfg, + }; + TypescriptAstExtractor::new() + .extract(&ctx, &bytes) + .unwrap() + } + + fn symbols(doc: &kebab_core::CanonicalDocument) -> Vec { + let mut s: Vec = doc + .blocks + .iter() + .filter_map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, lang, .. } => { + assert_eq!(lang.as_deref(), Some("typescript")); + symbol.clone() + } + _ => None, + }, + _ => None, + }) + .collect(); + s.sort(); + s + } + + #[test] + fn extractor_supports_only_media_code_typescript() { + let e = TypescriptAstExtractor::new(); + assert!(e.supports(&MediaType::Code("typescript".into()))); + assert!(!e.supports(&MediaType::Code("rust".into()))); + assert!(!e.supports(&MediaType::Markdown)); + } + + #[test] + fn ts_units_match_design_3_4_symbols() { + // workspace_path `src/sample.ts` → mod_prefix `src/sample` + let doc = extract_fixture("sample.ts", "src/sample.ts"); + let syms = symbols(&doc); + assert!(syms.iter().any(|s| s == "src/sample.add"), "got {syms:?}"); + assert!(syms.iter().any(|s| s == "src/sample.Greet")); + assert!(syms.iter().any(|s| s == "src/sample.Maybe")); + assert!(syms.iter().any(|s| s == "src/sample.Retriever")); + assert!(syms.iter().any(|s| s == "src/sample.Retriever.search")); + assert!(syms.iter().any(|s| s == "src/sample.Retriever.create")); + assert!(syms.iter().any(|s| s == "src/sample.default")); + assert!(syms.iter().any(|s| s == "src/sample.")); + } + + #[test] + fn tsx_uses_tsx_grammar_and_emits_units() { + let doc = extract_fixture("sample.tsx", "src/sample.tsx"); + let syms = symbols(&doc); + assert!( + syms.iter().any(|s| s == "src/sample.Hello"), + "got {syms:?}" + ); + assert!( + syms.iter().any(|s| s == "src/sample."), + "arrow fn + import should roll into top-level glue" + ); + } + + #[test] + fn deterministic_across_runs() { + let a = extract_fixture("sample.ts", "src/sample.ts"); + for _ in 0..30 { + assert_eq!(extract_fixture("sample.ts", "src/sample.ts").blocks, a.blocks); + } + } +} diff --git a/crates/kebab-parse-code/tests/fixtures/sample.ts b/crates/kebab-parse-code/tests/fixtures/sample.ts new file mode 100644 index 0000000..f390b89 --- /dev/null +++ b/crates/kebab-parse-code/tests/fixtures/sample.ts @@ -0,0 +1,11 @@ +// sample.ts +import { x } from "./other"; +const ANSWER = 42; +export interface Greet { hello(): string; } +export type Maybe = T | null; +export function add(a: number, b: number): number { return a + b; } +export class Retriever { + search(q: string): string[] { return []; } + static create(): Retriever { return new Retriever(); } +} +export default function () { return 1; } diff --git a/crates/kebab-parse-code/tests/fixtures/sample.tsx b/crates/kebab-parse-code/tests/fixtures/sample.tsx new file mode 100644 index 0000000..0fd1a5e --- /dev/null +++ b/crates/kebab-parse-code/tests/fixtures/sample.tsx @@ -0,0 +1,4 @@ +// sample.tsx +import React from "react"; +export function Hello({ name }: { name: string }) { return {name}; } +export const App = () => ; // arrow fn assigned → glue