diff --git a/Cargo.lock b/Cargo.lock index 9d34e2b..7fc5b74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4127,7 +4127,7 @@ dependencies = [ [[package]] name = "kebab-app" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "base64 0.22.1", @@ -4172,7 +4172,7 @@ dependencies = [ [[package]] name = "kebab-chunk" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "blake3", @@ -4187,7 +4187,7 @@ dependencies = [ [[package]] name = "kebab-cli" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "clap", @@ -4208,7 +4208,7 @@ dependencies = [ [[package]] name = "kebab-config" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "dirs 5.0.1", @@ -4223,7 +4223,7 @@ dependencies = [ [[package]] name = "kebab-core" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "blake3", @@ -4237,7 +4237,7 @@ dependencies = [ [[package]] name = "kebab-embed" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "blake3", @@ -4251,7 +4251,7 @@ dependencies = [ [[package]] name = "kebab-embed-local" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "fastembed", @@ -4264,7 +4264,7 @@ dependencies = [ [[package]] name = "kebab-eval" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "kebab-app", @@ -4283,7 +4283,7 @@ dependencies = [ [[package]] name = "kebab-llm" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "kebab-core", @@ -4292,7 +4292,7 @@ dependencies = [ [[package]] name = "kebab-llm-local" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "kebab-config", @@ -4309,7 +4309,7 @@ dependencies = [ [[package]] name = "kebab-mcp" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "kebab-app", @@ -4327,7 +4327,7 @@ dependencies = [ [[package]] name = "kebab-normalize" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "kebab-core", @@ -4342,7 +4342,7 @@ dependencies = [ [[package]] name = "kebab-parse-code" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "gix", @@ -4352,12 +4352,15 @@ dependencies = [ "time", "tracing", "tree-sitter", + "tree-sitter-javascript", + "tree-sitter-python", "tree-sitter-rust", + "tree-sitter-typescript", ] [[package]] name = "kebab-parse-image" -version = "0.7.0" +version = "0.8.0" dependencies = [ "ab_glyph", "anyhow", @@ -4381,7 +4384,7 @@ dependencies = [ [[package]] name = "kebab-parse-md" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "kebab-core", @@ -4398,7 +4401,7 @@ dependencies = [ [[package]] name = "kebab-parse-pdf" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "blake3", @@ -4411,7 +4414,7 @@ dependencies = [ [[package]] name = "kebab-parse-types" -version = "0.7.0" +version = "0.8.0" dependencies = [ "kebab-core", "serde", @@ -4419,7 +4422,7 @@ dependencies = [ [[package]] name = "kebab-rag" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "blake3", @@ -4440,7 +4443,7 @@ dependencies = [ [[package]] name = "kebab-search" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "globset", @@ -4459,7 +4462,7 @@ dependencies = [ [[package]] name = "kebab-source-fs" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "blake3", @@ -4477,7 +4480,7 @@ dependencies = [ [[package]] name = "kebab-store-sqlite" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "blake3", @@ -4498,7 +4501,7 @@ dependencies = [ [[package]] name = "kebab-store-vector" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "arrow", @@ -4522,7 +4525,7 @@ dependencies = [ [[package]] name = "kebab-tui" -version = "0.7.0" +version = "0.8.0" dependencies = [ "anyhow", "crossterm", @@ -8523,12 +8526,32 @@ dependencies = [ "tree-sitter-language", ] +[[package]] +name = "tree-sitter-javascript" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68204f2abc0627a90bdf06e605f5c470aa26fdcb2081ea553a04bdad756693f5" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-language" version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782" +[[package]] +name = "tree-sitter-python" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bf85fd39652e740bf60f46f4cda9492c3a9ad75880575bf14960f775cb74a1c" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-rust" version = "0.24.2" @@ -8539,6 +8562,16 @@ dependencies = [ "tree-sitter-language", ] +[[package]] +name = "tree-sitter-typescript" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5f76ed8d947a75cc446d5fccd8b602ebf0cde64ccf2ffa434d873d7a575eff" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "try-lock" version = "0.2.5" diff --git a/Cargo.toml b/Cargo.toml index a20dfcb..feab7f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ edition = "2024" rust-version = "1.85" license = "MIT OR Apache-2.0" repository = "https://github.com/altair823/kebab" -version = "0.7.0" +version = "0.8.0" [workspace.dependencies] anyhow = "1" @@ -90,6 +90,10 @@ gix = { version = "0.70", default-features = false, features = ["revisi # chunker stays tree-sitter-free — AST work is parser-side per design §6.3. tree-sitter = "0.26" tree-sitter-rust = "0.24" +# Python / TS / JS grammars for code ingest (kebab-parse-code, p10-1B). +tree-sitter-python = "0.25.0" +tree-sitter-typescript = "0.23.2" +tree-sitter-javascript = "0.25.0" # Disk-footprint trim for dev / test builds. Codegen, opt-level, and # behavior are unchanged — only DWARF debug info is reduced (line diff --git a/HANDOFF.md b/HANDOFF.md index dfc17ed..b1080ad 100644 --- a/HANDOFF.md +++ b/HANDOFF.md @@ -20,7 +20,7 @@ P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. | **P7** | PDF text + page citation | `kebab-parse-pdf` | P5 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring) | | **P8** | 음성 transcription + timestamp citation | `kebab-parse-audio` | P5 | ⏸ 보류 (whisper-rs 시스템 dep brainstorm 필요) | | **P9** | TUI + desktop app | `kebab-tui`, `kebab-desktop` | P5 | 🟡 진행 (4/5 component — P9-1/2/3/4 완료 [Library / Search / Ask / Inspect], P9-5 desktop 예정 · 도그푸딩 피드백 **20/20 ✅**) | -| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, tree-sitter-rust, `code-rust-ast-v1` — kebab 자기 dogfooding 가능, v0.7.0) | +| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, tree-sitter-rust, `code-rust-ast-v1` — v0.7.0), **1B 🟡 PR 오픈** (Python `code-python-ast-v1` + TypeScript `code-ts-ast-v1` + JavaScript `code-js-ast-v1` — 3 언어 dogfooding 가능, v0.8.0 대기) | P0~P5 직렬. P6~P9 P5 이후 병렬 가능. @@ -32,6 +32,7 @@ P0~P5 직렬. P6~P9 P5 이후 병렬 가능. 머지 후 발견된 모든 deviation / hotfix 의 dated 로그는 [tasks/HOTFIXES.md](tasks/HOTFIXES.md). 본 요약은 \"누군가가 인수받을 때 알아두면 시간을 많이 절약하는\" 항목만: +- **2026-05-20 P10-1B (Rust 1A symbol path 비일관 + expression-level 함수 미방출)** — (a) Rust `code-rust-ast-v1` 은 file-scope nesting 만 (workspace path prefix 없음), 1B 의 Python/TypeScript/JavaScript 는 workspace 경로 → module path prefix 사용 (비일관 수용, retrofit = chunker_version bump + reindex 필요, 사용자 명시 요청까지 보류); (b) TS/JS 의 `const foo = () => {...}` 같은 expression-level 함수는 `` glue 로 처리됨 (declaration-level 단위만 1B 1차 범위). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-20) 두 항목. - **2026-05-19 P10-1A-2 (code_rust_ast_v1.rs + SourceType)** — `AST_CHUNK_MAX_LINES` 상수가 `IngestCodeCfg.ast_chunk_max_lines` 를 읽지 않고 모듈 상수 200 고정 (Chunker trait 이 per-medium config 미노출); `SourceType::Code` variant 부재로 code 파일이 `SourceType::Note` 로 분류됨 — 두 항목 모두 `tasks/HOTFIXES.md` (2026-05-19) 에 기록. - **2026-05-07 fb-26 (progress.rs)** — `Aborted` unconditional writeln (TTY duplicate) + `Completed` TTY no summary fixed; `KEBAB_PROGRESS=plain` env + quiet suppression added - **2026-05-07 fb-28 (main.rs)** — `--readonly` (KEBAB_READONLY) blocks Ingest/IngestFile/IngestStdin/Reset; `--quiet` suppresses progress stderr; error.v1 code: "readonly_mode" diff --git a/README.md b/README.md index 183b673..7e9ece5 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ cargo install --git https://gitea.altair823.xyz/altair823-org/kebab.git --bin ke # 첫 실행 — XDG 경로에 데이터 디렉토리 + config.toml 생성 kebab init -# config 손보고 — workspace.root, 모델 endpoint 등 설정 (지원 형식: md / png / jpg / pdf / rs) +# config 손보고 — workspace.root, 모델 endpoint 등 설정 (지원 형식: md / png / jpg / pdf / rs / py / ts / js) ${EDITOR:-vi} ~/.config/kebab/config.toml # 색인 (Markdown / 이미지 / PDF 모두 한 번에) @@ -70,7 +70,7 @@ kebab doctor | 명령 | 동작 | |------|------| | `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 | -| `kebab ingest []` | Markdown / 이미지 / PDF / Rust 소스코드 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`), **Rust 소스코드** (`.rs`, tree-sitter AST chunker `code-rust-ast-v1` — p10-1A-2). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. 코드 chunk 는 `citation.kind = "code"` 에 `citation.lang = "rust"` + `symbol` + line range 를 담고, SearchHit top-level 에 `code_lang = "rust"` + `repo` (`.git/` walk-up 의 디렉토리 이름) 가 backfill 됨. `--code-lang rust` / `--media code` filter 로 코드 전용 검색 가능 (p10-1A-1 filter flags). | +| `kebab ingest []` | Markdown / 이미지 / PDF / Rust 소스코드 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`), **소스코드** (`.rs` → `code-rust-ast-v1`, `.py` → `code-python-ast-v1`, `.ts`/`.tsx` → `code-ts-ast-v1`, `.js`/`.mjs`/`.cjs`/`.jsx` → `code-js-ast-v1` — 모두 tree-sitter AST chunker). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. 코드 chunk 는 `citation.kind = "code"` 에 `citation.lang = ""` + `symbol` + line range 를 담고, SearchHit top-level 에 `code_lang` + `repo` (`.git/` walk-up 의 디렉토리 이름) 가 backfill 됨. `--code-lang rust` / `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` / `--media code` filter 로 언어별·코드 전용 검색 가능 (p10-1A-1 filter flags). Python symbol 은 workspace 경로 → dotted module path prefix (예: `kebab_eval.metrics.compute_mrr`), TS/JS symbol 은 slash-style module path prefix (예: `src/Foo.Foo.search`). | | `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID] [--trace] [--bulk] [--repo NAME ...] [--code-lang LIST]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` 는 반복 가능 flag (`--tag rust --tag async`) 로 OR 매칭, `--media` 는 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). **`--trace` (p9-fb-37)** — `search_response.v1.trace` 에 lexical / vector pre-fusion 후보 + RRF union + per-stage timing (`lexical_ms` / `vector_ms` / `fusion_ms` / `total_ms`) 노출. trace 요청은 캐시 우회 (`--no-cache` 없이도 항상 cold). **`--bulk` (p9-fb-42)** — stdin ndjson 으로 N query 한 번에 실행. `--json` 면 stdout per-query ndjson (`bulk_search_item.v1`) + stderr summary (`bulk_summary: total=N succeeded=S failed=F`). Cap 100. agent 가 query decomposition 후 sub-query 일괄 실행 시 single round-trip — App instance 재사용으로 캐시 / embedder cold-start 비용 한 번만. Per-query failure 는 item 의 `error` (error.v1) 에 격리, 다른 query 계속 진행. **code corpus filters (p10-1A-1):** `--repo` 는 반복 가능 (`--repo kebab --repo other`) OR 매칭. `--code-lang` 는 반복 또는 comma 다중 값 (`--code-lang rust,python`), 알 수 없는 값은 빈 hits. `--media code` 는 Tier 1/2/3 모든 code chunk 포함. 1A-1 시점에서는 indexed 된 code chunk 가 없어 filter 가 항상 빈 결과 — 1A-2 (Rust AST chunker) 머지 이후 실효. | | `kebab list docs` | 색인된 문서 목록 | | `kebab inspect doc ` / `kebab inspect chunk ` | raw record 보기 | @@ -132,7 +132,7 @@ flowchart TB subgraph Pipeline["도메인 + 파이프라인"] parse["parse-md / parse-pdf / parse-image / parse-code"] - chunker["chunker (md-heading-v1, pdf-page-v1, code-rust-ast-v1)"] + chunker["chunker (md-heading-v1, pdf-page-v1, code-rust-ast-v1, code-python-ast-v1, code-ts-ast-v1, code-js-ast-v1)"] embedder["embedder (fastembed multilingual-e5-large)"] retriever["retriever (lexical / vector / hybrid RRF)"] rag["RAG pipeline"] diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 39f43e8..dd837f7 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -39,7 +39,7 @@ use std::sync::Arc; use anyhow::{Context, anyhow}; use serde::{Deserialize, Serialize}; -use kebab_chunk::{CodeRustAstV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; +use kebab_chunk::{CodeJsAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTsAstV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, @@ -50,7 +50,7 @@ use kebab_core::{ use kebab_llm_local::OllamaLanguageModel; use kebab_normalize::build_canonical_document; use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr}; -use kebab_parse_code::RustAstExtractor; +use kebab_parse_code::{JavascriptAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor}; use kebab_parse_pdf::PdfTextExtractor; use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -918,8 +918,10 @@ fn ingest_one_asset( force_reingest, ); } - // p10-1A-2 Task 8: Rust code ingest. - MediaType::Code(lang) if lang == "rust" => { + // p10-1A-2 / 1B: code ingest dispatch. + MediaType::Code(lang) + if matches!(lang.as_str(), "rust" | "python" | "typescript" | "javascript") => + { return ingest_one_code_asset( app, asset, @@ -928,6 +930,7 @@ fn ingest_one_asset( vector_store, existing_doc_ids, force_reingest, + lang.as_str(), ); } // p10-1A-2: non-Rust Code, Audio, and Other are not yet wired; @@ -1642,6 +1645,7 @@ fn ingest_one_pdf_asset( /// /// All other steps (incremental skip, byte read, ExtractContext, put_*, /// embed, purge_vector_orphans) are identical to the PDF function. +#[allow(clippy::too_many_arguments)] fn ingest_one_code_asset( app: &App, asset: &RawAsset, @@ -1650,6 +1654,7 @@ fn ingest_one_code_asset( vector_store: Option<&Arc>, existing_doc_ids: &std::collections::HashSet, force_reingest: bool, + code_lang: &str, // <-- NEW (p10-1b Task D) ) -> anyhow::Result { let path = match &asset.source_uri { SourceUri::File(p) => p.clone(), @@ -1671,17 +1676,30 @@ fn ingest_one_code_asset( }); } }; - // p10-1A-2 task 8: incremental-ingest early-skip for the code flow. - // Code docs use `code-rust-v1` as the parser_version and - // `CodeRustAstV1Chunker` as the chunker — both pinned per-medium - // today (no config knob). - let code_parser_version = - ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()); + + // p10-1b Task D/G/J: parser_version per-lang. + let parser_version = match code_lang { + "rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()), + "python" => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.to_string()), + "typescript" => ParserVersion(kebab_parse_code::TS_PARSER_VERSION.to_string()), + "javascript" => ParserVersion(kebab_parse_code::JS_PARSER_VERSION.to_string()), + other => anyhow::bail!("unsupported code_lang: {other}"), + }; + + // p10-1b Task D/G/J/L: chunker_version per-lang. + let chunker_version = match code_lang { + "rust" => CodeRustAstV1Chunker.chunker_version(), + "python" => CodePythonAstV1Chunker.chunker_version(), + "typescript" => CodeTsAstV1Chunker.chunker_version(), + "javascript" => CodeJsAstV1Chunker.chunker_version(), + other => anyhow::bail!("unreachable chunker_version: {other}"), + }; + if let Some(item) = try_skip_unchanged( app, asset, - &code_parser_version, - &CodeRustAstV1Chunker.chunker_version(), + &parser_version, + &chunker_version, embedder.map(|e| e.model_version()).as_ref(), force_reingest, )? { @@ -1697,20 +1715,44 @@ fn ingest_one_code_asset( workspace_root: &workspace_root, config: &extract_config, }; - let mut canonical = RustAstExtractor::new() - .extract(&ctx, &bytes) - .context("kb-parse-code::RustAstExtractor::extract")?; - // Per-medium chunker selection: Rust code always uses code-rust-ast-v1 - // regardless of `config.chunking.chunker_version`. - let chunker = CodeRustAstV1Chunker; - let chunks = chunker - .chunk(&canonical, chunk_policy) - .context("kb-chunk::CodeRustAstV1Chunker::chunk")?; + // p10-1b Task D/G/J/L: extractor per-lang. + let mut canonical = match code_lang { + "rust" => RustAstExtractor::new() + .extract(&ctx, &bytes) + .context("kb-parse-code::RustAstExtractor::extract (code:rust)")?, + "python" => PythonAstExtractor::new() + .extract(&ctx, &bytes) + .context("kb-parse-code::PythonAstExtractor::extract (code:python)")?, + "typescript" => TypescriptAstExtractor::new() + .extract(&ctx, &bytes) + .context("kb-parse-code::TypescriptAstExtractor::extract (code:typescript)")?, + "javascript" => JavascriptAstExtractor::new() + .extract(&ctx, &bytes) + .context("kb-parse-code::JavascriptAstExtractor::extract (code:javascript)")?, + other => anyhow::bail!("unreachable (extract): {other}"), + }; + + // p10-1b Task D/G/J/L: chunker per-lang. + let chunks = match code_lang { + "rust" => CodeRustAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeRustAstV1Chunker::chunk (code:rust)")?, + "python" => CodePythonAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodePythonAstV1Chunker::chunk (code:python)")?, + "typescript" => CodeTsAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeTsAstV1Chunker::chunk (code:typescript)")?, + "javascript" => CodeJsAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeJsAstV1Chunker::chunk (code:javascript)")?, + other => anyhow::bail!("unreachable (chunk): {other}"), + }; // Stamp chunker + embedding versions so incremental skip detection has // data on the second run. - canonical.last_chunker_version = Some(chunker.chunker_version()); + canonical.last_chunker_version = Some(chunker_version.clone()); if let Some(emb) = embedder { canonical.last_embedding_version = Some(emb.model_version()); } @@ -1794,7 +1836,7 @@ fn ingest_one_code_asset( block_count: u32::try_from(canonical.blocks.len()).ok(), chunk_count: u32::try_from(chunks.len()).ok(), parser_version: Some(canonical.parser_version.clone()), - chunker_version: Some(chunker.chunker_version()), + chunker_version: Some(chunker_version), warnings, error: None, }) diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index d6611f1..c18ecea 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -159,6 +159,237 @@ fn rust_code_search_hit_has_repo() { ); } +/// p10-1b Task G: a `.py` file in a sub-directory is ingested and the +/// resulting `Citation::Code` hit must carry `lang="python"`, +/// `symbol="kebab_eval.metrics.compute_mrr"`, and `line_start >= 1`. +/// The sub-directory (`kebab_eval/`) ensures `module_path_for_python` +/// produces a non-empty prefix so the fully-qualified symbol assertion +/// exercises the prefix wiring end-to-end. +#[test] +fn python_file_ingests_and_searches_as_code_citation() { + let env = TestEnv::lexical_only(); + + let module_dir = env.workspace_root.join("kebab_eval"); + std::fs::create_dir_all(&module_dir).unwrap(); + std::fs::write( + module_dir.join("metrics.py"), + "\"\"\"compute metrics.\"\"\"\ndef compute_mrr(scores):\n return sum(scores) / max(len(scores), 1)\n", + ) + .unwrap(); + + let report = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + + assert!(report.new >= 1, "python file ingested: {report:?}"); + + let items = report.items.as_ref().expect("items present"); + let py_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("metrics.py")) + .expect("metrics.py item"); + assert_eq!( + py_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("code-python-v1"), + "parser_version must be code-python-v1" + ); + assert_eq!( + py_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-python-ast-v1"), + "chunker_version must be code-python-ast-v1" + ); + + let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("compute_mrr")) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'compute_mrr'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!( + lang.as_deref(), + Some("python"), + "citation.lang must be 'python'" + ); + assert_eq!( + symbol.as_deref(), + Some("kebab_eval.metrics.compute_mrr"), + "citation.symbol must be 'kebab_eval.metrics.compute_mrr'" + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("python"), + "SearchHit.code_lang must be 'python'" + ); +} + +/// p10-1b Task J: a `.ts` file in a sub-directory is ingested and the +/// resulting `Citation::Code` hit must carry `lang="typescript"`, +/// `symbol="src/Foo.Foo.bar"`, and `line_start >= 1`. +/// The sub-directory (`src/`) ensures `module_path_for_tsjs` produces +/// a non-empty prefix so the fully-qualified symbol assertion exercises +/// the prefix wiring end-to-end. +#[test] +fn typescript_file_ingests_and_searches_as_code_citation() { + let env = TestEnv::lexical_only(); + + let src_dir = env.workspace_root.join("src"); + std::fs::create_dir_all(&src_dir).unwrap(); + std::fs::write( + src_dir.join("Foo.ts"), + "export class Foo {\n bar(): number { return 42; }\n}\n", + ) + .unwrap(); + + let report = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + + assert!(report.new >= 1, "ts file ingested: {report:?}"); + + let items = report.items.as_ref().expect("items present"); + let ts_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("Foo.ts")) + .expect("Foo.ts item"); + assert_eq!( + ts_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("code-ts-v1"), + "parser_version must be code-ts-v1" + ); + assert_eq!( + ts_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-ts-ast-v1"), + "chunker_version must be code-ts-ast-v1" + ); + + let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("bar")) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'bar'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!( + lang.as_deref(), + Some("typescript"), + "citation.lang must be 'typescript'" + ); + assert_eq!( + symbol.as_deref(), + Some("src/Foo.Foo.bar"), + "citation.symbol must be 'src/Foo.Foo.bar'" + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("typescript"), + "SearchHit.code_lang must be 'typescript'" + ); +} + +/// p10-1b Task L: a `.js` file in a sub-directory is ingested and the +/// resulting `Citation::Code` hit must carry `lang="javascript"`, +/// `symbol="src/Bar.Bar.baz"`, and `line_start >= 1`. +/// The sub-directory (`src/`) ensures `module_path_for_tsjs` produces +/// a non-empty prefix so the fully-qualified symbol assertion exercises +/// the prefix wiring end-to-end. +#[test] +fn javascript_file_ingests_and_searches_as_code_citation() { + let env = TestEnv::lexical_only(); + + let src_dir = env.workspace_root.join("src"); + std::fs::create_dir_all(&src_dir).unwrap(); + std::fs::write( + src_dir.join("Bar.js"), + "export class Bar {\n baz() { return 7; }\n}\n", + ) + .unwrap(); + + let report = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + + assert!(report.new >= 1, "js file ingested: {report:?}"); + + let items = report.items.as_ref().expect("items present"); + let js_item = items + .iter() + .find(|i| i.doc_path.0.ends_with("Bar.js")) + .expect("Bar.js item"); + assert_eq!( + js_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("code-js-v1"), + "parser_version must be code-js-v1" + ); + assert_eq!( + js_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-js-ast-v1"), + "chunker_version must be code-js-ast-v1" + ); + + let hits = kebab_app::search_with_config(env.config.clone(), lexical_query("baz")) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'baz'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!( + lang.as_deref(), + Some("javascript"), + "citation.lang must be 'javascript'" + ); + assert_eq!( + symbol.as_deref(), + Some("src/Bar.Bar.baz"), + "citation.symbol must be 'src/Bar.Bar.baz'" + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("javascript"), + "SearchHit.code_lang must be 'javascript'" + ); +} + /// Re-ingesting the same `.rs` file without changes must report /// `Unchanged` (incremental-skip path exercised). #[test] diff --git a/crates/kebab-chunk/src/code_js_ast_v1.rs b/crates/kebab-chunk/src/code_js_ast_v1.rs new file mode 100644 index 0000000..7fe93cd --- /dev/null +++ b/crates/kebab-chunk/src/code_js_ast_v1.rs @@ -0,0 +1,322 @@ +//! `code-js-ast-v1` — maps a tree-sitter-derived JavaScript AST +//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with +//! `SourceSpan::Code`) to chunks 1:1. A unit longer than +//! `AST_CHUNK_MAX_LINES` is split into ` [part i/N]` sub-chunks +//! at blank-line paragraph boundaries (design §9.1 oversize fallback). +//! +//! tree-sitter is intentionally NOT a dependency here: AST work is +//! parser-side (`kebab-parse-code`, design §6.3). This chunker only +//! consumes the `CanonicalDocument`. +//! +//! `AST_CHUNK_MAX_LINES` is a constant matching +//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium +//! config threading needs a chunker registry (P+); same deviation +//! pattern as `pdf-page-v1`'s pinned `chunker_version` +//! (`tasks/HOTFIXES.md`). + +use kebab_core::{ + Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId, + SourceSpan, id_for_chunk, +}; + +const VERSION_LABEL: &str = "code-js-ast-v1"; +const BYTES_PER_TOKEN: usize = 3; +const POLICY_HASH_HEX_LEN: usize = 16; +const AST_CHUNK_MAX_LINES: u32 = 200; + +#[derive(Clone, Copy, Debug, Default)] +pub struct CodeJsAstV1Chunker; + +impl Chunker for CodeJsAstV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() + } + + fn chunk( + &self, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + ) -> anyhow::Result> { + for b in &doc.blocks { + let c = match b { + Block::Code(c) => c, + _ => anyhow::bail!( + "CodeJsAstV1Chunker only handles code docs (got non-Code block)" + ), + }; + if !matches!(c.common.source_span, SourceSpan::Code { .. }) { + anyhow::bail!( + "CodeJsAstV1Chunker only handles code docs (got non-Code source_span)" + ); + } + } + + let base_policy_hash = self.policy_hash(policy); + let chunker_version = self.chunker_version(); + let mut out: Vec = Vec::new(); + + for b in &doc.blocks { + let cb = match b { + Block::Code(c) => c, + _ => unreachable!("validated above"), + }; + let (ls, le, symbol, lang) = match &cb.common.source_span { + SourceSpan::Code { line_start, line_end, symbol, lang } => { + (*line_start, *line_end, symbol.clone(), lang.clone()) + } + _ => unreachable!("validated above"), + }; + let block_ids: Vec = vec![cb.common.block_id.clone()]; + let span_lines = le.saturating_sub(ls) + 1; + + if span_lines <= AST_CHUNK_MAX_LINES { + let span = SourceSpan::Code { + line_start: ls, + line_end: le, + symbol: symbol.clone(), + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + None, span, cb.code.clone(), + )); + } else { + let parts = split_oversize(&cb.code); + let n = parts.len(); + for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { + let part_ls = ls + off_start; + let part_le = ls + off_end; + let part_sym = symbol + .as_ref() + .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let span = SourceSpan::Code { + line_start: part_ls, + line_end: part_le, + symbol: part_sym, + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + Some(part_ls), span, text, + )); + } + } + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = out.len(), + "code-js-ast-v1 chunked", + ); + Ok(out) + } +} + +#[allow(clippy::too_many_arguments)] +fn make_chunk( + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + block_ids: &[BlockId], + base_policy_hash: &str, + split_key: Option, + span: SourceSpan, + text: String, +) -> Chunk { + let id_hash = match split_key { + Some(k) => format!("{base_policy_hash}#L{k}"), + None => base_policy_hash.to_string(), + }; + let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash); + let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN); + Chunk { + chunk_id, + doc_id: DocumentId(doc.doc_id.0.clone()), + block_ids: block_ids.to_vec(), + text, + heading_path: Vec::new(), + source_spans: vec![span], + token_estimate, + chunker_version: chunker_version.clone(), + policy_hash: base_policy_hash.to_string(), + } +} + +/// Split an oversize unit at blank-line paragraph boundaries, greedily +/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate. +/// Returns `(line_offset_start, line_offset_end, text)` where offsets are +/// 0-based within the unit (caller adds the unit's absolute `line_start`). +fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { + let lines: Vec<&str> = code.split('\n').collect(); + let total = lines.len() as u32; + let mut out: Vec<(u32, u32, String)> = Vec::new(); + let mut start: u32 = 0; + while start < total { + let mut end = (start + AST_CHUNK_MAX_LINES).min(total); + let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5); + if end < total { + if let Some(b) = (floor.min(end)..end) + .rev() + .find(|&i| lines[i as usize].trim().is_empty()) + { + end = b + 1; + } + } + let text = lines[start as usize..end as usize].join("\n"); + out.push((start, end.saturating_sub(1), text)); + start = end; + } + if out.is_empty() { + out.push((0, total.saturating_sub(1), code.to_string())); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{ + Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, + SourceType, TrustLevel, WorkspacePath, + }; + use time::OffsetDateTime; + + fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument { + let wp = WorkspacePath("crates/x/src/a.js".into()); + let aid = AssetId("a".repeat(64)); + let pv = ParserVersion("code-js-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + let blocks = units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("javascript".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + lang: Some("javascript".into()), + code: (*code).to_string(), + }) + }) + .collect(); + CanonicalDocument { + doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), + lang: Lang("und".into()), blocks, + metadata: Metadata { + aliases: vec![], tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, trust_level: TrustLevel::Primary, + user_id_alias: None, user: Default::default(), + repo: Some("kebab".into()), git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), code_lang: Some("javascript".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, schema_version: 1, doc_version: 1, + last_chunker_version: None, last_embedding_version: None, + } + } + fn policy() -> ChunkPolicy { + ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + } + + #[test] + fn chunker_version_is_code_js_ast_v1() { + assert_eq!(CodeJsAstV1Chunker.chunker_version(), + ChunkerVersion("code-js-ast-v1".into())); + } + + #[test] + fn one_chunk_per_unit_preserves_code_span() { + let doc = code_doc(&[ + ("parse", 1, 3, "function parse() {\n // x\n}"), + ("Foo.double", 5, 7, "function double() {\n //\n return 0;\n}"), + ]); + let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert_eq!(chunks.len(), 2); + for c in &chunks { + assert_eq!(c.source_spans.len(), 1); + assert!(matches!(c.source_spans[0], SourceSpan::Code { .. })); + assert_eq!(c.heading_path, Vec::::new()); + assert_eq!(c.chunker_version.0, "code-js-ast-v1"); + } + match &chunks[0].source_spans[0] { + SourceSpan::Code { symbol, line_start, line_end, .. } => { + assert_eq!(symbol.as_deref(), Some("parse")); + assert_eq!((*line_start, *line_end), (1, 3)); + } + _ => unreachable!(), + } + } + + #[test] + fn oversize_unit_splits_into_parts_with_unique_ids() { + let body = (0..500).map(|i| format!(" const x{i} = {i};")).collect::>().join("\n"); + let code = format!("function big() {{\n{body}\n}}"); + let doc = code_doc(&[("big", 1, 502, &code)]); + let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + for c in &chunks { + match &c.source_spans[0] { + SourceSpan::Code { symbol, .. } => { + assert!(symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}"); + } + _ => unreachable!(), + } + } + let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); + let n = ids.len(); ids.sort(); ids.dedup(); + assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); + } + + #[test] + fn non_code_doc_errors() { + use kebab_core::TextBlock; + let mut doc = code_doc(&[("parse", 1, 1, "function parse() {}")]); + doc.blocks = vec![Block::Paragraph(TextBlock { + common: CommonBlock { + block_id: kebab_core::BlockId("b".into()), + heading_path: vec![], + source_span: SourceSpan::Line { start: 1, end: 1 }, + }, + text: "x".into(), inlines: vec![], + })]; + let err = CodeJsAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); + assert!(err.to_string().contains("CodeJsAstV1Chunker")); + } + + #[test] + fn deterministic_chunk_ids_1000() { + let doc = code_doc(&[("parse", 1, 2, "function parse() {}\n")]); + let base: Vec = CodeJsAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + for _ in 0..1000 { + let again: Vec = CodeJsAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + assert_eq!(again, base); + } + } + + #[test] + fn policy_hash_matches_md_heading_v1() { + let p = policy(); + assert_eq!(CodeJsAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p)); + } +} diff --git a/crates/kebab-chunk/src/code_python_ast_v1.rs b/crates/kebab-chunk/src/code_python_ast_v1.rs new file mode 100644 index 0000000..e814130 --- /dev/null +++ b/crates/kebab-chunk/src/code_python_ast_v1.rs @@ -0,0 +1,322 @@ +//! `code-python-ast-v1` — maps a tree-sitter-derived Python AST +//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with +//! `SourceSpan::Code`) to chunks 1:1. A unit longer than +//! `AST_CHUNK_MAX_LINES` is split into ` [part i/N]` sub-chunks +//! at blank-line paragraph boundaries (design §9.1 oversize fallback). +//! +//! tree-sitter is intentionally NOT a dependency here: AST work is +//! parser-side (`kebab-parse-code`, design §6.3). This chunker only +//! consumes the `CanonicalDocument`. +//! +//! `AST_CHUNK_MAX_LINES` is a constant matching +//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium +//! config threading needs a chunker registry (P+); same deviation +//! pattern as `pdf-page-v1`'s pinned `chunker_version` +//! (`tasks/HOTFIXES.md`). + +use kebab_core::{ + Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId, + SourceSpan, id_for_chunk, +}; + +const VERSION_LABEL: &str = "code-python-ast-v1"; +const BYTES_PER_TOKEN: usize = 3; +const POLICY_HASH_HEX_LEN: usize = 16; +const AST_CHUNK_MAX_LINES: u32 = 200; + +#[derive(Clone, Copy, Debug, Default)] +pub struct CodePythonAstV1Chunker; + +impl Chunker for CodePythonAstV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() + } + + fn chunk( + &self, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + ) -> anyhow::Result> { + for b in &doc.blocks { + let c = match b { + Block::Code(c) => c, + _ => anyhow::bail!( + "CodePythonAstV1Chunker only handles code docs (got non-Code block)" + ), + }; + if !matches!(c.common.source_span, SourceSpan::Code { .. }) { + anyhow::bail!( + "CodePythonAstV1Chunker only handles code docs (got non-Code source_span)" + ); + } + } + + let base_policy_hash = self.policy_hash(policy); + let chunker_version = self.chunker_version(); + let mut out: Vec = Vec::new(); + + for b in &doc.blocks { + let cb = match b { + Block::Code(c) => c, + _ => unreachable!("validated above"), + }; + let (ls, le, symbol, lang) = match &cb.common.source_span { + SourceSpan::Code { line_start, line_end, symbol, lang } => { + (*line_start, *line_end, symbol.clone(), lang.clone()) + } + _ => unreachable!("validated above"), + }; + let block_ids: Vec = vec![cb.common.block_id.clone()]; + let span_lines = le.saturating_sub(ls) + 1; + + if span_lines <= AST_CHUNK_MAX_LINES { + let span = SourceSpan::Code { + line_start: ls, + line_end: le, + symbol: symbol.clone(), + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + None, span, cb.code.clone(), + )); + } else { + let parts = split_oversize(&cb.code); + let n = parts.len(); + for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { + let part_ls = ls + off_start; + let part_le = ls + off_end; + let part_sym = symbol + .as_ref() + .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let span = SourceSpan::Code { + line_start: part_ls, + line_end: part_le, + symbol: part_sym, + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + Some(part_ls), span, text, + )); + } + } + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = out.len(), + "code-python-ast-v1 chunked", + ); + Ok(out) + } +} + +#[allow(clippy::too_many_arguments)] +fn make_chunk( + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + block_ids: &[BlockId], + base_policy_hash: &str, + split_key: Option, + span: SourceSpan, + text: String, +) -> Chunk { + let id_hash = match split_key { + Some(k) => format!("{base_policy_hash}#L{k}"), + None => base_policy_hash.to_string(), + }; + let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash); + let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN); + Chunk { + chunk_id, + doc_id: DocumentId(doc.doc_id.0.clone()), + block_ids: block_ids.to_vec(), + text, + heading_path: Vec::new(), + source_spans: vec![span], + token_estimate, + chunker_version: chunker_version.clone(), + policy_hash: base_policy_hash.to_string(), + } +} + +/// Split an oversize unit at blank-line paragraph boundaries, greedily +/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate. +/// Returns `(line_offset_start, line_offset_end, text)` where offsets are +/// 0-based within the unit (caller adds the unit's absolute `line_start`). +fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { + let lines: Vec<&str> = code.split('\n').collect(); + let total = lines.len() as u32; + let mut out: Vec<(u32, u32, String)> = Vec::new(); + let mut start: u32 = 0; + while start < total { + let mut end = (start + AST_CHUNK_MAX_LINES).min(total); + let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5); + if end < total { + if let Some(b) = (floor.min(end)..end) + .rev() + .find(|&i| lines[i as usize].trim().is_empty()) + { + end = b + 1; + } + } + let text = lines[start as usize..end as usize].join("\n"); + out.push((start, end.saturating_sub(1), text)); + start = end; + } + if out.is_empty() { + out.push((0, total.saturating_sub(1), code.to_string())); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{ + Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, + SourceType, TrustLevel, WorkspacePath, + }; + use time::OffsetDateTime; + + fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument { + let wp = WorkspacePath("crates/x/src/a.py".into()); + let aid = AssetId("a".repeat(64)); + let pv = ParserVersion("code-python-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + let blocks = units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("python".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + lang: Some("python".into()), + code: (*code).to_string(), + }) + }) + .collect(); + CanonicalDocument { + doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), + lang: Lang("und".into()), blocks, + metadata: Metadata { + aliases: vec![], tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, trust_level: TrustLevel::Primary, + user_id_alias: None, user: Default::default(), + repo: Some("kebab".into()), git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), code_lang: Some("python".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, schema_version: 1, doc_version: 1, + last_chunker_version: None, last_embedding_version: None, + } + } + fn policy() -> ChunkPolicy { + ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + } + + #[test] + fn chunker_version_is_code_python_ast_v1() { + assert_eq!(CodePythonAstV1Chunker.chunker_version(), + ChunkerVersion("code-python-ast-v1".into())); + } + + #[test] + fn one_chunk_per_unit_preserves_code_span() { + let doc = code_doc(&[ + ("parse", 1, 3, "def parse():\n pass\n # x"), + ("Foo.double", 5, 7, "def double():\n #\n pass"), + ]); + let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert_eq!(chunks.len(), 2); + for c in &chunks { + assert_eq!(c.source_spans.len(), 1); + assert!(matches!(c.source_spans[0], SourceSpan::Code { .. })); + assert_eq!(c.heading_path, Vec::::new()); + assert_eq!(c.chunker_version.0, "code-python-ast-v1"); + } + match &chunks[0].source_spans[0] { + SourceSpan::Code { symbol, line_start, line_end, .. } => { + assert_eq!(symbol.as_deref(), Some("parse")); + assert_eq!((*line_start, *line_end), (1, 3)); + } + _ => unreachable!(), + } + } + + #[test] + fn oversize_unit_splits_into_parts_with_unique_ids() { + let body = (0..500).map(|i| format!(" x{i} = {i}")).collect::>().join("\n"); + let code = format!("def big():\n{body}\n"); + let doc = code_doc(&[("big", 1, 502, &code)]); + let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + for c in &chunks { + match &c.source_spans[0] { + SourceSpan::Code { symbol, .. } => { + assert!(symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}"); + } + _ => unreachable!(), + } + } + let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); + let n = ids.len(); ids.sort(); ids.dedup(); + assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); + } + + #[test] + fn non_code_doc_errors() { + use kebab_core::TextBlock; + let mut doc = code_doc(&[("parse", 1, 1, "def parse(): pass")]); + doc.blocks = vec![Block::Paragraph(TextBlock { + common: CommonBlock { + block_id: kebab_core::BlockId("b".into()), + heading_path: vec![], + source_span: SourceSpan::Line { start: 1, end: 1 }, + }, + text: "x".into(), inlines: vec![], + })]; + let err = CodePythonAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); + assert!(err.to_string().contains("CodePythonAstV1Chunker")); + } + + #[test] + fn deterministic_chunk_ids_1000() { + let doc = code_doc(&[("parse", 1, 2, "def parse(): pass\n")]); + let base: Vec = CodePythonAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + for _ in 0..1000 { + let again: Vec = CodePythonAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + assert_eq!(again, base); + } + } + + #[test] + fn policy_hash_matches_md_heading_v1() { + let p = policy(); + assert_eq!(CodePythonAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p)); + } +} diff --git a/crates/kebab-chunk/src/code_ts_ast_v1.rs b/crates/kebab-chunk/src/code_ts_ast_v1.rs new file mode 100644 index 0000000..4f273d7 --- /dev/null +++ b/crates/kebab-chunk/src/code_ts_ast_v1.rs @@ -0,0 +1,322 @@ +//! `code-ts-ast-v1` — maps a tree-sitter-derived TypeScript AST +//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with +//! `SourceSpan::Code`) to chunks 1:1. A unit longer than +//! `AST_CHUNK_MAX_LINES` is split into ` [part i/N]` sub-chunks +//! at blank-line paragraph boundaries (design §9.1 oversize fallback). +//! +//! tree-sitter is intentionally NOT a dependency here: AST work is +//! parser-side (`kebab-parse-code`, design §6.3). This chunker only +//! consumes the `CanonicalDocument`. +//! +//! `AST_CHUNK_MAX_LINES` is a constant matching +//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium +//! config threading needs a chunker registry (P+); same deviation +//! pattern as `pdf-page-v1`'s pinned `chunker_version` +//! (`tasks/HOTFIXES.md`). + +use kebab_core::{ + Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId, + SourceSpan, id_for_chunk, +}; + +const VERSION_LABEL: &str = "code-ts-ast-v1"; +const BYTES_PER_TOKEN: usize = 3; +const POLICY_HASH_HEX_LEN: usize = 16; +const AST_CHUNK_MAX_LINES: u32 = 200; + +#[derive(Clone, Copy, Debug, Default)] +pub struct CodeTsAstV1Chunker; + +impl Chunker for CodeTsAstV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() + } + + fn chunk( + &self, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + ) -> anyhow::Result> { + for b in &doc.blocks { + let c = match b { + Block::Code(c) => c, + _ => anyhow::bail!( + "CodeTsAstV1Chunker only handles code docs (got non-Code block)" + ), + }; + if !matches!(c.common.source_span, SourceSpan::Code { .. }) { + anyhow::bail!( + "CodeTsAstV1Chunker only handles code docs (got non-Code source_span)" + ); + } + } + + let base_policy_hash = self.policy_hash(policy); + let chunker_version = self.chunker_version(); + let mut out: Vec = Vec::new(); + + for b in &doc.blocks { + let cb = match b { + Block::Code(c) => c, + _ => unreachable!("validated above"), + }; + let (ls, le, symbol, lang) = match &cb.common.source_span { + SourceSpan::Code { line_start, line_end, symbol, lang } => { + (*line_start, *line_end, symbol.clone(), lang.clone()) + } + _ => unreachable!("validated above"), + }; + let block_ids: Vec = vec![cb.common.block_id.clone()]; + let span_lines = le.saturating_sub(ls) + 1; + + if span_lines <= AST_CHUNK_MAX_LINES { + let span = SourceSpan::Code { + line_start: ls, + line_end: le, + symbol: symbol.clone(), + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + None, span, cb.code.clone(), + )); + } else { + let parts = split_oversize(&cb.code); + let n = parts.len(); + for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { + let part_ls = ls + off_start; + let part_le = ls + off_end; + let part_sym = symbol + .as_ref() + .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let span = SourceSpan::Code { + line_start: part_ls, + line_end: part_le, + symbol: part_sym, + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + Some(part_ls), span, text, + )); + } + } + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = out.len(), + "code-ts-ast-v1 chunked", + ); + Ok(out) + } +} + +#[allow(clippy::too_many_arguments)] +fn make_chunk( + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + block_ids: &[BlockId], + base_policy_hash: &str, + split_key: Option, + span: SourceSpan, + text: String, +) -> Chunk { + let id_hash = match split_key { + Some(k) => format!("{base_policy_hash}#L{k}"), + None => base_policy_hash.to_string(), + }; + let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash); + let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN); + Chunk { + chunk_id, + doc_id: DocumentId(doc.doc_id.0.clone()), + block_ids: block_ids.to_vec(), + text, + heading_path: Vec::new(), + source_spans: vec![span], + token_estimate, + chunker_version: chunker_version.clone(), + policy_hash: base_policy_hash.to_string(), + } +} + +/// Split an oversize unit at blank-line paragraph boundaries, greedily +/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate. +/// Returns `(line_offset_start, line_offset_end, text)` where offsets are +/// 0-based within the unit (caller adds the unit's absolute `line_start`). +fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { + let lines: Vec<&str> = code.split('\n').collect(); + let total = lines.len() as u32; + let mut out: Vec<(u32, u32, String)> = Vec::new(); + let mut start: u32 = 0; + while start < total { + let mut end = (start + AST_CHUNK_MAX_LINES).min(total); + let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5); + if end < total { + if let Some(b) = (floor.min(end)..end) + .rev() + .find(|&i| lines[i as usize].trim().is_empty()) + { + end = b + 1; + } + } + let text = lines[start as usize..end as usize].join("\n"); + out.push((start, end.saturating_sub(1), text)); + start = end; + } + if out.is_empty() { + out.push((0, total.saturating_sub(1), code.to_string())); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{ + Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, + SourceType, TrustLevel, WorkspacePath, + }; + use time::OffsetDateTime; + + fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument { + let wp = WorkspacePath("crates/x/src/a.ts".into()); + let aid = AssetId("a".repeat(64)); + let pv = ParserVersion("code-ts-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + let blocks = units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("typescript".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + lang: Some("typescript".into()), + code: (*code).to_string(), + }) + }) + .collect(); + CanonicalDocument { + doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), + lang: Lang("und".into()), blocks, + metadata: Metadata { + aliases: vec![], tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, trust_level: TrustLevel::Primary, + user_id_alias: None, user: Default::default(), + repo: Some("kebab".into()), git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), code_lang: Some("typescript".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, schema_version: 1, doc_version: 1, + last_chunker_version: None, last_embedding_version: None, + } + } + fn policy() -> ChunkPolicy { + ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + } + + #[test] + fn chunker_version_is_code_ts_ast_v1() { + assert_eq!(CodeTsAstV1Chunker.chunker_version(), + ChunkerVersion("code-ts-ast-v1".into())); + } + + #[test] + fn one_chunk_per_unit_preserves_code_span() { + let doc = code_doc(&[ + ("parse", 1, 3, "function parse(): void {\n // x\n}"), + ("Foo.double", 5, 7, "function double(): number {\n //\n return 0;\n}"), + ]); + let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert_eq!(chunks.len(), 2); + for c in &chunks { + assert_eq!(c.source_spans.len(), 1); + assert!(matches!(c.source_spans[0], SourceSpan::Code { .. })); + assert_eq!(c.heading_path, Vec::::new()); + assert_eq!(c.chunker_version.0, "code-ts-ast-v1"); + } + match &chunks[0].source_spans[0] { + SourceSpan::Code { symbol, line_start, line_end, .. } => { + assert_eq!(symbol.as_deref(), Some("parse")); + assert_eq!((*line_start, *line_end), (1, 3)); + } + _ => unreachable!(), + } + } + + #[test] + fn oversize_unit_splits_into_parts_with_unique_ids() { + let body = (0..500).map(|i| format!(" const x{i} = {i};")).collect::>().join("\n"); + let code = format!("function big(): void {{\n{body}\n}}"); + let doc = code_doc(&[("big", 1, 502, &code)]); + let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + for c in &chunks { + match &c.source_spans[0] { + SourceSpan::Code { symbol, .. } => { + assert!(symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}"); + } + _ => unreachable!(), + } + } + let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); + let n = ids.len(); ids.sort(); ids.dedup(); + assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); + } + + #[test] + fn non_code_doc_errors() { + use kebab_core::TextBlock; + let mut doc = code_doc(&[("parse", 1, 1, "function parse(): void {}")]); + doc.blocks = vec![Block::Paragraph(TextBlock { + common: CommonBlock { + block_id: kebab_core::BlockId("b".into()), + heading_path: vec![], + source_span: SourceSpan::Line { start: 1, end: 1 }, + }, + text: "x".into(), inlines: vec![], + })]; + let err = CodeTsAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); + assert!(err.to_string().contains("CodeTsAstV1Chunker")); + } + + #[test] + fn deterministic_chunk_ids_1000() { + let doc = code_doc(&[("parse", 1, 2, "function parse(): void {}\n")]); + let base: Vec = CodeTsAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + for _ in 0..1000 { + let again: Vec = CodeTsAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + assert_eq!(again, base); + } + } + + #[test] + fn policy_hash_matches_md_heading_v1() { + let p = policy(); + assert_eq!(CodeTsAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p)); + } +} diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index 88a8eb7..194e835 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -15,10 +15,16 @@ //! embedder, the retriever, the LLM, the RAG layer, or the UI layers. //! It consumes `CanonicalDocument` purely through `kb-core` types. +mod code_js_ast_v1; +mod code_python_ast_v1; mod code_rust_ast_v1; +mod code_ts_ast_v1; mod md_heading_v1; mod pdf_page_v1; +pub use code_js_ast_v1::CodeJsAstV1Chunker; +pub use code_python_ast_v1::CodePythonAstV1Chunker; pub use code_rust_ast_v1::CodeRustAstV1Chunker; +pub use code_ts_ast_v1::CodeTsAstV1Chunker; pub use md_heading_v1::MdHeadingV1Chunker; pub use pdf_page_v1::PdfPageV1Chunker; diff --git a/crates/kebab-chunk/tests/code_js_ast_snapshot.rs b/crates/kebab-chunk/tests/code_js_ast_snapshot.rs new file mode 100644 index 0000000..9cb818d --- /dev/null +++ b/crates/kebab-chunk/tests/code_js_ast_snapshot.rs @@ -0,0 +1,221 @@ +//! Snapshot test pinning the `Vec` JSON for a +//! representative JavaScript code `CanonicalDocument`. +//! +//! This is an integration test. `kebab-parse-code` is intentionally NOT +//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side). +//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code` +//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s +//! internal `code_doc` test helper. +//! +//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline. + +use std::path::PathBuf; + +use kebab_chunk::CodeJsAstV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, + id_for_block, id_for_doc, +}; +use serde_json::Value; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +fn fixed_doc() -> CanonicalDocument { + let wp = WorkspacePath("src/bar.js".into()); + let aid = AssetId("b".repeat(64)); + // Pin parser_version so doc_id / block_ids are reproducible. + let pv = ParserVersion("code-js-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + // Build a >200-line function body to force split_oversize. + let big_body: String = { + let header = "function bigTransform(items) {\n"; + let body: String = (0..210u32) + .map(|i| format!(" const v{i} = items[{i}] !== undefined ? items[{i}] : null;\n")) + .collect(); + let footer = " return items;\n}"; + format!("{header}{body}{footer}") + }; + let big_line_count = big_body.lines().count() as u32; + let big_line_end = 48 + big_line_count - 1; + + // Representative units: + // 0. require/import block (lines 1–5, ≤200) + // 1. free fn `add` (lines 7–12, ≤200) + // 2. class `EventBus` (lines 14–20, ≤200) + // 3. class `BaseHandler` (lines 22–30, ≤200) + // 4. method `EventBus.emit` (lines 32–38, ≤200) + // 5. method `EventBus.on` (lines 40–46, ≤200) + // 6. bigTransform (>200 lines) to force split_oversize + let raw_units: Vec<(&str, u32, u32, String)> = vec![ + ( + "requires", + 1, + 5, + "const fs = require('fs');\nconst path = require('path');\nconst { EventEmitter } = require('events');\nconst assert = require('assert');\nconst crypto = require('crypto');".to_string(), + ), + ( + "add", + 7, + 12, + "export function add(a, b) {\n if (typeof a !== 'number') throw new TypeError('a');\n if (typeof b !== 'number') throw new TypeError('b');\n const result = a + b;\n assert(isFinite(result));\n return result;\n}".to_string(), + ), + ( + "EventBus", + 14, + 20, + "class EventBus {\n constructor() {\n this._handlers = new Map();\n this._history = [];\n this._maxHistory = 100;\n this._seq = 0;\n }\n}".to_string(), + ), + ( + "BaseHandler", + 22, + 30, + "class BaseHandler {\n handle(event) {\n throw new Error('not implemented');\n }\n batchHandle(events) {\n const results = [];\n for (const ev of events) {\n results.push(this.handle(ev));\n }\n return results;\n }\n}".to_string(), + ), + ( + "EventBus.emit", + 32, + 38, + "class EventBus {\n emit(name, payload) {\n const handlers = this._handlers.get(name) ?? [];\n for (const h of handlers) {\n h(payload);\n }\n return this;\n }\n}".to_string(), + ), + ( + "EventBus.on", + 40, + 46, + "class EventBus {\n on(name, handler) {\n if (!this._handlers.has(name)) {\n this._handlers.set(name, []);\n }\n this._handlers.get(name).push(handler);\n return this;\n }\n}".to_string(), + ), + ("bigTransform", 48, big_line_end, big_body), + ]; + + let blocks: Vec = raw_units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("javascript".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some("javascript".into()), + code: code.clone(), + }) + }) + .collect(); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "bar.js".into(), + lang: Lang("und".into()), + blocks, + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("javascript".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn fixed_policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("code-js-ast-v1".into()), + } +} + +#[test] +fn code_js_ast_chunks_snapshot() { + let doc = fixed_doc(); + let policy = fixed_policy(); + + let chunks = CodeJsAstV1Chunker.chunk(&doc, &policy).expect("chunk"); + let actual = serde_json::to_value(&chunks).unwrap(); + + let dir = fixtures_dir(); + let baseline_path = dir.join("code-sample.js.chunks.snapshot.json"); + let baseline_text = match std::fs::read_to_string(&baseline_path) { + Ok(s) => s, + Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => { + std::fs::create_dir_all(&dir).unwrap(); + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + return; + } + Err(e) => panic!( + "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}", + baseline_path.display() + ), + }; + let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json"); + + if actual != expected { + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + eprintln!("updated baseline {}", baseline_path.display()); + return; + } + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "code-js-ast-v1 chunks snapshot drift\n\ + --- expected ({}) ---\n{baseline_text}\n\ + --- actual ---\n{pretty}\n\ + If intentional, re-run with UPDATE_SNAPSHOTS=1.", + baseline_path.display() + ); + } +} + +/// Determinism cross-check: re-running the same pipeline yields the same +/// chunk_ids byte-for-byte. +#[test] +fn code_js_ast_chunks_are_deterministic() { + let policy = fixed_policy(); + let baseline: Vec = CodeJsAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + for _ in 0..5 { + let again: Vec = CodeJsAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + assert_eq!(again, baseline); + } +} diff --git a/crates/kebab-chunk/tests/code_python_ast_snapshot.rs b/crates/kebab-chunk/tests/code_python_ast_snapshot.rs new file mode 100644 index 0000000..2a164b1 --- /dev/null +++ b/crates/kebab-chunk/tests/code_python_ast_snapshot.rs @@ -0,0 +1,221 @@ +//! Snapshot test pinning the `Vec` JSON for a +//! representative Python code `CanonicalDocument`. +//! +//! This is an integration test. `kebab-parse-code` is intentionally NOT +//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side). +//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code` +//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s +//! internal `code_doc` test helper. +//! +//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline. + +use std::path::PathBuf; + +use kebab_chunk::CodePythonAstV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, + id_for_block, id_for_doc, +}; +use serde_json::Value; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +fn fixed_doc() -> CanonicalDocument { + let wp = WorkspacePath("kebab_eval/metrics.py".into()); + let aid = AssetId("b".repeat(64)); + // Pin parser_version so doc_id / block_ids are reproducible. + let pv = ParserVersion("code-python-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + // Build a >200-line function body to force split_oversize. + let big_body: String = { + let header = "def big_compute(data):\n"; + let body: String = (0..210u32) + .map(|i| format!(" v{i} = data[{i}] if {i} < len(data) else 0\n")) + .collect(); + let footer = " return sum(data)"; + format!("{header}{body}{footer}") + }; + let big_line_count = big_body.lines().count() as u32; + let big_line_end = 48 + big_line_count - 1; + + // Representative units: + // 0. import block (lines 1–5, ≤200) + // 1. free fn `compute_mrr` (lines 7–12, ≤200) + // 2. class `MetricsCollector` (lines 14–20, ≤200) + // 3. class `BaseEvaluator` (lines 22–30, ≤200) + // 4. method `run` (lines 32–38, ≤200) + // 5. method `report` (lines 40–46, ≤200) + // 6. big_compute (>200 lines) to force split_oversize + let raw_units: Vec<(&str, u32, u32, String)> = vec![ + ( + "imports", + 1, + 5, + "import os\nimport sys\nfrom typing import List\nfrom pathlib import Path\nfrom collections import defaultdict".to_string(), + ), + ( + "compute_mrr", + 7, + 12, + "def compute_mrr(scores):\n if not scores:\n return 0.0\n return sum(\n 1.0 / r for r in scores\n ) / len(scores)".to_string(), + ), + ( + "MetricsCollector", + 14, + 20, + "class MetricsCollector:\n def __init__(self):\n self.scores = []\n self.labels = []\n self.counts = defaultdict(int)\n self.totals = defaultdict(float)\n self.tags = []".to_string(), + ), + ( + "BaseEvaluator", + 22, + 30, + "class BaseEvaluator:\n def evaluate(self, data):\n raise NotImplementedError\n def batch_evaluate(self, items):\n results = []\n for item in items:\n results.append(self.evaluate(item))\n return results\n def name(self):\n return type(self).__name__".to_string(), + ), + ( + "MetricsCollector.run", + 32, + 38, + "class MetricsCollector:\n def run(self, inputs):\n for inp in inputs:\n score = self._score(inp)\n self.scores.append(\n score\n )".to_string(), + ), + ( + "MetricsCollector.report", + 40, + 46, + "class MetricsCollector:\n def report(self):\n return {\n 'mean': sum(self.scores) / max(len(self.scores), 1),\n 'count': len(self.scores),\n 'tags': self.tags,\n }".to_string(), + ), + ("big_compute", 48, big_line_end, big_body), + ]; + + let blocks: Vec = raw_units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("python".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some("python".into()), + code: code.clone(), + }) + }) + .collect(); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "metrics.py".into(), + lang: Lang("und".into()), + blocks, + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("python".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn fixed_policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("code-python-ast-v1".into()), + } +} + +#[test] +fn code_python_ast_chunks_snapshot() { + let doc = fixed_doc(); + let policy = fixed_policy(); + + let chunks = CodePythonAstV1Chunker.chunk(&doc, &policy).expect("chunk"); + let actual = serde_json::to_value(&chunks).unwrap(); + + let dir = fixtures_dir(); + let baseline_path = dir.join("code-sample.py.chunks.snapshot.json"); + let baseline_text = match std::fs::read_to_string(&baseline_path) { + Ok(s) => s, + Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => { + std::fs::create_dir_all(&dir).unwrap(); + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + return; + } + Err(e) => panic!( + "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}", + baseline_path.display() + ), + }; + let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json"); + + if actual != expected { + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + eprintln!("updated baseline {}", baseline_path.display()); + return; + } + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "code-python-ast-v1 chunks snapshot drift\n\ + --- expected ({}) ---\n{baseline_text}\n\ + --- actual ---\n{pretty}\n\ + If intentional, re-run with UPDATE_SNAPSHOTS=1.", + baseline_path.display() + ); + } +} + +/// Determinism cross-check: re-running the same pipeline yields the same +/// chunk_ids byte-for-byte. +#[test] +fn code_python_ast_chunks_are_deterministic() { + let policy = fixed_policy(); + let baseline: Vec = CodePythonAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + for _ in 0..5 { + let again: Vec = CodePythonAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + assert_eq!(again, baseline); + } +} diff --git a/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs b/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs new file mode 100644 index 0000000..bca0301 --- /dev/null +++ b/crates/kebab-chunk/tests/code_ts_ast_snapshot.rs @@ -0,0 +1,221 @@ +//! Snapshot test pinning the `Vec` JSON for a +//! representative TypeScript code `CanonicalDocument`. +//! +//! This is an integration test. `kebab-parse-code` is intentionally NOT +//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side). +//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code` +//! units, which is the same pattern used in `code_rust_ast_v1.rs`'s +//! internal `code_doc` test helper. +//! +//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline. + +use std::path::PathBuf; + +use kebab_chunk::CodeTsAstV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, + id_for_block, id_for_doc, +}; +use serde_json::Value; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +fn fixed_doc() -> CanonicalDocument { + let wp = WorkspacePath("src/Foo.ts".into()); + let aid = AssetId("b".repeat(64)); + // Pin parser_version so doc_id / block_ids are reproducible. + let pv = ParserVersion("code-ts-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + // Build a >200-line method body to force split_oversize. + let big_body: String = { + let header = "export class BigProcessor {\n process(items: string[]): string[] {\n"; + let body: String = (0..210u32) + .map(|i| format!(" const v{i} = items[{i}] ?? '';\n")) + .collect(); + let footer = " return items;\n }\n}"; + format!("{header}{body}{footer}") + }; + let big_line_count = big_body.lines().count() as u32; + let big_line_end = 48 + big_line_count - 1; + + // Representative units: + // 0. import block (lines 1–5, ≤200) + // 1. free fn `parseInput` (lines 7–12, ≤200) + // 2. interface `Frobable` (lines 14–20, ≤200) + // 3. class `Foo` (lines 22–30, ≤200) + // 4. method `Foo.double` (lines 32–38, ≤200) + // 5. method `Foo.triple` (lines 40–46, ≤200) + // 6. BigProcessor (>200 lines) to force split_oversize + let raw_units: Vec<(&str, u32, u32, String)> = vec![ + ( + "imports", + 1, + 5, + "import { readFileSync } from 'fs';\nimport { join } from 'path';\nimport type { Config } from './config';\nimport { Logger } from './logger';\nimport { EventEmitter } from 'events';".to_string(), + ), + ( + "parseInput", + 7, + 12, + "export function parseInput(raw: string): number | null {\n const trimmed = raw.trim();\n const n = Number(trimmed);\n if (isNaN(n)) return null;\n return n;\n}".to_string(), + ), + ( + "Frobable", + 14, + 20, + "export interface Frobable {\n frob(): string;\n frobTwice(): string;\n readonly name: string;\n readonly tags: string[];\n count: number;\n reset(): void;\n}".to_string(), + ), + ( + "Foo", + 22, + 30, + "export class Foo implements Frobable {\n constructor(\n public readonly name: string,\n public value: number,\n public tags: string[] = [],\n ) {}\n frob(): string { return this.name; }\n frobTwice(): string { return this.name.repeat(2); }\n reset(): void { this.value = 0; }\n}".to_string(), + ), + ( + "Foo.double", + 32, + 38, + "export class Foo {\n double(): number {\n const result = this.value * 2;\n if (result > Number.MAX_SAFE_INTEGER) {\n return Number.MAX_SAFE_INTEGER;\n }\n return result;\n }\n}".to_string(), + ), + ( + "Foo.triple", + 40, + 46, + "export class Foo {\n triple(): number {\n const result = this.value * 3;\n if (result > Number.MAX_SAFE_INTEGER) {\n return Number.MAX_SAFE_INTEGER;\n }\n return result;\n }\n}".to_string(), + ), + ("BigProcessor", 48, big_line_end, big_body), + ]; + + let blocks: Vec = raw_units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("typescript".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some("typescript".into()), + code: code.clone(), + }) + }) + .collect(); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "Foo.ts".into(), + lang: Lang("und".into()), + blocks, + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("typescript".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn fixed_policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("code-ts-ast-v1".into()), + } +} + +#[test] +fn code_ts_ast_chunks_snapshot() { + let doc = fixed_doc(); + let policy = fixed_policy(); + + let chunks = CodeTsAstV1Chunker.chunk(&doc, &policy).expect("chunk"); + let actual = serde_json::to_value(&chunks).unwrap(); + + let dir = fixtures_dir(); + let baseline_path = dir.join("code-sample.ts.chunks.snapshot.json"); + let baseline_text = match std::fs::read_to_string(&baseline_path) { + Ok(s) => s, + Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => { + std::fs::create_dir_all(&dir).unwrap(); + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + return; + } + Err(e) => panic!( + "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}", + baseline_path.display() + ), + }; + let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json"); + + if actual != expected { + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + eprintln!("updated baseline {}", baseline_path.display()); + return; + } + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "code-ts-ast-v1 chunks snapshot drift\n\ + --- expected ({}) ---\n{baseline_text}\n\ + --- actual ---\n{pretty}\n\ + If intentional, re-run with UPDATE_SNAPSHOTS=1.", + baseline_path.display() + ); + } +} + +/// Determinism cross-check: re-running the same pipeline yields the same +/// chunk_ids byte-for-byte. +#[test] +fn code_ts_ast_chunks_are_deterministic() { + let policy = fixed_policy(); + let baseline: Vec = CodeTsAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + for _ in 0..5 { + let again: Vec = CodeTsAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + assert_eq!(again, baseline); + } +} diff --git a/crates/kebab-chunk/tests/fixtures/code-sample.js.chunks.snapshot.json b/crates/kebab-chunk/tests/fixtures/code-sample.js.chunks.snapshot.json new file mode 100644 index 0000000..fb33e50 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/code-sample.js.chunks.snapshot.json @@ -0,0 +1,170 @@ +[ + { + "block_ids": [ + "cc724d960aebe9fb36062d24f4626c66" + ], + "chunk_id": "c9cf256456a47671fd6ceda800db8c05", + "chunker_version": "code-js-ast-v1", + "doc_id": "99d3bdd276489c2d51975eefb16cb64d", + "heading_path": [], + "policy_hash": "0d0cd223ca3431b2", + "source_spans": [ + { + "kind": "code", + "lang": "javascript", + "line_end": 5, + "line_start": 1, + "symbol": "requires" + } + ], + "text": "const fs = require('fs');\nconst path = require('path');\nconst { EventEmitter } = require('events');\nconst assert = require('assert');\nconst crypto = require('crypto');", + "token_estimate": 56 + }, + { + "block_ids": [ + "f0d00af94e8b3e0fe4249f66d27caedd" + ], + "chunk_id": "93756a717b518ce2a94d1390b1b6d4f5", + "chunker_version": "code-js-ast-v1", + "doc_id": "99d3bdd276489c2d51975eefb16cb64d", + "heading_path": [], + "policy_hash": "0d0cd223ca3431b2", + "source_spans": [ + { + "kind": "code", + "lang": "javascript", + "line_end": 12, + "line_start": 7, + "symbol": "add" + } + ], + "text": "export function add(a, b) {\n if (typeof a !== 'number') throw new TypeError('a');\n if (typeof b !== 'number') throw new TypeError('b');\n const result = a + b;\n assert(isFinite(result));\n return result;\n}", + "token_estimate": 70 + }, + { + "block_ids": [ + "19e5e07c316d04e18ec0b10598c20ec7" + ], + "chunk_id": "1de13c1d85ba9c05e4f05f2d9c32820d", + "chunker_version": "code-js-ast-v1", + "doc_id": "99d3bdd276489c2d51975eefb16cb64d", + "heading_path": [], + "policy_hash": "0d0cd223ca3431b2", + "source_spans": [ + { + "kind": "code", + "lang": "javascript", + "line_end": 20, + "line_start": 14, + "symbol": "EventBus" + } + ], + "text": "class EventBus {\n constructor() {\n this._handlers = new Map();\n this._history = [];\n this._maxHistory = 100;\n this._seq = 0;\n }\n}", + "token_estimate": 48 + }, + { + "block_ids": [ + "8e016bd376edde2c49320c5094d01b67" + ], + "chunk_id": "db80b34645e2b9148a2ebd8967d44a64", + "chunker_version": "code-js-ast-v1", + "doc_id": "99d3bdd276489c2d51975eefb16cb64d", + "heading_path": [], + "policy_hash": "0d0cd223ca3431b2", + "source_spans": [ + { + "kind": "code", + "lang": "javascript", + "line_end": 30, + "line_start": 22, + "symbol": "BaseHandler" + } + ], + "text": "class BaseHandler {\n handle(event) {\n throw new Error('not implemented');\n }\n batchHandle(events) {\n const results = [];\n for (const ev of events) {\n results.push(this.handle(ev));\n }\n return results;\n }\n}", + "token_estimate": 77 + }, + { + "block_ids": [ + "e06656d11af2c1d7928856766382d168" + ], + "chunk_id": "7b0422e3646997d0cd2e694a6c4ca2e7", + "chunker_version": "code-js-ast-v1", + "doc_id": "99d3bdd276489c2d51975eefb16cb64d", + "heading_path": [], + "policy_hash": "0d0cd223ca3431b2", + "source_spans": [ + { + "kind": "code", + "lang": "javascript", + "line_end": 38, + "line_start": 32, + "symbol": "EventBus.emit" + } + ], + "text": "class EventBus {\n emit(name, payload) {\n const handlers = this._handlers.get(name) ?? [];\n for (const h of handlers) {\n h(payload);\n }\n return this;\n }\n}", + "token_estimate": 58 + }, + { + "block_ids": [ + "601a8af776f0634cfb4ccfa97e612afc" + ], + "chunk_id": "640269e1e8c1d052868f1f88558f28a2", + "chunker_version": "code-js-ast-v1", + "doc_id": "99d3bdd276489c2d51975eefb16cb64d", + "heading_path": [], + "policy_hash": "0d0cd223ca3431b2", + "source_spans": [ + { + "kind": "code", + "lang": "javascript", + "line_end": 46, + "line_start": 40, + "symbol": "EventBus.on" + } + ], + "text": "class EventBus {\n on(name, handler) {\n if (!this._handlers.has(name)) {\n this._handlers.set(name, []);\n }\n this._handlers.get(name).push(handler);\n return this;\n }\n}", + "token_estimate": 62 + }, + { + "block_ids": [ + "2bc61a811414be749c17290832857c7f" + ], + "chunk_id": "073802021b95a24f0c905b36dd9905c3", + "chunker_version": "code-js-ast-v1", + "doc_id": "99d3bdd276489c2d51975eefb16cb64d", + "heading_path": [], + "policy_hash": "0d0cd223ca3431b2", + "source_spans": [ + { + "kind": "code", + "lang": "javascript", + "line_end": 247, + "line_start": 48, + "symbol": "bigTransform [part 1/2]" + } + ], + "text": "function bigTransform(items) {\n const v0 = items[0] !== undefined ? items[0] : null;\n const v1 = items[1] !== undefined ? items[1] : null;\n const v2 = items[2] !== undefined ? items[2] : null;\n const v3 = items[3] !== undefined ? items[3] : null;\n const v4 = items[4] !== undefined ? items[4] : null;\n const v5 = items[5] !== undefined ? items[5] : null;\n const v6 = items[6] !== undefined ? items[6] : null;\n const v7 = items[7] !== undefined ? items[7] : null;\n const v8 = items[8] !== undefined ? items[8] : null;\n const v9 = items[9] !== undefined ? items[9] : null;\n const v10 = items[10] !== undefined ? items[10] : null;\n const v11 = items[11] !== undefined ? items[11] : null;\n const v12 = items[12] !== undefined ? items[12] : null;\n const v13 = items[13] !== undefined ? items[13] : null;\n const v14 = items[14] !== undefined ? items[14] : null;\n const v15 = items[15] !== undefined ? items[15] : null;\n const v16 = items[16] !== undefined ? items[16] : null;\n const v17 = items[17] !== undefined ? items[17] : null;\n const v18 = items[18] !== undefined ? items[18] : null;\n const v19 = items[19] !== undefined ? items[19] : null;\n const v20 = items[20] !== undefined ? items[20] : null;\n const v21 = items[21] !== undefined ? items[21] : null;\n const v22 = items[22] !== undefined ? items[22] : null;\n const v23 = items[23] !== undefined ? items[23] : null;\n const v24 = items[24] !== undefined ? items[24] : null;\n const v25 = items[25] !== undefined ? items[25] : null;\n const v26 = items[26] !== undefined ? items[26] : null;\n const v27 = items[27] !== undefined ? items[27] : null;\n const v28 = items[28] !== undefined ? items[28] : null;\n const v29 = items[29] !== undefined ? items[29] : null;\n const v30 = items[30] !== undefined ? items[30] : null;\n const v31 = items[31] !== undefined ? items[31] : null;\n const v32 = items[32] !== undefined ? items[32] : null;\n const v33 = items[33] !== undefined ? items[33] : null;\n const v34 = items[34] !== undefined ? items[34] : null;\n const v35 = items[35] !== undefined ? items[35] : null;\n const v36 = items[36] !== undefined ? items[36] : null;\n const v37 = items[37] !== undefined ? items[37] : null;\n const v38 = items[38] !== undefined ? items[38] : null;\n const v39 = items[39] !== undefined ? items[39] : null;\n const v40 = items[40] !== undefined ? items[40] : null;\n const v41 = items[41] !== undefined ? items[41] : null;\n const v42 = items[42] !== undefined ? items[42] : null;\n const v43 = items[43] !== undefined ? items[43] : null;\n const v44 = items[44] !== undefined ? items[44] : null;\n const v45 = items[45] !== undefined ? items[45] : null;\n const v46 = items[46] !== undefined ? items[46] : null;\n const v47 = items[47] !== undefined ? items[47] : null;\n const v48 = items[48] !== undefined ? items[48] : null;\n const v49 = items[49] !== undefined ? items[49] : null;\n const v50 = items[50] !== undefined ? items[50] : null;\n const v51 = items[51] !== undefined ? items[51] : null;\n const v52 = items[52] !== undefined ? items[52] : null;\n const v53 = items[53] !== undefined ? items[53] : null;\n const v54 = items[54] !== undefined ? items[54] : null;\n const v55 = items[55] !== undefined ? items[55] : null;\n const v56 = items[56] !== undefined ? items[56] : null;\n const v57 = items[57] !== undefined ? items[57] : null;\n const v58 = items[58] !== undefined ? items[58] : null;\n const v59 = items[59] !== undefined ? items[59] : null;\n const v60 = items[60] !== undefined ? items[60] : null;\n const v61 = items[61] !== undefined ? items[61] : null;\n const v62 = items[62] !== undefined ? items[62] : null;\n const v63 = items[63] !== undefined ? items[63] : null;\n const v64 = items[64] !== undefined ? items[64] : null;\n const v65 = items[65] !== undefined ? items[65] : null;\n const v66 = items[66] !== undefined ? items[66] : null;\n const v67 = items[67] !== undefined ? items[67] : null;\n const v68 = items[68] !== undefined ? items[68] : null;\n const v69 = items[69] !== undefined ? items[69] : null;\n const v70 = items[70] !== undefined ? items[70] : null;\n const v71 = items[71] !== undefined ? items[71] : null;\n const v72 = items[72] !== undefined ? items[72] : null;\n const v73 = items[73] !== undefined ? items[73] : null;\n const v74 = items[74] !== undefined ? items[74] : null;\n const v75 = items[75] !== undefined ? items[75] : null;\n const v76 = items[76] !== undefined ? items[76] : null;\n const v77 = items[77] !== undefined ? items[77] : null;\n const v78 = items[78] !== undefined ? items[78] : null;\n const v79 = items[79] !== undefined ? items[79] : null;\n const v80 = items[80] !== undefined ? items[80] : null;\n const v81 = items[81] !== undefined ? items[81] : null;\n const v82 = items[82] !== undefined ? items[82] : null;\n const v83 = items[83] !== undefined ? items[83] : null;\n const v84 = items[84] !== undefined ? items[84] : null;\n const v85 = items[85] !== undefined ? items[85] : null;\n const v86 = items[86] !== undefined ? items[86] : null;\n const v87 = items[87] !== undefined ? items[87] : null;\n const v88 = items[88] !== undefined ? items[88] : null;\n const v89 = items[89] !== undefined ? items[89] : null;\n const v90 = items[90] !== undefined ? items[90] : null;\n const v91 = items[91] !== undefined ? items[91] : null;\n const v92 = items[92] !== undefined ? items[92] : null;\n const v93 = items[93] !== undefined ? items[93] : null;\n const v94 = items[94] !== undefined ? items[94] : null;\n const v95 = items[95] !== undefined ? items[95] : null;\n const v96 = items[96] !== undefined ? items[96] : null;\n const v97 = items[97] !== undefined ? items[97] : null;\n const v98 = items[98] !== undefined ? items[98] : null;\n const v99 = items[99] !== undefined ? items[99] : null;\n const v100 = items[100] !== undefined ? items[100] : null;\n const v101 = items[101] !== undefined ? items[101] : null;\n const v102 = items[102] !== undefined ? items[102] : null;\n const v103 = items[103] !== undefined ? items[103] : null;\n const v104 = items[104] !== undefined ? items[104] : null;\n const v105 = items[105] !== undefined ? items[105] : null;\n const v106 = items[106] !== undefined ? items[106] : null;\n const v107 = items[107] !== undefined ? items[107] : null;\n const v108 = items[108] !== undefined ? items[108] : null;\n const v109 = items[109] !== undefined ? items[109] : null;\n const v110 = items[110] !== undefined ? items[110] : null;\n const v111 = items[111] !== undefined ? items[111] : null;\n const v112 = items[112] !== undefined ? items[112] : null;\n const v113 = items[113] !== undefined ? items[113] : null;\n const v114 = items[114] !== undefined ? items[114] : null;\n const v115 = items[115] !== undefined ? items[115] : null;\n const v116 = items[116] !== undefined ? items[116] : null;\n const v117 = items[117] !== undefined ? items[117] : null;\n const v118 = items[118] !== undefined ? items[118] : null;\n const v119 = items[119] !== undefined ? items[119] : null;\n const v120 = items[120] !== undefined ? items[120] : null;\n const v121 = items[121] !== undefined ? items[121] : null;\n const v122 = items[122] !== undefined ? items[122] : null;\n const v123 = items[123] !== undefined ? items[123] : null;\n const v124 = items[124] !== undefined ? items[124] : null;\n const v125 = items[125] !== undefined ? items[125] : null;\n const v126 = items[126] !== undefined ? items[126] : null;\n const v127 = items[127] !== undefined ? items[127] : null;\n const v128 = items[128] !== undefined ? items[128] : null;\n const v129 = items[129] !== undefined ? items[129] : null;\n const v130 = items[130] !== undefined ? items[130] : null;\n const v131 = items[131] !== undefined ? items[131] : null;\n const v132 = items[132] !== undefined ? items[132] : null;\n const v133 = items[133] !== undefined ? items[133] : null;\n const v134 = items[134] !== undefined ? items[134] : null;\n const v135 = items[135] !== undefined ? items[135] : null;\n const v136 = items[136] !== undefined ? items[136] : null;\n const v137 = items[137] !== undefined ? items[137] : null;\n const v138 = items[138] !== undefined ? items[138] : null;\n const v139 = items[139] !== undefined ? items[139] : null;\n const v140 = items[140] !== undefined ? items[140] : null;\n const v141 = items[141] !== undefined ? items[141] : null;\n const v142 = items[142] !== undefined ? items[142] : null;\n const v143 = items[143] !== undefined ? items[143] : null;\n const v144 = items[144] !== undefined ? items[144] : null;\n const v145 = items[145] !== undefined ? items[145] : null;\n const v146 = items[146] !== undefined ? items[146] : null;\n const v147 = items[147] !== undefined ? items[147] : null;\n const v148 = items[148] !== undefined ? items[148] : null;\n const v149 = items[149] !== undefined ? items[149] : null;\n const v150 = items[150] !== undefined ? items[150] : null;\n const v151 = items[151] !== undefined ? items[151] : null;\n const v152 = items[152] !== undefined ? items[152] : null;\n const v153 = items[153] !== undefined ? items[153] : null;\n const v154 = items[154] !== undefined ? items[154] : null;\n const v155 = items[155] !== undefined ? items[155] : null;\n const v156 = items[156] !== undefined ? items[156] : null;\n const v157 = items[157] !== undefined ? items[157] : null;\n const v158 = items[158] !== undefined ? items[158] : null;\n const v159 = items[159] !== undefined ? items[159] : null;\n const v160 = items[160] !== undefined ? items[160] : null;\n const v161 = items[161] !== undefined ? items[161] : null;\n const v162 = items[162] !== undefined ? items[162] : null;\n const v163 = items[163] !== undefined ? items[163] : null;\n const v164 = items[164] !== undefined ? items[164] : null;\n const v165 = items[165] !== undefined ? items[165] : null;\n const v166 = items[166] !== undefined ? items[166] : null;\n const v167 = items[167] !== undefined ? items[167] : null;\n const v168 = items[168] !== undefined ? items[168] : null;\n const v169 = items[169] !== undefined ? items[169] : null;\n const v170 = items[170] !== undefined ? items[170] : null;\n const v171 = items[171] !== undefined ? items[171] : null;\n const v172 = items[172] !== undefined ? items[172] : null;\n const v173 = items[173] !== undefined ? items[173] : null;\n const v174 = items[174] !== undefined ? items[174] : null;\n const v175 = items[175] !== undefined ? items[175] : null;\n const v176 = items[176] !== undefined ? items[176] : null;\n const v177 = items[177] !== undefined ? items[177] : null;\n const v178 = items[178] !== undefined ? items[178] : null;\n const v179 = items[179] !== undefined ? items[179] : null;\n const v180 = items[180] !== undefined ? items[180] : null;\n const v181 = items[181] !== undefined ? items[181] : null;\n const v182 = items[182] !== undefined ? items[182] : null;\n const v183 = items[183] !== undefined ? items[183] : null;\n const v184 = items[184] !== undefined ? items[184] : null;\n const v185 = items[185] !== undefined ? items[185] : null;\n const v186 = items[186] !== undefined ? items[186] : null;\n const v187 = items[187] !== undefined ? items[187] : null;\n const v188 = items[188] !== undefined ? items[188] : null;\n const v189 = items[189] !== undefined ? items[189] : null;\n const v190 = items[190] !== undefined ? items[190] : null;\n const v191 = items[191] !== undefined ? items[191] : null;\n const v192 = items[192] !== undefined ? items[192] : null;\n const v193 = items[193] !== undefined ? items[193] : null;\n const v194 = items[194] !== undefined ? items[194] : null;\n const v195 = items[195] !== undefined ? items[195] : null;\n const v196 = items[196] !== undefined ? items[196] : null;\n const v197 = items[197] !== undefined ? items[197] : null;\n const v198 = items[198] !== undefined ? items[198] : null;", + "token_estimate": 3947 + }, + { + "block_ids": [ + "2bc61a811414be749c17290832857c7f" + ], + "chunk_id": "62cdc27761f4a7767e6caceae2517977", + "chunker_version": "code-js-ast-v1", + "doc_id": "99d3bdd276489c2d51975eefb16cb64d", + "heading_path": [], + "policy_hash": "0d0cd223ca3431b2", + "source_spans": [ + { + "kind": "code", + "lang": "javascript", + "line_end": 260, + "line_start": 248, + "symbol": "bigTransform [part 2/2]" + } + ], + "text": " const v199 = items[199] !== undefined ? items[199] : null;\n const v200 = items[200] !== undefined ? items[200] : null;\n const v201 = items[201] !== undefined ? items[201] : null;\n const v202 = items[202] !== undefined ? items[202] : null;\n const v203 = items[203] !== undefined ? items[203] : null;\n const v204 = items[204] !== undefined ? items[204] : null;\n const v205 = items[205] !== undefined ? items[205] : null;\n const v206 = items[206] !== undefined ? items[206] : null;\n const v207 = items[207] !== undefined ? items[207] : null;\n const v208 = items[208] !== undefined ? items[208] : null;\n const v209 = items[209] !== undefined ? items[209] : null;\n return items;\n}", + "token_estimate": 230 + } +] diff --git a/crates/kebab-chunk/tests/fixtures/code-sample.py.chunks.snapshot.json b/crates/kebab-chunk/tests/fixtures/code-sample.py.chunks.snapshot.json new file mode 100644 index 0000000..1b9d86e --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/code-sample.py.chunks.snapshot.json @@ -0,0 +1,170 @@ +[ + { + "block_ids": [ + "bd1be1fd8b8f77e2874755010b36e617" + ], + "chunk_id": "20e05d99069f939104cdc69c7ef22889", + "chunker_version": "code-python-ast-v1", + "doc_id": "97ddfbda5585eb82ed09b0d7e95c0c03", + "heading_path": [], + "policy_hash": "383e9a070f636294", + "source_spans": [ + { + "kind": "code", + "lang": "python", + "line_end": 5, + "line_start": 1, + "symbol": "imports" + } + ], + "text": "import os\nimport sys\nfrom typing import List\nfrom pathlib import Path\nfrom collections import defaultdict", + "token_estimate": 35 + }, + { + "block_ids": [ + "2fe948bb529221e94c5139951cc65acf" + ], + "chunk_id": "99cef84788f2cbad3de6fb7c27b81c48", + "chunker_version": "code-python-ast-v1", + "doc_id": "97ddfbda5585eb82ed09b0d7e95c0c03", + "heading_path": [], + "policy_hash": "383e9a070f636294", + "source_spans": [ + { + "kind": "code", + "lang": "python", + "line_end": 12, + "line_start": 7, + "symbol": "compute_mrr" + } + ], + "text": "def compute_mrr(scores):\n if not scores:\n return 0.0\n return sum(\n 1.0 / r for r in scores\n ) / len(scores)", + "token_estimate": 44 + }, + { + "block_ids": [ + "ff944bad66bea107fd2500c35d7ddf68" + ], + "chunk_id": "28a3abdd51390c9c9bb89aa8b3ff3f46", + "chunker_version": "code-python-ast-v1", + "doc_id": "97ddfbda5585eb82ed09b0d7e95c0c03", + "heading_path": [], + "policy_hash": "383e9a070f636294", + "source_spans": [ + { + "kind": "code", + "lang": "python", + "line_end": 20, + "line_start": 14, + "symbol": "MetricsCollector" + } + ], + "text": "class MetricsCollector:\n def __init__(self):\n self.scores = []\n self.labels = []\n self.counts = defaultdict(int)\n self.totals = defaultdict(float)\n self.tags = []", + "token_estimate": 67 + }, + { + "block_ids": [ + "1e75f40c64ba21ad0bada0f5d35dc232" + ], + "chunk_id": "031086ad8c4b880d02cb52527382425c", + "chunker_version": "code-python-ast-v1", + "doc_id": "97ddfbda5585eb82ed09b0d7e95c0c03", + "heading_path": [], + "policy_hash": "383e9a070f636294", + "source_spans": [ + { + "kind": "code", + "lang": "python", + "line_end": 30, + "line_start": 22, + "symbol": "BaseEvaluator" + } + ], + "text": "class BaseEvaluator:\n def evaluate(self, data):\n raise NotImplementedError\n def batch_evaluate(self, items):\n results = []\n for item in items:\n results.append(self.evaluate(item))\n return results\n def name(self):\n return type(self).__name__", + "token_estimate": 99 + }, + { + "block_ids": [ + "33d08d6405adb459e90b8d67bab5cc80" + ], + "chunk_id": "a431bd5ab64b2f12634c0d4f4b3e0841", + "chunker_version": "code-python-ast-v1", + "doc_id": "97ddfbda5585eb82ed09b0d7e95c0c03", + "heading_path": [], + "policy_hash": "383e9a070f636294", + "source_spans": [ + { + "kind": "code", + "lang": "python", + "line_end": 38, + "line_start": 32, + "symbol": "MetricsCollector.run" + } + ], + "text": "class MetricsCollector:\n def run(self, inputs):\n for inp in inputs:\n score = self._score(inp)\n self.scores.append(\n score\n )", + "token_estimate": 61 + }, + { + "block_ids": [ + "af3d89eb1be6e11dfd14af3c86a8ba9c" + ], + "chunk_id": "00b756d5bcc43858bb98aa609f22ab6c", + "chunker_version": "code-python-ast-v1", + "doc_id": "97ddfbda5585eb82ed09b0d7e95c0c03", + "heading_path": [], + "policy_hash": "383e9a070f636294", + "source_spans": [ + { + "kind": "code", + "lang": "python", + "line_end": 46, + "line_start": 40, + "symbol": "MetricsCollector.report" + } + ], + "text": "class MetricsCollector:\n def report(self):\n return {\n 'mean': sum(self.scores) / max(len(self.scores), 1),\n 'count': len(self.scores),\n 'tags': self.tags,\n }", + "token_estimate": 69 + }, + { + "block_ids": [ + "c86acf6ae110d7f5681093c93ee0e5e5" + ], + "chunk_id": "90071017de40b5dd57e9d6001657cf14", + "chunker_version": "code-python-ast-v1", + "doc_id": "97ddfbda5585eb82ed09b0d7e95c0c03", + "heading_path": [], + "policy_hash": "383e9a070f636294", + "source_spans": [ + { + "kind": "code", + "lang": "python", + "line_end": 247, + "line_start": 48, + "symbol": "big_compute [part 1/2]" + } + ], + "text": "def big_compute(data):\n v0 = data[0] if 0 < len(data) else 0\n v1 = data[1] if 1 < len(data) else 0\n v2 = data[2] if 2 < len(data) else 0\n v3 = data[3] if 3 < len(data) else 0\n v4 = data[4] if 4 < len(data) else 0\n v5 = data[5] if 5 < len(data) else 0\n v6 = data[6] if 6 < len(data) else 0\n v7 = data[7] if 7 < len(data) else 0\n v8 = data[8] if 8 < len(data) else 0\n v9 = data[9] if 9 < len(data) else 0\n v10 = data[10] if 10 < len(data) else 0\n v11 = data[11] if 11 < len(data) else 0\n v12 = data[12] if 12 < len(data) else 0\n v13 = data[13] if 13 < len(data) else 0\n v14 = data[14] if 14 < len(data) else 0\n v15 = data[15] if 15 < len(data) else 0\n v16 = data[16] if 16 < len(data) else 0\n v17 = data[17] if 17 < len(data) else 0\n v18 = data[18] if 18 < len(data) else 0\n v19 = data[19] if 19 < len(data) else 0\n v20 = data[20] if 20 < len(data) else 0\n v21 = data[21] if 21 < len(data) else 0\n v22 = data[22] if 22 < len(data) else 0\n v23 = data[23] if 23 < len(data) else 0\n v24 = data[24] if 24 < len(data) else 0\n v25 = data[25] if 25 < len(data) else 0\n v26 = data[26] if 26 < len(data) else 0\n v27 = data[27] if 27 < len(data) else 0\n v28 = data[28] if 28 < len(data) else 0\n v29 = data[29] if 29 < len(data) else 0\n v30 = data[30] if 30 < len(data) else 0\n v31 = data[31] if 31 < len(data) else 0\n v32 = data[32] if 32 < len(data) else 0\n v33 = data[33] if 33 < len(data) else 0\n v34 = data[34] if 34 < len(data) else 0\n v35 = data[35] if 35 < len(data) else 0\n v36 = data[36] if 36 < len(data) else 0\n v37 = data[37] if 37 < len(data) else 0\n v38 = data[38] if 38 < len(data) else 0\n v39 = data[39] if 39 < len(data) else 0\n v40 = data[40] if 40 < len(data) else 0\n v41 = data[41] if 41 < len(data) else 0\n v42 = data[42] if 42 < len(data) else 0\n v43 = data[43] if 43 < len(data) else 0\n v44 = data[44] if 44 < len(data) else 0\n v45 = data[45] if 45 < len(data) else 0\n v46 = data[46] if 46 < len(data) else 0\n v47 = data[47] if 47 < len(data) else 0\n v48 = data[48] if 48 < len(data) else 0\n v49 = data[49] if 49 < len(data) else 0\n v50 = data[50] if 50 < len(data) else 0\n v51 = data[51] if 51 < len(data) else 0\n v52 = data[52] if 52 < len(data) else 0\n v53 = data[53] if 53 < len(data) else 0\n v54 = data[54] if 54 < len(data) else 0\n v55 = data[55] if 55 < len(data) else 0\n v56 = data[56] if 56 < len(data) else 0\n v57 = data[57] if 57 < len(data) else 0\n v58 = data[58] if 58 < len(data) else 0\n v59 = data[59] if 59 < len(data) else 0\n v60 = data[60] if 60 < len(data) else 0\n v61 = data[61] if 61 < len(data) else 0\n v62 = data[62] if 62 < len(data) else 0\n v63 = data[63] if 63 < len(data) else 0\n v64 = data[64] if 64 < len(data) else 0\n v65 = data[65] if 65 < len(data) else 0\n v66 = data[66] if 66 < len(data) else 0\n v67 = data[67] if 67 < len(data) else 0\n v68 = data[68] if 68 < len(data) else 0\n v69 = data[69] if 69 < len(data) else 0\n v70 = data[70] if 70 < len(data) else 0\n v71 = data[71] if 71 < len(data) else 0\n v72 = data[72] if 72 < len(data) else 0\n v73 = data[73] if 73 < len(data) else 0\n v74 = data[74] if 74 < len(data) else 0\n v75 = data[75] if 75 < len(data) else 0\n v76 = data[76] if 76 < len(data) else 0\n v77 = data[77] if 77 < len(data) else 0\n v78 = data[78] if 78 < len(data) else 0\n v79 = data[79] if 79 < len(data) else 0\n v80 = data[80] if 80 < len(data) else 0\n v81 = data[81] if 81 < len(data) else 0\n v82 = data[82] if 82 < len(data) else 0\n v83 = data[83] if 83 < len(data) else 0\n v84 = data[84] if 84 < len(data) else 0\n v85 = data[85] if 85 < len(data) else 0\n v86 = data[86] if 86 < len(data) else 0\n v87 = data[87] if 87 < len(data) else 0\n v88 = data[88] if 88 < len(data) else 0\n v89 = data[89] if 89 < len(data) else 0\n v90 = data[90] if 90 < len(data) else 0\n v91 = data[91] if 91 < len(data) else 0\n v92 = data[92] if 92 < len(data) else 0\n v93 = data[93] if 93 < len(data) else 0\n v94 = data[94] if 94 < len(data) else 0\n v95 = data[95] if 95 < len(data) else 0\n v96 = data[96] if 96 < len(data) else 0\n v97 = data[97] if 97 < len(data) else 0\n v98 = data[98] if 98 < len(data) else 0\n v99 = data[99] if 99 < len(data) else 0\n v100 = data[100] if 100 < len(data) else 0\n v101 = data[101] if 101 < len(data) else 0\n v102 = data[102] if 102 < len(data) else 0\n v103 = data[103] if 103 < len(data) else 0\n v104 = data[104] if 104 < len(data) else 0\n v105 = data[105] if 105 < len(data) else 0\n v106 = data[106] if 106 < len(data) else 0\n v107 = data[107] if 107 < len(data) else 0\n v108 = data[108] if 108 < len(data) else 0\n v109 = data[109] if 109 < len(data) else 0\n v110 = data[110] if 110 < len(data) else 0\n v111 = data[111] if 111 < len(data) else 0\n v112 = data[112] if 112 < len(data) else 0\n v113 = data[113] if 113 < len(data) else 0\n v114 = data[114] if 114 < len(data) else 0\n v115 = data[115] if 115 < len(data) else 0\n v116 = data[116] if 116 < len(data) else 0\n v117 = data[117] if 117 < len(data) else 0\n v118 = data[118] if 118 < len(data) else 0\n v119 = data[119] if 119 < len(data) else 0\n v120 = data[120] if 120 < len(data) else 0\n v121 = data[121] if 121 < len(data) else 0\n v122 = data[122] if 122 < len(data) else 0\n v123 = data[123] if 123 < len(data) else 0\n v124 = data[124] if 124 < len(data) else 0\n v125 = data[125] if 125 < len(data) else 0\n v126 = data[126] if 126 < len(data) else 0\n v127 = data[127] if 127 < len(data) else 0\n v128 = data[128] if 128 < len(data) else 0\n v129 = data[129] if 129 < len(data) else 0\n v130 = data[130] if 130 < len(data) else 0\n v131 = data[131] if 131 < len(data) else 0\n v132 = data[132] if 132 < len(data) else 0\n v133 = data[133] if 133 < len(data) else 0\n v134 = data[134] if 134 < len(data) else 0\n v135 = data[135] if 135 < len(data) else 0\n v136 = data[136] if 136 < len(data) else 0\n v137 = data[137] if 137 < len(data) else 0\n v138 = data[138] if 138 < len(data) else 0\n v139 = data[139] if 139 < len(data) else 0\n v140 = data[140] if 140 < len(data) else 0\n v141 = data[141] if 141 < len(data) else 0\n v142 = data[142] if 142 < len(data) else 0\n v143 = data[143] if 143 < len(data) else 0\n v144 = data[144] if 144 < len(data) else 0\n v145 = data[145] if 145 < len(data) else 0\n v146 = data[146] if 146 < len(data) else 0\n v147 = data[147] if 147 < len(data) else 0\n v148 = data[148] if 148 < len(data) else 0\n v149 = data[149] if 149 < len(data) else 0\n v150 = data[150] if 150 < len(data) else 0\n v151 = data[151] if 151 < len(data) else 0\n v152 = data[152] if 152 < len(data) else 0\n v153 = data[153] if 153 < len(data) else 0\n v154 = data[154] if 154 < len(data) else 0\n v155 = data[155] if 155 < len(data) else 0\n v156 = data[156] if 156 < len(data) else 0\n v157 = data[157] if 157 < len(data) else 0\n v158 = data[158] if 158 < len(data) else 0\n v159 = data[159] if 159 < len(data) else 0\n v160 = data[160] if 160 < len(data) else 0\n v161 = data[161] if 161 < len(data) else 0\n v162 = data[162] if 162 < len(data) else 0\n v163 = data[163] if 163 < len(data) else 0\n v164 = data[164] if 164 < len(data) else 0\n v165 = data[165] if 165 < len(data) else 0\n v166 = data[166] if 166 < len(data) else 0\n v167 = data[167] if 167 < len(data) else 0\n v168 = data[168] if 168 < len(data) else 0\n v169 = data[169] if 169 < len(data) else 0\n v170 = data[170] if 170 < len(data) else 0\n v171 = data[171] if 171 < len(data) else 0\n v172 = data[172] if 172 < len(data) else 0\n v173 = data[173] if 173 < len(data) else 0\n v174 = data[174] if 174 < len(data) else 0\n v175 = data[175] if 175 < len(data) else 0\n v176 = data[176] if 176 < len(data) else 0\n v177 = data[177] if 177 < len(data) else 0\n v178 = data[178] if 178 < len(data) else 0\n v179 = data[179] if 179 < len(data) else 0\n v180 = data[180] if 180 < len(data) else 0\n v181 = data[181] if 181 < len(data) else 0\n v182 = data[182] if 182 < len(data) else 0\n v183 = data[183] if 183 < len(data) else 0\n v184 = data[184] if 184 < len(data) else 0\n v185 = data[185] if 185 < len(data) else 0\n v186 = data[186] if 186 < len(data) else 0\n v187 = data[187] if 187 < len(data) else 0\n v188 = data[188] if 188 < len(data) else 0\n v189 = data[189] if 189 < len(data) else 0\n v190 = data[190] if 190 < len(data) else 0\n v191 = data[191] if 191 < len(data) else 0\n v192 = data[192] if 192 < len(data) else 0\n v193 = data[193] if 193 < len(data) else 0\n v194 = data[194] if 194 < len(data) else 0\n v195 = data[195] if 195 < len(data) else 0\n v196 = data[196] if 196 < len(data) else 0\n v197 = data[197] if 197 < len(data) else 0\n v198 = data[198] if 198 < len(data) else 0", + "token_estimate": 3015 + }, + { + "block_ids": [ + "c86acf6ae110d7f5681093c93ee0e5e5" + ], + "chunk_id": "efc6599ac90e8de5fe8f63896a85d747", + "chunker_version": "code-python-ast-v1", + "doc_id": "97ddfbda5585eb82ed09b0d7e95c0c03", + "heading_path": [], + "policy_hash": "383e9a070f636294", + "source_spans": [ + { + "kind": "code", + "lang": "python", + "line_end": 259, + "line_start": 248, + "symbol": "big_compute [part 2/2]" + } + ], + "text": " v199 = data[199] if 199 < len(data) else 0\n v200 = data[200] if 200 < len(data) else 0\n v201 = data[201] if 201 < len(data) else 0\n v202 = data[202] if 202 < len(data) else 0\n v203 = data[203] if 203 < len(data) else 0\n v204 = data[204] if 204 < len(data) else 0\n v205 = data[205] if 205 < len(data) else 0\n v206 = data[206] if 206 < len(data) else 0\n v207 = data[207] if 207 < len(data) else 0\n v208 = data[208] if 208 < len(data) else 0\n v209 = data[209] if 209 < len(data) else 0\n return sum(data)", + "token_estimate": 179 + } +] diff --git a/crates/kebab-chunk/tests/fixtures/code-sample.ts.chunks.snapshot.json b/crates/kebab-chunk/tests/fixtures/code-sample.ts.chunks.snapshot.json new file mode 100644 index 0000000..446b98d --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/code-sample.ts.chunks.snapshot.json @@ -0,0 +1,170 @@ +[ + { + "block_ids": [ + "29c56554514c80a92a9d12410056e168" + ], + "chunk_id": "fc30b9a92970ee5fb940c2b12db2c005", + "chunker_version": "code-ts-ast-v1", + "doc_id": "ff6591709852ab9c57be6e50145b9800", + "heading_path": [], + "policy_hash": "dd45402f76b4e339", + "source_spans": [ + { + "kind": "code", + "lang": "typescript", + "line_end": 5, + "line_start": 1, + "symbol": "imports" + } + ], + "text": "import { readFileSync } from 'fs';\nimport { join } from 'path';\nimport type { Config } from './config';\nimport { Logger } from './logger';\nimport { EventEmitter } from 'events';", + "token_estimate": 59 + }, + { + "block_ids": [ + "e3f542c4928032926a1e21a159686a34" + ], + "chunk_id": "d5988988e20b69da53307b43f2d400ee", + "chunker_version": "code-ts-ast-v1", + "doc_id": "ff6591709852ab9c57be6e50145b9800", + "heading_path": [], + "policy_hash": "dd45402f76b4e339", + "source_spans": [ + { + "kind": "code", + "lang": "typescript", + "line_end": 12, + "line_start": 7, + "symbol": "parseInput" + } + ], + "text": "export function parseInput(raw: string): number | null {\n const trimmed = raw.trim();\n const n = Number(trimmed);\n if (isNaN(n)) return null;\n return n;\n}", + "token_estimate": 53 + }, + { + "block_ids": [ + "77d7f5ea7af7be27611adcbcee7c2e8f" + ], + "chunk_id": "f1147cabe4dff8bc33b56f8ff0b397e9", + "chunker_version": "code-ts-ast-v1", + "doc_id": "ff6591709852ab9c57be6e50145b9800", + "heading_path": [], + "policy_hash": "dd45402f76b4e339", + "source_spans": [ + { + "kind": "code", + "lang": "typescript", + "line_end": 20, + "line_start": 14, + "symbol": "Frobable" + } + ], + "text": "export interface Frobable {\n frob(): string;\n frobTwice(): string;\n readonly name: string;\n readonly tags: string[];\n count: number;\n reset(): void;\n}", + "token_estimate": 52 + }, + { + "block_ids": [ + "ee878891c19c9bacebe2e2d262c2ea77" + ], + "chunk_id": "bc07691ba8e249a360fe0e056eeff9ac", + "chunker_version": "code-ts-ast-v1", + "doc_id": "ff6591709852ab9c57be6e50145b9800", + "heading_path": [], + "policy_hash": "dd45402f76b4e339", + "source_spans": [ + { + "kind": "code", + "lang": "typescript", + "line_end": 30, + "line_start": 22, + "symbol": "Foo" + } + ], + "text": "export class Foo implements Frobable {\n constructor(\n public readonly name: string,\n public value: number,\n public tags: string[] = [],\n ) {}\n frob(): string { return this.name; }\n frobTwice(): string { return this.name.repeat(2); }\n reset(): void { this.value = 0; }\n}", + "token_estimate": 95 + }, + { + "block_ids": [ + "df08aa572f5c85d0e5d28d6490acc7bc" + ], + "chunk_id": "42b7bec354bbb69ded1c8da40d30250c", + "chunker_version": "code-ts-ast-v1", + "doc_id": "ff6591709852ab9c57be6e50145b9800", + "heading_path": [], + "policy_hash": "dd45402f76b4e339", + "source_spans": [ + { + "kind": "code", + "lang": "typescript", + "line_end": 38, + "line_start": 32, + "symbol": "Foo.double" + } + ], + "text": "export class Foo {\n double(): number {\n const result = this.value * 2;\n if (result > Number.MAX_SAFE_INTEGER) {\n return Number.MAX_SAFE_INTEGER;\n }\n return result;\n }\n}", + "token_estimate": 63 + }, + { + "block_ids": [ + "91aadf18fa97c1d7c94019e0968bc9c8" + ], + "chunk_id": "f218eaf2cb72f10a78c6a2090f72c215", + "chunker_version": "code-ts-ast-v1", + "doc_id": "ff6591709852ab9c57be6e50145b9800", + "heading_path": [], + "policy_hash": "dd45402f76b4e339", + "source_spans": [ + { + "kind": "code", + "lang": "typescript", + "line_end": 46, + "line_start": 40, + "symbol": "Foo.triple" + } + ], + "text": "export class Foo {\n triple(): number {\n const result = this.value * 3;\n if (result > Number.MAX_SAFE_INTEGER) {\n return Number.MAX_SAFE_INTEGER;\n }\n return result;\n }\n}", + "token_estimate": 63 + }, + { + "block_ids": [ + "d719400f1d79b522d0a1267331966be0" + ], + "chunk_id": "7bd082ae93cc75e683b2a9eb3f911ee9", + "chunker_version": "code-ts-ast-v1", + "doc_id": "ff6591709852ab9c57be6e50145b9800", + "heading_path": [], + "policy_hash": "dd45402f76b4e339", + "source_spans": [ + { + "kind": "code", + "lang": "typescript", + "line_end": 247, + "line_start": 48, + "symbol": "BigProcessor [part 1/2]" + } + ], + "text": "export class BigProcessor {\n process(items: string[]): string[] {\n const v0 = items[0] ?? '';\n const v1 = items[1] ?? '';\n const v2 = items[2] ?? '';\n const v3 = items[3] ?? '';\n const v4 = items[4] ?? '';\n const v5 = items[5] ?? '';\n const v6 = items[6] ?? '';\n const v7 = items[7] ?? '';\n const v8 = items[8] ?? '';\n const v9 = items[9] ?? '';\n const v10 = items[10] ?? '';\n const v11 = items[11] ?? '';\n const v12 = items[12] ?? '';\n const v13 = items[13] ?? '';\n const v14 = items[14] ?? '';\n const v15 = items[15] ?? '';\n const v16 = items[16] ?? '';\n const v17 = items[17] ?? '';\n const v18 = items[18] ?? '';\n const v19 = items[19] ?? '';\n const v20 = items[20] ?? '';\n const v21 = items[21] ?? '';\n const v22 = items[22] ?? '';\n const v23 = items[23] ?? '';\n const v24 = items[24] ?? '';\n const v25 = items[25] ?? '';\n const v26 = items[26] ?? '';\n const v27 = items[27] ?? '';\n const v28 = items[28] ?? '';\n const v29 = items[29] ?? '';\n const v30 = items[30] ?? '';\n const v31 = items[31] ?? '';\n const v32 = items[32] ?? '';\n const v33 = items[33] ?? '';\n const v34 = items[34] ?? '';\n const v35 = items[35] ?? '';\n const v36 = items[36] ?? '';\n const v37 = items[37] ?? '';\n const v38 = items[38] ?? '';\n const v39 = items[39] ?? '';\n const v40 = items[40] ?? '';\n const v41 = items[41] ?? '';\n const v42 = items[42] ?? '';\n const v43 = items[43] ?? '';\n const v44 = items[44] ?? '';\n const v45 = items[45] ?? '';\n const v46 = items[46] ?? '';\n const v47 = items[47] ?? '';\n const v48 = items[48] ?? '';\n const v49 = items[49] ?? '';\n const v50 = items[50] ?? '';\n const v51 = items[51] ?? '';\n const v52 = items[52] ?? '';\n const v53 = items[53] ?? '';\n const v54 = items[54] ?? '';\n const v55 = items[55] ?? '';\n const v56 = items[56] ?? '';\n const v57 = items[57] ?? '';\n const v58 = items[58] ?? '';\n const v59 = items[59] ?? '';\n const v60 = items[60] ?? '';\n const v61 = items[61] ?? '';\n const v62 = items[62] ?? '';\n const v63 = items[63] ?? '';\n const v64 = items[64] ?? '';\n const v65 = items[65] ?? '';\n const v66 = items[66] ?? '';\n const v67 = items[67] ?? '';\n const v68 = items[68] ?? '';\n const v69 = items[69] ?? '';\n const v70 = items[70] ?? '';\n const v71 = items[71] ?? '';\n const v72 = items[72] ?? '';\n const v73 = items[73] ?? '';\n const v74 = items[74] ?? '';\n const v75 = items[75] ?? '';\n const v76 = items[76] ?? '';\n const v77 = items[77] ?? '';\n const v78 = items[78] ?? '';\n const v79 = items[79] ?? '';\n const v80 = items[80] ?? '';\n const v81 = items[81] ?? '';\n const v82 = items[82] ?? '';\n const v83 = items[83] ?? '';\n const v84 = items[84] ?? '';\n const v85 = items[85] ?? '';\n const v86 = items[86] ?? '';\n const v87 = items[87] ?? '';\n const v88 = items[88] ?? '';\n const v89 = items[89] ?? '';\n const v90 = items[90] ?? '';\n const v91 = items[91] ?? '';\n const v92 = items[92] ?? '';\n const v93 = items[93] ?? '';\n const v94 = items[94] ?? '';\n const v95 = items[95] ?? '';\n const v96 = items[96] ?? '';\n const v97 = items[97] ?? '';\n const v98 = items[98] ?? '';\n const v99 = items[99] ?? '';\n const v100 = items[100] ?? '';\n const v101 = items[101] ?? '';\n const v102 = items[102] ?? '';\n const v103 = items[103] ?? '';\n const v104 = items[104] ?? '';\n const v105 = items[105] ?? '';\n const v106 = items[106] ?? '';\n const v107 = items[107] ?? '';\n const v108 = items[108] ?? '';\n const v109 = items[109] ?? '';\n const v110 = items[110] ?? '';\n const v111 = items[111] ?? '';\n const v112 = items[112] ?? '';\n const v113 = items[113] ?? '';\n const v114 = items[114] ?? '';\n const v115 = items[115] ?? '';\n const v116 = items[116] ?? '';\n const v117 = items[117] ?? '';\n const v118 = items[118] ?? '';\n const v119 = items[119] ?? '';\n const v120 = items[120] ?? '';\n const v121 = items[121] ?? '';\n const v122 = items[122] ?? '';\n const v123 = items[123] ?? '';\n const v124 = items[124] ?? '';\n const v125 = items[125] ?? '';\n const v126 = items[126] ?? '';\n const v127 = items[127] ?? '';\n const v128 = items[128] ?? '';\n const v129 = items[129] ?? '';\n const v130 = items[130] ?? '';\n const v131 = items[131] ?? '';\n const v132 = items[132] ?? '';\n const v133 = items[133] ?? '';\n const v134 = items[134] ?? '';\n const v135 = items[135] ?? '';\n const v136 = items[136] ?? '';\n const v137 = items[137] ?? '';\n const v138 = items[138] ?? '';\n const v139 = items[139] ?? '';\n const v140 = items[140] ?? '';\n const v141 = items[141] ?? '';\n const v142 = items[142] ?? '';\n const v143 = items[143] ?? '';\n const v144 = items[144] ?? '';\n const v145 = items[145] ?? '';\n const v146 = items[146] ?? '';\n const v147 = items[147] ?? '';\n const v148 = items[148] ?? '';\n const v149 = items[149] ?? '';\n const v150 = items[150] ?? '';\n const v151 = items[151] ?? '';\n const v152 = items[152] ?? '';\n const v153 = items[153] ?? '';\n const v154 = items[154] ?? '';\n const v155 = items[155] ?? '';\n const v156 = items[156] ?? '';\n const v157 = items[157] ?? '';\n const v158 = items[158] ?? '';\n const v159 = items[159] ?? '';\n const v160 = items[160] ?? '';\n const v161 = items[161] ?? '';\n const v162 = items[162] ?? '';\n const v163 = items[163] ?? '';\n const v164 = items[164] ?? '';\n const v165 = items[165] ?? '';\n const v166 = items[166] ?? '';\n const v167 = items[167] ?? '';\n const v168 = items[168] ?? '';\n const v169 = items[169] ?? '';\n const v170 = items[170] ?? '';\n const v171 = items[171] ?? '';\n const v172 = items[172] ?? '';\n const v173 = items[173] ?? '';\n const v174 = items[174] ?? '';\n const v175 = items[175] ?? '';\n const v176 = items[176] ?? '';\n const v177 = items[177] ?? '';\n const v178 = items[178] ?? '';\n const v179 = items[179] ?? '';\n const v180 = items[180] ?? '';\n const v181 = items[181] ?? '';\n const v182 = items[182] ?? '';\n const v183 = items[183] ?? '';\n const v184 = items[184] ?? '';\n const v185 = items[185] ?? '';\n const v186 = items[186] ?? '';\n const v187 = items[187] ?? '';\n const v188 = items[188] ?? '';\n const v189 = items[189] ?? '';\n const v190 = items[190] ?? '';\n const v191 = items[191] ?? '';\n const v192 = items[192] ?? '';\n const v193 = items[193] ?? '';\n const v194 = items[194] ?? '';\n const v195 = items[195] ?? '';\n const v196 = items[196] ?? '';\n const v197 = items[197] ?? '';", + "token_estimate": 2259 + }, + { + "block_ids": [ + "d719400f1d79b522d0a1267331966be0" + ], + "chunk_id": "fd63728143d8936de0faab714fbd4165", + "chunker_version": "code-ts-ast-v1", + "doc_id": "ff6591709852ab9c57be6e50145b9800", + "heading_path": [], + "policy_hash": "dd45402f76b4e339", + "source_spans": [ + { + "kind": "code", + "lang": "typescript", + "line_end": 262, + "line_start": 248, + "symbol": "BigProcessor [part 2/2]" + } + ], + "text": " const v198 = items[198] ?? '';\n const v199 = items[199] ?? '';\n const v200 = items[200] ?? '';\n const v201 = items[201] ?? '';\n const v202 = items[202] ?? '';\n const v203 = items[203] ?? '';\n const v204 = items[204] ?? '';\n const v205 = items[205] ?? '';\n const v206 = items[206] ?? '';\n const v207 = items[207] ?? '';\n const v208 = items[208] ?? '';\n const v209 = items[209] ?? '';\n return items;\n }\n}", + "token_estimate": 148 + } +] diff --git a/crates/kebab-parse-code/Cargo.toml b/crates/kebab-parse-code/Cargo.toml index 5ef1c69..b17617c 100644 --- a/crates/kebab-parse-code/Cargo.toml +++ b/crates/kebab-parse-code/Cargo.toml @@ -16,6 +16,9 @@ time = { workspace = true } tracing = { workspace = true } tree-sitter = { workspace = true } tree-sitter-rust = { workspace = true } +tree-sitter-python = { workspace = true } +tree-sitter-typescript = { workspace = true } +tree-sitter-javascript = { workspace = true } [dev-dependencies] tempfile = { workspace = true } diff --git a/crates/kebab-parse-code/src/javascript.rs b/crates/kebab-parse-code/src/javascript.rs new file mode 100644 index 0000000..f2e2a16 --- /dev/null +++ b/crates/kebab-parse-code/src/javascript.rs @@ -0,0 +1,574 @@ +//! `kebab-parse-code::javascript` — tree-sitter JavaScript / JSX AST +//! extractor (P10-1B Task K). +//! +//! Implements [`kebab_core::Extractor`] for [`MediaType::Code("javascript")`]. +//! Walks the tree-sitter parse tree (single grammar +//! [`tree_sitter_javascript::LANGUAGE`] — the JS grammar handles `.jsx` +//! as well, no second grammar needed) and emits one [`Block::Code`] per +//! top-level AST semantic unit (free fn, class, each method, +//! recursively per nested class), each carrying [`SourceSpan::Code`] +//! with the unit's dotted symbol path prefixed by +//! [`module_path_for_tsjs`]. +//! +//! Glue declarations (`import_statement`, bare `export_statement` +//! re-exports, `lexical_declaration` / `variable_declaration` at the +//! module level, etc.) collapse into one grouped `` (or +//! ``) unit. +//! +//! `export_statement` is unwrapped: an `export function|class` is +//! treated as the inner declaration arm but the unit's line range +//! comes from the OUTER `export_statement` so the `export ` prefix is +//! folded in. `export default function () {}` / `export default class +//! {}` (no `name` field) emits `default` as the symbol name. +//! +//! Differs from `typescript.rs` only by: single-grammar (no +//! TS/TSX selection) and no `interface_declaration` / +//! `type_alias_declaration` / `enum_declaration` arms (TS-only). All +//! other walker behavior (export unwrap with `value`-field quirk for +//! default-exported anonymous function/class, class-body method walk, +//! glue flush, post-pass `` → `` rewrite) is +//! identical. +//! +//! Scope follows 1A-2 / 1B Task K: AST unit extraction + dotted symbol +//! paths + line ranges. Per design §3.4 / §9.1 / §9 versioning. + +use anyhow::Result; +use kebab_core::{ + Block, CanonicalDocument, CodeBlock, CommonBlock, Extractor, Lang, MediaType, Metadata, + ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TrustLevel, + id_for_block, id_for_doc, +}; +use serde_json::Map; +use time::OffsetDateTime; + +use crate::scaffold::{filename_from_workspace_path, join_symbol, strip_extension}; + +pub const PARSER_VERSION: &str = "code-js-v1"; + +/// JavaScript / JSX AST extractor. Per-unit blocks via +/// tree-sitter-javascript 0.25 (single `LANGUAGE` `LanguageFn` — the +/// JS grammar covers `.jsx` natively, no second grammar) parsed by +/// tree-sitter 0.26. +pub struct JavascriptAstExtractor; + +impl JavascriptAstExtractor { + pub fn new() -> Self { + Self + } +} + +impl Default for JavascriptAstExtractor { + fn default() -> Self { + Self::new() + } +} + +impl Extractor for JavascriptAstExtractor { + fn supports(&self, m: &MediaType) -> bool { + matches!(m, MediaType::Code(l) if l == "javascript") + } + + fn parser_version(&self) -> ParserVersion { + ParserVersion(PARSER_VERSION.to_string()) + } + + fn extract( + &self, + ctx: &kebab_core::ExtractContext<'_>, + bytes: &[u8], + ) -> Result { + let asset = ctx.asset; + if !self.supports(&asset.media_type) { + anyhow::bail!( + "kebab-parse-code: unsupported media_type for JavascriptAstExtractor: {:?}", + asset.media_type + ); + } + + let parser_version = self.parser_version(); + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); + + let source = String::from_utf8(bytes.to_vec()).map_err(|e| { + anyhow::anyhow!("kebab-parse-code: JavaScript source is not valid UTF-8: {e}") + })?; + + let mod_prefix = crate::lang::module_path_for_tsjs(&asset.workspace_path.0); + let language: tree_sitter::Language = tree_sitter_javascript::LANGUAGE.into(); + let blocks = build_blocks(&source, &doc_id, &mod_prefix, language)?; + let unit_count = blocks.len() as u32; + + let now = OffsetDateTime::now_utc(); + let mut events: Vec = Vec::with_capacity(2); + events.push(ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-code".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!( + "parser_version={}; unit_count={}", + parser_version.0, unit_count + )), + }); + + let title = { + let fname = filename_from_workspace_path(&asset.workspace_path.0); + strip_extension(&fname) + }; + + // Resolve the file's absolute path for repo detection. If the + // source URI carries a relative path, anchor it at the workspace + // root so the `.git/` walk-up starts from the right place. + let abs_path = match &asset.source_uri { + kebab_core::SourceUri::File(p) => { + if p.is_absolute() { + p.clone() + } else { + ctx.workspace_root.join(p) + } + } + kebab_core::SourceUri::Kb(_) => ctx.workspace_root.to_path_buf(), + }; + let (repo, git_branch, git_commit) = match crate::repo::detect_repo(&abs_path) { + Some(r) => (Some(r.name), r.branch, r.commit), + None => (None, None, None), + }; + + let metadata = Metadata { + aliases: Vec::new(), + tags: Vec::new(), + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Map::new(), + repo, + git_branch, + git_commit, + code_lang: Some("javascript".to_string()), + }; + + tracing::debug!( + target: "kebab-parse-code", + "extracted JavaScript doc_id={} workspace_path={} units={}", + doc_id.0, + asset.workspace_path.0, + unit_count + ); + + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks, + metadata, + provenance: Provenance { events }, + parser_version, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + }) + } +} + +fn build_blocks( + source: &str, + doc_id: &kebab_core::DocumentId, + mod_prefix: &str, + language: tree_sitter::Language, +) -> anyhow::Result> { + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&language) + .map_err(|e| anyhow::anyhow!("set tree-sitter-javascript language: {e}"))?; + let tree = parser + .parse(source.as_bytes(), None) + .ok_or_else(|| anyhow::anyhow!("tree-sitter failed to parse JavaScript source"))?; + let lines: Vec<&str> = source.split('\n').collect(); + + // units: (symbol, line_start, line_end, is_real_semantic_unit). + // Glue groups are pushed with a sentinel symbol + is_real=false so a + // post-pass can decide `` vs `` (same algorithm + // as 1A Gap 1 / 1B Python / 1B TS). + let mut units: Vec<(String, u32, u32, bool)> = Vec::new(); + // (is_module_only_kind 0/1, s, e). `is_module_only_kind` flags + // `import_statement` and bare re-export `export_statement`s — used by + // the glue flush to pick `` vs `` provisional + // label (1A's `is_mod_decl` analog). + let mut glue: Vec<(usize, u32, u32)> = Vec::new(); + + /// Walk preceding `comment` siblings to extend the unit's line range + /// upward, folding leading doc / line comments into the unit. + fn unit_start(n: &tree_sitter::Node) -> u32 { + let mut start = n.start_position().row as u32 + 1; + let mut prev = n.prev_sibling(); + while let Some(p) = prev { + if p.kind() == "comment" { + start = p.start_position().row as u32 + 1; + prev = p.prev_sibling(); + } else { + break; + } + } + start + } + fn name_text<'a>(n: &tree_sitter::Node, src: &'a str) -> Option<&'a str> { + n.child_by_field_name("name") + .map(|c| &src[c.start_byte()..c.end_byte()]) + } + /// Walk a class body, emitting one unit per `method_definition`. + /// Class names already pushed onto `mod_path` by the caller, so + /// method symbols come out as `..`. + fn walk_class_body( + body: tree_sitter::Node, + src: &str, + mod_prefix: &str, + mod_path: &[String], + units: &mut Vec<(String, u32, u32, bool)>, + ) { + let mut cur = body.walk(); + for child in body.named_children(&mut cur) { + if child.kind() == "method_definition" { + if let Some(name) = name_text(&child, src) { + let s = unit_start(&child); + let e = child.end_position().row as u32 + 1; + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + } + } + } + } + fn walk( + node: tree_sitter::Node, + src: &str, + mod_prefix: &str, + mod_path: &[String], + units: &mut Vec<(String, u32, u32, bool)>, + glue: &mut Vec<(usize, u32, u32)>, + ) { + let mut cur = node.walk(); + for child in node.named_children(&mut cur) { + let s = unit_start(&child); + let e = child.end_position().row as u32 + 1; + match child.kind() { + "function_declaration" => { + if let Some(name) = name_text(&child, src) { + glue.retain(|(_, gs, _)| *gs < s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + } + } + "class_declaration" => { + if let Some(name) = name_text(&child, src) { + glue.retain(|(_, gs, _)| *gs < s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + if let Some(body) = child.child_by_field_name("body") { + let mut np = mod_path.to_vec(); + np.push(name.to_string()); + walk_class_body(body, src, mod_prefix, &np, units); + } + } + } + "export_statement" => { + // Try field "declaration" first (export class / + // function). If absent, fall back to "value" — + // `export default function () {}` / `export default + // class {}` expose the anonymous function_expression + // / class under the `value` field (same grammar + // quirk as TS 0.23). + let outer_s = s; // includes `export ` prefix line + let outer_e = e; + if let Some(inner) = child.child_by_field_name("declaration") { + let inner_kind = inner.kind(); + match inner_kind { + "function_declaration" | "class_declaration" => { + let name_opt = name_text(&inner, src).map(|s| s.to_string()); + if let Some(name) = name_opt { + glue.retain(|(_, gs, _)| *gs < outer_s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, &name); + units.push((sym, outer_s, outer_e, true)); + if inner_kind == "class_declaration" { + if let Some(body) = inner.child_by_field_name("body") { + let mut np = mod_path.to_vec(); + np.push(name); + walk_class_body(body, src, mod_prefix, &np, units); + } + } + } else { + // Defensive: `export default` with a + // function_declaration that somehow + // lacks `name`. Emit `default`. + glue.retain(|(_, gs, _)| *gs < outer_s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, "default"); + units.push((sym, outer_s, outer_e, true)); + } + } + // `lexical_declaration` etc. wrapped in + // export: treat as glue (assigned arrow + // fns / consts don't get their own unit). + _ => { + glue.push((0, s, e)); + } + } + } else if let Some(value) = child.child_by_field_name("value") { + // `export default `. We emit a unit only + // for the function / class shapes (named or + // anonymous); other value shapes are glue. + match value.kind() { + "function_expression" + | "function_declaration" + | "class" + | "class_declaration" => { + let name_opt = name_text(&value, src).map(|s| s.to_string()); + let leaf = + name_opt.as_deref().unwrap_or("default").to_string(); + glue.retain(|(_, gs, _)| *gs < outer_s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, &leaf); + units.push((sym, outer_s, outer_e, true)); + // Recurse into class body if we have one. + if matches!(value.kind(), "class" | "class_declaration") { + if let Some(body) = value.child_by_field_name("body") { + let mut np = mod_path.to_vec(); + np.push(leaf); + walk_class_body(body, src, mod_prefix, &np, units); + } + } + } + _ => { + glue.push((0, s, e)); + } + } + } else { + // Bare `export { x };` / `export * from "..."` — + // a re-export, glue with module-only flag set + // (we have no `declaration` / `value` field for + // it). + glue.push((1, s, e)); + } + } + "import_statement" => { + glue.push((1, s, e)); + } + "lexical_declaration" | "variable_declaration" => { + glue.push((0, s, e)); + } + _ => {} + } + } + flush_glue(glue, units, mod_prefix, mod_path); + } + fn flush_glue( + glue: &mut Vec<(usize, u32, u32)>, + units: &mut Vec<(String, u32, u32, bool)>, + mod_prefix: &str, + mod_path: &[String], + ) { + if glue.is_empty() { + return; + } + let s = glue.iter().map(|(_, a, _)| *a).min().unwrap(); + let e = glue.iter().map(|(_, _, b)| *b).max().unwrap(); + let only_module = glue.iter().all(|(is_mod, _, _)| *is_mod == 1); + let label = if only_module { "" } else { "" }; + units.push((join_symbol(mod_prefix, mod_path, label), s, e, false)); + glue.clear(); + } + + walk( + tree.root_node(), + source, + mod_prefix, + &[], + &mut units, + &mut glue, + ); + + // `` is correct only when the file produced no real unit. + // Otherwise the import-only group becomes `` (same + // post-pass as 1A Gap 1 / Python / TS). + let has_real_unit = units.iter().any(|(_, _, _, is_real)| *is_real); + if has_real_unit { + for (sym, _, _, is_real) in units.iter_mut() { + if !*is_real && sym.ends_with("") { + let pre = &sym[..sym.len() - "".len()]; + *sym = format!("{pre}"); + } + } + } + + let total_lines = lines.len() as u32; + let mut blocks = Vec::with_capacity(units.len()); + for (ordinal, (symbol, ls, le, _is_real)) in units.into_iter().enumerate() { + let line_start = ls.max(1); + let line_end = le.min(total_lines.max(1)); + let span = SourceSpan::Code { + line_start, + line_end, + symbol: Some(symbol), + lang: Some("javascript".to_string()), + }; + let block_id = id_for_block(doc_id, "code", &[], ordinal as u32, &span); + let code = lines[(line_start as usize - 1)..=(line_end as usize - 1)].join("\n"); + blocks.push(Block::Code(CodeBlock { + common: CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span, + }, + lang: Some("javascript".to_string()), + code, + })); + } + Ok(blocks) +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{Block, MediaType, SourceSpan}; + + fn extract_fixture(workspace_path: &str) -> kebab_core::CanonicalDocument { + let bytes = std::fs::read( + concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/sample.js"), + ) + .unwrap(); + let asset = crate::rust::tests_support::fixed_code_asset(workspace_path, "javascript"); + let cfg = kebab_core::ExtractConfig::default(); + let root = std::path::PathBuf::from("/tmp"); + let ctx = kebab_core::ExtractContext { + asset: &asset, + workspace_root: &root, + config: &cfg, + }; + JavascriptAstExtractor::new().extract(&ctx, &bytes).unwrap() + } + fn symbols(doc: &kebab_core::CanonicalDocument) -> Vec { + let mut s: Vec = doc + .blocks + .iter() + .filter_map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, lang, .. } => { + assert_eq!(lang.as_deref(), Some("javascript")); + symbol.clone() + } + _ => None, + }, + _ => None, + }) + .collect(); + s.sort(); + s + } + #[test] + fn extractor_supports_only_media_code_javascript() { + let e = JavascriptAstExtractor::new(); + assert!(e.supports(&MediaType::Code("javascript".into()))); + assert!(!e.supports(&MediaType::Code("typescript".into()))); + assert!(!e.supports(&MediaType::Markdown)); + } + #[test] + fn js_units_match_design_3_4_symbols() { + let doc = extract_fixture("src/sample.js"); + let syms = symbols(&doc); + assert!(syms.iter().any(|s| s == "src/sample.add"), "got {syms:?}"); + assert!(syms.iter().any(|s| s == "src/sample.Retriever")); + assert!(syms.iter().any(|s| s == "src/sample.Retriever.search")); + assert!(syms.iter().any(|s| s == "src/sample.Retriever.create")); + assert!(syms.iter().any(|s| s == "src/sample.default")); + assert!(syms.iter().any(|s| s == "src/sample.")); + } + #[test] + fn jsx_via_js_grammar() { + // tree-sitter-javascript handles .jsx via the same single grammar. + let bytes = b"export function App() { return null; }\n"; + let asset = crate::rust::tests_support::fixed_code_asset("src/App.jsx", "javascript"); + let cfg = kebab_core::ExtractConfig::default(); + let root = std::path::PathBuf::from("/tmp"); + let ctx = kebab_core::ExtractContext { + asset: &asset, + workspace_root: &root, + config: &cfg, + }; + let doc = JavascriptAstExtractor::new().extract(&ctx, bytes).unwrap(); + let syms = symbols(&doc); + assert!(syms.iter().any(|s| s == "src/App.App"), "got {syms:?}"); + } + #[test] + fn deterministic_across_runs() { + let a = extract_fixture("src/sample.js"); + for _ in 0..30 { + assert_eq!(extract_fixture("src/sample.js").blocks, a.blocks); + } + } + + /// In tree-sitter-javascript, `decorator` is a CHILD of + /// `method_definition` (stored in the `decorator` field), so + /// `method_definition.start_row` already covers the decorator line + /// without any sibling walk. Verify that the emitted unit already + /// includes the decorator line and line_start is 2 (the @Log() line). + #[test] + fn js_class_method_decorator_already_folded_by_grammar() { + // Line 1 (1-indexed): "class Foo {" + // Line 2: " @Log()" <- decorator (child of method_definition in JS grammar) + // Line 3: " bar() { return 1; }" + // Line 4: "}" + let bytes = b"class Foo {\n @Log()\n bar() { return 1; }\n}\n"; + let asset = crate::rust::tests_support::fixed_code_asset("src/foo.js", "javascript"); + let cfg = kebab_core::ExtractConfig::default(); + let root = std::path::PathBuf::from("/tmp"); + let ctx = kebab_core::ExtractContext { + asset: &asset, + workspace_root: &root, + config: &cfg, + }; + let doc = JavascriptAstExtractor::new().extract(&ctx, bytes).unwrap(); + + let bar_block = doc + .blocks + .iter() + .find_map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, .. } + if symbol.as_deref() == Some("src/foo.Foo.bar") => + { + Some(c) + } + _ => None, + }, + _ => None, + }) + .expect("src/foo.Foo.bar block should be present"); + + // JS grammar: method_definition.start_row == decorator row, so + // no sibling walk change needed -- decorator is already included. + assert!( + bar_block.code.contains("@Log()"), + "JS method unit must include decorator (grammar folds it natively); got: {:?}", + bar_block.code + ); + match &bar_block.common.source_span { + SourceSpan::Code { line_start, .. } => { + assert_eq!( + *line_start, 2, + "JS line_start must cover the @Log() decorator line (got {line_start})" + ); + } + _ => unreachable!(), + } + } +} diff --git a/crates/kebab-parse-code/src/lang.rs b/crates/kebab-parse-code/src/lang.rs index bd850f6..1128ac7 100644 --- a/crates/kebab-parse-code/src/lang.rs +++ b/crates/kebab-parse-code/src/lang.rs @@ -40,3 +40,73 @@ pub fn code_lang_for_path(path: &Path) -> Option<&'static str> { _ => None, } } + +/// p10-1B: workspace-relative Python file path → dotted module-path prefix. +/// See plan §Task C for the exact rules + tasks/p10/p10-1b for the §3.4 +/// design contract. +pub fn module_path_for_python(workspace_path: &str) -> String { + let mut p: &str = workspace_path; + if let Some(rest) = p.strip_prefix("crates/") { + if let Some(slash) = rest.find('/') { + let after = &rest[slash + 1..]; + if let Some(stripped) = after.strip_prefix("src/") { + p = stripped; + } + } + } else if let Some(stripped) = p.strip_prefix("src/") { + p = stripped; + } else if let Some(stripped) = p.strip_prefix("lib/") { + p = stripped; + } + let p = match p.strip_suffix(".py") { + Some(s) => s, + None => p.strip_suffix(".pyi").unwrap_or(p), + }; + let p = if let Some(parent) = p.strip_suffix("/__init__") { + parent + } else if p == "__init__" { + "" + } else { + p + }; + p.replace('/', ".") +} + +/// p10-1B: workspace-relative TS/JS file path → path-style prefix +/// (no slash replacement, no source-root strip). See plan §Task C. +pub fn module_path_for_tsjs(workspace_path: &str) -> String { + let p = workspace_path; + for ext in [".tsx", ".ts", ".jsx", ".mjs", ".cjs", ".js"] { + if let Some(stripped) = p.strip_suffix(ext) { + return stripped.to_string(); + } + } + p.to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn module_path_for_python_strips_src_roots_and_extensions() { + assert_eq!(module_path_for_python("kebab_eval/metrics.py"), "kebab_eval.metrics"); + assert_eq!(module_path_for_python("kebab_eval/__init__.py"), "kebab_eval"); + assert_eq!(module_path_for_python("src/foo/bar.py"), "foo.bar"); + assert_eq!(module_path_for_python("crates/x/src/foo/bar.py"), "foo.bar"); + assert_eq!(module_path_for_python("a/b/c.pyi"), "a.b.c"); + assert_eq!(module_path_for_python("standalone.py"), "standalone"); + assert_eq!(module_path_for_python("src/__init__.py"), ""); + } + + #[test] + fn module_path_for_tsjs_keeps_slashes_and_strips_ext() { + for ext in ["ts", "tsx", "js", "jsx", "mjs", "cjs"] { + let p = format!("src/search/retriever/Retriever.{ext}"); + assert_eq!(module_path_for_tsjs(&p), "src/search/retriever/Retriever"); + } + assert_eq!(module_path_for_tsjs("foo.ts"), "foo"); + assert_eq!(module_path_for_tsjs("a/b/c.ts"), "a/b/c"); + assert_eq!(module_path_for_tsjs("packages/x/src/Foo.ts"), "packages/x/src/Foo"); + } +} diff --git a/crates/kebab-parse-code/src/lib.rs b/crates/kebab-parse-code/src/lib.rs index d9faefb..5118784 100644 --- a/crates/kebab-parse-code/src/lib.rs +++ b/crates/kebab-parse-code/src/lib.rs @@ -13,12 +13,19 @@ //! `kebab-parse-*` crates per design §8: must NOT depend on store / embed //! / llm / rag. +pub mod javascript; pub mod lang; +pub mod python; pub mod repo; pub mod rust; +pub(crate) mod scaffold; pub mod skip; +pub mod typescript; -pub use lang::code_lang_for_path; +pub use javascript::{PARSER_VERSION as JS_PARSER_VERSION, JavascriptAstExtractor}; +pub use lang::{code_lang_for_path, module_path_for_python, module_path_for_tsjs}; +pub use python::{PARSER_VERSION as PYTHON_PARSER_VERSION, PythonAstExtractor}; pub use repo::{RepoMeta, detect_repo}; pub use rust::{PARSER_VERSION as RUST_PARSER_VERSION, RustAstExtractor}; pub use skip::{BUILTIN_BLACKLIST, is_generated_file, is_oversized}; +pub use typescript::{PARSER_VERSION as TS_PARSER_VERSION, TypescriptAstExtractor}; diff --git a/crates/kebab-parse-code/src/python.rs b/crates/kebab-parse-code/src/python.rs new file mode 100644 index 0000000..e2b1ae7 --- /dev/null +++ b/crates/kebab-parse-code/src/python.rs @@ -0,0 +1,437 @@ +//! `kebab-parse-code::python` — tree-sitter Python AST extractor (P10-1B Task E). +//! +//! Implements [`kebab_core::Extractor`] for [`MediaType::Code("python")`]. +//! Walks the tree-sitter parse tree and emits one [`Block::Code`] per +//! top-level AST semantic unit (free fn, class, each method, recursively +//! per nested class), each carrying [`SourceSpan::Code`] with the unit's +//! dotted self-reference symbol path prefixed by `module_path_for_python` +//! (design §3.4). Glue declarations (`import` / `import from` / +//! `expression_statement` / `assignment` / `global_statement` / +//! `future_import_statement`) collapse into one grouped `` +//! (or ``) unit. +//! +//! Decorators are folded into the decorated unit's line range via the +//! `decorated_definition` unwrap arm (analog of the Rust `attribute_item` +//! re-absorption in 1A — see §9.1). +//! +//! Scope follows 1A: AST unit extraction + dotted symbol paths + line +//! ranges. Per design §3.4 / §9.1 / §9 versioning. + +use anyhow::Result; +use kebab_core::{ + Block, CanonicalDocument, CodeBlock, CommonBlock, Extractor, Lang, MediaType, Metadata, + ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TrustLevel, + id_for_block, id_for_doc, +}; +use serde_json::Map; +use time::OffsetDateTime; + +use crate::scaffold::{filename_from_workspace_path, join_symbol, strip_extension}; + +pub const PARSER_VERSION: &str = "code-python-v1"; + +/// Python AST extractor. Per-unit blocks via tree-sitter-python 0.25 +/// (`LANGUAGE: LanguageFn`) parsed by tree-sitter 0.26. +pub struct PythonAstExtractor; + +impl PythonAstExtractor { + pub fn new() -> Self { + Self + } +} + +impl Default for PythonAstExtractor { + fn default() -> Self { + Self::new() + } +} + +impl Extractor for PythonAstExtractor { + fn supports(&self, m: &MediaType) -> bool { + matches!(m, MediaType::Code(l) if l == "python") + } + + fn parser_version(&self) -> ParserVersion { + ParserVersion(PARSER_VERSION.to_string()) + } + + fn extract( + &self, + ctx: &kebab_core::ExtractContext<'_>, + bytes: &[u8], + ) -> Result { + let asset = ctx.asset; + if !self.supports(&asset.media_type) { + anyhow::bail!( + "kebab-parse-code: unsupported media_type for PythonAstExtractor: {:?}", + asset.media_type + ); + } + + let parser_version = self.parser_version(); + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); + + let source = String::from_utf8(bytes.to_vec()).map_err(|e| { + anyhow::anyhow!("kebab-parse-code: Python source is not valid UTF-8: {e}") + })?; + + let mod_prefix = crate::lang::module_path_for_python(&asset.workspace_path.0); + let blocks = build_blocks(&source, &doc_id, &mod_prefix)?; + let unit_count = blocks.len() as u32; + + let now = OffsetDateTime::now_utc(); + let mut events: Vec = Vec::with_capacity(2); + events.push(ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-code".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!( + "parser_version={}; unit_count={}", + parser_version.0, unit_count + )), + }); + + let title = { + let fname = filename_from_workspace_path(&asset.workspace_path.0); + strip_extension(&fname) + }; + + // Resolve the file's absolute path for repo detection. If the + // source URI carries a relative path, anchor it at the workspace + // root so the `.git/` walk-up starts from the right place. + let abs_path = match &asset.source_uri { + kebab_core::SourceUri::File(p) => { + if p.is_absolute() { + p.clone() + } else { + ctx.workspace_root.join(p) + } + } + kebab_core::SourceUri::Kb(_) => ctx.workspace_root.to_path_buf(), + }; + let (repo, git_branch, git_commit) = match crate::repo::detect_repo(&abs_path) { + Some(r) => (Some(r.name), r.branch, r.commit), + None => (None, None, None), + }; + + let metadata = Metadata { + aliases: Vec::new(), + tags: Vec::new(), + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Map::new(), + repo, + git_branch, + git_commit, + code_lang: Some("python".to_string()), + }; + + tracing::debug!( + target: "kebab-parse-code", + "extracted Python doc_id={} workspace_path={} units={}", + doc_id.0, + asset.workspace_path.0, + unit_count + ); + + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks, + metadata, + provenance: Provenance { events }, + parser_version, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + }) + } +} + +fn build_blocks( + source: &str, + doc_id: &kebab_core::DocumentId, + mod_prefix: &str, +) -> anyhow::Result> { + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&tree_sitter_python::LANGUAGE.into()) + .map_err(|e| anyhow::anyhow!("set tree-sitter-python language: {e}"))?; + let tree = parser + .parse(source.as_bytes(), None) + .ok_or_else(|| anyhow::anyhow!("tree-sitter failed to parse Python source"))?; + let lines: Vec<&str> = source.split('\n').collect(); + + // units: (symbol, line_start, line_end, is_real_semantic_unit). + // Glue groups are pushed with a sentinel symbol + is_real=false so a + // post-pass can decide `` vs `` (same algorithm + // as 1A Gap 1). + let mut units: Vec<(String, u32, u32, bool)> = Vec::new(); + // (is_import 0/1, s, e). `is_import` flags `import_statement` / + // `import_from_statement` / `future_import_statement` — used by the + // glue flush to pick `` vs `` provisional label + // (1A's `is_mod_decl` analog). + let mut glue: Vec<(usize, u32, u32)> = Vec::new(); + + fn node_name<'a>(n: &tree_sitter::Node, src: &'a str) -> Option<&'a str> { + n.child_by_field_name("name") + .map(|c| &src[c.start_byte()..c.end_byte()]) + } + /// Walk preceding `comment` siblings to extend the unit's line range + /// upward, folding leading doc / line comments into the unit. Note + /// that Python decorators are NOT preceding siblings — they live + /// INSIDE a `decorated_definition` parent — so they are handled by + /// the unwrap arm below, not here. + fn unit_start(n: &tree_sitter::Node) -> u32 { + let mut start = n.start_position().row as u32 + 1; + let mut prev = n.prev_sibling(); + while let Some(p) = prev { + if p.kind() == "comment" { + start = p.start_position().row as u32 + 1; + prev = p.prev_sibling(); + } else { + break; + } + } + start + } + fn walk( + node: tree_sitter::Node, + src: &str, + mod_prefix: &str, + mod_path: &[String], + units: &mut Vec<(String, u32, u32, bool)>, + glue: &mut Vec<(usize, u32, u32)>, + ) { + let mut cur = node.walk(); + for child in node.named_children(&mut cur) { + // Default unit line range — overridden by the + // `decorated_definition` unwrap arm so decorator lines are + // included. + let s = unit_start(&child); + let e = child.end_position().row as u32 + 1; + match child.kind() { + "function_definition" => { + if let Some(name) = node_name(&child, src) { + glue.retain(|(_, gs, _)| *gs < s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + } + } + "class_definition" => { + if let Some(name) = node_name(&child, src) { + glue.retain(|(_, gs, _)| *gs < s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + // Recurse into the class body with the class + // name pushed onto mod_path; methods become + // `<...>..` and nested + // classes recurse further with both names. + if let Some(body) = child.child_by_field_name("body") { + let mut np = mod_path.to_vec(); + np.push(name.to_string()); + walk(body, src, mod_prefix, &np, units, glue); + debug_assert!( + glue.is_empty(), + "inner walk must flush its glue before returning" + ); + } + } + } + "decorated_definition" => { + // Unwrap: the inner definition supplies the symbol + // name, but the unit's line range comes from the + // OUTER `decorated_definition` so decorator lines + // are folded in (analog of `attribute_item` + // re-absorption in 1A — see plan §Task E note (b)). + if let Some(inner) = child.child_by_field_name("definition") { + let outer_s = s; // already includes decorators + let outer_e = e; + match inner.kind() { + "function_definition" => { + if let Some(name) = node_name(&inner, src) { + glue.retain(|(_, gs, _)| *gs < outer_s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, outer_s, outer_e, true)); + } + } + "class_definition" => { + if let Some(name) = node_name(&inner, src) { + glue.retain(|(_, gs, _)| *gs < outer_s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, outer_s, outer_e, true)); + if let Some(body) = inner.child_by_field_name("body") { + let mut np = mod_path.to_vec(); + np.push(name.to_string()); + walk(body, src, mod_prefix, &np, units, glue); + debug_assert!( + glue.is_empty(), + "inner walk must flush its glue before returning" + ); + } + } + } + _ => {} + } + } + } + "import_statement" | "import_from_statement" | "future_import_statement" => { + glue.push((1, s, e)); + } + "expression_statement" | "assignment" | "global_statement" => { + glue.push((0, s, e)); + } + _ => {} + } + } + flush_glue(glue, units, mod_prefix, mod_path); + } + fn flush_glue( + glue: &mut Vec<(usize, u32, u32)>, + units: &mut Vec<(String, u32, u32, bool)>, + mod_prefix: &str, + mod_path: &[String], + ) { + if glue.is_empty() { + return; + } + let s = glue.iter().map(|(_, a, _)| *a).min().unwrap(); + let e = glue.iter().map(|(_, _, b)| *b).max().unwrap(); + // Provisional label: `` only if the group is exclusively + // imports (1A's `only_mod_decls` analog). The post-pass below + // demotes any `` to `` if the file produced + // any real unit. + let only_imports = glue.iter().all(|(is_import, _, _)| *is_import == 1); + let label = if only_imports { "" } else { "" }; + units.push((join_symbol(mod_prefix, mod_path, label), s, e, false)); + glue.clear(); + } + + walk(tree.root_node(), source, mod_prefix, &[], &mut units, &mut glue); + + // `` is correct only when the file produced no real unit. + // Otherwise the import-only group becomes `` (same + // algorithm as 1A Gap 1). Match on the suffix so a class-nested + // glue group (which doesn't exist in current Python AST but is + // future-proofed) still demotes correctly. + let has_real_unit = units.iter().any(|(_, _, _, is_real)| *is_real); + if has_real_unit { + for (sym, _, _, is_real) in units.iter_mut() { + if !*is_real && sym.ends_with("") { + let pre = &sym[..sym.len() - "".len()]; + *sym = format!("{pre}"); + } + } + } + + let total_lines = lines.len() as u32; + let mut blocks = Vec::with_capacity(units.len()); + for (ordinal, (symbol, ls, le, _is_real)) in units.into_iter().enumerate() { + let line_start = ls.max(1); + let line_end = le.min(total_lines.max(1)); + let span = SourceSpan::Code { + line_start, + line_end, + symbol: Some(symbol), + lang: Some("python".to_string()), + }; + let block_id = id_for_block(doc_id, "code", &[], ordinal as u32, &span); + let code = lines[(line_start as usize - 1)..=(line_end as usize - 1)].join("\n"); + blocks.push(Block::Code(CodeBlock { + common: CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span, + }, + lang: Some("python".to_string()), + code, + })); + } + Ok(blocks) +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{Block, MediaType, SourceSpan}; + + fn extract_fixture() -> kebab_core::CanonicalDocument { + let bytes = std::fs::read( + concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/sample.py"), + ) + .unwrap(); + let asset = crate::rust::tests_support::fixed_code_asset( + "kebab_eval/metrics.py", "python", + ); + let cfg = kebab_core::ExtractConfig::default(); + let root = std::path::PathBuf::from("/tmp"); + let ctx = kebab_core::ExtractContext { + asset: &asset, workspace_root: &root, config: &cfg, + }; + PythonAstExtractor::new().extract(&ctx, &bytes).unwrap() + } + + #[test] + fn extractor_supports_only_media_code_python() { + let e = PythonAstExtractor::new(); + assert!(e.supports(&MediaType::Code("python".into()))); + assert!(!e.supports(&MediaType::Code("rust".into()))); + assert!(!e.supports(&MediaType::Markdown)); + } + + #[test] + fn python_units_carry_module_prefixed_symbols() { + let doc = extract_fixture(); + let mut syms: Vec = doc.blocks.iter().map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, lang, .. } => { + assert_eq!(lang.as_deref(), Some("python")); + symbol.clone().unwrap() + } + _ => panic!("expected SourceSpan::Code"), + }, + other => panic!("expected Block::Code, got {other:?}"), + }).collect(); + syms.sort(); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.free")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Foo")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Foo.double")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Foo.name")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Outer")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Outer.Inner")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Outer.Inner.helper")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.with_decorator")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.")); + // The `@no_type_check` decorator on `free` is folded into its + // unit's line range (decorated_definition unwrap). + let free_src = doc.blocks.iter().find_map(|b| match b { + Block::Code(c) if matches!(&c.common.source_span, + SourceSpan::Code{symbol,..} if symbol.as_deref()==Some("kebab_eval.metrics.free")) => Some(c.code.clone()), + _ => None, + }).unwrap(); + assert!(free_src.contains("@no_type_check"), "decorator folded in: {free_src}"); + } + + #[test] + fn deterministic_across_runs() { + let a = extract_fixture(); + for _ in 0..50 { assert_eq!(extract_fixture().blocks, a.blocks); } + } +} diff --git a/crates/kebab-parse-code/src/rust.rs b/crates/kebab-parse-code/src/rust.rs index 7dcf8cc..4b932a6 100644 --- a/crates/kebab-parse-code/src/rust.rs +++ b/crates/kebab-parse-code/src/rust.rs @@ -30,6 +30,8 @@ use kebab_core::{ use serde_json::Map; use time::OffsetDateTime; +use crate::scaffold::{filename_from_workspace_path, strip_extension}; + pub const PARSER_VERSION: &str = "code-rust-v1"; /// Rust AST extractor. Per-unit blocks via tree-sitter-rust 0.24 @@ -162,18 +164,6 @@ impl Extractor for RustAstExtractor { } } -fn filename_from_workspace_path(p: &str) -> String { - p.rsplit('/').next().unwrap_or(p).to_string() -} - -fn strip_extension(filename: &str) -> String { - match filename.rfind('.') { - Some(0) => filename.to_string(), - Some(idx) => filename[..idx].to_string(), - None => filename.to_string(), - } -} - fn build_blocks( source: &str, doc_id: &kebab_core::DocumentId, @@ -393,7 +383,7 @@ mod tests { concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/sample.rs"), ) .unwrap(); - let asset = kebab_parse_code_test_support::fixed_rust_asset("crates/x/src/sample.rs"); + let asset = tests_support::fixed_code_asset("crates/x/src/sample.rs", "rust"); let cfg = kebab_core::ExtractConfig::default(); let root = std::path::PathBuf::from("/tmp"); let ctx = kebab_core::ExtractContext { asset: &asset, workspace_root: &root, config: &cfg }; @@ -444,7 +434,7 @@ mod tests { /// Run the extractor on an in-memory Rust source string (no fixture /// file) and return (symbol, code) for every emitted block. fn extract_inline(source: &str) -> Vec<(String, String)> { - let asset = kebab_parse_code_test_support::fixed_rust_asset("crates/x/src/inline.rs"); + let asset = tests_support::fixed_code_asset("crates/x/src/inline.rs", "rust"); let cfg = kebab_core::ExtractConfig::default(); let root = std::path::PathBuf::from("/tmp"); let ctx = kebab_core::ExtractContext { asset: &asset, workspace_root: &root, config: &cfg }; @@ -531,20 +521,23 @@ mod tests { } #[cfg(test)] -mod kebab_parse_code_test_support { +pub(crate) mod tests_support { use kebab_core::*; use time::OffsetDateTime; - pub fn fixed_rust_asset(path: &str) -> RawAsset { + /// Test-only `RawAsset` builder for any tree-sitter language. Shared + /// across `rust.rs` / `python.rs` / future TS+JS extractor tests so all + /// in-crate code-extractor tests use a single canonical fixture shape. + pub fn fixed_code_asset(workspace_path: &str, code_lang: &str) -> RawAsset { RawAsset { asset_id: AssetId("a".repeat(64)), - source_uri: SourceUri::File(std::path::PathBuf::from(path)), - workspace_path: WorkspacePath(path.to_string()), - media_type: MediaType::Code("rust".to_string()), + source_uri: SourceUri::File(std::path::PathBuf::from(workspace_path)), + workspace_path: WorkspacePath(workspace_path.to_string()), + media_type: MediaType::Code(code_lang.to_string()), byte_len: 0, checksum: Checksum("b".repeat(64)), discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), stored: AssetStorage::Reference { - path: std::path::PathBuf::from(path), + path: std::path::PathBuf::from(workspace_path), sha: Checksum("b".repeat(64)), }, } diff --git a/crates/kebab-parse-code/src/scaffold.rs b/crates/kebab-parse-code/src/scaffold.rs new file mode 100644 index 0000000..6900fed --- /dev/null +++ b/crates/kebab-parse-code/src/scaffold.rs @@ -0,0 +1,45 @@ +//! `kebab-parse-code::scaffold` — shared pure helpers used by all +//! per-language extractor modules. +//! +//! These are `pub(crate)` utilities extracted from the four extractor +//! modules (rust / python / typescript / javascript) where identical +//! copies existed. Keeping them here is the single source of truth. + +/// Extract the last path component (filename) from a `/`-separated +/// workspace path string. +/// For a path like `crates/x/src/foo.rs` this returns `foo.rs`. +pub(crate) fn filename_from_workspace_path(p: &str) -> String { + p.rsplit('/').next().unwrap_or(p).to_string() +} + +/// Strip the last dot-extension from a filename string. +/// A leading dot (hidden-file convention) is preserved as-is. +/// `foo.rs` → `foo`, `.hidden` → `.hidden`, `noext` → `noext`. +pub(crate) fn strip_extension(filename: &str) -> String { + match filename.rfind('.') { + Some(0) => filename.to_string(), + Some(idx) => filename[..idx].to_string(), + None => filename.to_string(), + } +} + +/// Join `(mod_prefix, mod_path, name)` into a dotted symbol string. +/// +/// Used by Python / TypeScript / JavaScript extractors. Rust uses +/// `::` separators instead and builds symbols inline; this helper +/// covers the `.`-joined languages. +/// +/// Empty `mod_prefix` (e.g. file is `__init__.py` at workspace root) +/// drops the leading prefix segment; empty `mod_path` (file top-level) +/// drops the class-nesting middle segment. +pub(crate) fn join_symbol(mod_prefix: &str, mod_path: &[String], name: &str) -> String { + let mut parts: Vec<&str> = Vec::with_capacity(mod_path.len() + 2); + if !mod_prefix.is_empty() { + parts.push(mod_prefix); + } + for p in mod_path { + parts.push(p.as_str()); + } + parts.push(name); + parts.join(".") +} diff --git a/crates/kebab-parse-code/src/typescript.rs b/crates/kebab-parse-code/src/typescript.rs new file mode 100644 index 0000000..2fb4e97 --- /dev/null +++ b/crates/kebab-parse-code/src/typescript.rs @@ -0,0 +1,690 @@ +//! `kebab-parse-code::typescript` — tree-sitter TypeScript / TSX AST +//! extractor (P10-1B Task H). +//! +//! Implements [`kebab_core::Extractor`] for [`MediaType::Code("typescript")`]. +//! Walks the tree-sitter parse tree (one of two grammars selected by the +//! workspace path's extension — `.tsx` uses [`tree_sitter_typescript::LANGUAGE_TSX`], +//! everything else uses [`tree_sitter_typescript::LANGUAGE_TYPESCRIPT`]) and +//! emits one [`Block::Code`] per top-level AST semantic unit (free fn, +//! class, each method, interface, type alias, enum, recursively per +//! nested class), each carrying [`SourceSpan::Code`] with the unit's +//! dotted symbol path prefixed by [`module_path_for_tsjs`]. +//! +//! Glue declarations (`import_statement`, bare `export_statement` +//! re-exports, `lexical_declaration` / `variable_declaration` at the +//! module level, namespace / module declarations, etc.) collapse into +//! one grouped `` (or ``) unit. +//! +//! `export_statement` is unwrapped: an `export function|class|interface +//! |type|enum` is treated as the inner declaration arm but the unit's +//! line range comes from the OUTER `export_statement` so the `export ` +//! prefix is folded in. `export default function () {}` / `export +//! default class {}` (no `name` field) emits `default` as the symbol +//! name. +//! +//! Scope follows 1A-2 / 1B Task E: AST unit extraction + dotted symbol +//! paths + line ranges. Per design §3.4 / §9.1 / §9 versioning. + +use anyhow::Result; +use kebab_core::{ + Block, CanonicalDocument, CodeBlock, CommonBlock, Extractor, Lang, MediaType, Metadata, + ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TrustLevel, + id_for_block, id_for_doc, +}; +use serde_json::Map; +use time::OffsetDateTime; + +use crate::scaffold::{filename_from_workspace_path, join_symbol, strip_extension}; + +pub const PARSER_VERSION: &str = "code-ts-v1"; + +/// TypeScript / TSX AST extractor. Per-unit blocks via +/// tree-sitter-typescript 0.23 (`LANGUAGE_TYPESCRIPT` / `LANGUAGE_TSX` +/// — two `LanguageFn`s, selected by extension) parsed by tree-sitter +/// 0.26. +pub struct TypescriptAstExtractor; + +impl TypescriptAstExtractor { + pub fn new() -> Self { + Self + } +} + +impl Default for TypescriptAstExtractor { + fn default() -> Self { + Self::new() + } +} + +impl Extractor for TypescriptAstExtractor { + fn supports(&self, m: &MediaType) -> bool { + matches!(m, MediaType::Code(l) if l == "typescript") + } + + fn parser_version(&self) -> ParserVersion { + ParserVersion(PARSER_VERSION.to_string()) + } + + fn extract( + &self, + ctx: &kebab_core::ExtractContext<'_>, + bytes: &[u8], + ) -> Result { + let asset = ctx.asset; + if !self.supports(&asset.media_type) { + anyhow::bail!( + "kebab-parse-code: unsupported media_type for TypescriptAstExtractor: {:?}", + asset.media_type + ); + } + + let parser_version = self.parser_version(); + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); + + let source = String::from_utf8(bytes.to_vec()).map_err(|e| { + anyhow::anyhow!("kebab-parse-code: TypeScript source is not valid UTF-8: {e}") + })?; + + let mod_prefix = crate::lang::module_path_for_tsjs(&asset.workspace_path.0); + let language = select_grammar(&asset.workspace_path.0); + let blocks = build_blocks(&source, &doc_id, &mod_prefix, language)?; + let unit_count = blocks.len() as u32; + + let now = OffsetDateTime::now_utc(); + let mut events: Vec = Vec::with_capacity(2); + events.push(ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-code".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!( + "parser_version={}; unit_count={}", + parser_version.0, unit_count + )), + }); + + let title = { + let fname = filename_from_workspace_path(&asset.workspace_path.0); + strip_extension(&fname) + }; + + // Resolve the file's absolute path for repo detection. If the + // source URI carries a relative path, anchor it at the workspace + // root so the `.git/` walk-up starts from the right place. + let abs_path = match &asset.source_uri { + kebab_core::SourceUri::File(p) => { + if p.is_absolute() { + p.clone() + } else { + ctx.workspace_root.join(p) + } + } + kebab_core::SourceUri::Kb(_) => ctx.workspace_root.to_path_buf(), + }; + let (repo, git_branch, git_commit) = match crate::repo::detect_repo(&abs_path) { + Some(r) => (Some(r.name), r.branch, r.commit), + None => (None, None, None), + }; + + let metadata = Metadata { + aliases: Vec::new(), + tags: Vec::new(), + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Map::new(), + repo, + git_branch, + git_commit, + code_lang: Some("typescript".to_string()), + }; + + tracing::debug!( + target: "kebab-parse-code", + "extracted TypeScript doc_id={} workspace_path={} units={}", + doc_id.0, + asset.workspace_path.0, + unit_count + ); + + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks, + metadata, + provenance: Provenance { events }, + parser_version, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + }) + } +} + +/// Select the tree-sitter grammar based on the workspace path's +/// extension. `.tsx` → TSX grammar; everything else (`.ts`, `.d.ts`, +/// missing extension) → TypeScript grammar. +fn select_grammar(workspace_path: &str) -> tree_sitter::Language { + if workspace_path.ends_with(".tsx") { + tree_sitter_typescript::LANGUAGE_TSX.into() + } else { + tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into() + } +} + +fn build_blocks( + source: &str, + doc_id: &kebab_core::DocumentId, + mod_prefix: &str, + language: tree_sitter::Language, +) -> anyhow::Result> { + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&language) + .map_err(|e| anyhow::anyhow!("set tree-sitter-typescript language: {e}"))?; + let tree = parser + .parse(source.as_bytes(), None) + .ok_or_else(|| anyhow::anyhow!("tree-sitter failed to parse TypeScript source"))?; + let lines: Vec<&str> = source.split('\n').collect(); + + // units: (symbol, line_start, line_end, is_real_semantic_unit). + // Glue groups are pushed with a sentinel symbol + is_real=false so a + // post-pass can decide `` vs `` (same algorithm + // as 1A Gap 1 / 1B Python). + let mut units: Vec<(String, u32, u32, bool)> = Vec::new(); + // (is_module_only_kind 0/1, s, e). `is_module_only_kind` flags + // `import_statement` and bare re-export `export_statement`s — used by + // the glue flush to pick `` vs `` provisional + // label (1A's `is_mod_decl` analog). + let mut glue: Vec<(usize, u32, u32)> = Vec::new(); + + /// Walk preceding `comment` and `decorator` siblings to extend the + /// unit's line range upward, folding leading doc/line comments and + /// decorators into the unit. + /// + /// In tree-sitter-typescript 0.23, TS class-method decorators (and + /// class-level decorators) are **`class_body` siblings** that + /// immediately precede the `method_definition` node — they are NOT + /// children of `method_definition`. (Contrast with + /// tree-sitter-javascript, where the `decorator` IS stored inside + /// `method_definition` as a named child via the `decorator` field, so + /// `method_definition.start_row` already covers the decorator line + /// there — no sibling walk needed in `javascript.rs`.) + /// + /// Extending backward over `decorator` siblings here matches Python's + /// `decorated_definition` arm behavior: the decorator line is folded + /// into the emitted unit's line range. + fn unit_start(n: &tree_sitter::Node) -> u32 { + let mut start = n.start_position().row as u32 + 1; + let mut prev = n.prev_sibling(); + while let Some(p) = prev { + if p.kind() == "comment" || p.kind() == "decorator" { + start = p.start_position().row as u32 + 1; + prev = p.prev_sibling(); + } else { + break; + } + } + start + } + fn name_text<'a>(n: &tree_sitter::Node, src: &'a str) -> Option<&'a str> { + n.child_by_field_name("name") + .map(|c| &src[c.start_byte()..c.end_byte()]) + } + /// Walk a class body, emitting one unit per `method_definition`. + /// Class names already pushed onto `mod_path` by the caller, so + /// method symbols come out as `..`. + fn walk_class_body( + body: tree_sitter::Node, + src: &str, + mod_prefix: &str, + mod_path: &[String], + units: &mut Vec<(String, u32, u32, bool)>, + ) { + let mut cur = body.walk(); + for child in body.named_children(&mut cur) { + if child.kind() == "method_definition" { + if let Some(name) = name_text(&child, src) { + let s = unit_start(&child); + let e = child.end_position().row as u32 + 1; + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + } + } + } + } + fn walk( + node: tree_sitter::Node, + src: &str, + mod_prefix: &str, + mod_path: &[String], + units: &mut Vec<(String, u32, u32, bool)>, + glue: &mut Vec<(usize, u32, u32)>, + ) { + let mut cur = node.walk(); + for child in node.named_children(&mut cur) { + let s = unit_start(&child); + let e = child.end_position().row as u32 + 1; + match child.kind() { + "function_declaration" => { + if let Some(name) = name_text(&child, src) { + glue.retain(|(_, gs, _)| *gs < s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + } + } + "class_declaration" => { + if let Some(name) = name_text(&child, src) { + glue.retain(|(_, gs, _)| *gs < s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + if let Some(body) = child.child_by_field_name("body") { + let mut np = mod_path.to_vec(); + np.push(name.to_string()); + walk_class_body(body, src, mod_prefix, &np, units); + } + } + } + "interface_declaration" + | "type_alias_declaration" + | "enum_declaration" => { + if let Some(name) = name_text(&child, src) { + glue.retain(|(_, gs, _)| *gs < s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, name); + units.push((sym, s, e, true)); + } + } + "export_statement" => { + // Try field "declaration" first (export class / + // function / interface / type / enum). If absent, + // fall back to "value" — `export default function + // () {}` / `export default class {}` expose the + // anonymous function_expression / class under the + // `value` field (TS grammar 0.23). + let outer_s = s; // includes `export ` prefix line + let outer_e = e; + if let Some(inner) = child.child_by_field_name("declaration") { + let inner_kind = inner.kind(); + match inner_kind { + "function_declaration" + | "class_declaration" + | "interface_declaration" + | "type_alias_declaration" + | "enum_declaration" => { + let name_opt = name_text(&inner, src).map(|s| s.to_string()); + if let Some(name) = name_opt { + glue.retain(|(_, gs, _)| *gs < outer_s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = + join_symbol(mod_prefix, mod_path, &name); + units.push((sym, outer_s, outer_e, true)); + if inner_kind == "class_declaration" { + if let Some(body) = + inner.child_by_field_name("body") + { + let mut np = mod_path.to_vec(); + np.push(name); + walk_class_body( + body, src, mod_prefix, &np, units, + ); + } + } + } else { + // `export default function foo() {}` + // path is covered by name_opt = + // Some(_) above; the no-name path + // here is `export default` with a + // function_declaration that + // somehow lacks `name`. Emit + // `default` defensively. + glue.retain(|(_, gs, _)| *gs < outer_s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = + join_symbol(mod_prefix, mod_path, "default"); + units.push((sym, outer_s, outer_e, true)); + } + } + // `lexical_declaration` etc. wrapped in + // export: treat as glue (assigned arrow + // fns / consts don't get their own unit). + _ => { + glue.push((0, s, e)); + } + } + } else if let Some(value) = child.child_by_field_name("value") { + // `export default `. We emit a unit only + // for the function / class shapes (named or + // anonymous); other value shapes are glue. + match value.kind() { + "function_expression" + | "function_declaration" + | "class" + | "class_declaration" => { + let name_opt = + name_text(&value, src).map(|s| s.to_string()); + let leaf = name_opt + .as_deref() + .unwrap_or("default") + .to_string(); + glue.retain(|(_, gs, _)| *gs < outer_s); + flush_glue(glue, units, mod_prefix, mod_path); + let sym = join_symbol(mod_prefix, mod_path, &leaf); + units.push((sym, outer_s, outer_e, true)); + // Recurse into class body if we have one. + if matches!( + value.kind(), + "class" | "class_declaration" + ) { + if let Some(body) = + value.child_by_field_name("body") + { + let mut np = mod_path.to_vec(); + np.push(leaf); + walk_class_body( + body, src, mod_prefix, &np, units, + ); + } + } + } + _ => { + glue.push((0, s, e)); + } + } + } else { + // Bare `export { x };` / `export * from "..."` — + // a re-export, glue with module-only flag set + // (we have no `declaration` / `value` field for + // it). + glue.push((1, s, e)); + } + } + "import_statement" => { + glue.push((1, s, e)); + } + "lexical_declaration" | "variable_declaration" => { + glue.push((0, s, e)); + } + // Namespace / module declarations (rare in app code, + // common in `.d.ts`): treat as glue per plan §Task H + // (1B 1차 scope; documented under spec Risks). + "internal_module" | "module" | "ambient_declaration" => { + glue.push((0, s, e)); + } + _ => {} + } + } + flush_glue(glue, units, mod_prefix, mod_path); + } + fn flush_glue( + glue: &mut Vec<(usize, u32, u32)>, + units: &mut Vec<(String, u32, u32, bool)>, + mod_prefix: &str, + mod_path: &[String], + ) { + if glue.is_empty() { + return; + } + let s = glue.iter().map(|(_, a, _)| *a).min().unwrap(); + let e = glue.iter().map(|(_, _, b)| *b).max().unwrap(); + let only_module = glue.iter().all(|(is_mod, _, _)| *is_mod == 1); + let label = if only_module { "" } else { "" }; + units.push((join_symbol(mod_prefix, mod_path, label), s, e, false)); + glue.clear(); + } + + walk( + tree.root_node(), + source, + mod_prefix, + &[], + &mut units, + &mut glue, + ); + + // `` is correct only when the file produced no real unit. + // Otherwise the import-only group becomes `` (same + // post-pass as 1A Gap 1 / Python). + let has_real_unit = units.iter().any(|(_, _, _, is_real)| *is_real); + if has_real_unit { + for (sym, _, _, is_real) in units.iter_mut() { + if !*is_real && sym.ends_with("") { + let pre = &sym[..sym.len() - "".len()]; + *sym = format!("{pre}"); + } + } + } + + let total_lines = lines.len() as u32; + let mut blocks = Vec::with_capacity(units.len()); + for (ordinal, (symbol, ls, le, _is_real)) in units.into_iter().enumerate() { + let line_start = ls.max(1); + let line_end = le.min(total_lines.max(1)); + let span = SourceSpan::Code { + line_start, + line_end, + symbol: Some(symbol), + lang: Some("typescript".to_string()), + }; + let block_id = id_for_block(doc_id, "code", &[], ordinal as u32, &span); + let code = lines[(line_start as usize - 1)..=(line_end as usize - 1)].join("\n"); + blocks.push(Block::Code(CodeBlock { + common: CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span, + }, + lang: Some("typescript".to_string()), + code, + })); + } + Ok(blocks) +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{Block, MediaType, SourceSpan}; + + fn extract_fixture(name: &str, workspace_path: &str) -> kebab_core::CanonicalDocument { + let bytes = std::fs::read(format!( + concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/{}"), + name + )) + .unwrap(); + let asset = crate::rust::tests_support::fixed_code_asset(workspace_path, "typescript"); + let cfg = kebab_core::ExtractConfig::default(); + let root = std::path::PathBuf::from("/tmp"); + let ctx = kebab_core::ExtractContext { + asset: &asset, + workspace_root: &root, + config: &cfg, + }; + TypescriptAstExtractor::new() + .extract(&ctx, &bytes) + .unwrap() + } + + fn symbols(doc: &kebab_core::CanonicalDocument) -> Vec { + let mut s: Vec = doc + .blocks + .iter() + .filter_map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, lang, .. } => { + assert_eq!(lang.as_deref(), Some("typescript")); + symbol.clone() + } + _ => None, + }, + _ => None, + }) + .collect(); + s.sort(); + s + } + + #[test] + fn extractor_supports_only_media_code_typescript() { + let e = TypescriptAstExtractor::new(); + assert!(e.supports(&MediaType::Code("typescript".into()))); + assert!(!e.supports(&MediaType::Code("rust".into()))); + assert!(!e.supports(&MediaType::Markdown)); + } + + #[test] + fn ts_units_match_design_3_4_symbols() { + // workspace_path `src/sample.ts` → mod_prefix `src/sample` + let doc = extract_fixture("sample.ts", "src/sample.ts"); + let syms = symbols(&doc); + assert!(syms.iter().any(|s| s == "src/sample.add"), "got {syms:?}"); + assert!(syms.iter().any(|s| s == "src/sample.Greet")); + assert!(syms.iter().any(|s| s == "src/sample.Maybe")); + assert!(syms.iter().any(|s| s == "src/sample.Retriever")); + assert!(syms.iter().any(|s| s == "src/sample.Retriever.search")); + assert!(syms.iter().any(|s| s == "src/sample.Retriever.create")); + assert!(syms.iter().any(|s| s == "src/sample.default")); + assert!(syms.iter().any(|s| s == "src/sample.")); + } + + #[test] + fn tsx_uses_tsx_grammar_and_emits_units() { + let doc = extract_fixture("sample.tsx", "src/sample.tsx"); + let syms = symbols(&doc); + assert!( + syms.iter().any(|s| s == "src/sample.Hello"), + "got {syms:?}" + ); + assert!( + syms.iter().any(|s| s == "src/sample."), + "arrow fn + import should roll into top-level glue" + ); + } + + #[test] + fn deterministic_across_runs() { + let a = extract_fixture("sample.ts", "src/sample.ts"); + for _ in 0..30 { + assert_eq!(extract_fixture("sample.ts", "src/sample.ts").blocks, a.blocks); + } + } + + /// Regression: TS class-method decorators are `class_body` preceding + /// siblings (not children of `method_definition`). The `unit_start` + /// backward walk must fold the decorator line into the emitted unit's + /// line range, matching Python's `decorated_definition` behavior. + #[test] + fn class_method_decorator_folded_into_method_unit() { + // Line 1 (1-indexed): "class Foo {" + // Line 2: " @Log()" <- decorator + // Line 3: " bar() { return 1; }" + // Line 4: "}" + let bytes = b"class Foo {\n @Log()\n bar() { return 1; }\n}\n"; + let asset = crate::rust::tests_support::fixed_code_asset("src/foo.ts", "typescript"); + let cfg = kebab_core::ExtractConfig::default(); + let root = std::path::PathBuf::from("/tmp"); + let ctx = kebab_core::ExtractContext { + asset: &asset, + workspace_root: &root, + config: &cfg, + }; + let doc = TypescriptAstExtractor::new().extract(&ctx, bytes).unwrap(); + + let bar_block = doc + .blocks + .iter() + .find_map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, .. } + if symbol.as_deref() == Some("src/foo.Foo.bar") => + { + Some(c) + } + _ => None, + }, + _ => None, + }) + .expect("src/foo.Foo.bar block should be present"); + + // After the fix, the unit MUST include the @Log() decorator line. + assert!( + bar_block.code.contains("@Log()"), + "decorator must be folded into class-method unit (Python parity); got code: {:?}", + bar_block.code + ); + + // line_start must be 2 (the @Log() line), NOT 3 (the bar() line). + match &bar_block.common.source_span { + SourceSpan::Code { line_start, .. } => { + assert_eq!( + *line_start, 2, + "line_start must cover the @Log() decorator line (got {line_start})" + ); + } + _ => unreachable!(), + } + } + + /// Class-level decorator (preceding sibling of `class_declaration` in + /// the module root): same `unit_start` backward walk folds it in. + /// Line 1: "@Injectable()" + /// Line 2: "class Service {" + /// Line 3: "}" + #[test] + fn ts_class_decorator_folded_into_class_unit() { + let bytes = b"@Injectable()\nclass Service {\n}\n"; + let asset = crate::rust::tests_support::fixed_code_asset("src/svc.ts", "typescript"); + let cfg = kebab_core::ExtractConfig::default(); + let root = std::path::PathBuf::from("/tmp"); + let ctx = kebab_core::ExtractContext { + asset: &asset, + workspace_root: &root, + config: &cfg, + }; + let doc = TypescriptAstExtractor::new().extract(&ctx, bytes).unwrap(); + + let svc_block = doc + .blocks + .iter() + .find_map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, .. } + if symbol.as_deref() == Some("src/svc.Service") => + { + Some(c) + } + _ => None, + }, + _ => None, + }) + .expect("src/svc.Service block should be present"); + + assert!( + svc_block.code.contains("@Injectable()"), + "class-level decorator must be folded into the class unit; got code: {:?}", + svc_block.code + ); + match &svc_block.common.source_span { + SourceSpan::Code { line_start, .. } => { + assert_eq!( + *line_start, 1, + "line_start must cover the @Injectable() line (got {line_start})" + ); + } + _ => unreachable!(), + } + } +} diff --git a/crates/kebab-parse-code/tests/fixtures/sample.js b/crates/kebab-parse-code/tests/fixtures/sample.js new file mode 100644 index 0000000..3f95e42 --- /dev/null +++ b/crates/kebab-parse-code/tests/fixtures/sample.js @@ -0,0 +1,9 @@ +// sample.js +import { x } from "./other"; +const ANSWER = 42; +export function add(a, b) { return a + b; } +export class Retriever { + search(q) { return []; } + static create() { return new Retriever(); } +} +export default function () { return 1; } diff --git a/crates/kebab-parse-code/tests/fixtures/sample.py b/crates/kebab-parse-code/tests/fixtures/sample.py new file mode 100644 index 0000000..b19d906 --- /dev/null +++ b/crates/kebab-parse-code/tests/fixtures/sample.py @@ -0,0 +1,26 @@ +"""sample fixture.""" +import os + +ANSWER = 42 + +@no_type_check +def free(x): + """free fn.""" + return x + 1 + +class Foo: + """doc.""" + def double(self, n): + return n * 2 + + @classmethod + def name(cls): + return "foo" + +class Outer: + class Inner: + def helper(self): + return True + +def with_decorator(): + pass diff --git a/crates/kebab-parse-code/tests/fixtures/sample.ts b/crates/kebab-parse-code/tests/fixtures/sample.ts new file mode 100644 index 0000000..f390b89 --- /dev/null +++ b/crates/kebab-parse-code/tests/fixtures/sample.ts @@ -0,0 +1,11 @@ +// sample.ts +import { x } from "./other"; +const ANSWER = 42; +export interface Greet { hello(): string; } +export type Maybe = T | null; +export function add(a: number, b: number): number { return a + b; } +export class Retriever { + search(q: string): string[] { return []; } + static create(): Retriever { return new Retriever(); } +} +export default function () { return 1; } diff --git a/crates/kebab-parse-code/tests/fixtures/sample.tsx b/crates/kebab-parse-code/tests/fixtures/sample.tsx new file mode 100644 index 0000000..0fd1a5e --- /dev/null +++ b/crates/kebab-parse-code/tests/fixtures/sample.tsx @@ -0,0 +1,4 @@ +// sample.tsx +import React from "react"; +export function Hello({ name }: { name: string }) { return {name}; } +export const App = () => ; // arrow fn assigned → glue diff --git a/crates/kebab-source-fs/src/media.rs b/crates/kebab-source-fs/src/media.rs index 68db875..c84ce7f 100644 --- a/crates/kebab-source-fs/src/media.rs +++ b/crates/kebab-source-fs/src/media.rs @@ -38,6 +38,11 @@ pub(crate) fn media_type_for(path: &Path) -> MediaType { // recognized code langs stay Other until their phase (1B+). "rs" => MediaType::Code("rust".to_string()), + // p10-1B: Python / TS / JS AST chunkers active. + "py" | "pyi" => MediaType::Code("python".into()), + "ts" | "tsx" => MediaType::Code("typescript".into()), + "js" | "mjs" | "cjs" | "jsx" => MediaType::Code("javascript".into()), + // Empty string (no extension) and any other extension: bucket as // Other and let downstream extractors decide if they support it. _ => MediaType::Other(ext), @@ -81,11 +86,22 @@ mod tests { media_type_for(Path::new("crates/kebab-core/src/lib.rs")), MediaType::Code("rust".to_string()) ); - // non-Rust code extensions stay Other in 1A - assert_eq!(media_type_for(Path::new("a/b.py")), MediaType::Other("py".to_string())); assert_eq!(media_type_for(Path::new("Cargo.toml")), MediaType::Other("toml".to_string())); } + #[test] + fn py_ts_js_files_map_to_media_code() { + assert_eq!(media_type_for(Path::new("a/b.py")), MediaType::Code("python".into())); + assert_eq!(media_type_for(Path::new("a/b.pyi")), MediaType::Code("python".into())); + assert_eq!(media_type_for(Path::new("a/b.ts")), MediaType::Code("typescript".into())); + assert_eq!(media_type_for(Path::new("a/b.tsx")), MediaType::Code("typescript".into())); + assert_eq!(media_type_for(Path::new("a/b.js")), MediaType::Code("javascript".into())); + assert_eq!(media_type_for(Path::new("a/b.mjs")), MediaType::Code("javascript".into())); + assert_eq!(media_type_for(Path::new("a/b.cjs")), MediaType::Code("javascript".into())); + assert_eq!(media_type_for(Path::new("a/b.jsx")), MediaType::Code("javascript".into())); + assert_eq!(media_type_for(Path::new("a/b.rs")), MediaType::Code("rust".into())); + } + #[test] fn unknown_and_missing_extension() { assert_eq!( diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index d5952a7..30f165c 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -22,7 +22,8 @@ Cargo workspace, 함수 호출 기반 모듈러 모놀리스. UI binary (`kebab- | OCR | Ollama vision LM (default `gemma4:e4b`) — `OcrEngine` trait 으로 Tesseract / Apple Vision 등 future swap (HOTFIXES P6-2) | | Image caption | Ollama vision LM, runtime gate `image.caption.enabled` (default OFF) | | PDF parser | `lopdf` per-page 텍스트, `chunker_version = "pdf-page-v1"` 가 PDF 자산에 하드코딩 (HOTFIXES P7-3) | -| code parser | `tree-sitter` + `tree-sitter-rust` — **parser-side** (`kebab-parse-code`), chunker-side 아님 (design §6.3). `chunker_version = "code-rust-ast-v1"`. `ast_chunk_max_lines = 200` 상수 고정 (HOTFIXES 2026-05-19 — Chunker trait 이 per-medium config 미노출). | +| code parser | `tree-sitter` + `tree-sitter-rust` / `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` — **parser-side** (`kebab-parse-code`), chunker-side 아님 (design §6.3). chunker versions: Rust = `code-rust-ast-v1`, Python = `code-python-ast-v1`, TypeScript = `code-ts-ast-v1`, JavaScript = `code-js-ast-v1`. `ast_chunk_max_lines = 200` 상수 고정 (HOTFIXES 2026-05-19 — Chunker trait 이 per-medium config 미노출). | +| 1B symbol path | workspace path → module path: Python = dotted prefix (`kebab_eval.metrics.compute_mrr`), TypeScript/JavaScript = slash-style prefix (`src/Foo.Foo.search`). Rust 1A-2 는 file-scope nesting 만 (workspace prefix 없음, 비일관 수용 — HOTFIXES 2026-05-20). | | TUI | Ratatui + crossterm — P9-1 Library 패널, P9-2/3/4 진행 예정 | | Desktop | Tauri 2 + `pdfjs-dist` (native PDF render backend 금지) — P9-5 | | citation 형식 | URI fragment (`path#L12-L34` / `path#p=12` / `path#xywh=0,0,100,50`, W3C Media Fragments) | @@ -51,7 +52,7 @@ flowchart TB ppdf["kebab-parse-pdf"] pimg["kebab-parse-image"] paud["kebab-parse-audio
(P8 보류)"] - pcode["kebab-parse-code
(P10-1A-2)"] + pcode["kebab-parse-code
(P10-1A-2 + P10-1B)"] ptypes["kebab-parse-types"] norm["kebab-normalize"] chunk["kebab-chunk"] @@ -126,6 +127,8 @@ flowchart TB UI → store/llm/parse 직접 의존 금지. 모든 user-facing 진입은 `kebab-app` facade 만 통한다 (frozen 설계 §8). `kebab-cli` 가 `--config ` flag 를 honor 하려면 `kebab_app::*_with_config(cfg, …)` companion 을 통해 Config 을 명시적으로 thread 하는 패턴 — 자세한 이유는 [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 의 `--config` 항목. +`kebab-parse-code` 의 외부 tree-sitter grammar crate 의존: P10-1A-2 에서 `tree-sitter-rust` 추가, P10-1B 에서 `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` 추가. 모두 `kebab-parse-code` 에만 격리 (facade 룰 — UI crate / chunker 가 직접 import 금지). + ## 디렉토리 구조 ```text @@ -162,7 +165,7 @@ kebab/ │ ├── kebab-source-fs/ # 워크스페이스 walk + checksum (P1-1) │ ├── kebab-parse-md/ # Markdown frontmatter + blocks (P1-2/3) │ ├── kebab-normalize/ # ParsedBlock → CanonicalDocument (P1-4) -│ ├── kebab-chunk/ # heading-aware + pdf-page-v1 + code-rust-ast-v1 chunker (P1-5, P7-2, P10-1A-2) +│ ├── kebab-chunk/ # heading-aware + pdf-page-v1 + code-rust-ast-v1 + code-python-ast-v1 + code-ts-ast-v1 + code-js-ast-v1 chunker (P1-5, P7-2, P10-1A-2, P10-1B) │ ├── kebab-store-sqlite/ # SQLite + FTS5 (V001/V002/V003) (P1-6, P2-1, P3-3) │ ├── kebab-search/ # Lexical + Vector + Hybrid retriever (P2-2, P3-4) │ ├── kebab-embed/ kebab-embed-local/ # Embedder trait + fastembed adapter (P3-1, P3-2) @@ -172,7 +175,7 @@ kebab/ │ ├── kebab-eval/ # golden query runner + metrics (P5-1, P5-2) │ ├── kebab-parse-image/ # ImageExtractor + Ollama OCR + caption (P6) │ ├── kebab-parse-pdf/ # lopdf per-page text extractor (P7-1) -│ ├── kebab-parse-code/ # tree-sitter Rust AST extractor (P10-1A-2); chunker lives in kebab-chunk +│ ├── kebab-parse-code/ # tree-sitter AST extractors: Rust (P10-1A-2), Python + TypeScript + JavaScript (P10-1B); chunker lives in kebab-chunk │ ├── kebab-app/ # facade (P0 시그니처 + P3-5/P6-4/P7-3 본체) │ ├── kebab-tui/ # Ratatui shell + Library 패널 (P9-1) │ ├── kebab-mcp/ # stdio MCP server — tools: schema, doctor, search, ask (P9-FB-30) diff --git a/docs/SMOKE.md b/docs/SMOKE.md index c42a25d..2e44e25 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -340,6 +340,67 @@ extra_skip_globs = [] # 사용자 추가 skip 패턴 - `.rs` 파일은 `SourceType::Note` 로 분류됨 (kebab-core `SourceType::Code` variant 미존재). `--media code` filter 는 정상 동작 — `MediaType::Code("rust")` 로 별도 분류됨. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-19 `SourceType::Code` 항목). - `.gitignore` 가 honor 됨 — `target/` / `node_modules/` 등은 built-in 안전망으로 자동 skip. +## P10-1B Python / TypeScript / JavaScript 코드 색인 + +P10-1A-2 와 동일한 격리 KB 설정으로 Python / TypeScript / JavaScript 3 언어를 검증한다. 설정 블록은 P10-1A-2 와 동일 (`[ingest.code]` 절 포함). + +```bash +# 1) 워크스페이스에 Python / TS / JS 파일 추가 (소규모 샘플로 충분) +mkdir -p /tmp/kebab-smoke/workspace/sample_code +# Python 예시 +cat > /tmp/kebab-smoke/workspace/sample_code/metrics.py <<'EOF' +def compute_mrr(results): + """Mean Reciprocal Rank.""" + total = 0.0 + for i, hit in enumerate(results, 1): + if hit: + total += 1.0 / i + break + return total +EOF +# TypeScript 예시 +cat > /tmp/kebab-smoke/workspace/sample_code/searcher.ts <<'EOF' +export class Searcher { + search(query: string): string[] { + return []; + } +} +EOF +# JavaScript 예시 +cat > /tmp/kebab-smoke/workspace/sample_code/utils.js <<'EOF' +function formatResult(hit) { + return `${hit.score}: ${hit.path}`; +} +module.exports = { formatResult }; +EOF + +# 2) ingest +KB ingest + +# 3) 언어별 검색 (symbol + module path prefix 확인) +KB search --mode hybrid "compute_mrr" --code-lang python --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang}]}' +KB search --mode hybrid "search" --code-lang typescript --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang}]}' +KB search --mode hybrid "formatResult" --code-lang javascript --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang}]}' + +# 4) schema stats 에 3 언어 카운트 확인 +KB --json schema | jq '.stats.code_lang_breakdown' +# 기대: {"python": N, "typescript": N, "javascript": N, "rust": M, ...} +``` + +**Symbol path 컨벤션 (2026-05-20 기준)**: + +- **Python**: workspace 경로 → dotted module path prefix. `sample_code/metrics.py` 의 `compute_mrr` → symbol `sample_code.metrics.compute_mrr`. +- **TypeScript / JavaScript**: workspace 경로 → slash-style module path prefix. `sample_code/searcher.ts` 의 `search` → `sample_code/searcher.Searcher.search`. `.tsx` / `.mjs` / `.cjs` / `.jsx` 도 동일 처리. +- **Rust** (1A-2): file-scope nesting 만, workspace path prefix 없음 (예: `Foo::double`). Python/TS/JS 와 비일관 — HOTFIXES 2026-05-20 참조. + +**알려진 동작**: + +- `const foo = () => {...}` 같은 expression-level 함수는 `` glue 로 잡힘 (declaration-level 단위만 1B 1차 범위). 자세한 내용: `tasks/HOTFIXES.md` (2026-05-20). +- `.gitignore` honor — `node_modules/` / `__pycache__/` / `.venv/` 등 built-in 안전망 자동 skip. + ## 검증 체크리스트 - `kebab doctor` 가 `--config` path 를 honor 하고 그 안의 `storage.data_dir` 를 출력 (XDG default 가 아님). @@ -371,6 +432,7 @@ rm -rf /tmp/kebab-smoke # 통째로 정리 - (P7-3) `config.chunking.chunker_version` 는 markdown 만 represent — PDF 자산은 `pdf-page-v1` 하드코딩. `config.toml` 의 `chunker_version = "md-heading-v1"` 을 봐도 PDF 는 영향 안 받음. HOTFIXES `2026-05-02 P7-3` entry 참조 (P+ chunker registry task 까지 유지). - (P7-3) 한 PDF 가 N 페이지면 `kebab ingest` 가 N 개 (또는 그 이상의, 페이지 길면 multi-chunk) 의 chunk 를 한 transaction 안에서 commit. 500 페이지 책 → 500+ chunk 한 번에 → embedding throughput 가 bottleneck. 임베딩 활성 워크스페이스에서 큰 PDF 를 처음 ingest 하면 분-단위 시간 + WAL 크기 증가 가능 — P+ 스케일 hardening task 까지 정상 동작이지만 비용은 측정 가능. - (P10-1A-2) `.rs` 파일을 워크스페이스에 두면 `kebab ingest` 결과에 `new` 카운터에 포함. `kebab search --mode hybrid "<함수명>" --code-lang rust --json` 가 `citation.kind = "code"`, `citation.lang = "rust"` (SearchHit top-level `code_lang` 도 동일), `citation.symbol` (함수/타입 이름), `citation.line_start` / `citation.line_end` 를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"rust": N` 이 나오면 chunk 가 색인됨. +- (P10-1B) `.py` / `.ts` / `.tsx` / `.js` / `.mjs` / `.cjs` / `.jsx` 파일을 워크스페이스에 두면 `kebab ingest` 결과에 `new` 카운터에 포함. `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` 검색이 `citation.symbol` 에 module path prefix 를 포함한 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 해당 언어 카운트 등장 확인. - (P7-3 + follow-up) 동일 path 에 byte 가 다른 PDF 를 두 번째 ingest 하면 `purge_vector_orphans_for_workspace_path` 가 옛 chunk_id 를 LanceDB 에서 먼저 삭제, 이어서 `purge_orphan_at_workspace_path` 가 옛 doc / chunks / embedding_records 를 SQLite 에서 sweep. 새 byte 가 새 `doc_id` 로 색인됨. `IngestReport` 에 그 자산만 `new+=1` (다른 자산은 `updated`). 두 store 모두 정합 — 옛 본문 검색 시 옛 chunks 가 더 이상 surface 되지 않음. ### Embedding upgrade (fb-39b) diff --git a/docs/superpowers/plans/2026-05-20-p10-1b-py-ts-js-ast-chunkers.md b/docs/superpowers/plans/2026-05-20-p10-1b-py-ts-js-ast-chunkers.md new file mode 100644 index 0000000..4610091 --- /dev/null +++ b/docs/superpowers/plans/2026-05-20-p10-1b-py-ts-js-ast-chunkers.md @@ -0,0 +1,741 @@ +# p10-1B Python + TS/JS AST Chunkers Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development to implement this plan task-by-task. + +**Goal:** Activate Python / TypeScript / JavaScript code ingest end-to-end on top of 1A-2's infrastructure — 3 new tree-sitter grammars, 3 new Extractors, 3 new chunkers (`code-{python,ts,js}-ast-v1`), a `module_path_for_*` helper for workspace-path → module-path conversion, and a small app-dispatch generalization. Wire `code_lang` filter / breakdown / Citation::Code surface activate automatically. + +**Architecture:** Mirror 1A-2 exactly per language. Each Extractor in `kebab-parse-code/src/{python,typescript,javascript}.rs` calls its tree-sitter grammar and emits one `Block::Code` per top-level AST semantic unit with `SourceSpan::Code { line_start, line_end, symbol, lang }`. Symbol = `module_path` (from workspace_path) `+` per-language join (`.` for Python, `/.../basename.symbol` for TS/JS). Each chunker is a near-duplicate of `code-rust-ast-v1` (1:1 + oversize split). App dispatch becomes `match lang { "rust" | "python" | "typescript" | "javascript" }`. + +**Tech Stack:** Rust 2024 workspace, `tree-sitter` 0.26, `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript`, existing 1A-2 infrastructure (citation_helper Code arm, backfill, schema breakdown). + +**Memory note:** Host was OOM-killed earlier in this branch's history. Prefer `cargo test -p ` and `cargo check -p `; the only `cargo test --workspace -j 1` call is the Task L full-suite gate. Never run cargo invocations in parallel. + +--- + +## Pre-flight + +Branch `feat/p10-1b-py-ts-js` already exists on main (`git checkout feat/p10-1b-py-ts-js`). + +- [ ] **Disk hygiene**: `cargo clean`. + +Reference files (read before touching the corresponding 1B file): +- 1A-2 Rust extractor: `crates/kebab-parse-code/src/rust.rs` — the scaffold every per-lang extractor mirrors. +- 1A-2 Rust chunker: `crates/kebab-chunk/src/code_rust_ast_v1.rs` — the scaffold every per-lang chunker mirrors. +- 1A-2 app dispatch: `crates/kebab-app/src/lib.rs` `ingest_one_code_asset` (~line 1645). +- 1A-2 source-fs routing: `crates/kebab-source-fs/src/media.rs:39` (the `"rs" =>` arm). +- 1A-2 lang dispatch: `crates/kebab-parse-code/src/lang.rs::code_lang_for_path`. + +--- + +## Task A: Workspace deps + +**Files:** +- Modify: `Cargo.toml` (workspace `[workspace.dependencies]`, after the existing `tree-sitter-rust` entry) +- Modify: `crates/kebab-parse-code/Cargo.toml` (`[dependencies]`) + +- [ ] **Step 1**: Resolve versions: `cargo add tree-sitter-python tree-sitter-typescript tree-sitter-javascript -p kebab-parse-code`. + +- [ ] **Step 2**: Lift the three resolved versions into `[workspace.dependencies]` in the root `Cargo.toml`, immediately after the `tree-sitter-rust` line. Single-line comment first: + +```toml +# Python / TS / JS grammars for code ingest (kebab-parse-code, p10-1B). +tree-sitter-python = "" +tree-sitter-typescript = "" +tree-sitter-javascript = "" +``` + +Then change the crate's `[dependencies]` entries to `{ workspace = true }` matching the existing `tree-sitter` / `tree-sitter-rust` style. + +- [ ] **Step 3**: `cargo build -p kebab-parse-code` → clean (unused deps OK; warnings appear when actually imported in later tasks). + +- [ ] **Step 4**: Commit. + +```bash +git add Cargo.toml Cargo.lock crates/kebab-parse-code/Cargo.toml +git commit -m "build(p10-1b): add tree-sitter-python/-typescript/-javascript workspace deps + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task B: source-fs media routing for `.py`/`.pyi`/`.ts`/`.tsx`/`.js`/`.mjs`/`.cjs`/`.jsx` + +**Files:** +- Modify: `crates/kebab-source-fs/src/media.rs` (add 3 arms next to the existing `"rs"` arm at L39) +- Test: same file's test module + +- [ ] **Step 1 (failing test)**: + +```rust +#[test] +fn py_ts_js_files_map_to_media_code() { + assert_eq!(media_type_for(Path::new("a/b.py")), MediaType::Code("python".into())); + assert_eq!(media_type_for(Path::new("a/b.pyi")), MediaType::Code("python".into())); + assert_eq!(media_type_for(Path::new("a/b.ts")), MediaType::Code("typescript".into())); + assert_eq!(media_type_for(Path::new("a/b.tsx")), MediaType::Code("typescript".into())); + assert_eq!(media_type_for(Path::new("a/b.js")), MediaType::Code("javascript".into())); + assert_eq!(media_type_for(Path::new("a/b.mjs")), MediaType::Code("javascript".into())); + assert_eq!(media_type_for(Path::new("a/b.cjs")), MediaType::Code("javascript".into())); + assert_eq!(media_type_for(Path::new("a/b.jsx")), MediaType::Code("javascript".into())); + // Rust 1A-2 arm still works + assert_eq!(media_type_for(Path::new("a/b.rs")), MediaType::Code("rust".into())); +} +``` + +- [ ] **Step 2**: Run → FAIL. + +- [ ] **Step 3**: Add the three arms before the `_ => MediaType::Other(ext)` fallback. Match existing style and order extensions logically (most common first within each language): + +```rust + // p10-1B: Python / TS / JS AST chunkers active. + "py" | "pyi" => MediaType::Code("python".into()), + "ts" | "tsx" => MediaType::Code("typescript".into()), + "js" | "mjs" | "cjs" | "jsx" => MediaType::Code("javascript".into()), +``` + +- [ ] **Step 4**: Run → PASS. Then `cargo test -p kebab-source-fs` → no regression. + +- [ ] **Step 5**: `cargo clippy -p kebab-source-fs --all-targets -- -D warnings` clean. Commit. + +```bash +git add crates/kebab-source-fs/ +git commit -m "feat(p10-1b): route .py/.pyi/.ts/.tsx/.js/.mjs/.cjs/.jsx to MediaType::Code + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task C: `module_path_for_python` + `module_path_for_tsjs` helpers + +**Files:** +- Modify: `crates/kebab-parse-code/src/lang.rs` (add 2 pub fns + tests) +- Modify: `crates/kebab-parse-code/src/lib.rs` (re-export the 2 fns) + +These convert a `WorkspacePath` into a module-path prefix for symbol formatting. Single source of truth — used by all per-language extractors. + +### Rules + +**`module_path_for_python(workspace_path: &str) -> String`**: +1. Strip a leading well-known "source root" prefix from a small allowlist if present (in order): `crates//src/`, `src/`, `lib/`. (Use a single small `for` loop over the allowlist; stop at first prefix match.) Rationale: avoid noisy `crates.x.src.foo.bar` symbols when the user has a conventional layout, while leaving non-conventional paths untouched. +2. Strip trailing `.py` or `.pyi` extension. If the basename (after extension strip) is `__init__`, drop it (and the preceding `/`) so `pkg/__init__.py` → `pkg`. +3. Replace `/` with `.`. +4. Result is the dotted module prefix. Symbols are joined with `.` (e.g. `module_path + "." + sym`). Empty result (file is at workspace root without prefix) → use empty string → symbol is the unit name alone. + +**`module_path_for_tsjs(workspace_path: &str) -> String`**: +1. Strip extension if it's one of `.ts` / `.tsx` / `.js` / `.jsx` / `.mjs` / `.cjs`. +2. Do NOT replace `/` (TS/JS convention is path-like). Do NOT strip any source root (TS/JS layouts vary too widely). +3. Result is the path-style prefix (e.g. `src/search/retriever/Retriever`). Symbols join with `.` (`prefix + "." + sym`, e.g. `src/search/retriever/Retriever.search`). + +- [ ] **Step 1 (failing tests)** — add to existing `mod tests` (or create one) in `lang.rs`: + +```rust +#[test] +fn module_path_for_python_strips_src_roots_and_extensions() { + assert_eq!(module_path_for_python("kebab_eval/metrics.py"), "kebab_eval.metrics"); + assert_eq!(module_path_for_python("kebab_eval/__init__.py"), "kebab_eval"); + assert_eq!(module_path_for_python("src/foo/bar.py"), "foo.bar"); + assert_eq!(module_path_for_python("crates/x/src/foo/bar.py"), "foo.bar"); + assert_eq!(module_path_for_python("a/b/c.pyi"), "a.b.c"); + assert_eq!(module_path_for_python("standalone.py"), "standalone"); + assert_eq!(module_path_for_python("src/__init__.py"), ""); +} + +#[test] +fn module_path_for_tsjs_keeps_slashes_and_strips_ext() { + for ext in ["ts", "tsx", "js", "jsx", "mjs", "cjs"] { + let p = format!("src/search/retriever/Retriever.{ext}"); + assert_eq!(module_path_for_tsjs(&p), "src/search/retriever/Retriever"); + } + assert_eq!(module_path_for_tsjs("foo.ts"), "foo"); + assert_eq!(module_path_for_tsjs("a/b/c.ts"), "a/b/c"); + // No `src/` strip — TS layouts vary. + assert_eq!(module_path_for_tsjs("packages/x/src/Foo.ts"), "packages/x/src/Foo"); +} +``` + +- [ ] **Step 2**: Run → FAIL (helpers not defined). + +- [ ] **Step 3**: Implement both in `lang.rs`. Suggested implementation (refine if a test points out a missed edge case): + +```rust +/// p10-1B: workspace-relative Python file path → dotted module-path prefix. +/// See plan §Task C for the exact rules. +pub fn module_path_for_python(workspace_path: &str) -> String { + let mut p: &str = workspace_path; + // Strip a known source-root prefix. Allowlist + `starts_with` over a + // pattern with a glob in the middle would be a pain; treat + // `crates/*/src/` by string-walking. + if let Some(rest) = p.strip_prefix("crates/") { + if let Some(slash) = rest.find('/') { + let after = &rest[slash + 1..]; + if let Some(stripped) = after.strip_prefix("src/") { + p = stripped; + } + } + } else if let Some(stripped) = p.strip_prefix("src/") { + p = stripped; + } else if let Some(stripped) = p.strip_prefix("lib/") { + p = stripped; + } + // Strip extension. + let p = p + .strip_suffix(".py") + .or_else(|| p.strip_suffix(".pyi")) + .unwrap_or(p); + // __init__ → drop it (and the preceding `/`). + let p = if let Some(parent) = p.strip_suffix("/__init__") { + parent + } else if p == "__init__" { + "" + } else { + p + }; + p.replace('/', ".") +} + +/// p10-1B: workspace-relative TS/JS file path → path-style prefix +/// (no slash replacement). See plan §Task C. +pub fn module_path_for_tsjs(workspace_path: &str) -> String { + let p = workspace_path; + for ext in [".tsx", ".ts", ".jsx", ".mjs", ".cjs", ".js"] { + if let Some(stripped) = p.strip_suffix(ext) { + return stripped.to_string(); + } + } + p.to_string() +} +``` + +- [ ] **Step 4**: Re-export both from `lib.rs` (next to the existing `pub use lang::code_lang_for_path`): + +```rust +pub use lang::{code_lang_for_path, module_path_for_python, module_path_for_tsjs}; +``` + +- [ ] **Step 5**: Run → PASS. clippy clean. + +- [ ] **Step 6**: Commit. + +```bash +git add crates/kebab-parse-code/ +git commit -m "feat(p10-1b): module_path_for_python / _tsjs helpers (workspace path → module prefix) + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task D: App dispatch generalization + +**Files:** +- Modify: `crates/kebab-app/src/lib.rs` + +Today's `ingest_one_code_asset` (~L1645) hardcodes `RustAstExtractor` + `CodeRustAstV1Chunker`. 1B needs to dispatch by `lang`. Cleanest minimal change: keep the same function signature but take `code_lang: &str` and `match` it internally onto an `Extractor` + `Chunker` pair. Rust path keeps the same observable behavior. + +Two equivalent dispatch shapes — pick the one with the smallest diff: + +**Shape 1 (recommended — fewest lines changed):** factor extractor invocation + chunker invocation into a small `match code_lang` *inside* `ingest_one_code_asset`. The `parser_version` constant lookup also branches. Everything else (read bytes, ExtractContext, put_*, embed, IngestItem) stays a single non-branched flow. + +**Shape 2:** introduce a tiny enum `CodeLangKind { Rust, Python, Typescript, Javascript }` + an `impl CodeLangKind { fn extract(...) -> CanonicalDocument; fn chunk(...) -> Vec; fn parser_version() -> ParserVersion; fn chunker_version() -> ChunkerVersion; }`. More structure, but better insulates the function body. + +Use Shape 1 for this task (less risk). A future C/D phase can refactor to Shape 2 if the dispatch grows. + +- [ ] **Step 1 (failing test)** — add a Python smoke as the failing test (TS/JS land later in this PR; one failing-then-passing TDD cycle is enough to lock the dispatch contract): + +In `crates/kebab-app/tests/code_ingest_smoke.rs` add: + +```rust +#[test] +fn python_file_ingests_and_searches_as_code_citation() { + // Mirror rust_file_ingests_and_searches_as_code_citation exactly, + // but write `kebab_eval/metrics.py` (in the temp workspace root) with: + // def compute_mrr(): return 1.0 + // and assert h.code_lang == Some("python"), citation.lang == Some("python"), + // citation.symbol == Some("kebab_eval.metrics.compute_mrr"), parser_version "code-python-v1", + // chunker_version "code-python-ast-v1". + // ... +} +``` + +(Spec shape ONLY — the actual extractor + chunker land in Tasks F + G. This test compiles but FAILS at runtime until those land. Mark it `#[ignore]` if it would otherwise break TDD ordering — un-`#[ignore]` it in Task G's commit. Alternative: skip this step here and rely on the per-extractor unit tests in Task F + Task G; that is the cleaner TDD ordering. Choose either; document the choice in the commit message.) + +- [ ] **Step 2**: Update `ingest_one_asset` dispatch match arm to accept all four code languages with a `lang` capture passed through: + +```rust + // p10-1A-2 / 1B: code ingest dispatch. + MediaType::Code(lang) + if matches!(lang.as_str(), "rust" | "python" | "typescript" | "javascript") => + { + return ingest_one_code_asset( + app, asset, chunk_policy, embedder, vector_store, + existing_doc_ids, force_reingest, lang.as_str(), + ); + } +``` + +(Keep the trailing `MediaType::Code(_) | MediaType::Audio(_) | MediaType::Other(_)` or-pattern as the Skipped fallback — non-allowlisted code langs route there.) + +- [ ] **Step 3**: Update `ingest_one_code_asset` signature to take `code_lang: &str` and dispatch internally. Keep all I/O / persistence / embed code unchanged. Per the Shape-1 recipe: + - `let parser_version = match code_lang { "rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.into()), "python" => ParserVersion(kebab_parse_code::PYTHON_PARSER_VERSION.into()), "typescript" => ParserVersion(kebab_parse_code::TS_PARSER_VERSION.into()), "javascript" => ParserVersion(kebab_parse_code::JS_PARSER_VERSION.into()), _ => unreachable!(), };` + - The `try_skip_unchanged` call's chunker_version arg branches the same way (different chunker per lang). + - The extract call branches: `match code_lang { "rust" => RustAstExtractor::new().extract(...), "python" => PythonAstExtractor::new().extract(...), ... }`. + - The chunk call branches: `match code_lang { "rust" => CodeRustAstV1Chunker.chunk(...), "python" => CodePythonAstV1Chunker.chunk(...), ... }`. + - All other lines (purge_vector_orphans / put_asset_with_bytes / put_document / put_blocks / put_chunks / embed branch / IngestItem) unchanged. + +At this point Python/TS/JS extractors + chunkers don't exist yet → compile FAILS on the references. Acceptable — Task E/F/G/H/I add them. To stage compile-cleanly: gate the Python/TS/JS arms behind `unimplemented!()` for now (returns an error path) and let Tasks F/G/H/I/J/K replace them. Recommended: leave the dispatch fully written but use `anyhow::bail!("not yet activated in this commit")` for the three non-Rust arms, with a `TODO(p10-1b Task X)` comment per arm. They flip to real calls when each language's extractor + chunker land. + +- [ ] **Step 4**: `cargo test -p kebab-app --lib` (lib-only is enough — integration tests for the non-Rust paths land later). Existing Rust path tests must stay green. + +- [ ] **Step 5**: clippy clean, commit. + +```bash +git add crates/kebab-app/ +git commit -m "refactor(p10-1b): generalize ingest_one_code_asset for multi-language dispatch + +Rust path unchanged (verified by existing code_ingest_smoke tests). Python/TS/JS arms +bail with TODO; per-lang extractor + chunker land in subsequent tasks. + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task E: Python Extractor (`kebab-parse-code/src/python.rs`) + +**Files:** +- Create: `crates/kebab-parse-code/src/python.rs` +- Modify: `crates/kebab-parse-code/src/lib.rs` (`pub mod python` + re-exports `PYTHON_PARSER_VERSION` and `PythonAstExtractor`) +- Create: `crates/kebab-parse-code/tests/fixtures/sample.py` + +Scaffold MIRRORS `crates/kebab-parse-code/src/rust.rs` line-for-line (read it first). Only the AST walk + the symbol prefix differ. + +### Python AST mapping + +tree-sitter-python language: `tree_sitter_python::LANGUAGE` (LanguageFn). Set via `parser.set_language(&tree_sitter_python::LANGUAGE.into())`. + +Walk `module` (root) named children. Maintain `mod_path: Vec` — but for Python we DO NOT push class names onto `mod_path` (class members get `Class.method` form via the class arm directly; nested classes recurse with the class name appended). + +| node kind | unit | symbol (joined with `.`) | +|-----------|------|--------------------------| +| `function_definition` (name field) | 1 | `.` (or `` if module_prefix empty) | +| `class_definition` (name) — emit ONE unit for the class definition itself (symbol `.`), then recurse into its `block` body: each inner `function_definition` → unit with symbol `..`; nested `class_definition` recurses with parent class prepended. | 1 per class + 1 per method (etc.) | as above | +| `decorated_definition` | unwrap — process its inner `definition` (either function_definition or class_definition) as if at the same level. `unit_start`'s backward extension over `decorator` siblings folds them into the unit. | n/a | n/a | +| `import_statement`, `import_from_statement`, `expression_statement`, `assignment`, `global_statement`, `future_import_statement` at module level | glue | `` (with `module_prefix` prefix if non-empty: `.`) | + +`unit_start` (backward extension) covers `comment` siblings + `decorator` siblings (decorators in tree-sitter-python appear as children of `decorated_definition`, NOT as siblings — so the `unwrap decorated_definition` arm above is what brings them in; `comment` siblings still need backward extension). Adapt `unit_start` for the Python flavor: extend over `comment` siblings only (decorators are already covered by unwrapping `decorated_definition`). + +Module-prefix application: at extract time, compute `let mod_prefix = kebab_parse_code::module_path_for_python(&asset.workspace_path.0);`. The walk builds symbols using `mod_prefix` (joined with `.` if non-empty; the bare name if empty). Glue group: if `mod_prefix` non-empty, symbol = `format!("{mod_prefix}.")`; else ``. `` glue label (file contains only `import`s and no real unit) follows the same prefix rule. + +### Scaffold differences from rust.rs + +- `pub const PARSER_VERSION: &str = "code-python-v1";` +- `pub struct PythonAstExtractor;` + `new()`/`Default`. +- `fn supports(&self, m: &MediaType) -> bool { matches!(m, MediaType::Code(l) if l == "python") }` +- Agent string `"kb-parse-code"` (unchanged). +- `metadata.code_lang = Some("python".to_string())`. +- `repo` / `git_branch` / `git_commit` from `crate::repo::detect_repo` (same as Rust). +- The AST walk is its own `build_blocks` function — DO NOT generalize across languages in this task (each grammar's node names differ enough that polymorphism hurts more than helps; a future refactor task can extract common helpers if patterns converge). + +### Step list (TDD) + +- [ ] **Step 1**: Create `tests/fixtures/sample.py`: + +```python +"""sample fixture.""" +import os + +ANSWER = 42 + +@staticmethod +def free(x): + """free fn.""" + return x + 1 + +class Foo: + """doc.""" + def double(self, n): + return n * 2 + + @classmethod + def name(cls): + return "foo" + +class Outer: + class Inner: + def helper(self): + return True + +def with_decorator(): + pass +``` + +- [ ] **Step 2 (failing test)** in `python.rs`: + +```rust +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{Block, MediaType, SourceSpan}; + fn extract_fixture() -> kebab_core::CanonicalDocument { + let bytes = std::fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/sample.py")).unwrap(); + // Reuse the test-support helper added in Task 6 of 1A-2 (rust.rs tests): + // adjust `fixed_rust_asset` to a generic `fixed_code_asset(workspace_path, code_lang)` + // OR inline a per-test asset constructor that matches its kebab-core types. + let asset = crate::rust::tests_support::fixed_code_asset( + "kebab_eval/metrics.py", "python"); + let cfg = kebab_core::ExtractConfig::default(); + let root = std::path::PathBuf::from("/tmp"); + let ctx = kebab_core::ExtractContext { asset: &asset, workspace_root: &root, config: &cfg }; + PythonAstExtractor::new().extract(&ctx, &bytes).unwrap() + } + #[test] + fn extractor_supports_only_media_code_python() { + let e = PythonAstExtractor::new(); + assert!(e.supports(&MediaType::Code("python".into()))); + assert!(!e.supports(&MediaType::Code("rust".into()))); + assert!(!e.supports(&MediaType::Markdown)); + } + #[test] + fn python_units_carry_module_prefixed_symbols() { + let doc = extract_fixture(); + let mut syms: Vec = doc.blocks.iter().map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, lang, .. } => { + assert_eq!(lang.as_deref(), Some("python")); + symbol.clone().unwrap() + } + _ => panic!("expected SourceSpan::Code"), + }, + other => panic!("expected Block::Code, got {other:?}"), + }).collect(); + syms.sort(); + // workspace_path `kebab_eval/metrics.py` → mod_prefix `kebab_eval.metrics` + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.free")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Foo")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Foo.double")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Foo.name")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Outer")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Outer.Inner")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.Outer.Inner.helper")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.with_decorator")); + assert!(syms.iter().any(|s| s == "kebab_eval.metrics.")); // import + assignment + } + #[test] + fn deterministic_across_runs() { + let a = extract_fixture(); + for _ in 0..50 { assert_eq!(extract_fixture().blocks, a.blocks); } + } +} +``` + +(`tests_support::fixed_code_asset` — promote 1A-2's `fixed_rust_asset` to a generic helper that takes the lang string and sets `media_type: MediaType::Code(lang.to_string())`. Move it to a new `pub(crate) mod tests_support` in `rust.rs` so it's reachable from `python.rs::tests`, OR duplicate it inline — pick the smaller diff. Keep the helper `#[cfg(test)]`.) + +- [ ] **Step 3**: Run → FAIL (`PythonAstExtractor` undefined). + +- [ ] **Step 4**: Implement `python.rs`. Scaffold mirrors `rust.rs`; the AST walk follows the table above. The `mod_path: Vec` for Python tracks **class nesting** (so methods get `Class.method`, nested classes get `Outer.Inner`). `Vec` empty at function-level. Glue grouping mirrors Rust's. Apply `mod_prefix` from `module_path_for_python(&asset.workspace_path.0)` to all unit symbols: `if mod_prefix.is_empty() { sym } else { format!("{mod_prefix}.{sym}") }`. The `` / `` label inherits the same prefixing. + +- [ ] **Step 5**: Wire into `lib.rs`: + +```rust +pub mod python; +pub use python::{PARSER_VERSION as PYTHON_PARSER_VERSION, PythonAstExtractor}; +``` + +- [ ] **Step 6**: `cargo test -p kebab-parse-code python` → all pass. + +- [ ] **Step 7**: clippy clean, commit. + +```bash +git add crates/kebab-parse-code/ +git commit -m "feat(p10-1b): tree-sitter-python AST extractor (PythonAstExtractor) + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task F: Python chunker (`code-python-ast-v1`) + +**Files:** +- Create: `crates/kebab-chunk/src/code_python_ast_v1.rs` +- Modify: `crates/kebab-chunk/src/lib.rs` (`mod` + `pub use`) + +NEAR-DUPLICATE of `crates/kebab-chunk/src/code_rust_ast_v1.rs`. ONLY differences: +- `const VERSION_LABEL: &str = "code-python-ast-v1";` +- struct name `CodePythonAstV1Chunker` +- The validation message says "code-python-ast-v1 only handles..." + +`split_oversize` + `make_chunk` + `AST_CHUNK_MAX_LINES` + `BYTES_PER_TOKEN` + `POLICY_HASH_HEX_LEN` IDENTICAL (these are language-agnostic). + +- [ ] **Step 1 (failing tests)**: Copy the entire `#[cfg(test)] mod tests` from `code_rust_ast_v1.rs` and substitute `Rust` → `Python` / `code-rust-ast-v1` → `code-python-ast-v1`. Use the same in-memory `code_doc` helper — it doesn't care about the actual language. Add one extra test specifically asserting the `policy_hash` equals the Rust chunker's (cross-chunker fingerprint identity is a 1A-2 invariant — must hold for new chunkers too). + +- [ ] **Step 2**: Run → FAIL. + +- [ ] **Step 3**: Copy `code_rust_ast_v1.rs` to `code_python_ast_v1.rs` and apply the substitutions above. Keep the `tree-sitter is intentionally NOT a dependency here` comment (still true). + +- [ ] **Step 4**: Wire into `lib.rs`: + +```rust +mod code_python_ast_v1; +pub use code_python_ast_v1::CodePythonAstV1Chunker; +``` + +- [ ] **Step 5**: `cargo test -p kebab-chunk code_python_ast` → pass. Full per-crate suite stays green. + +- [ ] **Step 6**: clippy clean, commit. + +```bash +git add crates/kebab-chunk/ +git commit -m "feat(p10-1b): code-python-ast-v1 chunker (1:1 + oversize split) + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task G: Activate Python in app dispatch + +**Files:** +- Modify: `crates/kebab-app/src/lib.rs` (replace the Python `bail!` arm with real calls) +- Modify: `crates/kebab-app/tests/code_ingest_smoke.rs` (un-`#[ignore]` the Python test, OR add it now if you deferred in Task D) + +- [ ] **Step 1**: Replace the Python arm's `bail!` with `PythonAstExtractor::new().extract(...)` + `CodePythonAstV1Chunker.chunk(...)` calls (mirror the Rust arm exactly). Set parser_version / chunker_version per Python. + +- [ ] **Step 2**: Un-ignore / add `python_file_ingests_and_searches_as_code_citation`. Test asserts the full pipeline produces a `Citation::Code { lang: Some("python"), symbol: Some("kebab_eval.metrics.compute_mrr"), .. }` for a `kebab_eval/metrics.py` written into the temp workspace. + +- [ ] **Step 3**: `cargo test -p kebab-app code_ingest_smoke python_file_ingests` → pass. Existing Rust test stays green. + +- [ ] **Step 4**: clippy clean, commit. + +```bash +git add crates/kebab-app/ +git commit -m "feat(p10-1b): activate Python in ingest_one_code_asset dispatch + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task H: TypeScript Extractor (`kebab-parse-code/src/typescript.rs`) + +**Files:** +- Create: `crates/kebab-parse-code/src/typescript.rs` +- Modify: `crates/kebab-parse-code/src/lib.rs` +- Create: `crates/kebab-parse-code/tests/fixtures/sample.ts` + `sample.tsx` + +Scaffold mirrors `rust.rs`/`python.rs`. Grammar selection: `tree_sitter_typescript::LANGUAGE_TYPESCRIPT` for `.ts`, `LANGUAGE_TSX` for `.tsx`. Decide inside `extract` by inspecting `asset.workspace_path.0` extension (a tiny helper local to this module is fine). + +### TypeScript AST mapping + +| node kind | unit | symbol (joined with `.`) | +|-----------|------|--------------------------| +| `function_declaration` (name) | 1 | `.` | +| `class_declaration` (name) — recurse into `class_body`: each `method_definition` (name) → unit `..` | 1 + 1 per method | as above | +| `interface_declaration` (name), `type_alias_declaration` (name), `enum_declaration` (name) | 1 | `.` | +| `export_statement` wrapping any of the above | unwrap to inner declaration; if the inner is `class_declaration` / `function_declaration` / `interface_declaration` / `type_alias_declaration` / `enum_declaration`, treat as that arm. If `export_statement` itself contains a default (i.e., `export default function () {...}` with no name field), emit unit symbol `.default`. | unwrapped as above, OR `.default` for nameless default | +| `lexical_declaration` / `variable_declaration` at top level (`const`/`let`/`var`) | glue | `` (prefixed) | +| `import_statement`, `export_statement` of bare values | glue | as above | + +`mod_path` for TS is empty (TS modules are file-level, not nested class/namespace at the symbol level — interfaces/types DO live in module scope but their names are unit-level, not parent context). Skip TS `namespace` / `module` declarations: emit them as glue for 1B (the explicit-namespace case is rare in modern TS; documented in 1B Risks). + +Module prefix: `mod_prefix = module_path_for_tsjs(&asset.workspace_path.0)`. Join with `.` for symbol. + +### Steps + +- [ ] **Step 1 (fixtures)**: + +```typescript +// sample.ts +import { x } from "./other"; +const ANSWER = 42; +export interface Greet { hello(): string; } +export type Maybe = T | null; +export function add(a: number, b: number): number { return a + b; } +export class Retriever { + search(q: string): string[] { return []; } + static create(): Retriever { return new Retriever(); } +} +export default function () { return 1; } +``` + +```tsx +// sample.tsx +import React from "react"; +export function Hello({ name }: { name: string }) { return {name}; } +export const App = () => ; // arrow fn assigned → glue in 1B +``` + +- [ ] **Step 2 (failing tests)**: 2 fixture-based tests asserting per-fixture symbols. Asserted symbols (sample.ts): + - `src/sample.add` (if workspace_path is `src/sample.ts`) + - `src/sample.Greet`, `src/sample.Maybe`, `src/sample.Retriever`, `src/sample.Retriever.search`, `src/sample.Retriever.create`, `src/sample.default`, `src/sample.`. +- For sample.tsx (workspace_path `src/sample.tsx`): `src/sample.Hello`, `src/sample.` (App arrow fn rolled into glue). +- Also: `extractor_supports_only_media_code_typescript`, `deterministic_across_runs`. + +- [ ] **Step 3**: Run → FAIL. + +- [ ] **Step 4**: Implement `typescript.rs` mirroring `rust.rs` scaffold. Grammar selection by file extension. AST walk per the table above. Module prefix application same shape as Python (prefix joined with `.`). + +- [ ] **Step 5**: Wire into `lib.rs`: + +```rust +pub mod typescript; +pub use typescript::{PARSER_VERSION as TS_PARSER_VERSION, TypescriptAstExtractor}; +``` + +- [ ] **Step 6**: Tests pass, clippy clean, commit. + +--- + +## Task I: TS chunker (`code-ts-ast-v1`) + +Pattern identical to Task F — duplicate `code_rust_ast_v1.rs` with substitutions (`VERSION_LABEL = "code-ts-ast-v1"`, struct `CodeTsAstV1Chunker`, error message). Test module copies the Rust chunker tests with name substitutions + adds `policy_hash_matches_md_heading_v1`. + +Commit: + +``` +feat(p10-1b): code-ts-ast-v1 chunker (1:1 + oversize split) +``` + +--- + +## Task J: Activate TypeScript in app dispatch + +Mirror Task G. Replace TS `bail!` arm with real calls. Add `typescript_file_ingests_and_searches_as_code_citation` integration test using a `src/Foo.ts` fixture. + +Commit: + +``` +feat(p10-1b): activate TypeScript in ingest_one_code_asset dispatch +``` + +--- + +## Task K: JavaScript Extractor (`javascript.rs`) + +Mirror Task H. tree-sitter-javascript single LanguageFn. AST mapping similar to TS but without `interface_declaration` / `type_alias_declaration` / `enum_declaration`. Module prefix via `module_path_for_tsjs`. + +Test fixture `sample.js`: + +```javascript +// sample.js +import { x } from "./other"; +const ANSWER = 42; +export function add(a, b) { return a + b; } +export class Retriever { + search(q) { return []; } + static create() { return new Retriever(); } +} +export default function () { return 1; } +``` + +Asserted symbols: `src/sample.add`, `src/sample.Retriever`, `src/sample.Retriever.search`, `src/sample.Retriever.create`, `src/sample.default`, `src/sample.`. + +Wire into `lib.rs`: + +```rust +pub mod javascript; +pub use javascript::{PARSER_VERSION as JS_PARSER_VERSION, JavascriptAstExtractor}; +``` + +Commits: + +``` +feat(p10-1b): tree-sitter-javascript AST extractor (JavascriptAstExtractor) +``` + +--- + +## Task L: JS chunker (`code-js-ast-v1`) + Activate JS in app dispatch + +Combine Task F + Task G shape for JS in a single commit (less ceremony than splitting since the diffs are tiny): + +- Chunker: duplicate-with-substitution from `code_rust_ast_v1.rs`. `VERSION_LABEL = "code-js-ast-v1"`, struct `CodeJsAstV1Chunker`. +- App dispatch: replace JS `bail!` with real calls. +- Integration test: `javascript_file_ingests_and_searches_as_code_citation`. + +Commit: + +``` +feat(p10-1b): code-js-ast-v1 chunker + activate JS in app dispatch +``` + +--- + +## Task M: Snapshots + full-suite gate + manual SMOKE + +**Files:** +- Create: `crates/kebab-chunk/tests/code_python_ast_snapshot.rs` + fixture `tests/fixtures/code-sample.py` + baseline `code-sample.chunks.snapshot.json` +- Create: same for TS (`code_ts_ast_snapshot.rs` + fixture `.ts` + baseline) +- Create: same for JS (`code_js_ast_snapshot.rs` + fixture `.js` + baseline) + +Mirror `crates/kebab-chunk/tests/code_rust_ast_snapshot.rs` exactly for each language. Build the `CanonicalDocument` IN-MEMORY (no `kebab-parse-code` dep crossing the chunk boundary). + +- [ ] **Step 1**: Add the 3 snapshot tests. Generate baselines: `UPDATE_SNAPSHOTS=1 cargo test -p kebab-chunk code_{python,ts,js}_ast_snapshot`. Re-run without env var → PASS. + +- [ ] **Step 2**: Full-suite gate (memory-conscious): + - `cargo clippy --workspace --all-targets -- -D warnings` (one invocation, no parallel). + - `cargo test --workspace --no-fail-fast -j 1` (the `-j 1` is mandatory). If the pre-existing `runner_lexical_is_deterministic_per_query_payload` flake reappears (unlikely — was fixed in PR #141 on main and merged before 1B branch was cut), re-run that single test once. + +- [ ] **Step 3**: Manual SMOKE (mirror `docs/SMOKE.md` P10-1A-2 flow for each language): + +```bash +cargo build --release +rm -rf /tmp/kebab-1bsmoke && mkdir -p /tmp/kebab-1bsmoke/ws/{kebab_eval,src} +echo 'def compute_mrr(): return 1.0' > /tmp/kebab-1bsmoke/ws/kebab_eval/metrics.py +echo 'export function add(a,b){return a+b;}' > /tmp/kebab-1bsmoke/ws/src/foo.ts +echo 'export function sub(a,b){return a-b;}' > /tmp/kebab-1bsmoke/ws/src/bar.js +# (match isolated config block format from docs/SMOKE.md) +./target/release/kebab --config /tmp/kebab-1bsmoke/config.toml ingest --json | jq '.items[].parser_version' | sort -u +./target/release/kebab --config /tmp/kebab-1bsmoke/config.toml search "compute_mrr" --code-lang python --json | jq '.hits[0]' +./target/release/kebab --config /tmp/kebab-1bsmoke/config.toml schema --json | jq '.stats.code_lang_breakdown' +``` + +Expected: parser_versions include `code-python-v1`, `code-ts-v1`, `code-js-v1`. Search returns `Citation::Code { lang: "python", symbol: "kebab_eval.metrics.compute_mrr" }`. `code_lang_breakdown` includes all four langs (rust may be 0 unless you also added a .rs). + +- [ ] **Step 4**: Commit (snapshot files + any harness tweaks). + +```bash +git add crates/kebab-chunk/tests/ +git commit -m "test(p10-1b): per-language chunker snapshots + full-suite gate" +``` + +--- + +## Task N: Docs + HOTFIXES + version bump + +- README: 지원 형식 / 명령 table row adds Python / TypeScript / JavaScript next to Rust. Mermaid stays unchanged (no new external surface crosses the diagram). +- HANDOFF: P10 row notes 1B merged (3 langs active). Add a one-line entry under 머지 후 결정 cross-linking the HOTFIXES entries. +- ARCHITECTURE: dependency-graph edge `pcode → core` already present. The new tree-sitter-{python,typescript,javascript} edges to `pcode` add to the description text. Locked-in decisions table: add "1B symbol path: workspace path → module path (Python dotted, TS/JS slash-style); Rust 1A keeps file-scope nesting only — HOTFIXES 2026-05-20". +- SMOKE: add 1B section mirroring the 1A-2 P10 section structure (config block, ingest / search / schema verification commands) for Python and TS/JS. Compact — one shared section for all three. +- tasks/INDEX + tasks/p10/INDEX: flip 1B row 🟡→🟢 (on PR open; ✅ on merge). +- tasks/HOTFIXES.md: TWO dated 2026-05-20 entries: + 1. **Rust 1A-2 symbol path is file-scope-only; 1B+ uses workspace path → module prefix**. Cross-link to design §3.4. Acceptable inconsistency for now (cost of 1A retrofit = chunker_version bump + reindex for every existing Rust corpus). User-requested retrofit triggers a separate task. + 2. **Expression-level functions (arrow fn / function expression assigned to const) NOT emitted as separate units in 1B 1차**. They fold into the `` glue. Documented limit; future phase may add `lexical_declaration` → inner-expression unwrap. + Cross-link both in `tasks/p10/p10-1b-py-ts-js-ast-chunkers.md` Risks/notes. +- `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` §10.1: add a one-liner — "p10-1B 활성화 (Python / TypeScript / JavaScript)". +- `Cargo.toml`: workspace version `0.7.0 → 0.8.0`. `cargo build --release` refreshes Cargo.lock. +- One commit: + +```bash +git add -A +git commit -m "docs(p10-1b): README/HANDOFF/ARCHITECTURE/SMOKE/INDEX + HOTFIXES; chore: bump version 0.7.0 → 0.8.0 + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Finalize + +- `gitea-pr` open the PR (gitea-ops skill) — title `feat(p10-1B): Python + TS/JS AST chunkers — tree-sitter-{python,typescript,javascript} 코드 색인 활성화`. +- **Review loop mode** (fixed per workflow memory) until APPROVE → merge → main pull → branch cleanup → `cargo clean` → `gitea-release v0.8.0`. + +--- + +## Self-review checklist (filled by plan author) + +- **Spec coverage**: every row of design §1B has a task; §3.4 symbol path covered by Task C + per-language extractors + integration tests; §6.1/§6.2 module structure covered by Tasks E/F/H/I/K/L; §9.1 Tier-1 + oversize fallback inherited from 1A-2 chunker pattern (Tasks F/I/L); §3.5 code_lang already in 1A-2 helper, extended in Task B routing; §5 dispatch covered by Task D; cascade rule (versioning §9) — chunker versions are per-language, fixture snapshots lock behavior. +- **No placeholders**: all novel logic (module_path helpers, app dispatch generalization, Python AST walk rules) given concretely with full code or exact deltas vs 1A-2. The per-language chunkers are explicit "duplicate code_rust_ast_v1.rs with substitution X/Y/Z" — concrete and verifiable, not vague. +- **Type consistency**: parser_version constants (`code-{rust,python,ts,js}-v1`) and chunker_version labels (`code-{rust,python,ts,js}-ast-v1`) used consistently across Tasks D/E/F/G/H/I/J/K/L. `module_path_for_python` / `module_path_for_tsjs` referenced consistently as the source of truth for prefixing. diff --git a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md index 4ec2113..dc137fb 100644 --- a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md +++ b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md @@ -1543,6 +1543,8 @@ transitional 형태) 의 source of truth. **p10-1A-2 surface 활성화 (2026-05-19)**: Rust 소스코드 ingest (`code-rust-ast-v1` chunker, `tree-sitter-rust`) 가 활성화됨. `.rs` 파일을 워크스페이스에 두면 `kebab ingest` 가 AST 단위로 chunk 생성 + `citation.kind = "code"` 로 검색 가능. `kebab schema --json` 의 `stats.code_lang_breakdown` 에 `"rust": N` 이 표시됨. 본 activation 으로 kebab 자기 crate 를 dogfooding KB 에 색인 가능. `SourceSpan::Code` (§3.4) 와 `MediaType::Code` (§3.5) 는 1A-1 에서 이미 spec 에 반영됨. 두 deferred deviation (`AST_CHUNK_MAX_LINES` 상수 고정, `SourceType::Code` 미존재) 은 `tasks/HOTFIXES.md` (2026-05-19) 에 기록. +**p10-1B 활성화 (Python / TypeScript / JavaScript) (2026-05-20)**: Python (`code-python-ast-v1`, `.py`), TypeScript (`code-ts-ast-v1`, `.ts`/`.tsx`), JavaScript (`code-js-ast-v1`, `.js`/`.mjs`/`.cjs`/`.jsx`) AST chunker 활성화. symbol path 는 workspace 경로 → module path prefix: Python = dotted (예: `kebab_eval.metrics.compute_mrr`), TypeScript/JavaScript = slash-style (예: `src/Foo.Foo.search`). Rust 1A-2 의 file-scope-only symbol 과 비일관 수용 (HOTFIXES 2026-05-20). expression-level 함수 (`const foo = () => {}`) 는 glue 처리 (HOTFIXES 2026-05-20). + ### 10.2 MCP server transport (fb-30) `kebab mcp` 가 stdio JSON-RPC server. Rust SDK = `rmcp 1.6`. Tool surface diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index 2350e6a..f64d4ca 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -14,6 +14,40 @@ historical contract that was implemented; this file accumulates the deltas so phase 5+ readers can find the live behavior without diffing git history. +## 2026-05-20 — p10-1B: Rust 1A-2 symbol path is file-scope-only; 1B+ uses workspace path → module prefix + +**무엇이 바뀌었나**: P10-1A-2 의 Rust `code-rust-ast-v1` chunker 가 생성하는 symbol 은 file-scope mod-path nesting 만 사용한다 (예: `Foo::double`). P10-1B 이후 Python / TypeScript / JavaScript 의 symbol 은 workspace 경로 → module path prefix 를 포함한다 (예: `kebab_eval.metrics.compute_mrr`, `src/Foo.Foo.search`). + +**원인**: 1A-2 는 symbol path 컨벤션이 확정되기 전에 구현됐고, 1B spec 에서 workspace path → module prefix 를 명시적 결정으로 확정했다 (p10-1b-py-ts-js-ast-chunkers.md §동결된 설계 결정). 1A-2 retrofit = `chunker_version` bump + Rust corpus 전체 re-ingest 비용이 수반됨. + +**사용자 가시적 영향**: Rust 코드 검색 시 symbol 이 `::` 형태 (workspace prefix 없음). Python/TypeScript/JavaScript 는 `.` / `.` 형태. 비일관이지만 각각은 일관되게 동작. + +**proper fix**: Rust AST chunker 에 `module_path_for_rust(workspace_path)` helper 추가 + `chunker_version = "code-rust-ast-v2"` bump → 사용자가 명시 요청할 때까지 보류. + +**cross-link**: `tasks/p10/p10-1b-py-ts-js-ast-chunkers.md` Risks / notes 섹션, design §3.4. + +## 2026-05-20 — p10-1B: module_path_for_python / _tsjs do not sanitize non-ASCII / 공백 / 특수문자 in workspace path + +**동작**: `module_path_for_python` 와 `module_path_for_tsjs` 가 workspace path 의 비-ASCII / 공백 / 따옴표 / 백슬래시 같은 특수문자를 그대로 prefix 에 통과시킨다. 예: `kebab eval/metrics.py` (공백 포함) → module prefix `kebab eval.metrics` — 라이브러리 코드는 동작하지만 symbol 텍스트에 공백이 들어간다. + +**이유**: 1B 1차 단순화. 대다수 코드 베이스가 ASCII identifier + `/` 구분자만 사용하므로 사용자 경험상 영향 미미. + +**해결**: 후속 phase 에서 path-sanitize 추가 검토. NFKC normalize 후 `[^A-Za-z0-9_.\-/]` → `_` 변환 식. 적용 시 chunker_version bump 트리거 (re-ingest cascade 필요). + +**cross-link**: `tasks/p10/p10-1b-py-ts-js-ast-chunkers.md` Risks / notes 섹션 line 55. + +## 2026-05-20 — p10-1B: expression-level functions (arrow fn, function expression assigned to const) NOT emitted as units in 1B 1차 + +**무엇이 바뀌었나**: TypeScript / JavaScript 의 `const foo = () => {...}` 또는 `const bar = function() {...}` 같은 expression-level 함수 할당은 `code-ts-ast-v1` / `code-js-ast-v1` 에서 독립 unit 으로 방출되지 않는다. 해당 코드는 가장 가까운 surrounding declaration-level unit (또는 `` glue) 에 흡수된다. + +**원인**: `function_declaration` / `class_declaration` / `method_definition` / `interface_declaration` 같은 declaration-level 노드만 unit 으로 선택. `lexical_declaration` (= `const / let / var`) 안의 function / arrow expression 은 별도 unwrap 없이 pass-through. 1B 1차 단순화. + +**사용자 가시적 영향**: expression-level 함수 이름으로 검색 시 함수 body 를 포함하는 glue chunk 가 반환되지만, symbol 이 함수 이름 자체를 가리키지는 않는다. 함수명이 함수 본문 텍스트에 등장하므로 lexical / hybrid 검색으로 일반적으로 찾을 수 있다. + +**proper fix**: `lexical_declaration` visitor 에서 binding value 가 `arrow_function` / `function` expression 인 경우 해당 identifier name 을 symbol 로 사용하는 unwrap 추가. 후속 phase 에서 검토. + +**cross-link**: `tasks/p10/p10-1b-py-ts-js-ast-chunkers.md` Risks / notes 섹션. + ## 2026-05-19 — p10-1A-2: AST_CHUNK_MAX_LINES constant vs config deviation **무엇이 바뀌었나**: `kebab-chunk/src/code_rust_ast_v1.rs` 가 `IngestCodeCfg.ast_chunk_max_lines` config 값을 읽지 않고 모듈 상수 `AST_CHUNK_MAX_LINES = 200` 으로 고정함. diff --git a/tasks/INDEX.md b/tasks/INDEX.md index 4f39c79..549b403 100644 --- a/tasks/INDEX.md +++ b/tasks/INDEX.md @@ -140,8 +140,8 @@ P0~P5 는 직렬. P6~P9 는 P5 이후 병렬 가능. - P10 — [p10/](p10/) — code ingest (multi-task, sub-indexed in [p10/INDEX.md](p10/INDEX.md)) - [p10-1A-1 code ingest framework](p10/p10-1a-1-code-ingest-framework.md) — ✅ 머지 - - [p10-1A-2 Rust AST chunker](p10/p10-1a-2-rust-ast-chunker.md) — 🟡 PR 오픈 (코드 완성, 머지 대기) - - p10-1B Python + TS/JS AST chunkers — ⏳ + - [p10-1A-2 Rust AST chunker](p10/p10-1a-2-rust-ast-chunker.md) — ✅ 머지 + - [p10-1B Python + TS/JS AST chunkers](p10/p10-1b-py-ts-js-ast-chunkers.md) — 🟡 PR 오픈 (코드 완성, 머지 대기) - p10-1C Go + Java + Kotlin AST chunkers — ⏳ - p10-1D C + C++ AST chunkers — ⏳ - p10-2 Tier 2 resource-aware — ⏳ diff --git a/tasks/p10/INDEX.md b/tasks/p10/INDEX.md index db727e7..2e389f5 100644 --- a/tasks/p10/INDEX.md +++ b/tasks/p10/INDEX.md @@ -3,8 +3,8 @@ | ID | Subject | Status | |----|---------|--------| | 1A-1 | code ingest framework (wire schema, parse-code crate skeleton, filter flags, skip policy, config 절) | ✅ 머지 | -| 1A-2 | Rust AST chunker | 🟡 PR 오픈 (코드 완성, 머지 대기) | -| 1B | Python + TS/JS AST chunkers | ⏳ | +| 1A-2 | Rust AST chunker | ✅ 머지 | +| 1B | Python + TS/JS AST chunkers | 🟡 PR 오픈 (코드 완성, 머지 대기) | | 1C | Go + Java + Kotlin AST chunkers | ⏳ | | 1D | C + C++ AST chunkers | ⏳ | | 2 | Tier 2 resource-aware (k8s / Dockerfile / manifest) | ⏳ | diff --git a/tasks/p10/p10-1b-py-ts-js-ast-chunkers.md b/tasks/p10/p10-1b-py-ts-js-ast-chunkers.md new file mode 100644 index 0000000..8437531 --- /dev/null +++ b/tasks/p10/p10-1b-py-ts-js-ast-chunkers.md @@ -0,0 +1,60 @@ +# p10-1B — Python + TS/JS AST chunkers + +**Status:** 🟡 진행 중 +**Contract sections:** §3.3 (chunker_version `code-python-ast-v1` / `code-ts-ast-v1` / `code-js-ast-v1`), §3.4 (symbol path — Python `pkg.module.Class.method`, TS/JS `module/Class.method` / `module/default`), §3.5 (code_lang `python` / `typescript` / `javascript`), §5 (확장자 라우팅 활성화), §6.1 (`kebab-parse-code/src/{python,typescript,javascript}.rs`), §6.2 (`kebab-chunk/src/code_{python,ts,js}_ast_v1.rs`), §9.1 (Tier 1 AST per-language + oversize fallback). +**Design:** [2026-05-15-kebab-code-ingest-design.md](../../docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md) §1B. +**Plan:** [2026-05-20-p10-1b-py-ts-js-ast-chunkers.md](../../docs/superpowers/plans/2026-05-20-p10-1b-py-ts-js-ast-chunkers.md). + +## Goal + +1A-2 가 깐 인프라 (`SourceSpan::Code`, `MediaType::Code(String)`, `Citation::Code` 매핑, `citation_helper` arm, `backfill_code_lang` + `backfill_repo`, `schema.v1.code_lang_breakdown`, `[ingest.code]` 절, HOTFIXES) 위에 **Python + TypeScript + JavaScript** 3 언어의 extractor + chunker 를 활성화. design §1B 묶음과 일치하는 단일 PR. 머지 시점부터 Python / TS / JS 프로젝트도 dogfooding 가능. + +## 동결된 설계 결정 (이 task 로 확정) + +- **Symbol path 의 module prefix = workspace 경로 → module path 변환** (design §3.4 예시 충실, 사용자 명시 결정): + - **Python**: `crates/x/src/foo/bar.py` 같은 workspace_path 를 `/`/`__init__.py` 처리 + `.py`·`.pyi` strip + `/` → `.` 변환 후 dotted prefix 로 사용. 예시: `kebab_eval/metrics.py` 의 `def compute_mrr()` → symbol `kebab_eval.metrics.compute_mrr`. `pkg/__init__.py` 는 module `pkg` 자체. 변환은 `kebab-parse-code::lang::module_path_for_python(workspace_path)` 단일 함수 (source of truth). + - **TS/JS**: `src/search/retriever/Retriever.ts` → `src/search/retriever/Retriever` prefix + `/` 구분자 보존 + `.ts`/`.tsx`/`.js`/`.jsx`/`.mjs`/`.cjs` strip. 예시: `src/search/retriever/Retriever.ts` 의 method `search` → `src/search/retriever/Retriever.search`. `module/default` 는 `export default function/class` 경우. 변환은 `module_path_for_tsjs(workspace_path)`. + - **Rust 1A-2 는 retrofit 하지 않음** — 1A 는 file-scope nesting 만 사용 (workspace prefix 없음). 비일관 수용; HOTFIXES 2026-05-20 에 기록 + 사용자가 명시 요청 시 retrofit (chunker_version bump + re-ingest cascade 필요). +- **TypeScript grammar selection**: `tree-sitter-typescript` crate 의 `LANGUAGE_TYPESCRIPT` 는 `.ts`, `LANGUAGE_TSX` 는 `.tsx` 에 사용. 파일 확장자로 선택. `code-ts-ast-v1` 하나의 chunker 가 둘 다 처리 (parser_version `code-ts-v1`). +- **JavaScript grammar**: `tree-sitter-javascript` 단일 LanguageFn 가 `.js` / `.mjs` / `.cjs` / `.jsx` 모두 처리. 별도 분기 불필요. +- **Expression-level 함수 (arrow fn / function expression assigned to const)**: 1B 1차에서는 *declaration-level 만* unit (function_declaration / class_declaration / method_definition / interface_declaration / type_alias_declaration / decorated_definition 등). `const foo = () => {...}` 같은 expression-level 은 glue 로 잡힘. HOTFIXES 2026-05-20 기록; 후속 phase 에서 lexical_declaration 안의 함수 표현식 unwrap 추가 검토. +- **App dispatch 일반화**: 현재 `ingest_one_code_asset` 은 RustAstExtractor + CodeRustAstV1Chunker 하드코딩. 1B 에서 `lang: &str` 받아 dispatch (Rust 도 동일 함수로 흡수) — Extractor 와 Chunker 를 trait object 가 아니라 enum/match 로 선택 (kebab-app 만 변경, kebab-core/Chunker trait 불변). frozen design 영향 없음. +- frozen design 자체는 변경 없음 (§3.4 의 symbol path 예시는 이미 본 결정과 일치). §10.1 (post-merge surface) 에 1B 활성화 한 줄 추가. + +## Acceptance criteria + +- `cargo test --workspace --no-fail-fast -j 1` passes (메모리 의식적으로는 per-crate; full-suite gate 는 Task K 직전 1회). +- `cargo clippy --workspace --all-targets -- -D warnings` passes. +- 3 언어 각각의 fixture (`tests/fixtures/sample.{py,ts,js}`) ingest → chunk snapshot 안정 + `Citation::Code` 의 symbol/line 이 §3.4 컨벤션 (workspace path → module path) 과 일치. +- 격리 TempDir KB 에 Python/TS/JS 파일 하나씩 두고 `kebab search --code-lang {python|typescript|javascript} --json` 가 정상 결과 반환. +- `kebab schema --json | jq .stats.code_lang_breakdown` 에 `python`, `typescript`, `javascript` 카운트 등장. +- README + HANDOFF + ARCHITECTURE + SMOKE + tasks/INDEX + tasks/p10/INDEX 갱신. +- frozen design §10.1 한 줄 추가 (1B 활성화). +- HOTFIXES 2026-05-20 에 (a) Rust 1A-2 symbol path 비일관 (1B 와 다름), (b) expression-level 함수 단위 제외 — 두 편차 기록. +- workspace `Cargo.toml` minor bump (0.7.0 → 0.8.0) — 도그푸딩 가능 surface 확장. + +## Allowed dependencies + +- `kebab-parse-code` 에 `tree-sitter-python`, `tree-sitter-typescript`, `tree-sitter-javascript` 추가 (workspace deps 경유). 기존 `kebab-core` / `anyhow` / `gix` / `tree-sitter` / `tree-sitter-rust` / `serde_json` / `time` / `tracing` 유지. +- `kebab-chunk` 의 새 모듈 3개 (`code_python_ast_v1.rs` / `code_ts_ast_v1.rs` / `code_js_ast_v1.rs`) — 1A-2 chunker 와 동일 dep (kebab-core + serde_json_canonicalizer + blake3 + anyhow + tracing). tree-sitter 절대 import 금지. +- `kebab-app` 변경 — 새 crate dep 없음. +- `kebab-source-fs` — 확장자 추가만, 새 dep 없음. + +## Forbidden dependencies + +- `kebab-chunk` 가 `tree-sitter-*` 직접 import 금지 (AST 는 parser-side). +- UI crate (cli / mcp / tui) 가 `kebab-parse-code` 직접 import 금지. +- `kebab-parse-code` 가 store / embed / llm / rag 직접 import 금지 (design §8 inheritance). + +## Risks / notes + +- tree-sitter-typescript 의 `LANGUAGE_TYPESCRIPT` 와 `LANGUAGE_TSX` 가 별도 LanguageFn — 잘못 선택하면 TSX JSX 가 parse 실패. 파일 확장자 기반 선택을 단일 함수에서 결정 (테스트로 고정). +- tree-sitter-python 의 `decorated_definition` 노드 처리 — 데코레이터가 wrap 하는 형태라 `function_definition` / `class_definition` 가 child. unwrap 필요 (decorator 라인은 unit_start backward extension 으로 자연스럽게 포함됨). +- Python `pkg/__init__.py` 의 module path = `pkg` 자체 (basename 제거). `module_path_for_python` 가 이걸 처리. +- TS/JS 의 `export default function/class` — name 이 없을 수 있음 (`export default function () {...}`). symbol `module/default` 로 표기 (design §3.4). +- `module_path_for_python` / `module_path_for_tsjs` 가 workspace_path 의 비-ASCII / 공백 / 특수문자 처리 필요. 1B 1차에서는 그대로 전달 (sanitize 없음); HOTFIXES 에 path-sanitize 부재 기록. +- 1A-2 `ingest_one_code_asset` 일반화로 인한 dispatch 코드 변경 — Rust 기존 동작 byte-identical 유지를 통합 테스트로 확인. +- 머지 후 deviation 은 `tasks/HOTFIXES.md` 에 dated 로그 + 본 spec `Risks / notes` 에 one-line cross-link. +- **[HOTFIXES 2026-05-20]** Rust 1A-2 symbol 은 file-scope nesting 만 (workspace prefix 없음); 1B 의 Python/TypeScript/JavaScript 와 비일관 — retrofit 은 사용자 명시 요청 시. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-20, "Rust 1A-2 symbol path"). +- **[HOTFIXES 2026-05-20]** TypeScript/JavaScript 의 expression-level 함수 (`const foo = () => {}` 등) 는 `` glue 로 처리됨, 독립 unit 미방출 — 후속 phase 에서 `lexical_declaration` unwrap 검토. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-20, "expression-level functions"). +- **[HOTFIXES 2026-05-20]** `module_path_for_python` / `module_path_for_tsjs` 가 path-sanitize 안 함 (특수문자/공백 그대로 prefix 에 들어감) — 후속 phase 에서 NFKC + 사용금지 문자 변환 검토. 자세한 내용: `tasks/HOTFIXES.md` (2026-05-20, "module_path_for_python / _tsjs do not sanitize").