diff --git a/Cargo.lock b/Cargo.lock index 34e4a91..73f9b16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4127,7 +4127,7 @@ dependencies = [ [[package]] name = "kebab-app" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "base64 0.22.1", @@ -4172,7 +4172,7 @@ dependencies = [ [[package]] name = "kebab-chunk" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4188,7 +4188,7 @@ dependencies = [ [[package]] name = "kebab-cli" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "clap", @@ -4209,7 +4209,7 @@ dependencies = [ [[package]] name = "kebab-config" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "dirs 5.0.1", @@ -4224,7 +4224,7 @@ dependencies = [ [[package]] name = "kebab-core" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4238,7 +4238,7 @@ dependencies = [ [[package]] name = "kebab-embed" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4252,7 +4252,7 @@ dependencies = [ [[package]] name = "kebab-embed-local" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "fastembed", @@ -4265,7 +4265,7 @@ dependencies = [ [[package]] name = "kebab-eval" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-app", @@ -4284,7 +4284,7 @@ dependencies = [ [[package]] name = "kebab-llm" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-core", @@ -4293,7 +4293,7 @@ dependencies = [ [[package]] name = "kebab-llm-local" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-config", @@ -4310,7 +4310,7 @@ dependencies = [ [[package]] name = "kebab-mcp" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-app", @@ -4328,7 +4328,7 @@ dependencies = [ [[package]] name = "kebab-normalize" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-core", @@ -4343,7 +4343,7 @@ dependencies = [ [[package]] name = "kebab-parse-code" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "gix", @@ -4353,6 +4353,8 @@ dependencies = [ "time", "tracing", "tree-sitter", + "tree-sitter-c", + "tree-sitter-cpp", "tree-sitter-go", "tree-sitter-java", "tree-sitter-javascript", @@ -4364,7 +4366,7 @@ dependencies = [ [[package]] name = "kebab-parse-image" -version = "0.15.0" +version = "0.16.0" dependencies = [ "ab_glyph", "anyhow", @@ -4388,7 +4390,7 @@ dependencies = [ [[package]] name = "kebab-parse-md" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-core", @@ -4405,7 +4407,7 @@ dependencies = [ [[package]] name = "kebab-parse-pdf" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4418,7 +4420,7 @@ dependencies = [ [[package]] name = "kebab-parse-types" -version = "0.15.0" +version = "0.16.0" dependencies = [ "kebab-core", "serde", @@ -4426,7 +4428,7 @@ dependencies = [ [[package]] name = "kebab-rag" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4447,7 +4449,7 @@ dependencies = [ [[package]] name = "kebab-search" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "globset", @@ -4466,7 +4468,7 @@ dependencies = [ [[package]] name = "kebab-source-fs" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4485,7 +4487,7 @@ dependencies = [ [[package]] name = "kebab-store-sqlite" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4506,7 +4508,7 @@ dependencies = [ [[package]] name = "kebab-store-vector" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "arrow", @@ -4530,7 +4532,7 @@ dependencies = [ [[package]] name = "kebab-tui" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "crossterm", @@ -8531,6 +8533,26 @@ dependencies = [ "tree-sitter-language", ] +[[package]] +name = "tree-sitter-c" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9b2eb57a55fed6b00812912e730b7a275cf4fe98bfd6a5d76263d4438371728" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-cpp" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df2196ea9d47b4ab4a31b9297eaa5a5d19a0b121dceb9f118f6790ad0ab94743" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-go" version = "0.25.0" diff --git a/Cargo.toml b/Cargo.toml index 571d7ff..3a5c1d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ edition = "2024" rust-version = "1.85" license = "MIT OR Apache-2.0" repository = "https://github.com/altair823/kebab" -version = "0.15.0" +version = "0.16.0" [workspace.dependencies] anyhow = "1" @@ -99,6 +99,9 @@ tree-sitter-go = "0.25.0" # JVM family grammars for code ingest (kebab-parse-code, p10-1C-JK). tree-sitter-java = "0.23.5" tree-sitter-kotlin-ng = "1.1.0" # bare tree-sitter-kotlin requires ts <0.23; -ng uses tree-sitter-language 0.1 (ts 0.26 compat) +# C/C++ family grammars for code ingest (kebab-parse-code, p10-1D). +tree-sitter-c = "0.24.2" +tree-sitter-cpp = "0.23.4" # Disk-footprint trim for dev / test builds. Codegen, opt-level, and # behavior are unchanged — only DWARF debug info is reduced (line diff --git a/HANDOFF.md b/HANDOFF.md index bb8338b..386bf0f 100644 --- a/HANDOFF.md +++ b/HANDOFF.md @@ -4,7 +4,7 @@ ## 한 줄 요약 -P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. `kebab ingest` 가 markdown / image / PDF / 소스코드 (Rust / Python / TS / JS / Go / Java / Kotlin) / Tier 2 리소스 파일 (yaml/k8s / dockerfile / toml / json / xml / groovy / go-mod) + Tier 3 paragraph fallback (shell / 비-k8s YAML / AST 실패 케이스) 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page / code citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공. P10-3 (Tier 3 paragraph fallback) 완료 — 다음 후보 = P10-1D (C/C++) 또는 P9-5 (desktop tauri) 또는 보류 중인 P8 (audio). +P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. `kebab ingest` 가 markdown / image / PDF / 소스코드 (Rust / Python / TS / JS / Go / Java / Kotlin) / Tier 2 리소스 파일 (yaml/k8s / dockerfile / toml / json / xml / groovy / go-mod) + Tier 3 paragraph fallback (shell / 비-k8s YAML / AST 실패 케이스) 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page / code citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공. P10-3 (Tier 3 paragraph fallback) 완료. P10-1D (C + C++) 완료로 Tier 1 chunker family 마무리 — 다음 후보 = P9-5 (desktop tauri) 또는 보류 중인 P8 (audio). ## Phase 로드맵 @@ -20,7 +20,7 @@ P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. | **P7** | PDF text + page citation | `kebab-parse-pdf` | P5 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring) | | **P8** | 음성 transcription + timestamp citation | `kebab-parse-audio` | P5 | ⏸ 보류 (whisper-rs 시스템 dep brainstorm 필요) | | **P9** | TUI + desktop app | `kebab-tui`, `kebab-desktop` | P5 | 🟡 진행 (4/5 component — P9-1/2/3/4 완료 [Library / Search / Ask / Inspect], P9-5 desktop 예정 · 도그푸딩 피드백 **20/20 ✅**) | -| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)**, **2 ✅ (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` — v0.14.0)**, **3 ✅ (Tier 3 paragraph fallback: code-text-paragraph-v1 — v0.15.0)** | +| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)**, **2 ✅ (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` — v0.14.0)**, **3 ✅ (Tier 3 paragraph fallback: code-text-paragraph-v1 — v0.15.0)**, **1D ✅ (C + C++ AST chunkers, code-c-ast-v1 + code-cpp-ast-v1 — v0.16.0)** | P0~P5 직렬. P6~P9 P5 이후 병렬 가능. diff --git a/README.md b/README.md index 828ac89..1455e14 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ kebab doctor | 명령 | 동작 | |------|------| | `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 | -| `kebab ingest []` | Markdown / 이미지 / PDF / Rust 소스코드 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`), **소스코드** (`.rs` → `code-rust-ast-v1`, `.py` → `code-python-ast-v1`, `.ts`/`.tsx` → `code-ts-ast-v1`, `.js`/`.mjs`/`.cjs`/`.jsx` → `code-js-ast-v1`, `.go` → `code-go-ast-v1`, `.java` → `code-java-ast-v1`, `.kt`/`.kts` → `code-kotlin-ast-v1` — 모두 tree-sitter AST chunker; **Tier 2 리소스 파일**: `.yaml`/`.yml` → `k8s-manifest-resource-v1` (apiVersion+kind 파싱), `Dockerfile`/`Dockerfile.*`/`*.dockerfile` → `dockerfile-file-v1` (전체 파일), `Cargo.toml`/`pyproject.toml`/`.toml`/`package.json`/`tsconfig.json`/`.json`/`pom.xml`/`.xml`/`build.gradle`/`.gradle`/`go.mod` → `manifest-file-v1` (전체 파일) — yaml (k8s) / dockerfile / toml / json / xml / groovy / go-mod 지원); **Tier 3 paragraph fallback** (`.sh`/`.bash`/`.zsh` → `code-text-paragraph-v1`, blank-line paragraph split + 80-line/20-overlap line-window. Tier 1/2 가 0 chunk 또는 Err 시 자동 fallback — 비-k8s YAML 같은 케이스 picked up. symbol = None, lang 은 원본 보존.). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. 코드 chunk 는 `citation.kind = "code"` 에 `citation.lang = ""` + `symbol` + line range 를 담고, SearchHit top-level 에 `code_lang` + `repo` (`.git/` walk-up 의 디렉토리 이름) 가 backfill 됨. `--code-lang rust` / `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` / `--code-lang go` / `--code-lang java` / `--code-lang kotlin` / `--code-lang yaml` / `--code-lang dockerfile` / `--code-lang toml` / `--code-lang json` / `--code-lang xml` / `--code-lang groovy` / `--code-lang go-mod` / `--code-lang shell` / `--media code` filter 로 언어별·코드 전용 검색 가능 (p10-1A-1 filter flags). Python symbol 은 workspace 경로 → dotted module path prefix (예: `kebab_eval.metrics.compute_mrr`), TS/JS symbol 은 slash-style module path prefix (예: `src/Foo.Foo.search`), Go symbol 은 `package.Func` / `package.(*Receiver).Method` 형식, Java / Kotlin symbol 은 `com.foo.Foo.bar` 형식 (패키지 + 클래스 + 메서드/필드). | +| `kebab ingest []` | Markdown / 이미지 / PDF / Rust 소스코드 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`), **소스코드** (`.rs` → `code-rust-ast-v1`, `.py` → `code-python-ast-v1`, `.ts`/`.tsx` → `code-ts-ast-v1`, `.js`/`.mjs`/`.cjs`/`.jsx` → `code-js-ast-v1`, `.go` → `code-go-ast-v1`, `.java` → `code-java-ast-v1`, `.kt`/`.kts` → `code-kotlin-ast-v1`, `.c`/`.h` → `code-c-ast-v1`, `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx` → `code-cpp-ast-v1` — 모두 tree-sitter AST chunker; **Tier 2 리소스 파일**: `.yaml`/`.yml` → `k8s-manifest-resource-v1` (apiVersion+kind 파싱), `Dockerfile`/`Dockerfile.*`/`*.dockerfile` → `dockerfile-file-v1` (전체 파일), `Cargo.toml`/`pyproject.toml`/`.toml`/`package.json`/`tsconfig.json`/`.json`/`pom.xml`/`.xml`/`build.gradle`/`.gradle`/`go.mod` → `manifest-file-v1` (전체 파일) — yaml (k8s) / dockerfile / toml / json / xml / groovy / go-mod 지원); **Tier 3 paragraph fallback** (`.sh`/`.bash`/`.zsh` → `code-text-paragraph-v1`, blank-line paragraph split + 80-line/20-overlap line-window. Tier 1/2 가 0 chunk 또는 Err 시 자동 fallback — 비-k8s YAML 같은 케이스 picked up. symbol = None, lang 은 원본 보존.). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. 코드 chunk 는 `citation.kind = "code"` 에 `citation.lang = ""` + `symbol` + line range 를 담고, SearchHit top-level 에 `code_lang` + `repo` (`.git/` walk-up 의 디렉토리 이름) 가 backfill 됨. `--code-lang rust` / `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` / `--code-lang go` / `--code-lang java` / `--code-lang kotlin` / `--code-lang yaml` / `--code-lang dockerfile` / `--code-lang toml` / `--code-lang json` / `--code-lang xml` / `--code-lang groovy` / `--code-lang go-mod` / `--code-lang shell` / `--code-lang c` / `--code-lang cpp` / `--media code` filter 로 언어별·코드 전용 검색 가능 (p10-1A-1 filter flags). Python symbol 은 workspace 경로 → dotted module path prefix (예: `kebab_eval.metrics.compute_mrr`), TS/JS symbol 은 slash-style module path prefix (예: `src/Foo.Foo.search`), Go symbol 은 `package.Func` / `package.(*Receiver).Method` 형식, Java / Kotlin symbol 은 `com.foo.Foo.bar` 형식 (패키지 + 클래스 + 메서드/필드). | | `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID] [--trace] [--bulk] [--repo NAME ...] [--code-lang LIST]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` 는 반복 가능 flag (`--tag rust --tag async`) 로 OR 매칭, `--media` 는 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). **`--trace` (p9-fb-37)** — `search_response.v1.trace` 에 lexical / vector pre-fusion 후보 + RRF union + per-stage timing (`lexical_ms` / `vector_ms` / `fusion_ms` / `total_ms`) 노출. trace 요청은 캐시 우회 (`--no-cache` 없이도 항상 cold). **`--bulk` (p9-fb-42)** — stdin ndjson 으로 N query 한 번에 실행. `--json` 면 stdout per-query ndjson (`bulk_search_item.v1`) + stderr summary (`bulk_summary: total=N succeeded=S failed=F`). Cap 100. agent 가 query decomposition 후 sub-query 일괄 실행 시 single round-trip — App instance 재사용으로 캐시 / embedder cold-start 비용 한 번만. Per-query failure 는 item 의 `error` (error.v1) 에 격리, 다른 query 계속 진행. **code corpus filters (p10-1A-1):** `--repo` 는 반복 가능 (`--repo kebab --repo other`) OR 매칭. `--code-lang` 는 반복 또는 comma 다중 값 (`--code-lang rust,python`), 알 수 없는 값은 빈 hits. `--media code` 는 Tier 1/2/3 모든 code chunk 포함. 1A-1 시점에서는 indexed 된 code chunk 가 없어 filter 가 항상 빈 결과 — 1A-2 (Rust AST chunker) 머지 이후 실효. | | `kebab list docs` | 색인된 문서 목록 | | `kebab inspect doc ` / `kebab inspect chunk ` | raw record 보기 | @@ -132,7 +132,7 @@ flowchart TB subgraph Pipeline["도메인 + 파이프라인"] parse["parse-md / parse-pdf / parse-image / parse-code"] - chunker["chunker (md-heading-v1, pdf-page-v1, code-{rust,python,ts,js,go,java,kotlin}-ast-v1, k8s-manifest-resource-v1, dockerfile-file-v1, manifest-file-v1, code-text-paragraph-v1)"] + chunker["chunker (md-heading-v1, pdf-page-v1, code-{rust,python,ts,js,go,java,kotlin,c,cpp}-ast-v1, k8s-manifest-resource-v1, dockerfile-file-v1, manifest-file-v1, code-text-paragraph-v1)"] embedder["embedder (fastembed multilingual-e5-large)"] retriever["retriever (lexical / vector / hybrid RRF)"] rag["RAG pipeline"] diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 907d93b..37013e3 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -39,7 +39,7 @@ use std::sync::Arc; use anyhow::{Context, anyhow}; use serde::{Deserialize, Serialize}; -use kebab_chunk::{CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; +use kebab_chunk::{CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, @@ -50,7 +50,7 @@ use kebab_core::{ use kebab_llm_local::OllamaLanguageModel; use kebab_normalize::build_canonical_document; use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr}; -use kebab_parse_code::{GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor}; +use kebab_parse_code::{CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor}; use kebab_parse_pdf::PdfTextExtractor; use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -795,6 +795,7 @@ fn try_skip_unchanged( current_chunker_version: &ChunkerVersion, current_embedding_version: Option<&kebab_core::EmbeddingVersion>, force_reingest: bool, + fallback_chunker_version: Option<&ChunkerVersion>, // p10-3 fix ) -> anyhow::Result> { if force_reingest { return Ok(None); @@ -829,6 +830,50 @@ fn try_skip_unchanged( if existing_doc.source_asset_id != asset.asset_id { return Ok(None); } + // p10-3 fix: detect "stored doc was previously Tier 3 fallback". + // When a Tier 1/2 extractor emits empty chunks, the fallback wrapper + // retries with CodeTextParagraphV1Chunker and stores + // last_chunker_version = "code-text-paragraph-v1" + parser_version = "none-v1". + // On the next ingest the caller computes current_parser_version / + // current_chunker_version from the Tier 1/2 dispatch (e.g. + // "k8s-manifest-resource-v1"), which can never match the stored + // fallback values, causing spurious re-ingests. Detect this case + // and bypass the parser/chunker equality checks — only the embedder + // version still must match. + let stored_is_tier3_fallback = fallback_chunker_version.is_some_and(|fbv| { + existing_doc.last_chunker_version.as_ref() == Some(fbv) + && existing_doc.parser_version.0 == "none-v1" + }); + + if stored_is_tier3_fallback { + // Embedder version still must match. + let embedder_match = existing_doc.last_embedding_version.as_ref() + == current_embedding_version; + if !embedder_match { + return Ok(None); + } + let candidate_doc_id = existing_doc.doc_id.clone(); + tracing::debug!( + target: "kebab-app::ingest", + path = %asset.workspace_path.0, + doc_id = %candidate_doc_id.0, + "skip-unchanged: tier 3 fallback state detected; bypassing parser/chunker equality" + ); + return Ok(Some(kebab_core::IngestItem { + kind: kebab_core::IngestItemKind::Unchanged, + doc_id: Some(candidate_doc_id), + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: u32::try_from(existing_doc.blocks.len()).ok(), + chunk_count: None, + parser_version: Some(existing_doc.parser_version.clone()), + chunker_version: existing_doc.last_chunker_version.clone(), + warnings: Vec::new(), + error: None, + })); + } + // 2. Parser unchanged: parser_version is baked into id_for_doc so // a version bump yields a different doc_id and the row above // would have been missing. Checking here explicitly keeps the @@ -948,12 +993,12 @@ fn ingest_one_asset( force_reingest, ); } - // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. + // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. p10-1D: c/cpp added. MediaType::Code(lang) if matches!(lang.as_str(), "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" - | "shell") => + | "shell" | "c" | "cpp") => { return ingest_one_code_asset( app, @@ -1017,6 +1062,7 @@ fn ingest_one_asset( &MdHeadingV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1211,6 +1257,7 @@ fn ingest_one_image_asset( &MdHeadingV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1657,6 +1704,7 @@ fn ingest_one_pdf_asset( &PdfPageV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1838,6 +1886,9 @@ fn ingest_one_code_asset( => ParserVersion("none-v1".to_string()), // p10-3: shell direct routes to Tier 3 (no parse step). "shell" => ParserVersion("none-v1".to_string()), + // p10-1D: C + C++ AST extractors. + "c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()), + "cpp" => ParserVersion(kebab_parse_code::CPP_PARSER_VERSION.to_string()), other => anyhow::bail!("unsupported code_lang: {other}"), }; @@ -1857,9 +1908,24 @@ fn ingest_one_code_asset( => ManifestFileV1Chunker.chunker_version(), // p10-3: "shell" => CodeTextParagraphV1Chunker.chunker_version(), + // p10-1D: C + C++ AST chunkers. + "c" => CodeCAstV1Chunker.chunker_version(), + "cpp" => CodeCppAstV1Chunker.chunker_version(), other => anyhow::bail!("unreachable chunker_version: {other}"), }; + // p10-3 fix: if this lang can fall back to Tier 3, compute the fallback + // chunker_version so try_skip_unchanged can detect the stored-as-Tier-3 + // state and skip parser/chunker equality checks. + let tier3_fallback_cv: Option = match code_lang { + "rust" | "python" | "typescript" | "javascript" + | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + | "c" | "cpp" // p10-1D + => Some(CodeTextParagraphV1Chunker.chunker_version()), + _ => None, + }; + if let Some(item) = try_skip_unchanged( app, asset, @@ -1867,6 +1933,7 @@ fn ingest_one_code_asset( &chunker_version, embedder.map(|e| e.model_version()).as_ref(), force_reingest, + tier3_fallback_cv.as_ref(), )? { return Ok(item); } @@ -1911,6 +1978,13 @@ fn ingest_one_code_asset( } // p10-3: shell reuses the same synthesizer. "shell" => synthesize_tier2_document(asset, &bytes, "shell", &parser_version), + // p10-1D: C + C++ AST extractors. + "c" => CAstExtractor::new() + .extract(&ctx, &bytes) + .context("kebab-parse-code::CAstExtractor::extract (code:c)"), + "cpp" => CppAstExtractor::new() + .extract(&ctx, &bytes) + .context("kebab-parse-code::CppAstExtractor::extract (code:cpp)"), other => anyhow::bail!("unreachable (extract): {other}"), }; @@ -1987,6 +2061,13 @@ fn ingest_one_code_asset( "shell" => CodeTextParagraphV1Chunker .chunk(&canonical, chunk_policy) .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (code:shell)"), + // p10-1D: C + C++ AST chunkers. + "c" => CodeCAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kebab-chunk::CodeCAstV1Chunker::chunk (code:c)"), + "cpp" => CodeCppAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kebab-chunk::CodeCppAstV1Chunker::chunk (code:cpp)"), other => anyhow::bail!("unreachable (chunk): {other}"), } }; diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index a462666..e5f2338 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -1064,3 +1064,270 @@ fn rust_file_re_ingest_is_unchanged() { ); assert_eq!(item2.doc_id, item1.doc_id); } + +/// p10-3 fix regression: a docker-compose YAML that falls back to Tier 3 +/// (k8s chunker returns empty, CodeTextParagraphV1Chunker retries) must +/// report Unchanged on the second ingest rather than re-processing. +/// Before the fix, try_skip_unchanged returned None because the stored +/// last_chunker_version ("code-text-paragraph-v1" / parser_version +/// "none-v1") never matched the caller's dispatch values. +#[test] +fn tier3_yaml_fallback_reingest_is_unchanged() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("docker-compose.yml"), + "version: '3'\nservices:\n api:\n image: nginx:latest\n", + ) + .unwrap(); + + let report1 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("first ingest"); + let item1 = report1 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) + .expect("docker-compose.yml in first report"); + assert!( + matches!(item1.kind, IngestItemKind::New), + "first ingest must be New, got {:?}", item1.kind + ); + assert_eq!( + item1.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-text-paragraph-v1"), + "first ingest must use Tier 3 fallback chunker" + ); + + let report2 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest"); + let item2 = report2 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) + .expect("docker-compose.yml in second report"); + assert!( + matches!(item2.kind, IngestItemKind::Unchanged), + "second ingest must be Unchanged, got {:?}", item2.kind + ); +} + +/// p10-1d Task G: a `.c` file with a single top-level function is ingested +/// and the resulting `Citation::Code` hit must carry `lang="c"`, +/// `symbol="parse_record"` (function name only — no nesting in C), and +/// `chunker_version = "code-c-ast-v1"`. +#[test] +fn tier1_c_ingest_searchable() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("parser.c"), + "#include \n\nint parse_record(const char *line) {\n if (line == NULL) return -1;\n return 0;\n}\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + assert!(report.new >= 1, "c file ingested: {report:?}"); + + let c_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("parser.c")) + .expect("parser.c item present"); + assert_eq!( + c_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("code-c-v1"), + "parser_version must be code-c-v1" + ); + assert_eq!( + c_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-c-ast-v1"), + "chunker_version must be code-c-ast-v1" + ); + + let query = kebab_core::SearchQuery { + text: "parse_record".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["c".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'parse_record'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!(lang.as_deref(), Some("c"), "citation.lang must be 'c'"); + assert_eq!( + symbol.as_deref(), + Some("parse_record"), + "C symbol must be function name only (no nesting)" + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("c"), + "SearchHit.code_lang must be 'c'" + ); + assert_eq!( + h.chunker_version.0.as_str(), + "code-c-ast-v1", + "C chunks must be stamped with code-c-ast-v1" + ); +} + +/// p10-1d Task G: a `.cpp` file with nested namespace + class is ingested +/// and the resulting `Citation::Code` hit must carry `lang="cpp"`, a +/// `symbol` that starts with `"kebab::chunk::Foo"` (namespace::Class or +/// namespace::Class::method), and `chunker_version = "code-cpp-ast-v1"`. +#[test] +fn tier1_cpp_ingest_searchable() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("chunker.cpp"), + "namespace kebab {\nnamespace chunk {\nclass Foo {\npublic:\n void bar() { /* impl */ }\n};\n}\n}\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + assert!(report.new >= 1, "cpp file ingested: {report:?}"); + + let cpp_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("chunker.cpp")) + .expect("chunker.cpp item present"); + assert_eq!( + cpp_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("code-cpp-v1"), + "parser_version must be code-cpp-v1" + ); + assert_eq!( + cpp_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-cpp-ast-v1"), + "chunker_version must be code-cpp-ast-v1" + ); + + let query = kebab_core::SearchQuery { + text: "bar".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["cpp".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'bar'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!(lang.as_deref(), Some("cpp"), "citation.lang must be 'cpp'"); + // Symbol could be "kebab::chunk::Foo" (class) or "kebab::chunk::Foo::bar" + // (method) depending on which chunk ranks first. + assert!( + symbol.as_deref().is_some_and(|s| s.starts_with("kebab::chunk::Foo")), + "C++ symbol must start with namespace::Class prefix, got {:?}", symbol + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("cpp"), + "SearchHit.code_lang must be 'cpp'" + ); + assert_eq!( + h.chunker_version.0.as_str(), + "code-cpp-ast-v1", + "C++ chunks must be stamped with code-cpp-ast-v1" + ); +} + +/// p10-3 fix regression: a shell file (direct Tier 3, not a fallback) +/// must also report Unchanged on re-ingest. Shell goes straight to +/// CodeTextParagraphV1Chunker so `stored_is_tier3_fallback` is false +/// (parser_version is "none-v1" and chunker matches the current dispatch), +/// but the normal equality path should pass regardless. +#[test] +fn tier3_shell_reingest_is_unchanged() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("deploy.sh"), + "#!/usr/bin/env bash\nset -e\necho hello\n", + ) + .unwrap(); + + let report1 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("first ingest"); + let item1 = report1 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("deploy.sh")) + .expect("deploy.sh in first report"); + assert!( + matches!(item1.kind, IngestItemKind::New), + "first ingest must be New, got {:?}", item1.kind + ); + + let report2 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest"); + let item2 = report2 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("deploy.sh")) + .expect("deploy.sh in second report"); + assert!( + matches!(item2.kind, IngestItemKind::Unchanged), + "shell reingest must be Unchanged, got {:?}", item2.kind + ); +} diff --git a/crates/kebab-chunk/src/code_c_ast_v1.rs b/crates/kebab-chunk/src/code_c_ast_v1.rs new file mode 100644 index 0000000..22dbcf2 --- /dev/null +++ b/crates/kebab-chunk/src/code_c_ast_v1.rs @@ -0,0 +1,322 @@ +//! `code-c-ast-v1` — maps a tree-sitter-derived C AST +//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with +//! `SourceSpan::Code`) to chunks 1:1. A unit longer than +//! `AST_CHUNK_MAX_LINES` is split into ` [part i/N]` sub-chunks +//! at blank-line paragraph boundaries (design §9.1 oversize fallback). +//! +//! tree-sitter is intentionally NOT a dependency here: AST work is +//! parser-side (`kebab-parse-code`, design §6.3). This chunker only +//! consumes the `CanonicalDocument`. +//! +//! `AST_CHUNK_MAX_LINES` is a constant matching +//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium +//! config threading needs a chunker registry (P+); same deviation +//! pattern as `pdf-page-v1`'s pinned `chunker_version` +//! (`tasks/HOTFIXES.md`). + +use kebab_core::{ + Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId, + SourceSpan, id_for_chunk, +}; + +const VERSION_LABEL: &str = "code-c-ast-v1"; +const BYTES_PER_TOKEN: usize = 3; +const POLICY_HASH_HEX_LEN: usize = 16; +const AST_CHUNK_MAX_LINES: u32 = 200; + +#[derive(Clone, Copy, Debug, Default)] +pub struct CodeCAstV1Chunker; + +impl Chunker for CodeCAstV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() + } + + fn chunk( + &self, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + ) -> anyhow::Result> { + for b in &doc.blocks { + let c = match b { + Block::Code(c) => c, + _ => anyhow::bail!( + "CodeCAstV1Chunker only handles code docs (got non-Code block)" + ), + }; + if !matches!(c.common.source_span, SourceSpan::Code { .. }) { + anyhow::bail!( + "CodeCAstV1Chunker only handles code docs (got non-Code source_span)" + ); + } + } + + let base_policy_hash = self.policy_hash(policy); + let chunker_version = self.chunker_version(); + let mut out: Vec = Vec::new(); + + for b in &doc.blocks { + let cb = match b { + Block::Code(c) => c, + _ => unreachable!("validated above"), + }; + let (ls, le, symbol, lang) = match &cb.common.source_span { + SourceSpan::Code { line_start, line_end, symbol, lang } => { + (*line_start, *line_end, symbol.clone(), lang.clone()) + } + _ => unreachable!("validated above"), + }; + let block_ids: Vec = vec![cb.common.block_id.clone()]; + let span_lines = le.saturating_sub(ls) + 1; + + if span_lines <= AST_CHUNK_MAX_LINES { + let span = SourceSpan::Code { + line_start: ls, + line_end: le, + symbol: symbol.clone(), + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + None, span, cb.code.clone(), + )); + } else { + let parts = split_oversize(&cb.code); + let n = parts.len(); + for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { + let part_ls = ls + off_start; + let part_le = ls + off_end; + let part_sym = symbol + .as_ref() + .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let span = SourceSpan::Code { + line_start: part_ls, + line_end: part_le, + symbol: part_sym, + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + Some(part_ls), span, text, + )); + } + } + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = out.len(), + "code-c-ast-v1 chunked", + ); + Ok(out) + } +} + +#[allow(clippy::too_many_arguments)] +fn make_chunk( + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + block_ids: &[BlockId], + base_policy_hash: &str, + split_key: Option, + span: SourceSpan, + text: String, +) -> Chunk { + let id_hash = match split_key { + Some(k) => format!("{base_policy_hash}#L{k}"), + None => base_policy_hash.to_string(), + }; + let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash); + let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN); + Chunk { + chunk_id, + doc_id: DocumentId(doc.doc_id.0.clone()), + block_ids: block_ids.to_vec(), + text, + heading_path: Vec::new(), + source_spans: vec![span], + token_estimate, + chunker_version: chunker_version.clone(), + policy_hash: base_policy_hash.to_string(), + } +} + +/// Split an oversize unit at blank-line paragraph boundaries, greedily +/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate. +/// Returns `(line_offset_start, line_offset_end, text)` where offsets are +/// 0-based within the unit (caller adds the unit's absolute `line_start`). +fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { + let lines: Vec<&str> = code.split('\n').collect(); + let total = lines.len() as u32; + let mut out: Vec<(u32, u32, String)> = Vec::new(); + let mut start: u32 = 0; + while start < total { + let mut end = (start + AST_CHUNK_MAX_LINES).min(total); + let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5); + if end < total { + if let Some(b) = (floor.min(end)..end) + .rev() + .find(|&i| lines[i as usize].trim().is_empty()) + { + end = b + 1; + } + } + let text = lines[start as usize..end as usize].join("\n"); + out.push((start, end.saturating_sub(1), text)); + start = end; + } + if out.is_empty() { + out.push((0, total.saturating_sub(1), code.to_string())); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{ + Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, + SourceType, TrustLevel, WorkspacePath, + }; + use time::OffsetDateTime; + + fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument { + let wp = WorkspacePath("crates/x/src/a.c".into()); + let aid = AssetId("a".repeat(64)); + let pv = ParserVersion("code-c-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + let blocks = units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("c".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + lang: Some("c".into()), + code: (*code).to_string(), + }) + }) + .collect(); + CanonicalDocument { + doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), + lang: Lang("und".into()), blocks, + metadata: Metadata { + aliases: vec![], tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, trust_level: TrustLevel::Primary, + user_id_alias: None, user: Default::default(), + repo: Some("kebab".into()), git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), code_lang: Some("c".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, schema_version: 1, doc_version: 1, + last_chunker_version: None, last_embedding_version: None, + } + } + fn policy() -> ChunkPolicy { + ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + } + + #[test] + fn chunker_version_is_code_c_ast_v1() { + assert_eq!(CodeCAstV1Chunker.chunker_version(), + ChunkerVersion("code-c-ast-v1".into())); + } + + #[test] + fn one_chunk_per_unit_preserves_code_span() { + let doc = code_doc(&[ + ("parse", 1, 3, "int parse() {\n\t// x\n}"), + ("print", 5, 7, "void print() {\n\t//\n\treturn;\n}"), + ]); + let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert_eq!(chunks.len(), 2); + for c in &chunks { + assert_eq!(c.source_spans.len(), 1); + assert!(matches!(c.source_spans[0], SourceSpan::Code { .. })); + assert_eq!(c.heading_path, Vec::::new()); + assert_eq!(c.chunker_version.0, "code-c-ast-v1"); + } + match &chunks[0].source_spans[0] { + SourceSpan::Code { symbol, line_start, line_end, .. } => { + assert_eq!(symbol.as_deref(), Some("parse")); + assert_eq!((*line_start, *line_end), (1, 3)); + } + _ => unreachable!(), + } + } + + #[test] + fn oversize_unit_splits_into_parts_with_unique_ids() { + let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::>().join(""); + let code = format!("int big() {{\n{body}\n}}"); + let doc = code_doc(&[("big", 1, 502, &code)]); + let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + for c in &chunks { + match &c.source_spans[0] { + SourceSpan::Code { symbol, .. } => { + assert!(symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}"); + } + _ => unreachable!(), + } + } + let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); + let n = ids.len(); ids.sort(); ids.dedup(); + assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); + } + + #[test] + fn non_code_doc_errors() { + use kebab_core::TextBlock; + let mut doc = code_doc(&[("parse", 1, 1, "int parse() {}")]); + doc.blocks = vec![Block::Paragraph(TextBlock { + common: CommonBlock { + block_id: kebab_core::BlockId("b".into()), + heading_path: vec![], + source_span: SourceSpan::Line { start: 1, end: 1 }, + }, + text: "x".into(), inlines: vec![], + })]; + let err = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); + assert!(err.to_string().contains("CodeCAstV1Chunker")); + } + + #[test] + fn deterministic_chunk_ids_1000() { + let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]); + let base: Vec = CodeCAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + for _ in 0..1000 { + let again: Vec = CodeCAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + assert_eq!(again, base); + } + } + + #[test] + fn policy_hash_matches_md_heading_v1() { + let p = policy(); + assert_eq!(CodeCAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p)); + } +} diff --git a/crates/kebab-chunk/src/code_cpp_ast_v1.rs b/crates/kebab-chunk/src/code_cpp_ast_v1.rs new file mode 100644 index 0000000..f9272d3 --- /dev/null +++ b/crates/kebab-chunk/src/code_cpp_ast_v1.rs @@ -0,0 +1,322 @@ +//! `code-cpp-ast-v1` — maps a tree-sitter-derived C++ AST +//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with +//! `SourceSpan::Code`) to chunks 1:1. A unit longer than +//! `AST_CHUNK_MAX_LINES` is split into ` [part i/N]` sub-chunks +//! at blank-line paragraph boundaries (design §9.1 oversize fallback). +//! +//! tree-sitter is intentionally NOT a dependency here: AST work is +//! parser-side (`kebab-parse-code`, design §6.3). This chunker only +//! consumes the `CanonicalDocument`. +//! +//! `AST_CHUNK_MAX_LINES` is a constant matching +//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium +//! config threading needs a chunker registry (P+); same deviation +//! pattern as `pdf-page-v1`'s pinned `chunker_version` +//! (`tasks/HOTFIXES.md`). + +use kebab_core::{ + Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId, + SourceSpan, id_for_chunk, +}; + +const VERSION_LABEL: &str = "code-cpp-ast-v1"; +const BYTES_PER_TOKEN: usize = 3; +const POLICY_HASH_HEX_LEN: usize = 16; +const AST_CHUNK_MAX_LINES: u32 = 200; + +#[derive(Clone, Copy, Debug, Default)] +pub struct CodeCppAstV1Chunker; + +impl Chunker for CodeCppAstV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() + } + + fn chunk( + &self, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + ) -> anyhow::Result> { + for b in &doc.blocks { + let c = match b { + Block::Code(c) => c, + _ => anyhow::bail!( + "CodeCppAstV1Chunker only handles code docs (got non-Code block)" + ), + }; + if !matches!(c.common.source_span, SourceSpan::Code { .. }) { + anyhow::bail!( + "CodeCppAstV1Chunker only handles code docs (got non-Code source_span)" + ); + } + } + + let base_policy_hash = self.policy_hash(policy); + let chunker_version = self.chunker_version(); + let mut out: Vec = Vec::new(); + + for b in &doc.blocks { + let cb = match b { + Block::Code(c) => c, + _ => unreachable!("validated above"), + }; + let (ls, le, symbol, lang) = match &cb.common.source_span { + SourceSpan::Code { line_start, line_end, symbol, lang } => { + (*line_start, *line_end, symbol.clone(), lang.clone()) + } + _ => unreachable!("validated above"), + }; + let block_ids: Vec = vec![cb.common.block_id.clone()]; + let span_lines = le.saturating_sub(ls) + 1; + + if span_lines <= AST_CHUNK_MAX_LINES { + let span = SourceSpan::Code { + line_start: ls, + line_end: le, + symbol: symbol.clone(), + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + None, span, cb.code.clone(), + )); + } else { + let parts = split_oversize(&cb.code); + let n = parts.len(); + for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { + let part_ls = ls + off_start; + let part_le = ls + off_end; + let part_sym = symbol + .as_ref() + .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let span = SourceSpan::Code { + line_start: part_ls, + line_end: part_le, + symbol: part_sym, + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + Some(part_ls), span, text, + )); + } + } + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = out.len(), + "code-cpp-ast-v1 chunked", + ); + Ok(out) + } +} + +#[allow(clippy::too_many_arguments)] +fn make_chunk( + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + block_ids: &[BlockId], + base_policy_hash: &str, + split_key: Option, + span: SourceSpan, + text: String, +) -> Chunk { + let id_hash = match split_key { + Some(k) => format!("{base_policy_hash}#L{k}"), + None => base_policy_hash.to_string(), + }; + let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash); + let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN); + Chunk { + chunk_id, + doc_id: DocumentId(doc.doc_id.0.clone()), + block_ids: block_ids.to_vec(), + text, + heading_path: Vec::new(), + source_spans: vec![span], + token_estimate, + chunker_version: chunker_version.clone(), + policy_hash: base_policy_hash.to_string(), + } +} + +/// Split an oversize unit at blank-line paragraph boundaries, greedily +/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate. +/// Returns `(line_offset_start, line_offset_end, text)` where offsets are +/// 0-based within the unit (caller adds the unit's absolute `line_start`). +fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { + let lines: Vec<&str> = code.split('\n').collect(); + let total = lines.len() as u32; + let mut out: Vec<(u32, u32, String)> = Vec::new(); + let mut start: u32 = 0; + while start < total { + let mut end = (start + AST_CHUNK_MAX_LINES).min(total); + let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5); + if end < total { + if let Some(b) = (floor.min(end)..end) + .rev() + .find(|&i| lines[i as usize].trim().is_empty()) + { + end = b + 1; + } + } + let text = lines[start as usize..end as usize].join("\n"); + out.push((start, end.saturating_sub(1), text)); + start = end; + } + if out.is_empty() { + out.push((0, total.saturating_sub(1), code.to_string())); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{ + Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, + SourceType, TrustLevel, WorkspacePath, + }; + use time::OffsetDateTime; + + fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument { + let wp = WorkspacePath("crates/x/src/a.cpp".into()); + let aid = AssetId("a".repeat(64)); + let pv = ParserVersion("code-cpp-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + let blocks = units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("cpp".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + lang: Some("cpp".into()), + code: (*code).to_string(), + }) + }) + .collect(); + CanonicalDocument { + doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), + lang: Lang("und".into()), blocks, + metadata: Metadata { + aliases: vec![], tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, trust_level: TrustLevel::Primary, + user_id_alias: None, user: Default::default(), + repo: Some("kebab".into()), git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), code_lang: Some("cpp".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, schema_version: 1, doc_version: 1, + last_chunker_version: None, last_embedding_version: None, + } + } + fn policy() -> ChunkPolicy { + ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + } + + #[test] + fn chunker_version_is_code_cpp_ast_v1() { + assert_eq!(CodeCppAstV1Chunker.chunker_version(), + ChunkerVersion("code-cpp-ast-v1".into())); + } + + #[test] + fn one_chunk_per_unit_preserves_code_span() { + let doc = code_doc(&[ + ("parse", 1, 3, "int parse() {\n\t// x\n}"), + ("print", 5, 7, "void print() {\n\t//\n\treturn;\n}"), + ]); + let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert_eq!(chunks.len(), 2); + for c in &chunks { + assert_eq!(c.source_spans.len(), 1); + assert!(matches!(c.source_spans[0], SourceSpan::Code { .. })); + assert_eq!(c.heading_path, Vec::::new()); + assert_eq!(c.chunker_version.0, "code-cpp-ast-v1"); + } + match &chunks[0].source_spans[0] { + SourceSpan::Code { symbol, line_start, line_end, .. } => { + assert_eq!(symbol.as_deref(), Some("parse")); + assert_eq!((*line_start, *line_end), (1, 3)); + } + _ => unreachable!(), + } + } + + #[test] + fn oversize_unit_splits_into_parts_with_unique_ids() { + let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::>().join(""); + let code = format!("int big() {{\n{body}\n}}"); + let doc = code_doc(&[("big", 1, 502, &code)]); + let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + for c in &chunks { + match &c.source_spans[0] { + SourceSpan::Code { symbol, .. } => { + assert!(symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}"); + } + _ => unreachable!(), + } + } + let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); + let n = ids.len(); ids.sort(); ids.dedup(); + assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); + } + + #[test] + fn non_code_doc_errors() { + use kebab_core::TextBlock; + let mut doc = code_doc(&[("parse", 1, 1, "int parse() {}")]); + doc.blocks = vec![Block::Paragraph(TextBlock { + common: CommonBlock { + block_id: kebab_core::BlockId("b".into()), + heading_path: vec![], + source_span: SourceSpan::Line { start: 1, end: 1 }, + }, + text: "x".into(), inlines: vec![], + })]; + let err = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); + assert!(err.to_string().contains("CodeCppAstV1Chunker")); + } + + #[test] + fn deterministic_chunk_ids_1000() { + let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]); + let base: Vec = CodeCppAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + for _ in 0..1000 { + let again: Vec = CodeCppAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + assert_eq!(again, base); + } + } + + #[test] + fn policy_hash_matches_md_heading_v1() { + let p = policy(); + assert_eq!(CodeCppAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p)); + } +} diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index eee3f69..1be8bd2 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -15,6 +15,8 @@ //! embedder, the retriever, the LLM, the RAG layer, or the UI layers. //! It consumes `CanonicalDocument` purely through `kb-core` types. +mod code_c_ast_v1; +mod code_cpp_ast_v1; mod code_go_ast_v1; mod code_java_ast_v1; mod code_js_ast_v1; @@ -30,6 +32,8 @@ pub mod dockerfile_file_v1; pub mod manifest_file_v1; pub mod code_text_paragraph_v1; +pub use code_c_ast_v1::CodeCAstV1Chunker; +pub use code_cpp_ast_v1::CodeCppAstV1Chunker; pub use code_go_ast_v1::CodeGoAstV1Chunker; pub use code_java_ast_v1::CodeJavaAstV1Chunker; pub use code_js_ast_v1::CodeJsAstV1Chunker; diff --git a/crates/kebab-chunk/tests/code_c_ast_snapshot.rs b/crates/kebab-chunk/tests/code_c_ast_snapshot.rs new file mode 100644 index 0000000..62162b0 --- /dev/null +++ b/crates/kebab-chunk/tests/code_c_ast_snapshot.rs @@ -0,0 +1,196 @@ +//! Snapshot test pinning the `Vec` JSON for a +//! representative C code `CanonicalDocument`. +//! +//! This is an integration test. `kebab-parse-code` is intentionally NOT +//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side). +//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code` +//! units, which is the same pattern used in `code_go_ast_v1.rs`'s +//! internal `code_doc` test helper. +//! +//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline. + +use std::path::PathBuf; + +use kebab_chunk::CodeCAstV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, + id_for_block, id_for_doc, +}; +use serde_json::Value; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +fn fixed_doc() -> CanonicalDocument { + let wp = WorkspacePath("projects/record.c".into()); + let aid = AssetId("c".repeat(64)); + // Pin parser_version so doc_id / block_ids are reproducible. + let pv = ParserVersion("code-c-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + // Representative units: + // 0. imports + defines (lines 1–4, ≤200) + // 1. status_t enum typedef (lines 6–9, ≤200) + // 2. record_t struct typedef (lines 11–16, ≤200) + // 3. static counter decl glue (line 18, ≤200) + // 4. parse_record fn (lines 20–23, ≤200) + // 5. print_record fn (lines 25–27, ≤200) + // 6. main fn (lines 29–33, ≤200) + let raw_units: Vec<(&str, u32, u32, String)> = vec![ + ( + "", + 1, + 18, + "#include \n#include \n\n#define MAX_BUF 4096\n\ntypedef enum {\n OK = 0,\n ERR_PARSE,\n ERR_IO,\n} status_t;\n\ntypedef struct {\n int id;\n char name[64];\n status_t status;\n} record_t;\n\nstatic int counter = 0;".to_string(), + ), + ( + "parse_record", + 20, + 23, + "int parse_record(const char *line, record_t *out) {\n if (line == NULL || out == NULL) return ERR_PARSE;\n return OK;\n}".to_string(), + ), + ( + "print_record", + 25, + 27, + "void print_record(const record_t *r) {\n printf(\"[%d] %s (status=%d)\\n\", r->id, r->name, r->status);\n}".to_string(), + ), + ( + "main", + 29, + 33, + "int main(void) {\n record_t r = { .id = 1, .name = \"foo\", .status = OK };\n print_record(&r);\n return 0;\n}".to_string(), + ), + ]; + + let blocks: Vec = raw_units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("c".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some("c".into()), + code: code.clone(), + }) + }) + .collect(); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "record.c".into(), + lang: Lang("und".into()), + blocks, + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("c".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn fixed_policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("code-c-ast-v1".into()), + } +} + +#[test] +fn code_c_ast_chunks_snapshot() { + let doc = fixed_doc(); + let policy = fixed_policy(); + + let chunks = CodeCAstV1Chunker.chunk(&doc, &policy).expect("chunk"); + let actual = serde_json::to_value(&chunks).unwrap(); + + let dir = fixtures_dir(); + let baseline_path = dir.join("code-sample.c.chunks.snapshot.json"); + let baseline_text = match std::fs::read_to_string(&baseline_path) { + Ok(s) => s, + Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => { + std::fs::create_dir_all(&dir).unwrap(); + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + return; + } + Err(e) => panic!( + "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}", + baseline_path.display() + ), + }; + let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json"); + + if actual != expected { + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + eprintln!("updated baseline {}", baseline_path.display()); + return; + } + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "code-c-ast-v1 chunks snapshot drift\n\ + --- expected ({}) ---\n{baseline_text}\n\ + --- actual ---\n{pretty}\n\ + If intentional, re-run with UPDATE_SNAPSHOTS=1.", + baseline_path.display() + ); + } +} + +/// Determinism cross-check: re-running the same pipeline yields the same +/// chunk_ids byte-for-byte. +#[test] +fn code_c_ast_chunks_are_deterministic() { + let policy = fixed_policy(); + let baseline: Vec = CodeCAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + for _ in 0..5 { + let again: Vec = CodeCAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + assert_eq!(again, baseline); + } +} diff --git a/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs b/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs new file mode 100644 index 0000000..0b7724f --- /dev/null +++ b/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs @@ -0,0 +1,200 @@ +//! Snapshot test pinning the `Vec` JSON for a +//! representative C++ code `CanonicalDocument`. +//! +//! This is an integration test. `kebab-parse-code` is intentionally NOT +//! a dev-dep (design §6.3 / §8 boundary: AST extraction is parser-side). +//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code` +//! units, which is the same pattern used in `code_c_ast_v1.rs`'s +//! internal `code_doc` test helper. +//! +//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline. + +use std::path::PathBuf; + +use kebab_chunk::CodeCppAstV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, + id_for_block, id_for_doc, +}; +use serde_json::Value; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +fn fixed_doc() -> CanonicalDocument { + let wp = WorkspacePath("projects/record.cpp".into()); + let aid = AssetId("c".repeat(64)); + // Pin parser_version so doc_id / block_ids are reproducible. + let pv = ParserVersion("code-cpp-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + // Representative units (C++ specific): + // 0. includes + namespace opening (lines 1–4, ≤200) + // 1. class definition (lines 6–20, ≤200) + // 2. template function (lines 22–25, ≤200) + // 3. namespace closing + free fn (lines 27–29, ≤200) + // 4. main fn (lines 31–34, ≤200) + let raw_units: Vec<(&str, u32, u32, String)> = vec![ + ( + "", + 1, + 4, + "#include \n#include \n\nnamespace kebab {".to_string(), + ), + ( + "kebab::chunk::MdHeadingV1Chunker", + 6, + 20, + "class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};".to_string(), + ), + ( + "kebab::identity", + 22, + 25, + "template \nT identity(T value) {\n return value;\n}".to_string(), + ), + ( + "kebab::global_helper", + 27, + 29, + "void global_helper() {\n // free function in kebab namespace\n}".to_string(), + ), + ( + "main", + 31, + 34, + "int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}".to_string(), + ), + ]; + + let blocks: Vec = raw_units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("cpp".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some("cpp".into()), + code: code.clone(), + }) + }) + .collect(); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "record.cpp".into(), + lang: Lang("und".into()), + blocks, + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("cpp".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn fixed_policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("code-cpp-ast-v1".into()), + } +} + +#[test] +fn code_cpp_ast_chunks_snapshot() { + let doc = fixed_doc(); + let policy = fixed_policy(); + + let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy).expect("chunk"); + let actual = serde_json::to_value(&chunks).unwrap(); + + let dir = fixtures_dir(); + let baseline_path = dir.join("code-sample.cpp.chunks.snapshot.json"); + let baseline_text = match std::fs::read_to_string(&baseline_path) { + Ok(s) => s, + Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => { + std::fs::create_dir_all(&dir).unwrap(); + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + return; + } + Err(e) => panic!( + "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}", + baseline_path.display() + ), + }; + let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json"); + + if actual != expected { + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + eprintln!("updated baseline {}", baseline_path.display()); + return; + } + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "code-cpp-ast-v1 chunks snapshot drift\n\ + --- expected ({}) ---\n{baseline_text}\n\ + --- actual ---\n{pretty}\n\ + If intentional, re-run with UPDATE_SNAPSHOTS=1.", + baseline_path.display() + ); + } +} + +/// Determinism cross-check: re-running the same pipeline yields the same +/// chunk_ids byte-for-byte. +#[test] +fn code_cpp_ast_chunks_are_deterministic() { + let policy = fixed_policy(); + let baseline: Vec = CodeCppAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + for _ in 0..5 { + let again: Vec = CodeCppAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + assert_eq!(again, baseline); + } +} diff --git a/crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json b/crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json new file mode 100644 index 0000000..832c474 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json @@ -0,0 +1,86 @@ +[ + { + "block_ids": [ + "8149e12ca002489acb4a0f74c97a061a" + ], + "chunk_id": "ec3cf06ae56c8e9796bbc9196438b7c5", + "chunker_version": "code-c-ast-v1", + "doc_id": "6bec42dd593920a060541db16c4e8e45", + "heading_path": [], + "policy_hash": "ecfad2ec1223662d", + "source_spans": [ + { + "kind": "code", + "lang": "c", + "line_end": 18, + "line_start": 1, + "symbol": "" + } + ], + "text": "#include \n#include \n\n#define MAX_BUF 4096\n\ntypedef enum {\n OK = 0,\n ERR_PARSE,\n ERR_IO,\n} status_t;\n\ntypedef struct {\n int id;\n char name[64];\n status_t status;\n} record_t;\n\nstatic int counter = 0;", + "token_estimate": 78 + }, + { + "block_ids": [ + "1baaa89f21a47b2f32d6396a24a85454" + ], + "chunk_id": "c2d7a81c898106733ef2e703774a6a4a", + "chunker_version": "code-c-ast-v1", + "doc_id": "6bec42dd593920a060541db16c4e8e45", + "heading_path": [], + "policy_hash": "ecfad2ec1223662d", + "source_spans": [ + { + "kind": "code", + "lang": "c", + "line_end": 23, + "line_start": 20, + "symbol": "parse_record" + } + ], + "text": "int parse_record(const char *line, record_t *out) {\n if (line == NULL || out == NULL) return ERR_PARSE;\n return OK;\n}", + "token_estimate": 41 + }, + { + "block_ids": [ + "8d0e14cbcc6d1e92d7878ab796ea68b8" + ], + "chunk_id": "0e4d7b131ab64eba03b51903b5d8f96d", + "chunker_version": "code-c-ast-v1", + "doc_id": "6bec42dd593920a060541db16c4e8e45", + "heading_path": [], + "policy_hash": "ecfad2ec1223662d", + "source_spans": [ + { + "kind": "code", + "lang": "c", + "line_end": 27, + "line_start": 25, + "symbol": "print_record" + } + ], + "text": "void print_record(const record_t *r) {\n printf(\"[%d] %s (status=%d)\\n\", r->id, r->name, r->status);\n}", + "token_estimate": 35 + }, + { + "block_ids": [ + "9c2ede84423871b615d48c38fefb1853" + ], + "chunk_id": "e076f8edb2ff141d7e99b4106bb95157", + "chunker_version": "code-c-ast-v1", + "doc_id": "6bec42dd593920a060541db16c4e8e45", + "heading_path": [], + "policy_hash": "ecfad2ec1223662d", + "source_spans": [ + { + "kind": "code", + "lang": "c", + "line_end": 33, + "line_start": 29, + "symbol": "main" + } + ], + "text": "int main(void) {\n record_t r = { .id = 1, .name = \"foo\", .status = OK };\n print_record(&r);\n return 0;\n}", + "token_estimate": 38 + } +] diff --git a/crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json b/crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json new file mode 100644 index 0000000..257d6e9 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json @@ -0,0 +1,107 @@ +[ + { + "block_ids": [ + "53292605459065d170cd36c118e20546" + ], + "chunk_id": "50a5b324300d9082eac4ce2a422810e1", + "chunker_version": "code-cpp-ast-v1", + "doc_id": "fff1e1f0a7ff70ef682937470e5d1d28", + "heading_path": [], + "policy_hash": "71f3c07bb9ec1d09", + "source_spans": [ + { + "kind": "code", + "lang": "cpp", + "line_end": 4, + "line_start": 1, + "symbol": "" + } + ], + "text": "#include \n#include \n\nnamespace kebab {", + "token_estimate": 18 + }, + { + "block_ids": [ + "f349acad94c9fa4cf9ad1c0a93e83610" + ], + "chunk_id": "0e6bc7c522665af8a4b0f66afb9d29c8", + "chunker_version": "code-cpp-ast-v1", + "doc_id": "fff1e1f0a7ff70ef682937470e5d1d28", + "heading_path": [], + "policy_hash": "71f3c07bb9ec1d09", + "source_spans": [ + { + "kind": "code", + "lang": "cpp", + "line_end": 20, + "line_start": 6, + "symbol": "kebab::chunk::MdHeadingV1Chunker" + } + ], + "text": "class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};", + "token_estimate": 95 + }, + { + "block_ids": [ + "8b9811387717d0bd4abf84abcc35b8b1" + ], + "chunk_id": "d9326d252905b665b2adb9a416c20451", + "chunker_version": "code-cpp-ast-v1", + "doc_id": "fff1e1f0a7ff70ef682937470e5d1d28", + "heading_path": [], + "policy_hash": "71f3c07bb9ec1d09", + "source_spans": [ + { + "kind": "code", + "lang": "cpp", + "line_end": 25, + "line_start": 22, + "symbol": "kebab::identity" + } + ], + "text": "template \nT identity(T value) {\n return value;\n}", + "token_estimate": 21 + }, + { + "block_ids": [ + "1754cb6b971f6a4cb292f144a4f0570b" + ], + "chunk_id": "56ee5f991de4a413c016da8dc4acfc35", + "chunker_version": "code-cpp-ast-v1", + "doc_id": "fff1e1f0a7ff70ef682937470e5d1d28", + "heading_path": [], + "policy_hash": "71f3c07bb9ec1d09", + "source_spans": [ + { + "kind": "code", + "lang": "cpp", + "line_end": 29, + "line_start": 27, + "symbol": "kebab::global_helper" + } + ], + "text": "void global_helper() {\n // free function in kebab namespace\n}", + "token_estimate": 22 + }, + { + "block_ids": [ + "14b5f3393d6d25f822f5b70763d24acd" + ], + "chunk_id": "c0d7c043cdd575c530db3909b54cc906", + "chunker_version": "code-cpp-ast-v1", + "doc_id": "fff1e1f0a7ff70ef682937470e5d1d28", + "heading_path": [], + "policy_hash": "71f3c07bb9ec1d09", + "source_spans": [ + { + "kind": "code", + "lang": "cpp", + "line_end": 34, + "line_start": 31, + "symbol": "main" + } + ], + "text": "int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}", + "token_estimate": 23 + } +] diff --git a/crates/kebab-chunk/tests/fixtures/sample.c b/crates/kebab-chunk/tests/fixtures/sample.c new file mode 100644 index 0000000..ded7945 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample.c @@ -0,0 +1,33 @@ +#include +#include + +#define MAX_BUF 4096 + +typedef enum { + OK = 0, + ERR_PARSE, + ERR_IO, +} status_t; + +typedef struct { + int id; + char name[64]; + status_t status; +} record_t; + +static int counter = 0; + +int parse_record(const char *line, record_t *out) { + if (line == NULL || out == NULL) return ERR_PARSE; + return OK; +} + +void print_record(const record_t *r) { + printf("[%d] %s (status=%d)\n", r->id, r->name, r->status); +} + +int main(void) { + record_t r = { .id = 1, .name = "foo", .status = OK }; + print_record(&r); + return 0; +} diff --git a/crates/kebab-chunk/tests/fixtures/sample.cpp b/crates/kebab-chunk/tests/fixtures/sample.cpp new file mode 100644 index 0000000..2b95a60 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample.cpp @@ -0,0 +1,40 @@ +#include +#include + +namespace kebab { +namespace chunk { + +class MdHeadingV1Chunker { +public: + MdHeadingV1Chunker() = default; + ~MdHeadingV1Chunker() = default; + + std::string chunk_doc(const std::string& doc) { + return doc; + } + + int operator()(int x) const { + return x * 2; + } + +private: + int counter_ = 0; +}; + +template +T identity(T value) { + return value; +} + +} // namespace chunk + +void global_helper() { + // free function in kebab namespace +} + +} // namespace kebab + +int main() { + kebab::chunk::MdHeadingV1Chunker c; + return 0; +} diff --git a/crates/kebab-parse-code/Cargo.toml b/crates/kebab-parse-code/Cargo.toml index caaceaf..cfdbca6 100644 --- a/crates/kebab-parse-code/Cargo.toml +++ b/crates/kebab-parse-code/Cargo.toml @@ -22,6 +22,8 @@ tree-sitter-javascript = { workspace = true } tree-sitter-go = { workspace = true } tree-sitter-java = { workspace = true } tree-sitter-kotlin-ng = { workspace = true } +tree-sitter-c = { workspace = true } +tree-sitter-cpp = { workspace = true } [dev-dependencies] tempfile = { workspace = true } diff --git a/crates/kebab-parse-code/src/c.rs b/crates/kebab-parse-code/src/c.rs new file mode 100644 index 0000000..4b88b58 --- /dev/null +++ b/crates/kebab-parse-code/src/c.rs @@ -0,0 +1,337 @@ +//! `kebab-parse-code::c` — tree-sitter C AST extractor (P10-1D Task B). +//! +//! Implements [`kebab_core::Extractor`] for [`MediaType::Code("c")`]. +//! Walks the tree-sitter parse tree and emits one [`Block::Code`] per +//! top-level AST semantic unit: +//! +//! - `function_definition` → 1 unit, symbol = function name (extracted +//! from the declarator's innermost `identifier`, handles pointer-returning +//! functions where the declarator is wrapped in `pointer_declarator`). +//! - `struct_specifier` (named) → 1 unit, symbol = struct name. +//! - `enum_specifier` (named) → 1 unit, symbol = enum name. +//! - `union_specifier` (named) → 1 unit, symbol = union name. +//! +//! Everything else (`declaration`, `preproc_*`, `type_definition`, +//! `linkage_specification`, etc.) collapses into a single `` +//! glue chunk. If the file produces zero units **and** zero glue, the +//! `` post-pass emits one unit covering the whole file (1A-2 +//! pattern). +//! +//! C symbol = function name only — no namespace, no class nesting +//! (design §3.4 C row). Per design §3.4 / §9.1 / §9 versioning. + +use anyhow::Result; +use kebab_core::{ + Block, CanonicalDocument, CodeBlock, CommonBlock, Extractor, Lang, MediaType, Metadata, + ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TrustLevel, + id_for_block, id_for_doc, +}; +use serde_json::Map; +use time::OffsetDateTime; + +use crate::scaffold::{filename_from_workspace_path, strip_extension}; + +pub const PARSER_VERSION: &str = "code-c-v1"; + +/// C AST extractor. Per-unit blocks via tree-sitter-c 0.24.2 +/// (`LANGUAGE: LanguageFn`) parsed by tree-sitter 0.26. +pub struct CAstExtractor; + +impl CAstExtractor { + pub fn new() -> Self { + Self + } +} + +impl Default for CAstExtractor { + fn default() -> Self { + Self::new() + } +} + +impl Extractor for CAstExtractor { + fn supports(&self, m: &MediaType) -> bool { + matches!(m, MediaType::Code(l) if l == "c") + } + + fn parser_version(&self) -> ParserVersion { + ParserVersion(PARSER_VERSION.to_string()) + } + + fn extract( + &self, + ctx: &kebab_core::ExtractContext<'_>, + bytes: &[u8], + ) -> Result { + let asset = ctx.asset; + if !self.supports(&asset.media_type) { + anyhow::bail!( + "kebab-parse-code: unsupported media_type for CAstExtractor: {:?}", + asset.media_type + ); + } + + let parser_version = self.parser_version(); + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); + + let source = String::from_utf8(bytes.to_vec()) + .map_err(|e| anyhow::anyhow!("kebab-parse-code: C source is not valid UTF-8: {e}"))?; + + let blocks = build_blocks(&source, &doc_id)?; + let unit_count = blocks.len() as u32; + + let now = OffsetDateTime::now_utc(); + let mut events: Vec = Vec::with_capacity(2); + events.push(ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-code".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!( + "parser_version={}; unit_count={}", + parser_version.0, unit_count + )), + }); + + let title = { + let fname = filename_from_workspace_path(&asset.workspace_path.0); + strip_extension(&fname) + }; + + let abs_path = match &asset.source_uri { + kebab_core::SourceUri::File(p) => { + if p.is_absolute() { + p.clone() + } else { + ctx.workspace_root.join(p) + } + } + kebab_core::SourceUri::Kb(_) => ctx.workspace_root.to_path_buf(), + }; + let (repo, git_branch, git_commit) = match crate::repo::detect_repo(&abs_path) { + Some(r) => (Some(r.name), r.branch, r.commit), + None => (None, None, None), + }; + + let metadata = Metadata { + aliases: Vec::new(), + tags: Vec::new(), + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Map::new(), + repo, + git_branch, + git_commit, + code_lang: Some("c".to_string()), + }; + + tracing::debug!( + target: "kebab-parse-code", + "extracted C doc_id={} workspace_path={} units={}", + doc_id.0, + asset.workspace_path.0, + unit_count + ); + + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks, + metadata, + provenance: Provenance { events }, + parser_version, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + }) + } +} + +/// Walk down the declarator chain of a `function_definition` to find +/// the innermost `identifier` — the function name. +/// +/// The tree for `int *foo(int x) { ... }` looks like: +/// ```text +/// function_definition +/// type: primitive_type "int" +/// declarator: pointer_declarator +/// declarator: function_declarator +/// declarator: identifier "foo" +/// parameters: parameter_list +/// body: compound_statement +/// ``` +/// We walk `declarator` fields recursively until we reach an `identifier` +/// or run out of nodes. Returns `None` if no identifier is found +/// (malformed / unsupported declarator shape). +fn extract_fn_name<'a>(decl_node: tree_sitter::Node, src: &'a str) -> Option<&'a str> { + let mut cur = decl_node; + loop { + match cur.kind() { + "identifier" => return Some(&src[cur.start_byte()..cur.end_byte()]), + // pointer_declarator, function_declarator, array_declarator, + // attributed_declarator, parenthesized_declarator — + // all carry a `declarator` field pointing deeper. + _ => { + if let Some(inner) = cur.child_by_field_name("declarator") { + cur = inner; + } else { + // No further `declarator` field; give up. + return None; + } + } + } + } +} + +fn build_blocks( + source: &str, + doc_id: &kebab_core::DocumentId, +) -> anyhow::Result> { + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&tree_sitter_c::LANGUAGE.into()) + .map_err(|e| anyhow::anyhow!("set tree-sitter-c language: {e}"))?; + let tree = parser + .parse(source.as_bytes(), None) + .ok_or_else(|| anyhow::anyhow!("tree-sitter failed to parse C source"))?; + let lines: Vec<&str> = source.split('\n').collect(); + + let root = tree.root_node(); + + // units: (symbol, line_start, line_end, is_real_semantic_unit). + // Glue is accumulated as (start, end) pairs and flushed into one + // "" block (or "" if no real unit exists). + let mut units: Vec<(String, u32, u32, bool)> = Vec::new(); + let mut glue: Vec<(u32, u32)> = Vec::new(); + + /// Walk preceding `comment` siblings to extend the unit's line range + /// upward, folding doc / line comments into the unit (1B pattern). + fn unit_start(n: &tree_sitter::Node) -> u32 { + let mut start = n.start_position().row as u32 + 1; + let mut prev = n.prev_sibling(); + while let Some(p) = prev { + if p.kind() == "comment" { + start = p.start_position().row as u32 + 1; + prev = p.prev_sibling(); + } else { + break; + } + } + start + } + + let mut cur = root.walk(); + for child in root.named_children(&mut cur) { + let s = unit_start(&child); + let e = child.end_position().row as u32 + 1; + + match child.kind() { + "function_definition" => { + if let Some(decl) = child.child_by_field_name("declarator") { + if let Some(name) = extract_fn_name(decl, source) { + flush_glue(&mut glue, &mut units); + units.push((name.to_string(), s, e, true)); + } else { + // Could not extract name — treat as glue. + glue.push((s, e)); + } + } else { + glue.push((s, e)); + } + } + "struct_specifier" | "enum_specifier" | "union_specifier" => { + if let Some(name_node) = child.child_by_field_name("name") { + let name = &source[name_node.start_byte()..name_node.end_byte()]; + flush_glue(&mut glue, &mut units); + units.push((name.to_string(), s, e, true)); + } else { + // Anonymous struct/enum/union — glue. + glue.push((s, e)); + } + } + // Everything else: preprocessor directives, declarations + // (typedef / global var / fn prototype), type_definition, + // linkage_specification, etc. — all collapse into glue. + _ => { + glue.push((s, e)); + } + } + } + flush_glue(&mut glue, &mut units); + + // Post-pass: if the file has no real semantic unit (only glue, or + // completely empty), rename the single glue unit to "" and + // emit it. If there are zero units AND zero glue, synthesise a + // one-line "" covering the whole file. + let has_real_unit = units.iter().any(|(_, _, _, is_real)| *is_real); + + if units.is_empty() { + // Completely empty file or whitespace/comments only. + let total = lines.len() as u32; + units.push(( + "".to_string(), + 1, + total.max(1), + false, + )); + } + // If there is only glue (no real unit) the single pushed "" + // label should be "" — rename it now. + if !has_real_unit { + for (sym, _, _, _) in units.iter_mut() { + if sym == "" { + *sym = "".to_string(); + } + } + } + + let total_lines = lines.len() as u32; + let mut blocks = Vec::with_capacity(units.len()); + for (ordinal, (symbol, ls, le, _is_real)) in units.into_iter().enumerate() { + let line_start = ls.max(1); + let line_end = le.min(total_lines.max(1)); + let span = SourceSpan::Code { + line_start, + line_end, + symbol: Some(symbol), + lang: Some("c".to_string()), + }; + let block_id = id_for_block(doc_id, "code", &[], ordinal as u32, &span); + let code = lines[(line_start as usize - 1)..=(line_end as usize - 1)].join("\n"); + blocks.push(Block::Code(CodeBlock { + common: CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span, + }, + lang: Some("c".to_string()), + code, + })); + } + Ok(blocks) +} + +fn flush_glue(glue: &mut Vec<(u32, u32)>, units: &mut Vec<(String, u32, u32, bool)>) { + if glue.is_empty() { + return; + } + let s = glue.iter().map(|(a, _)| *a).min().unwrap(); + let e = glue.iter().map(|(_, b)| *b).max().unwrap(); + units.push(("".to_string(), s, e, false)); + glue.clear(); +} + +// Tests for CAstExtractor (snapshot + unit assertions) are added in Task D +// alongside the C fixture file. This module is intentionally empty until then. diff --git a/crates/kebab-parse-code/src/cpp.rs b/crates/kebab-parse-code/src/cpp.rs new file mode 100644 index 0000000..81bf1f9 --- /dev/null +++ b/crates/kebab-parse-code/src/cpp.rs @@ -0,0 +1,883 @@ +//! `kebab-parse-code::cpp` — tree-sitter C++ AST extractor (P10-1D Task C). +//! +//! Implements [`kebab_core::Extractor`] for [`MediaType::Code("cpp")`]. +//! Walks the tree-sitter parse tree and emits one [`Block::Code`] per +//! top-level AST semantic unit, each carrying [`SourceSpan::Code`] with +//! the unit's `::` separated symbol path (design §3.4 C++ row). +//! +//! ## Symbol formation +//! +//! Symbol = `namespace::Class::method` via recursive `build_blocks`: +//! +//! - `namespace_definition` (named) → push namespace name, recurse into body. +//! - Anonymous namespace (`namespace { ... }`) → push ``, recurse. +//! - `nested_namespace_specifier` (`outer::inner`) → push all segments, recurse. +//! - `class_specifier` / `struct_specifier` (named) → emit class unit + recurse +//! into body with class name pushed. +//! - `function_definition` → emit method/function unit. Symbol is built from +//! the prefix chain + the extracted declarator name component. +//! - Out-of-class method def (`void Foo::bar() {}`) — the declarator's inner +//! node is a `qualified_identifier`; its scope chain is prepended to the +//! current prefix to form the full symbol. +//! - `template_declaration` → recurse into named children with same prefix; +//! the inner function/class body is matched by its own arm. Template params +//! are NOT included in the symbol. +//! - `enum_specifier` (named) → emit type unit. +//! - `concept_definition` (C++20) → emit type unit. +//! - `linkage_specification` (extern "C") → recurse into body with same prefix. +//! +//! ## Constructor / destructor / operator overload +//! +//! - Constructor: `function_declarator > identifier` matching the class name. +//! Symbol = `Class::Class` (name duplicated, same convention as Java). +//! - Destructor: `function_declarator > destructor_name`. Symbol = `Class::~Foo`. +//! - Operator overload: `function_declarator > operator_name`. Symbol = `Class::operator+`. +//! - Conversion operator: `function_definition.declarator` is `operator_cast`. +//! Symbol = `Class::operator ` (e.g. `Class::operator bool`). +//! +//! ## Glue +//! +//! Everything not in the unit list collapses into a single `` glue +//! chunk (preproc, declarations, using, typedef, etc.). If the file produces +//! zero units AND zero glue, the `` post-pass emits one unit covering +//! the whole file. +//! +//! Per design §3.4 / §9.1 / §9 versioning. + +use anyhow::Result; +use kebab_core::{ + Block, CanonicalDocument, CodeBlock, CommonBlock, Extractor, Lang, MediaType, Metadata, + ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TrustLevel, + id_for_block, id_for_doc, +}; +use serde_json::Map; +use time::OffsetDateTime; + +use crate::scaffold::{filename_from_workspace_path, strip_extension}; + +pub const PARSER_VERSION: &str = "code-cpp-v1"; + +/// C++ AST extractor. Per-unit blocks via tree-sitter-cpp 0.23.4 +/// (`LANGUAGE: LanguageFn`) parsed by tree-sitter 0.26. +pub struct CppAstExtractor; + +impl CppAstExtractor { + pub fn new() -> Self { + Self + } +} + +impl Default for CppAstExtractor { + fn default() -> Self { + Self::new() + } +} + +impl Extractor for CppAstExtractor { + fn supports(&self, m: &MediaType) -> bool { + matches!(m, MediaType::Code(l) if l == "cpp") + } + + fn parser_version(&self) -> ParserVersion { + ParserVersion(PARSER_VERSION.to_string()) + } + + fn extract( + &self, + ctx: &kebab_core::ExtractContext<'_>, + bytes: &[u8], + ) -> Result { + let asset = ctx.asset; + if !self.supports(&asset.media_type) { + anyhow::bail!( + "kebab-parse-code: unsupported media_type for CppAstExtractor: {:?}", + asset.media_type + ); + } + + let parser_version = self.parser_version(); + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); + + let source = String::from_utf8(bytes.to_vec()).map_err(|e| { + anyhow::anyhow!("kebab-parse-code: C++ source is not valid UTF-8: {e}") + })?; + + let blocks = build_blocks_top(&source, &doc_id)?; + let unit_count = blocks.len() as u32; + + let now = OffsetDateTime::now_utc(); + let mut events: Vec = Vec::with_capacity(2); + events.push(ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-code".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!( + "parser_version={}; unit_count={}", + parser_version.0, unit_count + )), + }); + + let title = { + let fname = filename_from_workspace_path(&asset.workspace_path.0); + strip_extension(&fname) + }; + + let abs_path = match &asset.source_uri { + kebab_core::SourceUri::File(p) => { + if p.is_absolute() { + p.clone() + } else { + ctx.workspace_root.join(p) + } + } + kebab_core::SourceUri::Kb(_) => ctx.workspace_root.to_path_buf(), + }; + let (repo, git_branch, git_commit) = match crate::repo::detect_repo(&abs_path) { + Some(r) => (Some(r.name), r.branch, r.commit), + None => (None, None, None), + }; + + let metadata = Metadata { + aliases: Vec::new(), + tags: Vec::new(), + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Map::new(), + repo, + git_branch, + git_commit, + code_lang: Some("cpp".to_string()), + }; + + tracing::debug!( + target: "kebab-parse-code", + "extracted C++ doc_id={} workspace_path={} units={}", + doc_id.0, + asset.workspace_path.0, + unit_count + ); + + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks, + metadata, + provenance: Provenance { events }, + parser_version, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + }) + } +} + +// --------------------------------------------------------------------------- +// Core block-building logic +// --------------------------------------------------------------------------- + +/// Top-level entry: parse source, walk the `translation_unit` root, assemble +/// units + glue, apply the `` post-pass, and emit `Block::Code`s. +fn build_blocks_top( + source: &str, + doc_id: &kebab_core::DocumentId, +) -> anyhow::Result> { + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&tree_sitter_cpp::LANGUAGE.into()) + .map_err(|e| anyhow::anyhow!("set tree-sitter-cpp language: {e}"))?; + let tree = parser + .parse(source.as_bytes(), None) + .ok_or_else(|| anyhow::anyhow!("tree-sitter failed to parse C++ source"))?; + let lines: Vec<&str> = source.split('\n').collect(); + let root = tree.root_node(); + + // units: (symbol, line_start, line_end, is_real_semantic_unit). + // Glue is accumulated as (start, end) pairs and flushed into one + // "" block (or "" if no real unit exists). + let mut units: Vec<(String, u32, u32, bool)> = Vec::new(); + let mut glue: Vec<(u32, u32)> = Vec::new(); + + build_blocks(root, source, &[], &mut units, &mut glue); + flush_glue(&mut glue, &mut units); + + // Post-pass: if the file has no real semantic unit (only glue, or + // completely empty), rename the single glue unit to "". + // If there are zero units AND zero glue, synthesize a one-line + // "" covering the whole file. + let has_real_unit = units.iter().any(|(_, _, _, is_real)| *is_real); + + if units.is_empty() { + let total = lines.len() as u32; + units.push(("".to_string(), 1, total.max(1), false)); + } + if !has_real_unit { + for (sym, _, _, _) in units.iter_mut() { + if sym == "" { + *sym = "".to_string(); + } + } + } + + let total_lines = lines.len() as u32; + let mut blocks = Vec::with_capacity(units.len()); + for (ordinal, (symbol, ls, le, _is_real)) in units.into_iter().enumerate() { + let line_start = ls.max(1); + let line_end = le.min(total_lines.max(1)); + let span = SourceSpan::Code { + line_start, + line_end, + symbol: Some(symbol), + lang: Some("cpp".to_string()), + }; + let block_id = id_for_block(doc_id, "code", &[], ordinal as u32, &span); + let code = lines[(line_start as usize - 1)..=(line_end as usize - 1)].join("\n"); + blocks.push(Block::Code(CodeBlock { + common: CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span, + }, + lang: Some("cpp".to_string()), + code, + })); + } + Ok(blocks) +} + +/// Walk preceding `comment` siblings to extend the unit's line range upward, +/// folding leading doc / line comments into the unit (1B pattern). +fn unit_start(n: &tree_sitter::Node) -> u32 { + let mut start = n.start_position().row as u32 + 1; + let mut prev = n.prev_sibling(); + while let Some(p) = prev { + if p.kind() == "comment" { + start = p.start_position().row as u32 + 1; + prev = p.prev_sibling(); + } else { + break; + } + } + start +} + +fn flush_glue(glue: &mut Vec<(u32, u32)>, units: &mut Vec<(String, u32, u32, bool)>) { + if glue.is_empty() { + return; + } + let s = glue.iter().map(|(a, _)| *a).min().unwrap(); + let e = glue.iter().map(|(_, b)| *b).max().unwrap(); + units.push(("".to_string(), s, e, false)); + glue.clear(); +} + +/// Walk a scope node (translation_unit, declaration_list, field_declaration_list) +/// emitting unit + glue blocks. `prefix` is the current namespace/class chain +/// (e.g. `["kebab", "Chunk", "Foo"]`). +/// +/// After returning, any pending glue in `glue` is NOT flushed — callers +/// responsible for flushing at the scope boundary (top-level flush in +/// `build_blocks_top`). Within recursive scope bodies (namespace/class) we +/// do flush before returning so that glue doesn't leak across scopes. +fn build_blocks( + node: tree_sitter::Node, + source: &str, + prefix: &[String], + units: &mut Vec<(String, u32, u32, bool)>, + glue: &mut Vec<(u32, u32)>, +) { + let mut cur = node.walk(); + for child in node.named_children(&mut cur) { + let s = unit_start(&child); + let e = child.end_position().row as u32 + 1; + + match child.kind() { + "namespace_definition" => { + // Flush pending glue before starting this namespace block. + flush_glue(glue, units); + + let name_node = child.child_by_field_name("name"); + let body = child + .child_by_field_name("body") + .unwrap_or(child); + + match name_node { + None => { + // Anonymous namespace: push "", recurse. + let mut new_prefix = prefix.to_vec(); + new_prefix.push("".to_string()); + build_blocks(body, source, &new_prefix, units, glue); + flush_glue(glue, units); + } + Some(nn) => match nn.kind() { + "namespace_identifier" => { + let name = &source[nn.start_byte()..nn.end_byte()]; + let mut new_prefix = prefix.to_vec(); + new_prefix.push(name.to_string()); + build_blocks(body, source, &new_prefix, units, glue); + flush_glue(glue, units); + } + "nested_namespace_specifier" => { + // e.g. `namespace outer::inner { ... }` + // All named children are namespace_identifier nodes. + let mut new_prefix = prefix.to_vec(); + let mut nc = nn.walk(); + for seg in nn.named_children(&mut nc) { + new_prefix.push(source[seg.start_byte()..seg.end_byte()].to_string()); + } + build_blocks(body, source, &new_prefix, units, glue); + flush_glue(glue, units); + } + _ => { + // Unknown name kind — treat entire namespace as glue. + glue.push((s, e)); + } + }, + } + } + + "class_specifier" | "struct_specifier" => { + let name_node = child.child_by_field_name("name"); + let Some(nn) = name_node else { + // Anonymous class/struct — glue. + glue.push((s, e)); + continue; + }; + let name = match nn.kind() { + "type_identifier" => &source[nn.start_byte()..nn.end_byte()], + _ => { + // template_type or qualified_identifier — use full text + // as the symbol segment (includes template args). + &source[nn.start_byte()..nn.end_byte()] + } + }; + + flush_glue(glue, units); + let sym = build_symbol(prefix, &[name]); + units.push((sym, s, e, true)); + + if let Some(body) = child.child_by_field_name("body") { + let mut new_prefix = prefix.to_vec(); + new_prefix.push(name.to_string()); + build_blocks(body, source, &new_prefix, units, glue); + flush_glue(glue, units); + } + } + + "function_definition" => { + let decl = child.child_by_field_name("declarator"); + let Some(decl_node) = decl else { + glue.push((s, e)); + continue; + }; + + match extract_fn_symbol(decl_node, source, prefix) { + Some(sym) => { + flush_glue(glue, units); + units.push((sym, s, e, true)); + } + None => { + glue.push((s, e)); + } + } + } + + "template_declaration" => { + // Unwrap: recurse into named children with same prefix. + // The inner function/class/concept will be matched by their own + // arms. template_parameter_list is not a unit; it will fall + // through to glue (it's not a named child of the template_declaration + // that matches any of our arms). + build_blocks(child, source, prefix, units, glue); + // Do NOT flush glue here — template body may be part of a glue group. + } + + "enum_specifier" => { + if let Some(nn) = child.child_by_field_name("name") { + let name = &source[nn.start_byte()..nn.end_byte()]; + flush_glue(glue, units); + let sym = build_symbol(prefix, &[name]); + units.push((sym, s, e, true)); + } else { + // Anonymous enum — glue. + glue.push((s, e)); + } + } + + "concept_definition" => { + // C++20. Has required "name" field (identifier). + if let Some(nn) = child.child_by_field_name("name") { + let name = &source[nn.start_byte()..nn.end_byte()]; + flush_glue(glue, units); + let sym = build_symbol(prefix, &[name]); + units.push((sym, s, e, true)); + } else { + glue.push((s, e)); + } + } + + "linkage_specification" => { + // extern "C" { ... } — glue-wrapper, but recurse into body + // with same prefix so inner definitions are extracted. + let body = child.child_by_field_name("body").unwrap_or(child); + // The linkage_spec itself is glue; inner defs handled by recursion. + // Don't emit the wrapper as a unit; but also don't push it as glue + // since recursion will push its inner children individually. + build_blocks(body, source, prefix, units, glue); + } + + // Everything else: preproc, declarations, using, typedef, etc. + _ => { + glue.push((s, e)); + } + } + } +} + +/// Join prefix + extras into a `::` separated symbol. +fn build_symbol(prefix: &[String], extras: &[&str]) -> String { + let mut parts: Vec<&str> = prefix.iter().map(String::as_str).collect(); + parts.extend_from_slice(extras); + parts.join("::") +} + +/// Extract the symbol for a `function_definition` given its top-level +/// `declarator` node. Returns `None` if the name cannot be determined. +/// +/// The declarator chain may be: +/// - `function_declarator` (plain fn or method) +/// - `pointer_declarator` wrapping `function_declarator` (fn returning pointer) +/// - `reference_declarator` wrapping `function_declarator` (fn returning ref) +/// - `operator_cast` (conversion operator — e.g. `operator bool`) +/// +/// The inner `function_declarator.declarator` is one of: +/// - `identifier` → free fn or constructor, symbol = `prefix::name` +/// - `field_identifier` → method in class body, symbol = `prefix::name` +/// - `destructor_name` → `~Foo`, symbol = `prefix::~Foo` +/// - `operator_name` → `operator+` etc., symbol = `prefix::operator+` +/// - `qualified_identifier` → out-of-class def `Foo::bar` or `ns::Foo::bar`; +/// the scope chain is extracted and prepended to prefix. +/// +/// For `qualified_identifier`, the scope hierarchy (which may itself be a +/// `qualified_identifier`) is flattened into a list of segments. These +/// segments REPLACE the current prefix (since out-of-class defs carry their +/// full scope explicitly). Example: `void ns::Foo::bar() {}` at top level +/// with prefix=[] → segments=[ns, Foo, bar] → symbol = `ns::Foo::bar`. +fn extract_fn_symbol( + decl_node: tree_sitter::Node, + source: &str, + prefix: &[String], +) -> Option { + // Walk down pointer/reference wrapper layers to reach the + // function_declarator (or operator_cast at definition level). + let fn_decl = unwrap_to_fn_declarator(decl_node, source)?; + + match fn_decl.kind() { + "operator_cast" => { + // e.g. `operator bool() const` — the function_definition.declarator + // IS the operator_cast (no function_declarator wrapper). + // Symbol = `prefix::operator `. + let type_node = fn_decl.child_by_field_name("type")?; + let type_text = &source[type_node.start_byte()..type_node.end_byte()]; + Some(build_symbol(prefix, &[&format!("operator {type_text}")])) + } + "function_declarator" => { + let inner = fn_decl.child_by_field_name("declarator")?; + extract_name_node(inner, source, prefix) + } + _ => None, + } +} + +/// Walk pointer_declarator / reference_declarator chains down to the +/// first `function_declarator` or `operator_cast` node. +/// +/// Returns `None` if no such node is found (e.g. a function definition +/// whose declarator is malformed or unknown). +fn unwrap_to_fn_declarator<'a>( + mut node: tree_sitter::Node<'a>, + _source: &str, +) -> Option> { + loop { + match node.kind() { + "function_declarator" | "operator_cast" => return Some(node), + "pointer_declarator" => { + node = node.child_by_field_name("declarator")?; + } + "reference_declarator" | "rvalue_reference_declarator" => { + // reference_declarator has no `declarator` field; its child + // is in the unnamed children list. + let mut walker = node.walk(); + node = node.named_children(&mut walker).next()?; + } + _ => return None, + } + } +} + +/// Given the innermost name node of a function_declarator, produce the symbol. +fn extract_name_node( + inner: tree_sitter::Node, + source: &str, + prefix: &[String], +) -> Option { + match inner.kind() { + "identifier" | "field_identifier" => { + let name = &source[inner.start_byte()..inner.end_byte()]; + Some(build_symbol(prefix, &[name])) + } + "destructor_name" => { + // destructor_name text includes the `~` prefix (e.g. "~Foo"). + let full = &source[inner.start_byte()..inner.end_byte()]; + Some(build_symbol(prefix, &[full])) + } + "operator_name" => { + // Full text e.g. "operator+", "operator->", "operator()". + let full = &source[inner.start_byte()..inner.end_byte()]; + Some(build_symbol(prefix, &[full])) + } + "template_function" | "template_method" => { + // Template function like `foo()`. Use the `name` field + // (the identifier / field_identifier before `<`). + let name_node = inner.child_by_field_name("name")?; + let name = &source[name_node.start_byte()..name_node.end_byte()]; + Some(build_symbol(prefix, &[name])) + } + "qualified_identifier" => { + // Out-of-class method definition. Flatten the nested + // qualified_identifier chain into ordered segments. + // Example: `ns::Foo::method` + // qualified_identifier { + // scope: namespace_identifier "ns" + // name: qualified_identifier { + // scope: namespace_identifier "Foo" + // name: identifier "method" + // } + // } + // → ["ns", "Foo", "method"] + // + // These segments are combined with the current prefix so that a + // top-level out-of-class def `void Foo::bar() {}` inside a + // namespace body with prefix=["ns"] produces `ns::Foo::bar`. + let mut segments: Vec = Vec::new(); + flatten_qualified_id(inner, source, &mut segments); + if segments.is_empty() { + return None; + } + // Build: prefix + all segments (scope chain + leaf). + let mut all: Vec<&str> = prefix.iter().map(String::as_str).collect(); + for seg in &segments { + all.push(seg.as_str()); + } + Some(all.join("::")) + } + _ => None, + } +} + +/// Recursively flatten a `qualified_identifier` node into ordered string +/// segments. For `ns::Foo::method` this produces `["ns", "Foo", "method"]`. +fn flatten_qualified_id(node: tree_sitter::Node, source: &str, out: &mut Vec) { + // A qualified_identifier has: + // scope: namespace_identifier | (None for global-scope `::foo`) + // name: identifier | field_identifier | destructor_name | + // operator_name | qualified_identifier | template_function | + // template_method | ... + let scope_node = node.child_by_field_name("scope"); + let name_node = node.child_by_field_name("name"); + + if let Some(s) = scope_node { + out.push(source[s.start_byte()..s.end_byte()].to_string()); + } + + match name_node { + Some(n) if n.kind() == "qualified_identifier" => { + // Recurse: more nesting. + flatten_qualified_id(n, source, out); + } + Some(n) => { + // Leaf name — push its text. + out.push(source[n.start_byte()..n.end_byte()].to_string()); + } + None => {} + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +pub(crate) mod tests_support { + use kebab_core::*; + use std::path::PathBuf; + use time::OffsetDateTime; + + pub fn fixed_code_asset(workspace_path: &str, lang: &str) -> RawAsset { + RawAsset { + asset_id: AssetId("a".repeat(64)), + source_uri: SourceUri::File(PathBuf::from(workspace_path)), + workspace_path: WorkspacePath(workspace_path.to_string()), + media_type: MediaType::Code(lang.to_string()), + byte_len: 0, + checksum: Checksum("b".repeat(64)), + discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + stored: AssetStorage::Reference { + path: PathBuf::from(workspace_path), + sha: Checksum("b".repeat(64)), + }, + } + } + + pub fn extract_cpp(src: &str, path: &str) -> kebab_core::CanonicalDocument { + use super::CppAstExtractor; + use kebab_core::Extractor; + let asset = fixed_code_asset(path, "cpp"); + let cfg = ExtractConfig::default(); + let root = PathBuf::from("/tmp"); + let ctx = ExtractContext { + asset: &asset, + workspace_root: &root, + config: &cfg, + }; + CppAstExtractor::new().extract(&ctx, src.as_bytes()).unwrap() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{Block, MediaType, SourceSpan}; + + fn syms(doc: &kebab_core::CanonicalDocument) -> Vec { + let mut s: Vec = doc + .blocks + .iter() + .filter_map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, .. } => symbol.clone(), + _ => None, + }, + _ => None, + }) + .collect(); + s.sort(); + s + } + + #[test] + fn extractor_supports_only_media_code_cpp() { + let e = CppAstExtractor::new(); + assert!(e.supports(&MediaType::Code("cpp".into()))); + assert!(!e.supports(&MediaType::Code("c".into()))); + assert!(!e.supports(&MediaType::Code("rust".into()))); + assert!(!e.supports(&MediaType::Markdown)); + } + + #[test] + fn free_function() { + let src = "void foo() {}\n"; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "foo"), "got {s:?}"); + } + + #[test] + fn namespace_and_class() { + let src = r#" +namespace ns { + class Foo { + public: + void method() {} + Foo() {} + ~Foo() {} + int operator+(const Foo& o) { return 0; } + }; +} +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "ns::Foo"), "ns::Foo missing: {s:?}"); + assert!(s.iter().any(|x| x == "ns::Foo::method"), "method missing: {s:?}"); + assert!(s.iter().any(|x| x == "ns::Foo::Foo"), "ctor missing: {s:?}"); + assert!(s.iter().any(|x| x == "ns::Foo::~Foo"), "dtor missing: {s:?}"); + assert!(s.iter().any(|x| x == "ns::Foo::operator+"), "op+ missing: {s:?}"); + } + + #[test] + fn anonymous_namespace() { + let src = r#" +namespace { + void hidden_fn() {} +} +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "::hidden_fn"), + "anon fn missing: {s:?}" + ); + } + + #[test] + fn nested_namespace_specifier() { + let src = r#" +namespace outer::inner { + void fn_in_nested() {} +} +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "outer::inner::fn_in_nested"), + "nested ns fn missing: {s:?}" + ); + } + + #[test] + fn out_of_class_method_def() { + let src = r#" +void ns::Foo::method() { } +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "ns::Foo::method"), + "out-of-class method missing: {s:?}" + ); + } + + #[test] + fn template_declaration() { + let src = r#" +template +class Bar { + void tmpl_method() {} +}; + +template +void tmpl_free_fn(T x) {} +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "Bar"), "Bar class missing: {s:?}"); + assert!( + s.iter().any(|x| x == "Bar::tmpl_method"), + "Bar::tmpl_method missing: {s:?}" + ); + assert!( + s.iter().any(|x| x == "tmpl_free_fn"), + "tmpl_free_fn missing: {s:?}" + ); + } + + #[test] + fn enum_and_concept() { + let src = r#" +enum class Color { Red, Green }; + +template +concept Printable = requires(T t) { t.print(); }; +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "Color"), "Color missing: {s:?}"); + assert!(s.iter().any(|x| x == "Printable"), "Printable missing: {s:?}"); + } + + #[test] + fn extern_c_block() { + let src = r#" +extern "C" { + void c_fn1() {} + void c_fn2() {} +} +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "c_fn1"), "c_fn1 missing: {s:?}"); + assert!(s.iter().any(|x| x == "c_fn2"), "c_fn2 missing: {s:?}"); + } + + #[test] + fn conversion_operator() { + let src = r#" +class Foo { + operator bool() const { return true; } +}; +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "Foo::operator bool"), + "conversion op missing: {s:?}" + ); + } + + #[test] + fn empty_file_produces_module() { + let src = ""; + let doc = tests_support::extract_cpp(src, "x/empty.cpp"); + let s = syms(&doc); + assert_eq!(s, vec![""], "expected : got {s:?}"); + } + + #[test] + fn glue_only_produces_module() { + let src = "#include \nusing namespace std;\n"; + let doc = tests_support::extract_cpp(src, "x/glue.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == ""), "expected : got {s:?}"); + } + + #[test] + fn ptr_returning_function() { + let src = "int* ptr_fn(int x) { return &x; }\n"; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "ptr_fn"), "ptr_fn missing: {s:?}"); + } + + #[test] + fn ref_returning_operator() { + let src = r#" +class Foo { + Foo& operator=(const Foo& o) { return *this; } +}; +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "Foo::operator="), + "operator= missing: {s:?}" + ); + } + + #[test] + fn deterministic_across_runs() { + let src = r#" +namespace ns { + class Foo { + void method() {} + }; +} +void free_fn() {} +"#; + let a = tests_support::extract_cpp(src, "x/foo.cpp"); + for _ in 0..20 { + assert_eq!(tests_support::extract_cpp(src, "x/foo.cpp").blocks, a.blocks); + } + } +} diff --git a/crates/kebab-parse-code/src/lib.rs b/crates/kebab-parse-code/src/lib.rs index 854ba27..7659fdb 100644 --- a/crates/kebab-parse-code/src/lib.rs +++ b/crates/kebab-parse-code/src/lib.rs @@ -13,6 +13,8 @@ //! `kebab-parse-*` crates per design §8: must NOT depend on store / embed //! / llm / rag. +pub mod c; +pub mod cpp; pub mod go; pub mod java; pub mod javascript; @@ -25,6 +27,8 @@ pub(crate) mod scaffold; pub mod skip; pub mod typescript; +pub use c::{PARSER_VERSION as C_PARSER_VERSION, CAstExtractor}; +pub use cpp::{PARSER_VERSION as CPP_PARSER_VERSION, CppAstExtractor}; pub use go::{PARSER_VERSION as GO_PARSER_VERSION, GoAstExtractor}; pub use java::{PARSER_VERSION as JAVA_PARSER_VERSION, JavaAstExtractor}; pub use javascript::{PARSER_VERSION as JS_PARSER_VERSION, JavascriptAstExtractor}; diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 39a0941..dfa38db 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -22,7 +22,7 @@ Cargo workspace, 함수 호출 기반 모듈러 모놀리스. UI binary (`kebab- | OCR | Ollama vision LM (default `gemma4:e4b`) — `OcrEngine` trait 으로 Tesseract / Apple Vision 등 future swap (HOTFIXES P6-2) | | Image caption | Ollama vision LM, runtime gate `image.caption.enabled` (default OFF) | | PDF parser | `lopdf` per-page 텍스트, `chunker_version = "pdf-page-v1"` 가 PDF 자산에 하드코딩 (HOTFIXES P7-3) | -| code parser | `tree-sitter` + `tree-sitter-rust` / `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` / `tree-sitter-go` / `tree-sitter-java` / `tree-sitter-kotlin-ng` — **parser-side** (`kebab-parse-code`), chunker-side 아님 (design §6.3). chunker versions: Rust = `code-rust-ast-v1`, Python = `code-python-ast-v1`, TypeScript = `code-ts-ast-v1`, JavaScript = `code-js-ast-v1`, Go = `code-go-ast-v1`, Java = `code-java-ast-v1`, Kotlin = `code-kotlin-ast-v1`. `ast_chunk_max_lines = 200` 상수 고정 (HOTFIXES 2026-05-19 — Chunker trait 이 per-medium config 미노출). Kotlin grammar 은 `tree-sitter-kotlin-ng` 사용 — bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 고착되어 있어 사용 불가. **Tier 2 (p10-2)**: YAML/k8s → `serde_yaml` + `k8s-manifest-resource-v1` (apiVersion+kind per resource), Dockerfile → `dockerfile-file-v1` (whole-file), Cargo.toml/go.mod/.json/.xml/.groovy → `manifest-file-v1` (whole-file). Tier 2 chunkers live in `kebab-chunk`; no tree-sitter grammar needed (structure from file type, not AST). **Tier 3 (p10-3)**: shell scripts (`.sh`/`.bash`/`.zsh`) direct → `code-text-paragraph-v1` (blank-line paragraph segmentation + 80-line / 20-overlap line-window for oversize). Same chunker also serves as fallback when Tier 1/2 emit 0 chunks or Err — non-k8s YAML / invalid YAML / AST extractor failures all picked up. symbol = None; lang preserved from input doc. | +| code parser | `tree-sitter` + `tree-sitter-rust` / `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` / `tree-sitter-go` / `tree-sitter-java` / `tree-sitter-kotlin-ng` — **parser-side** (`kebab-parse-code`), chunker-side 아님 (design §6.3). chunker versions: Rust = `code-rust-ast-v1`, Python = `code-python-ast-v1`, TypeScript = `code-ts-ast-v1`, JavaScript = `code-js-ast-v1`, Go = `code-go-ast-v1`, Java = `code-java-ast-v1`, Kotlin = `code-kotlin-ast-v1`. `ast_chunk_max_lines = 200` 상수 고정 (HOTFIXES 2026-05-19 — Chunker trait 이 per-medium config 미노출). Kotlin grammar 은 `tree-sitter-kotlin-ng` 사용 — bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 고착되어 있어 사용 불가. **Tier 2 (p10-2)**: YAML/k8s → `serde_yaml` + `k8s-manifest-resource-v1` (apiVersion+kind per resource), Dockerfile → `dockerfile-file-v1` (whole-file), Cargo.toml/go.mod/.json/.xml/.groovy → `manifest-file-v1` (whole-file). Tier 2 chunkers live in `kebab-chunk`; no tree-sitter grammar needed (structure from file type, not AST). **Tier 3 (p10-3)**: shell scripts (`.sh`/`.bash`/`.zsh`) direct → `code-text-paragraph-v1` (blank-line paragraph segmentation + 80-line / 20-overlap line-window for oversize). Same chunker also serves as fallback when Tier 1/2 emit 0 chunks or Err — non-k8s YAML / invalid YAML / AST extractor failures all picked up. symbol = None; lang preserved from input doc. **Tier 1 family complete (p10-1D)**: C (`tree-sitter-c`, `code-c-ast-v1`, `.c`/`.h`) + C++ (`tree-sitter-cpp`, `code-cpp-ast-v1`, `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx`). C symbol = function name only; C++ symbol = `namespace::Class::method` (recursive nesting). `.h` 가 C++ syntax 만나면 tree-sitter-c parse 실패 → Tier 3 fallback. | | 1B symbol path | workspace path → module path: Python = dotted prefix (`kebab_eval.metrics.compute_mrr`), TypeScript/JavaScript = slash-style prefix (`src/Foo.Foo.search`). Rust 1A-2 는 file-scope nesting 만 (workspace prefix 없음, 비일관 수용 — HOTFIXES 2026-05-20). | | TUI | Ratatui + crossterm — P9-1 Library 패널, P9-2/3/4 진행 예정 | | Desktop | Tauri 2 + `pdfjs-dist` (native PDF render backend 금지) — P9-5 | @@ -52,7 +52,7 @@ flowchart TB ppdf["kebab-parse-pdf"] pimg["kebab-parse-image"] paud["kebab-parse-audio
(P8 보류)"] - pcode["kebab-parse-code
(P10-1A-2 + P10-1B + P10-1C-Go + P10-1C-JK + P10-2 + P10-3)"] + pcode["kebab-parse-code
(P10-1A-2 + P10-1B + P10-1C-Go + P10-1C-JK + P10-2 + P10-3 + P10-1D)"] ptypes["kebab-parse-types"] norm["kebab-normalize"] chunk["kebab-chunk"] @@ -127,7 +127,7 @@ flowchart TB UI → store/llm/parse 직접 의존 금지. 모든 user-facing 진입은 `kebab-app` facade 만 통한다 (frozen 설계 §8). `kebab-cli` 가 `--config ` flag 를 honor 하려면 `kebab_app::*_with_config(cfg, …)` companion 을 통해 Config 을 명시적으로 thread 하는 패턴 — 자세한 이유는 [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 의 `--config` 항목. -`kebab-parse-code` 의 외부 tree-sitter grammar crate 의존: P10-1A-2 에서 `tree-sitter-rust` 추가, P10-1B 에서 `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` 추가, P10-1C-Go 에서 `tree-sitter-go` 추가, P10-1C-JK 에서 `tree-sitter-java` / `tree-sitter-kotlin-ng` 추가. 모두 `kebab-parse-code` 에만 격리 (facade 룰 — UI crate / chunker 가 직접 import 금지). Kotlin 은 `tree-sitter-kotlin-ng` 사용 (bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 고착 — 사용 불가). +`kebab-parse-code` 의 외부 tree-sitter grammar crate 의존: P10-1A-2 에서 `tree-sitter-rust` 추가, P10-1B 에서 `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` 추가, P10-1C-Go 에서 `tree-sitter-go` 추가, P10-1C-JK 에서 `tree-sitter-java` / `tree-sitter-kotlin-ng` 추가, P10-1D 에서 `tree-sitter-c` / `tree-sitter-cpp` 추가. 모두 `kebab-parse-code` 에만 격리 (facade 룰 — UI crate / chunker 가 직접 import 금지). Kotlin 은 `tree-sitter-kotlin-ng` 사용 (bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 고착 — 사용 불가). ## 디렉토리 구조 @@ -165,9 +165,11 @@ kebab/ │ ├── kebab-source-fs/ # 워크스페이스 walk + checksum (P1-1) │ ├── kebab-parse-md/ # Markdown frontmatter + blocks (P1-2/3) │ ├── kebab-normalize/ # ParsedBlock → CanonicalDocument (P1-4) -│ ├── kebab-chunk/ # heading-aware + pdf-page-v1 + code-*-ast-v1 (Tier 1) + k8s-manifest-resource-v1 + dockerfile-file-v1 + manifest-file-v1 + tier2_shared (P10-2) + code-text-paragraph-v1 (P10-3) chunker (P1-5, P7-2, P10-1A-2, P10-1B, P10-1C-Go, P10-1C-JK, P10-2, P10-3) +│ ├── kebab-chunk/ # heading-aware + pdf-page-v1 + code-*-ast-v1 (Tier 1) + k8s-manifest-resource-v1 + dockerfile-file-v1 + manifest-file-v1 + tier2_shared (P10-2) + code-text-paragraph-v1 (P10-3) chunker (P1-5, P7-2, P10-1A-2, P10-1B, P10-1C-Go, P10-1C-JK, P10-2, P10-3, P10-1D) │ │ └── src/ -│ │ ├── code_*_ast_v1.rs # Tier 1 AST chunkers (rust/python/ts/js/go/java/kotlin) +│ │ ├── code_*_ast_v1.rs # Tier 1 AST chunkers (rust/python/ts/js/go/java/kotlin/c/cpp) +│ │ ├── code_c_ast_v1.rs # Tier 1 (p10-1D): C top-level fn / struct / enum / union +│ │ ├── code_cpp_ast_v1.rs # Tier 1 (p10-1D): C++ namespace::Class::method (recursive nesting) │ │ ├── k8s_manifest_resource_v1.rs # Tier 2 (p10-2): YAML multi-doc, apiVersion+kind per resource │ │ ├── dockerfile_file_v1.rs # Tier 2 (p10-2): whole-file Dockerfile │ │ ├── manifest_file_v1.rs # Tier 2 (p10-2): whole-file Cargo.toml / go.mod / .json / .xml / .groovy @@ -182,7 +184,7 @@ kebab/ │ ├── kebab-eval/ # golden query runner + metrics (P5-1, P5-2) │ ├── kebab-parse-image/ # ImageExtractor + Ollama OCR + caption (P6) │ ├── kebab-parse-pdf/ # lopdf per-page text extractor (P7-1) -│ ├── kebab-parse-code/ # tree-sitter AST extractors: Rust (P10-1A-2), Python + TypeScript + JavaScript (P10-1B), Go (P10-1C-Go), Java + Kotlin (P10-1C-JK — java.rs + kotlin.rs); chunker lives in kebab-chunk +│ ├── kebab-parse-code/ # tree-sitter AST extractors: Rust (P10-1A-2), Python + TypeScript + JavaScript (P10-1B), Go (P10-1C-Go), Java + Kotlin (P10-1C-JK — java.rs + kotlin.rs), C + C++ (P10-1D — c.rs + cpp.rs); chunker lives in kebab-chunk │ ├── kebab-app/ # facade (P0 시그니처 + P3-5/P6-4/P7-3 본체) │ ├── kebab-tui/ # Ratatui shell + Library 패널 (P9-1) │ ├── kebab-mcp/ # stdio MCP server — tools: schema, doctor, search, ask (P9-FB-30) diff --git a/docs/SMOKE.md b/docs/SMOKE.md index 52380b7..961ec0a 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -548,6 +548,54 @@ KB --json schema | jq '.stats.code_lang_breakdown' **Tier 3 citation.symbol 컨벤션**: 항상 `null`. 의미 단위 식별 안 함. `lang` 은 원본 lang 보존 (shell → `"shell"`, yaml → `"yaml"` 등). +## P10-1D C + C++ AST chunkers + +P10-3 와 동일한 격리 KB 설정. `.c` 와 `.cpp` 파일이 각자의 AST chunker 로 처리된다. + +```bash +# 1) C 파일 — top-level function symbol +cat > /tmp/kebab-smoke/workspace/parser.c <<'EOF' +#include + +int parse_record(const char *line) { + if (line == NULL) return -1; + return 0; +} +EOF + +# 2) C++ 파일 — namespace::Class::method symbol +cat > /tmp/kebab-smoke/workspace/chunker.cpp <<'EOF' +namespace kebab { +namespace chunk { + +class Foo { +public: + void bar() { /* impl */ } +}; + +} // namespace chunk +} // namespace kebab +EOF + +# 3) ingest +KB ingest + +# 4) 언어별 검색 (citation.symbol 확인) +KB search --mode hybrid "parse_record" --code-lang c --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang}]}' +# 기대: symbol = "parse_record" (function name only), lang = "c" + +KB search --mode hybrid "bar" --code-lang cpp --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang}]}' +# 기대: symbol = "kebab::chunk::Foo" 또는 "kebab::chunk::Foo::bar" (namespace::Class[::method]), lang = "cpp" + +# 5) schema stats 에 C/C++ 카운트 확인 +KB --json schema | jq '.stats.code_lang_breakdown' +# 기대: {"c": N, "cpp": M, ...} +``` + +**Tier 1 (p10-1D) citation.symbol 컨벤션**: C 는 function name only (`parse_record` 같이 nesting 없음). C++ 는 `namespace::Class::method` (recursive namespace + class nesting). `.h` 파일이 C++ syntax (namespace / template / class) 만나면 tree-sitter-c parse 실패 → p10-3 Tier 3 fallback (`code-text-paragraph-v1`) 으로 자동 picked up. + ## 검증 체크리스트 - `kebab doctor` 가 `--config` path 를 honor 하고 그 안의 `storage.data_dir` 를 출력 (XDG default 가 아님). @@ -584,6 +632,7 @@ rm -rf /tmp/kebab-smoke # 통째로 정리 - (P10-1C-JK) `.java` 파일은 `code-java-ast-v1`, `.kt`/`.kts` 파일은 `code-kotlin-ast-v1` 로 처리. `--code-lang java` / `--code-lang kotlin` 검색이 `citation.symbol` 에 `com.foo.Foo.bar` 형식 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"java": N` / `"kotlin": N` 등장 확인. - (P10-2) `.yaml`/`.yml` 파일은 apiVersion+kind 파싱으로 k8s resource 별 chunk 생성 (`k8s-manifest-resource-v1`). `Dockerfile`/`Dockerfile.*` 는 전체 파일 단일 chunk (`dockerfile-file-v1`). `.toml`/`.json`/`.xml`/`.groovy`/`go.mod` 는 전체 파일 단일 chunk (`manifest-file-v1`). `--code-lang yaml` / `--code-lang dockerfile` / `--code-lang toml` 검색이 `citation.symbol` 에 각각 `Deployment/default/my-app` / `` / `` 형식 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"yaml": N` / `"dockerfile": N` / `"toml": N` 등장 확인. - (P10-3) `.sh`/`.bash`/`.zsh` 파일은 direct Tier 3 (`code-text-paragraph-v1`). 비-k8s YAML (apiVersion+kind 없는 yaml) 은 k8s chunker 가 0 chunk → Tier 3 fallback 으로 picked up. `--code-lang shell` / `--code-lang yaml` 검색이 `citation.symbol = null`, `chunker_version = "code-text-paragraph-v1"` 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"shell": N` 등장 확인. +- (P10-1D) `.c` / `.h` 파일은 `code-c-ast-v1` (function name only symbol). `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx` 는 `code-cpp-ast-v1` (`namespace::Class::method` symbol). `--code-lang c` / `--code-lang cpp` 검색 동작 + `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"c": N` / `"cpp": M` 등장 확인. `.h` 파일이 C++ 내용 (namespace 등) 갖고 있으면 자동으로 Tier 3 (`code-text-paragraph-v1`) fallback 으로 picked up. - (P7-3 + follow-up) 동일 path 에 byte 가 다른 PDF 를 두 번째 ingest 하면 `purge_vector_orphans_for_workspace_path` 가 옛 chunk_id 를 LanceDB 에서 먼저 삭제, 이어서 `purge_orphan_at_workspace_path` 가 옛 doc / chunks / embedding_records 를 SQLite 에서 sweep. 새 byte 가 새 `doc_id` 로 색인됨. `IngestReport` 에 그 자산만 `new+=1` (다른 자산은 `updated`). 두 store 모두 정합 — 옛 본문 검색 시 옛 chunks 가 더 이상 surface 되지 않음. ### Embedding upgrade (fb-39b) diff --git a/docs/superpowers/plans/2026-05-21-p10-1d-c-cpp-ast-chunker.md b/docs/superpowers/plans/2026-05-21-p10-1d-c-cpp-ast-chunker.md new file mode 100644 index 0000000..89c74e7 --- /dev/null +++ b/docs/superpowers/plans/2026-05-21-p10-1d-c-cpp-ast-chunker.md @@ -0,0 +1,930 @@ +# p10-1D C + C++ AST Chunkers Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Activate C + C++ code ingest end-to-end. P10 Tier 1 chunker family final entry. + +**Architecture:** Same shape as 1B (multi-language single PR) and 1C-JK (JVM family). 2 new tree-sitter grammars + 2 extractors + 2 chunkers + media routing (delegated via `code_lang_for_path`, no change) + app dispatch arms. C symbol = function name only; C++ symbol = `namespace::Class::method` via recursive class/namespace nesting (Java/Kotlin + Python hybrid). + +**Tech Stack:** Rust 2024 workspace, `tree-sitter` 0.26 (already), `tree-sitter-c` + `tree-sitter-cpp` (NEW). 1A-2/1B/1C/p10-2/p10-3 infrastructure unchanged. + +**Memory note:** Host has been OOM'd previously (재부팅 사례). Per-crate cargo only. ONE full-suite + clippy invocation in Task J. NO `cargo test --workspace` outside that gate. + +--- + +## Pre-flight + +Branch `feat/p10-1d-c-cpp` already exists (spec commit `8add684`). + +- [ ] **Disk hygiene**: `df -h /` 점검. 80% 넘으면 `cargo clean`. + +Reference files: +- 1C-JK extractor: `crates/kebab-parse-code/src/{java,kotlin}.rs` — closest template for source-side identifier prefix (package vs namespace). +- 1B Python extractor: `crates/kebab-parse-code/src/python.rs` — class-nesting recursion model (relevant for C++ class nesting). +- 1A-2 chunker: `crates/kebab-chunk/src/code_rust_ast_v1.rs` — duplicate-with-substitution pattern. +- 1B/1C/p10-2/p10-3 dispatch generalization: `crates/kebab-app/src/lib.rs::ingest_one_code_asset` (~L1796–2116). Current allowlist + 4-arm match. +- spec: `tasks/p10/p10-1d-c-cpp-ast-chunker.md`. + +--- + +## Task A: Workspace deps (tree-sitter-c + tree-sitter-cpp) + +**Files:** +- Modify: `Cargo.toml` (`[workspace.dependencies]`, after `tree-sitter-kotlin-ng`) +- Modify: `crates/kebab-parse-code/Cargo.toml` + +- [ ] **Step 1**: `cargo add tree-sitter-c tree-sitter-cpp -p kebab-parse-code`. If either crate's actively-maintained name differs (e.g. `tree-sitter-cpp` vs `tree-sitter-cpp-ng`), verify on crates.io. The `tree-sitter-c` 0.24 / `tree-sitter-cpp` 0.23 line is the most common; verify compatibility with workspace `tree-sitter = "0.26"` (likely already supported via the `tree-sitter-language` shim). + +- [ ] **Step 2**: Lift the two resolved versions into `[workspace.dependencies]` (after `tree-sitter-kotlin-ng`): + +```toml +# C/C++ family grammars for code ingest (kebab-parse-code, p10-1D). +tree-sitter-c = "" +tree-sitter-cpp = "" +``` + +Switch crate's `Cargo.toml` entries to `{ workspace = true }`. + +- [ ] **Step 3**: `cargo build -p kebab-parse-code` → clean. Unused dep warning is fine. + +- [ ] **Step 4**: Commit: + +```bash +git add Cargo.toml Cargo.lock crates/kebab-parse-code/Cargo.toml +git commit -m "$(cat <<'EOF' +build(p10-1d): add tree-sitter-c + tree-sitter-cpp workspace deps + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +If a crate's resolved name has a non-obvious fork suffix (e.g. `tree-sitter-cpp-ng`), document it in the commit body. + +--- + +## Task B: C AST extractor (`kebab-parse-code/src/c.rs`) + +**Files:** +- Create: `crates/kebab-parse-code/src/c.rs` +- Modify: `crates/kebab-parse-code/src/lib.rs` (pub mod + `C_PARSER_VERSION` const) + +- [ ] **Step 1**: Create `crates/kebab-parse-code/src/c.rs`. Mirror `crates/kebab-parse-code/src/go.rs` (closest template — single-language, no namespace/package nesting, top-level units). Replace tree-sitter-go with tree-sitter-c: + +```rust +//! p10-1D: C AST extractor. + +use crate::traits::{Extractor, ExtractContext}; +use anyhow::{Context, Result}; +use kebab_core::{Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, /*..*/, SourceSpan, id_for_block, id_for_doc}; +use tree_sitter::Parser; + +pub const C_PARSER_VERSION: &str = concat!("tree-sitter-c-", env!("CARGO_PKG_VERSION")); +// Or use the tree-sitter-c crate version: better to hardcode for stability. +// Look at how go.rs / rust.rs / etc. set their PARSER_VERSION. + +pub struct CAstExtractor { + parser: Parser, +} + +impl CAstExtractor { + pub fn new() -> Self { + let mut parser = Parser::new(); + parser.set_language(&tree_sitter_c::LANGUAGE.into()).expect("load tree-sitter-c"); + Self { parser } + } +} + +impl Extractor for CAstExtractor { + fn extract(&mut self, ctx: &ExtractContext, bytes: &[u8]) -> Result { + // ... mirror go.rs: + // 1. parse the tree + // 2. iterate source_file's named_children + // 3. for each top-level node: + // - function_definition → emit unit (symbol = fn name) + // - struct_specifier (named) → emit unit (symbol = struct name) + // - enum_specifier (named) → emit unit (symbol = enum name) + // - union_specifier (named) → emit unit (symbol = union name) + // - declaration → glue + // - preproc_include / preproc_def / preproc_function_def / preproc_ifdef → glue + // - else → glue + // 4. glue chunk if any glue accumulated + // 5. post-pass if 0 units + // ... + todo!("mirror go.rs structure with C-specific node-kind names") + } +} +``` + +**ACTION**: Read `crates/kebab-parse-code/src/go.rs` in full first. It's the closest template — single-language, no namespace prefix to thread through (C is even simpler than Go since there's no `package`). Port the structure: parse → iterate top-level → match on node-kind → emit units or accumulate glue. + +Node-kind name reference (tree-sitter-c): `function_definition`, `struct_specifier`, `enum_specifier`, `union_specifier`, `declaration`, `preproc_*`. Confirm by checking the crate's `node-types.json` if uncertain. + +**Function name extraction**: `function_definition` has a `declarator` field. The innermost `identifier` of that declarator is the function name. Mirror how go.rs extracts function names — it uses tree-sitter field traversal. + +- [ ] **Step 2**: Register the module in `crates/kebab-parse-code/src/lib.rs`: + +```rust +pub mod c; +pub use c::{CAstExtractor, C_PARSER_VERSION}; +``` + +- [ ] **Step 3**: Build: + +```bash +cargo build -p kebab-parse-code 2>&1 | tail -5 +``` + +Expected: clean. + +- [ ] **Step 4**: Commit (no test yet — Task D adds the snapshot test): + +```bash +git add crates/kebab-parse-code/src/c.rs crates/kebab-parse-code/src/lib.rs +git commit -m "$(cat <<'EOF' +feat(p10-1d): C AST extractor (tree-sitter-c) + +Top-level units: function_definition (symbol = fn name), struct_specifier, +enum_specifier, union_specifier (each emits 1 unit with the symbol being +the named identifier). Preprocessor directives + top-level declarations +group into a glue chunk. Empty file or zero units → +post-pass. + +C symbol = function name only — no namespace, no class nesting (design §3.4). + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task C: C++ AST extractor (`kebab-parse-code/src/cpp.rs`) + +**Files:** +- Create: `crates/kebab-parse-code/src/cpp.rs` +- Modify: `crates/kebab-parse-code/src/lib.rs` + +- [ ] **Step 1**: Create `crates/kebab-parse-code/src/cpp.rs`. The closest template is `crates/kebab-parse-code/src/java.rs` (1C-JK) — it handles package prefix + class nesting via recursion. C++ adds namespace nesting (multiple levels possible). + +Pseudocode: + +```rust +//! p10-1D: C++ AST extractor. + +use crate::traits::{Extractor, ExtractContext}; +use anyhow::{Context, Result}; +use kebab_core::{/* ... */}; +use tree_sitter::{Node, Parser}; + +pub const CPP_PARSER_VERSION: &str = "tree-sitter-cpp-"; + +pub struct CppAstExtractor { parser: Parser } + +impl CppAstExtractor { + pub fn new() -> Self { + let mut parser = Parser::new(); + parser.set_language(&tree_sitter_cpp::LANGUAGE.into()).expect("load tree-sitter-cpp"); + Self { parser } + } + + fn visit(&self, node: Node, source: &[u8], prefix: &[&str], units: &mut Vec<(String, Node)>, glue: &mut Vec) { + // prefix is the namespace/class chain so far (e.g. ["kebab", "chunk", "MdHeadingV1Chunker"]). + for child in node.named_children(&mut node.walk()) { + match child.kind() { + "namespace_definition" => { + let name = child.child_by_field_name("name") + .and_then(|n| n.utf8_text(source).ok()) + .unwrap_or(""); + let mut new_prefix = prefix.to_vec(); + new_prefix.push(name); + let body = child.child_by_field_name("body").unwrap_or(child); + self.visit(body, source, &new_prefix, units, glue); + } + "class_specifier" | "struct_specifier" if child.child_by_field_name("name").is_some() => { + let name = child.child_by_field_name("name") + .and_then(|n| n.utf8_text(source).ok()) + .unwrap_or(""); + // Emit the class itself as a unit. + let symbol = build_symbol(prefix, &[], name); // e.g. "kebab::chunk::Foo" + units.push((symbol, child)); + // Recurse for nested classes / methods. + let mut new_prefix = prefix.to_vec(); + new_prefix.push(name); + let body = child.child_by_field_name("body").unwrap_or(child); + self.visit(body, source, &new_prefix, units, glue); + } + "function_definition" => { + // declarator may be qualified_identifier (out-of-class def) or plain identifier. + let symbol = extract_fn_symbol(child, source, prefix); + units.push((symbol, child)); + // Do NOT recurse into function body — inner classes/lambdas left to a future revision. + } + "template_declaration" => { + // Recurse: unwrap to inner declarator (function_definition or class_specifier) + // and treat it as if it were directly there. Template params NOT in symbol. + self.visit(child, source, prefix, units, glue); + } + "enum_specifier" if child.child_by_field_name("name").is_some() => { + let name = child.child_by_field_name("name").and_then(|n| n.utf8_text(source).ok()).unwrap_or(""); + let symbol = build_symbol(prefix, &[], name); + units.push((symbol, child)); + } + "concept_definition" => { + let name = /* extract */; + let symbol = build_symbol(prefix, &[], &name); + units.push((symbol, child)); + } + _ => glue.push(child), + } + } + } +} + +fn build_symbol(prefix: &[&str], extras: &[&str], leaf: &str) -> String { + // Join with :: + let mut parts: Vec<&str> = prefix.iter().copied().collect(); + parts.extend_from_slice(extras); + parts.push(leaf); + parts.join("::") +} + +fn extract_fn_symbol(node: Node, source: &[u8], prefix: &[&str]) -> String { + // function_definition.declarator may be a function_declarator wrapping a + // qualified_identifier (out-of-class def like `void Foo::bar(){}`) or a + // plain identifier (free fn or in-namespace fn). + // Need to walk down to the leaf identifier and any qualifier chain. + // For qualified_identifier "Foo::bar::baz", break into ["Foo", "bar"] qualifier + "baz" leaf. + // ... + todo!("walk declarator → qualified_identifier → assemble symbol with prefix") +} + +// Extractor impl: parse, visit(root, ...), emit chunks-of-blocks per (symbol, node) pair + glue + fallback. +``` + +This is the most intricate extractor in p10-1D. **Action**: read `crates/kebab-parse-code/src/java.rs` for the recursion pattern, then `crates/kebab-parse-code/src/python.rs` for the class-nesting pattern, and combine. tree-sitter-cpp's node-types.json (or a quick `tree-sitter parse` against a sample file) confirms exact node-kind names. + +- [ ] **Step 2**: Register in `crates/kebab-parse-code/src/lib.rs`: + +```rust +pub mod cpp; +pub use cpp::{CppAstExtractor, CPP_PARSER_VERSION}; +``` + +- [ ] **Step 3**: Build: + +```bash +cargo build -p kebab-parse-code 2>&1 | tail -5 +``` + +Expected: clean. + +- [ ] **Step 4**: Commit: + +```bash +git add crates/kebab-parse-code/src/cpp.rs crates/kebab-parse-code/src/lib.rs +git commit -m "$(cat <<'EOF' +feat(p10-1d): C++ AST extractor (tree-sitter-cpp) + +Symbol = namespace::Class::method via recursive visit. namespace_definition +pushes namespace name (anonymous → ). class_specifier / struct_specifier +(named) emit class unit + recurse with class name pushed. function_definition +emits method unit (symbol may include qualified_identifier prefix for +out-of-class definitions). template_declaration unwraps to inner declarator +(template params NOT in symbol). enum_specifier + concept_definition emit +type-level units. extern "C" block content + using/include/define → glue. + +Constructor / destructor symbols use Class::Class / Class::~Class +convention. Operator overloads keep operator+ form. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task D: C chunker + snapshot test + +**Files:** +- Create: `crates/kebab-chunk/src/code_c_ast_v1.rs` +- Create: `crates/kebab-chunk/tests/fixtures/sample.c` +- Create: `crates/kebab-chunk/tests/code_c_ast_snapshot.rs` +- Modify: `crates/kebab-chunk/src/lib.rs` + +- [ ] **Step 1**: Create `crates/kebab-chunk/src/code_c_ast_v1.rs`. **Mirror `crates/kebab-chunk/src/code_go_ast_v1.rs`** (closest 1-extractor pattern, no nesting): + +```rust +//! p10-1D: C AST chunker. + +use crate::tier2_shared::build_chunk; +use crate::{Chunker, ChunkPolicy}; +use anyhow::Result; +use kebab_core::{Block, Chunk, Document}; + +pub const VERSION_LABEL: &str = "code-c-ast-v1"; + +pub struct CodeCAstV1Chunker; + +impl Chunker for CodeCAstV1Chunker { + fn chunker_version(&self) -> &'static str { VERSION_LABEL } + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + crate::tier2_shared::policy_hash(policy) + } + fn chunk(&self, doc: &Document, policy: &ChunkPolicy) -> Result> { + // Mirror code_go_ast_v1.rs's body — iterate doc.blocks, each Block::Code + // contributes 1 chunk via build_chunk. Apply oversize fallback per block + // via tier2_shared::push_chunks_with_oversize. + // ... + todo!("mirror code_go_ast_v1.rs verbatim, substituting VERSION_LABEL") + } +} +``` + +Read `code_go_ast_v1.rs` and port verbatim — the language-agnostic body iterates `doc.blocks` and emits chunks. Only the `VERSION_LABEL` and (potentially) symbol formatting helper change. + +- [ ] **Step 2**: Create `tests/fixtures/sample.c` (~30 lines, includes top-level fn, struct, enum, preprocessor): + +```c +#include +#include + +#define MAX_BUF 4096 + +typedef enum { + OK = 0, + ERR_PARSE, + ERR_IO, +} status_t; + +typedef struct { + int id; + char name[64]; + status_t status; +} record_t; + +static int counter = 0; + +int parse_record(const char *line, record_t *out) { + if (line == NULL || out == NULL) return ERR_PARSE; + return OK; +} + +void print_record(const record_t *r) { + printf("[%d] %s (status=%d)\n", r->id, r->name, r->status); +} + +int main(void) { + record_t r = { .id = 1, .name = "foo", .status = OK }; + print_record(&r); + return 0; +} +``` + +Expected snapshot: 3 function units (`parse_record`, `print_record`, `main`) + 1 enum unit (`status_t`) + 1 struct unit (`record_t`) + 1 `` glue (preproc + global var). Total ~6 chunks. + +- [ ] **Step 3**: Create `tests/code_c_ast_snapshot.rs` mirroring `tests/code_go_ast_snapshot.rs`. Assertions: + +```rust +// Pseudocode: +// 1. Load fixture sample.c +// 2. Run CAstExtractor → Document +// 3. Run CodeCAstV1Chunker.chunk(&doc, &policy) +// 4. Assert chunks.len() == expected (6). +// 5. Assert symbols (from chunks[i].source_spans[0]::SourceSpan::Code.symbol) match expected list: +// ["status_t", "record_t", "parse_record", "print_record", "main", ""] +// (order matches AST traversal order — verify by running once.) +// 6. Assert all chunks have lang = Some("c"). +``` + +- [ ] **Step 4**: Register module in `crates/kebab-chunk/src/lib.rs`: + +```rust +pub mod code_c_ast_v1; +pub use code_c_ast_v1::CodeCAstV1Chunker; +``` + +- [ ] **Step 5**: Run test: + +```bash +cargo test -p kebab-chunk --test code_c_ast_snapshot -- --nocapture 2>&1 | tail -25 +``` + +Expected: PASS. If chunk count or symbol order differs from expectation, INSPECT the actual output and update the test's expected list to match (run once to learn, codify on second run). + +- [ ] **Step 6**: Clippy + commit: + +```bash +cargo clippy -p kebab-chunk --all-targets -- -D warnings +git add crates/kebab-chunk/src/code_c_ast_v1.rs \ + crates/kebab-chunk/src/lib.rs \ + crates/kebab-chunk/tests/fixtures/sample.c \ + crates/kebab-chunk/tests/code_c_ast_snapshot.rs +git commit -m "$(cat <<'EOF' +feat(p10-1d): code-c-ast-v1 chunker + snapshot test + +Mirrors code-go-ast-v1's chunker pattern (1 chunk per AST unit + +glue + oversize fallback). Snapshot test against tests/fixtures/sample.c +(function + struct + enum + preprocessor) verifies symbol order + lang=c +stamping. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task E: C++ chunker + snapshot test + +**Files:** +- Create: `crates/kebab-chunk/src/code_cpp_ast_v1.rs` +- Create: `crates/kebab-chunk/tests/fixtures/sample.cpp` +- Create: `crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs` +- Modify: `crates/kebab-chunk/src/lib.rs` + +- [ ] **Step 1**: Create `code_cpp_ast_v1.rs`. **Mirror `code_c_ast_v1.rs`** verbatim, only VERSION_LABEL differs: + +```rust +pub const VERSION_LABEL: &str = "code-cpp-ast-v1"; + +pub struct CodeCppAstV1Chunker; + +impl Chunker for CodeCppAstV1Chunker { + fn chunker_version(&self) -> &'static str { VERSION_LABEL } + // ... identical body — both languages use the same Block::Code → Chunk emission ... +} +``` + +The actual symbol-formatting work happens in the EXTRACTOR (Task C). The chunker's job is to iterate blocks the extractor produced and emit Chunks. Both C and C++ chunkers are essentially identical bodies. + +- [ ] **Step 2**: Create `tests/fixtures/sample.cpp` (~50 lines, includes namespace + nested class + method + free fn + template): + +```cpp +#include +#include + +namespace kebab { +namespace chunk { + +class MdHeadingV1Chunker { +public: + MdHeadingV1Chunker() = default; + ~MdHeadingV1Chunker() = default; + + std::string chunk_doc(const std::string& doc) { + return doc; + } + + int operator()(int x) const { + return x * 2; + } + +private: + int counter_ = 0; +}; + +template +T identity(T value) { + return value; +} + +} // namespace chunk + +void global_helper() { + // free function in kebab namespace +} + +} // namespace kebab + +int main() { + kebab::chunk::MdHeadingV1Chunker c; + return 0; +} +``` + +Expected snapshot symbols (verify on first run, then codify): +- `kebab::chunk::MdHeadingV1Chunker` (class unit) +- `kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker` (constructor) +- `kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker` (destructor) +- `kebab::chunk::MdHeadingV1Chunker::chunk_doc` +- `kebab::chunk::MdHeadingV1Chunker::operator()` +- `kebab::chunk::identity` (template fn) +- `kebab::global_helper` +- `main` (free fn, no namespace) +- `` (include + using) + +~9 chunks total. + +- [ ] **Step 3**: Create `tests/code_cpp_ast_snapshot.rs` mirroring `code_c_ast_snapshot.rs`. Assert symbol list matches expected (run once to learn the actual order, codify). + +- [ ] **Step 4**: Register module in `lib.rs`: + +```rust +pub mod code_cpp_ast_v1; +pub use code_cpp_ast_v1::CodeCppAstV1Chunker; +``` + +- [ ] **Step 5**: Run test: + +```bash +cargo test -p kebab-chunk --test code_cpp_ast_snapshot -- --nocapture 2>&1 | tail -30 +``` + +Expected: PASS. + +- [ ] **Step 6**: Clippy + commit: + +```bash +cargo clippy -p kebab-chunk --all-targets -- -D warnings +git add crates/kebab-chunk/src/code_cpp_ast_v1.rs \ + crates/kebab-chunk/src/lib.rs \ + crates/kebab-chunk/tests/fixtures/sample.cpp \ + crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs +git commit -m "$(cat <<'EOF' +feat(p10-1d): code-cpp-ast-v1 chunker + snapshot test + +Identical chunker body to code-c-ast-v1; per-language work happens in the +CppAstExtractor (Task C). Snapshot fixture covers nested namespace + +class + ctor/dtor + method + operator overload + template fn + free fn + +top-level main, verifying namespace::Class::method symbol convention per +design §3.4. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task F: ingest_one_code_asset dispatch + tier3 fallback list extension + +**Files:** +- Modify: `crates/kebab-app/src/lib.rs` + +- [ ] **Step 1**: Top-of-file `use kebab_chunk::{...}` extend with `CodeCAstV1Chunker` + `CodeCppAstV1Chunker`: + +```rust +use kebab_chunk::{ + /* existing items */, + CodeCAstV1Chunker, + CodeCppAstV1Chunker, +}; +``` + +- [ ] **Step 2**: Allowlist (around line 953) extend: + +```rust +if matches!(lang.as_str(), + "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + | "shell" + | "c" | "cpp") +``` + +- [ ] **Step 3**: `parser_version` match — add C/C++ arms (Tier 1, so they DO get a real parser version): + +```rust +let parser_version = match code_lang { + // ... existing 7 Tier 1 + Tier 2 + shell arms ... + "c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()), + "cpp" => ParserVersion(kebab_parse_code::CPP_PARSER_VERSION.to_string()), + other => anyhow::bail!("unsupported code_lang: {other}"), +}; +``` + +- [ ] **Step 4**: `chunker_version` match — add C/C++ arms: + +```rust +let chunker_version = match code_lang { + // ... existing arms ... + "c" => CodeCAstV1Chunker.chunker_version(), + "cpp" => CodeCppAstV1Chunker.chunker_version(), + other => anyhow::bail!("unreachable chunker_version: {other}"), +}; +``` + +- [ ] **Step 5**: `canonical_result` extract match — add C/C++ arms: + +```rust +let canonical_result: anyhow::Result = match code_lang { + "rust" => RustAstExtractor::new().extract(&ctx, &bytes).context("..."), + // ... existing ... + "c" => CAstExtractor::new().extract(&ctx, &bytes) + .context("kb-parse-code::CAstExtractor::extract (code:c)"), + "cpp" => CppAstExtractor::new().extract(&ctx, &bytes) + .context("kb-parse-code::CppAstExtractor::extract (code:cpp)"), + // ... Tier 2 + shell ... + other => anyhow::bail!("unreachable (extract): {other}"), +}; +``` + +(Add `use kebab_parse_code::{CAstExtractor, CppAstExtractor};` at the top if not already wildcard-imported.) + +- [ ] **Step 6**: `chunks_result` match — add C/C++ arms: + +```rust +let chunks_result: anyhow::Result> = if extract_fell_back { + // ... existing ... +} else { + match code_lang { + "rust" => CodeRustAstV1Chunker.chunk(&canonical, chunk_policy).context("..."), + // ... existing ... + "c" => CodeCAstV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeCAstV1Chunker::chunk (code:c)"), + "cpp" => CodeCppAstV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeCppAstV1Chunker::chunk (code:cpp)"), + // ... existing ... + other => anyhow::bail!("unreachable (chunk): {other}"), + } +}; +``` + +- [ ] **Step 7**: `tier3_fallback_cv` (p10-3 Critical fix) — C/C++ are fallback-eligible (extract may fail on `.h` C++ headers or malformed code): + +```rust +let tier3_fallback_cv = match code_lang { + "rust" | "python" | "typescript" | "javascript" + | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + | "c" | "cpp" // p10-1d: + => Some(CodeTextParagraphV1Chunker.chunker_version()), + _ => None, +}; +``` + +(The exact location of this match is in `ingest_one_code_asset` between ~lines 1921-1927 per the p10-3 critical fix.) + +- [ ] **Step 8**: Build: + +```bash +cargo build -p kebab-app 2>&1 | tail -5 +``` + +Expected: clean. + +- [ ] **Step 9**: Per-crate test (no regression): + +```bash +cargo test -p kebab-app --lib -- --nocapture 2>&1 | tail -10 +``` + +Expected: 52 PASS (existing baseline). + +- [ ] **Step 10**: Clippy + commit: + +```bash +cargo clippy -p kebab-app --all-targets -- -D warnings +git add crates/kebab-app/src/lib.rs +git commit -m "$(cat <<'EOF' +feat(p10-1d): activate C + C++ in ingest_one_code_asset dispatch + +Extends 4-arm match (parser_version / chunker_version / extract / chunks) ++ allowlist + tier3_fallback_cv list with "c" + "cpp" arms. C uses +CAstExtractor + CodeCAstV1Chunker; C++ uses CppAstExtractor + +CodeCppAstV1Chunker. Both langs are Tier 3-fallback-eligible (e.g. .h +file with C++ syntax may fail tree-sitter-c parse → Tier 3 paragraph +fallback). + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task G: code_ingest_smoke integration tests (C + C++) + +**Files:** +- Modify: `crates/kebab-app/tests/code_ingest_smoke.rs` + +- [ ] **Step 1**: Append 2 tests at the end of the file (mirror the existing tier1 tests `c_ast_v1_*` if present; if not, mirror `rust_ast_v1_*` or `go_ast_v1_*`): + +```rust +#[test] +fn tier1_c_ingest_searchable() { + let env = TestEnv::lexical_only(); + let workspace = env.workspace_root(); + std::fs::write( + workspace.join("parser.c"), + "#include \n\nint parse_record(const char *line) {\n if (line == NULL) return -1;\n return 0;\n}\n", + ) + .unwrap(); + + let report = env.ingest().expect("ingest"); + assert!(report.new_docs >= 1, "expected at least 1 new doc"); + + let hits = env.search_code_lang("c", "parse_record").expect("search"); + assert!(!hits.is_empty(), "expected at least 1 c hit"); + + match &hits[0].citation { + Citation::Code { symbol, lang, .. } => { + assert_eq!(symbol.as_deref(), Some("parse_record"), "C symbol must be function name only"); + assert_eq!(lang.as_deref(), Some("c")); + } + other => panic!("expected Citation::Code, got {other:?}"), + } + assert_eq!( + hits[0].chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-c-ast-v1"), + ); +} + +#[test] +fn tier1_cpp_ingest_searchable() { + let env = TestEnv::lexical_only(); + let workspace = env.workspace_root(); + std::fs::write( + workspace.join("chunker.cpp"), + "namespace kebab {\nnamespace chunk {\nclass Foo {\npublic:\n void bar() { /* impl */ }\n};\n}\n}\n", + ) + .unwrap(); + + let report = env.ingest().expect("ingest"); + assert!(report.new_docs >= 1); + + let hits = env.search_code_lang("cpp", "bar").expect("search"); + assert!(!hits.is_empty(), "expected at least 1 cpp hit"); + + match &hits[0].citation { + Citation::Code { symbol, lang, .. } => { + // Symbol could be "kebab::chunk::Foo::bar" or "kebab::chunk::Foo" depending on which chunk hits first. + assert!( + symbol.as_deref().map_or(false, |s| s.starts_with("kebab::chunk::Foo")), + "C++ symbol must start with namespace::Class prefix, got {:?}", symbol + ); + assert_eq!(lang.as_deref(), Some("cpp")); + } + other => panic!("expected Citation::Code, got {other:?}"), + } + assert_eq!( + hits[0].chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-cpp-ast-v1"), + ); +} +``` + +- [ ] **Step 2**: Run tests: + +```bash +cargo test -p kebab-app --test code_ingest_smoke tier1_c_ingest tier1_cpp_ingest -- --nocapture 2>&1 | tail -30 +``` + +Expected: 2 PASS. + +- [ ] **Step 3**: Full smoke regression: + +```bash +cargo test -p kebab-app --test code_ingest_smoke -- --nocapture 2>&1 | tail -30 +``` + +Expected: 18 PASS (16 existing + 2 new). + +- [ ] **Step 4**: Clippy + commit: + +```bash +cargo clippy -p kebab-app --tests -- -D warnings +git add crates/kebab-app/tests/code_ingest_smoke.rs +git commit -m "$(cat <<'EOF' +test(p10-1d): integration smoke tests for C + C++ + +Verifies end-to-end ingest + search + Citation::Code shape: +- tier1_c_ingest_searchable: .c file → --code-lang c search → symbol + = function name (no nesting), lang = "c", chunker_version = "code-c-ast-v1". +- tier1_cpp_ingest_searchable: .cpp file → --code-lang cpp search → + symbol starts with namespace::Class prefix, lang = "cpp", + chunker_version = "code-cpp-ast-v1". + +Brings code_ingest_smoke to 18 tests (Rust 3 + Python 1 + TS 1 + JS 1 + +Go 1 + Java 1 + Kotlin 1 + yaml 1 + dockerfile 1 + manifest 1 + shell 1 + +yaml-fallback 1 + 2 reingest-unchanged regression + c 1 + cpp 1). + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task H: frozen design §10 activation log + +**Files:** +- Modify: `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` + +- [ ] **Step 1**: Find §10 activation log. Add p10-1D entry right after the p10-3 entry: + +``` +**p10-1D 활성화 (C + C++) (2026-05-21)**: Tier 1 chunker family 완료 — C (`code-c-ast-v1`, `.c`/`.h`) + C++ (`code-cpp-ast-v1`, `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx`) AST chunker 활성화. C symbol = function name only; C++ symbol = `namespace::Class::method` (recursive namespace + class nesting). `.h` 가 C++ syntax 만나면 tree-sitter-c parse 실패 → p10-3 Tier 3 fallback 으로 자동 picked up. +``` + +- [ ] **Step 2**: Commit: + +```bash +git add docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md \ + docs/superpowers/specs/2026-04-27-kebab-final-form-design.md 2>/dev/null +git add docs/superpowers/specs/2026-04-27-kebab-final-form-design.md +git commit -m "$(cat <<'EOF' +docs(p10-1d): activate C + C++ in frozen design §10 + +P10 Tier 1 chunker family complete. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task I: README + HANDOFF + ARCHITECTURE + SMOKE + tasks/INDEX + tasks/p10/INDEX + +**Files:** +- Modify: `README.md` (Mermaid + ingest row), `HANDOFF.md`, `docs/ARCHITECTURE.md`, `docs/SMOKE.md`, `tasks/INDEX.md`, `tasks/p10/INDEX.md` + +- [ ] **Step 1 — README.md**: Update the `kebab ingest` row's supported-langs list to include `.c` / `.h` → `code-c-ast-v1` and `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx` → `code-cpp-ast-v1`. Extend `--code-lang c` / `--code-lang cpp` in the enumeration. Update the Mermaid `chunker[...]` node to include `code-c-ast-v1, code-cpp-ast-v1` in the brace. + +- [ ] **Step 2 — HANDOFF.md**: P10 row append `, **1D ✅ (C + C++ AST chunkers, code-c-ast-v1 + code-cpp-ast-v1 — v0.16.0)**`. Update 한 줄 요약 to include C/C++. Update 다음 후보 (drop p10-1D; remaining: P9-5 desktop / P8 audio). + +- [ ] **Step 3 — docs/ARCHITECTURE.md**: code parser table row: append C + C++ row mention. Flowchart `pcode` node: append `+ P10-1D`. Directory tree chunkers list: add `code_c_ast_v1.rs` + `code_cpp_ast_v1.rs`. + +- [ ] **Step 4 — docs/SMOKE.md**: Add a "## P10-1D C + C++ AST chunker" section after the P10-3 section. Walkthrough with sample.c + sample.cpp ingest + `--code-lang c` / `--code-lang cpp` search assertions. Append verification checklist entry. + +- [ ] **Step 5 — tasks/INDEX.md + tasks/p10/INDEX.md**: Flip p10-1D row ⏳ → ✅ (v0.16.0). + +- [ ] **Step 6**: Commit: + +```bash +git add README.md HANDOFF.md docs/ARCHITECTURE.md docs/SMOKE.md tasks/INDEX.md tasks/p10/INDEX.md +git commit -m "$(cat <<'EOF' +docs(p10-1d): README/HANDOFF/ARCHITECTURE/SMOKE/INDEX sync + +P10 Tier 1 chunker family complete (Rust + Python + TS + JS + Go + Java + +Kotlin + C + C++). Tier 2 (k8s + dockerfile + manifest) and Tier 3 +(paragraph fallback) already active. p10-1D 활성화 + ✅ flip. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task J: workspace test gate + clippy + +- [ ] **Step 1**: Disk check (`df -h /`) + optional `cargo clean`. + +- [ ] **Step 2**: `cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -80`. Expected: all PASS. + +- [ ] **Step 3**: `cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -30`. Expected: clean. + +--- + +## Task K: version bump + gitea PR + release + +**Files:** +- Modify: `Cargo.toml` + +- [ ] **Step 1**: Workspace `version = "0.15.0"` → `"0.16.0"`. + +- [ ] **Step 2**: `cargo build -p kebab-cli` to refresh Cargo.lock. + +- [ ] **Step 3**: Commit: + +```bash +git add Cargo.toml Cargo.lock +git commit -m "$(cat <<'EOF' +chore: bump version 0.15.0 → 0.16.0 (p10-1d C + C++ AST chunkers) + +Minor bump — additive new chunker_versions code-c-ast-v1 + code-cpp-ast-v1 ++ new routing langs c / cpp + new tree-sitter-c / tree-sitter-cpp workspace +deps. P10 Tier 1 chunker family complete. No DB migration, no wire schema +major bump. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +- [ ] **Step 4**: Push branch + open gitea PR via REST API. Title: `feat(p10-1d): C + C++ AST chunkers — P10 Tier 1 chunker family complete`. + +- [ ] **Step 5**: Wait for code-reviewer APPROVE → merge via gitea REST API → cut `gitea-release v0.16.0`. + +--- + +## Verification matrix + +| 검증 | 명령 | 기대 | +|------|------|------| +| C symbol | `kebab search --code-lang c --json` | `Citation::Code.symbol = ""` | +| C++ symbol | `kebab search --code-lang cpp --json` | `Citation::Code.symbol = "namespace::Class::method"` | +| .h fallback | `.h` with C++ syntax → ingest | Tier 3 fallback: `chunker_version = "code-text-paragraph-v1"`, lang = c | +| code_lang_breakdown | `kebab schema --json` | `"c": N`, `"cpp": M` | + +--- + +## Risks reminder (구현 중 주의) + +- **tree-sitter grammar version resolution**: tree-sitter 0.26 호환 grammar. crates.io 최신 버전 default. +- **tree-sitter-cpp 의 node-kind 명**: spec 의 가정 (`namespace_definition`, `class_specifier`, `function_definition`, `template_declaration`, `concept_definition`, etc.) 이 실제 grammar 와 일치하는지 fixture parse 로 검증. +- **out-of-class method def 의 prefix 복원**: `void Foo::bar()` 의 declarator 가 `function_declarator > qualified_identifier > namespace_identifier "Foo" + identifier "bar"`. spec 의 `extract_fn_symbol` 이 이 chain 정확히 walk. +- **Operator overload**: tree-sitter-cpp 의 `operator_name` 또는 `field_identifier` "operator+" 형태. fixture 로 검증. +- **머지 후 deviation** 은 `tasks/HOTFIXES.md` dated 로그. diff --git a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md index 72992fb..3e2c7a9 100644 --- a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md +++ b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md @@ -1553,6 +1553,8 @@ transitional 형태) 의 source of truth. **p10-3 활성화 (Tier 3 paragraph fallback) (2026-05-21)**: Tier 3 chunker `code-text-paragraph-v1` 활성화. shell script (`.sh`/`.bash`/`.zsh`) direct routing + Tier 1/2 가 0 chunk 또는 Err 시 자동 fallback 으로 retry. 비-k8s YAML / invalid YAML / AST 실패 케이스 모두 picked up. lang 은 입력 보존 (shell → "shell", yaml → "yaml" 등), symbol 은 항상 None. +**p10-1D 활성화 (C + C++) (2026-05-21)**: P10 Tier 1 chunker family 완료 — C (`code-c-ast-v1`, `.c`/`.h`) + C++ (`code-cpp-ast-v1`, `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx`) AST chunker 활성화. C symbol = function name only (no nesting); C++ symbol = `namespace::Class::method` (recursive namespace + class nesting). `.h` 가 C++ syntax 만나면 tree-sitter-c parse 실패 → p10-3 Tier 3 fallback 으로 자동 picked up. + ### 10.2 MCP server transport (fb-30) `kebab mcp` 가 stdio JSON-RPC server. Rust SDK = `rmcp 1.6`. Tool surface diff --git a/tasks/INDEX.md b/tasks/INDEX.md index 7bf62ac..e02038e 100644 --- a/tasks/INDEX.md +++ b/tasks/INDEX.md @@ -144,7 +144,7 @@ P0~P5 는 직렬. P6~P9 는 P5 이후 병렬 가능. - [p10-1B Python + TS/JS AST chunkers](p10/p10-1b-py-ts-js-ast-chunkers.md) — 🟡 PR 오픈 (코드 완성, 머지 대기) - p10-1C-Go Go AST chunker — 🟡 PR 오픈 (v0.12.0, `code-go-ast-v1`) - p10-1C-JavaKotlin Java + Kotlin AST chunkers — 🟢 PR 오픈 (v0.13.0, `code-java-ast-v1` / `code-kotlin-ast-v1`) - - p10-1D C + C++ AST chunkers — ⏳ + - p10-1D C + C++ AST chunkers — ✅ 머지 (v0.16.0, `code-c-ast-v1` + `code-cpp-ast-v1`) - p10-2 Tier 2 resource-aware — ✅ 머지 (v0.14.0, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1`) - p10-3 Tier 3 paragraph + line-window fallback — ✅ 머지 (v0.15.0, `code-text-paragraph-v1`) diff --git a/tasks/p10/INDEX.md b/tasks/p10/INDEX.md index f2bb2c9..e2c762f 100644 --- a/tasks/p10/INDEX.md +++ b/tasks/p10/INDEX.md @@ -7,7 +7,7 @@ | 1B | Python + TS/JS AST chunkers | 🟡 PR 오픈 (코드 완성, 머지 대기) | | 1C-Go | Go AST chunker (`code-go-ast-v1`) | 🟡 PR 오픈 (v0.12.0) | | 1C-JavaKotlin | Java + Kotlin AST chunkers (`code-java-ast-v1` / `code-kotlin-ast-v1`) | 🟢 PR 오픈 (v0.13.0) | -| 1D | C + C++ AST chunkers | ⏳ | +| 1D | C + C++ AST chunkers | ✅ 머지 (v0.16.0) | | 2 | Tier 2 resource-aware (k8s / Dockerfile / manifest) | ✅ 머지 (v0.14.0) | | 3 | Tier 3 paragraph + line-window fallback | ✅ 머지 (v0.15.0) | diff --git a/tasks/p10/p10-1d-c-cpp-ast-chunker.md b/tasks/p10/p10-1d-c-cpp-ast-chunker.md new file mode 100644 index 0000000..e8b891d --- /dev/null +++ b/tasks/p10/p10-1d-c-cpp-ast-chunker.md @@ -0,0 +1,119 @@ +# p10-1D — C + C++ AST chunkers + +**Status:** 🟡 진행 중 +**Contract sections:** §3.3 (chunker_version `code-c-ast-v1` + `code-cpp-ast-v1`), §3.4 (symbol path — C `func_name`, C++ `namespace::Class::method`), §3.5 (code_lang `c` + `cpp`, ext `.c`/`.h` / `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx`), §6.1 (`kebab-parse-code/src/{c,cpp}.rs`), §6.2 (`kebab-chunk/src/code_{c,cpp}_ast_v1.rs`), §9.1 (Tier 1 AST per-language + oversize fallback), §10 (activation log). +**Design:** [2026-05-15-kebab-code-ingest-design.md](../../docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md) §1D (C + C++ 부분). +**Plan:** [2026-05-21-p10-1d-c-cpp-ast-chunker.md](../../docs/superpowers/plans/2026-05-21-p10-1d-c-cpp-ast-chunker.md). + +## Goal + +p10-1A-2 / 1B / 1C / p10-2 / p10-3 인프라 위에 C + C++ AST chunker 2종을 단일 PR 로 활성화. P10 의 Tier 1 chunker family 마지막. 머지 시점부터 `.c` / `.h` / `.cpp` / `.cc` / `.cxx` / `.hpp` / `.hh` / `.hxx` 파일 dogfooding 가능. + +`.h` 가 design 명시대로 C 매핑 — C++ 프로젝트의 `.h` 는 tree-sitter-c 의 parse 가 namespace / template 같은 C++ syntax 에 실패할 가능성. 실패 시 p10-3 의 Tier 3 fallback 으로 자동 picked up (이미 wired). + +## 동결된 설계 결정 (이 task 로 확정) + +### C extractor (`code-c-ast-v1`) + +- **Symbol** = function name only. design §3.4 그대로 — no nesting, no namespace. 예: `parse_blocks`. +- **Top-level units**: + - `function_definition` (named) → 1 unit, symbol = function name + - `struct_specifier` (named, top-level) → 1 unit, symbol = struct name + - `enum_specifier` (named, top-level) → 1 unit, symbol = enum name + - `union_specifier` (named, top-level) → 1 unit, symbol = union name + - `declaration` (top-level — typedef / global var / fn prototype) → glue `` + - `preproc_include` / `preproc_def` / `preproc_function_def` / `preproc_ifdef` 등 preprocessor → glue `` +- **Static / extern / inline fn**: 일반 fn 과 동일 처리 (storage class qualifier 무시 — symbol 은 declarator 의 fn name 만). +- **Inner struct / enum 안의 nested declaration** (C 도 가능): 1B Python class-nesting 미적용 — C 의 inner type 은 흔치 않고 outer 가 typedef wrapper 인 패턴이라 top-level 만 emit. +- **Empty file 또는 unit 0개** → `` post-pass (1A-2 패턴). + +### C++ extractor (`code-cpp-ast-v1`) + +- **Symbol** = `namespace::Class::method` (design §3.4 그대로). namespace 가 없으면 `Class::method` 또는 `func_name`. 예: `kebab::chunk::MdHeadingV1Chunker::chunk_doc`. +- **Top-level units + recursion**: + - `namespace_definition` (named) → recurse with namespace name pushed (Python class-nesting + Java/Kotlin package-prefix hybrid). + - **Anonymous namespace** (`namespace { ... }`) → namespace name = `` push (Python `` 패턴 일관). + - `class_specifier` / `struct_specifier` (top-level or in namespace or nested in class, named) → recurse with class name pushed. + - `function_definition` (top-level or in namespace or in class) → 1 unit, symbol per nesting (`namespace::Class::method` / `namespace::func` / `Class::method` / `func_name`). + - `template_declaration` → 내부 declarator type 따라 recurse / emit (function template → method emit, class template → class recurse). template type params (``, ``) 는 symbol 미포함 (Go generic 처리와 동일). + - `enum_specifier` (named) → 1 unit, symbol per nesting. + - `concept_definition` (C++20) → 1 unit, symbol per nesting (treat as type-level definition). + - `using_declaration` / `using_directive` / `preproc_include` / `preproc_def` 등 → glue ``. + - `extern "C"` 블록 안의 정의 → 일반 fn 처리 (block 자체는 glue). +- **Method out-of-class definition** (`Class::method` 형태로 namespace 밖에서 정의): tree-sitter-cpp 의 `function_declarator` 의 `qualified_identifier` 따라 prefix 복원 — declarator 의 `Class::method` 자체에서 추출. +- **Operator overload** (`operator+`, `operator()` 등): symbol = `Class::operator+` 그대로. +- **Constructor / destructor**: symbol = `Class::Class` / `Class::~Class` (convention). +- **Empty file 또는 unit 0개** → `` post-pass. + +### 공통 + +- **`` glue grouping**: preprocessor + global var + using 선언 등 의미 단위 외 → 1 glue chunk per file. +- **Oversize fallback**: 1A-2 의 `AST_CHUNK_MAX_LINES = 200` 동일. +- **`.h` 의 fallback 보장**: C parser 실패 시 p10-3 의 Tier 3 fallback wrapper (이미 wired) 가 picked up → `Citation::Code { symbol: None, lang: "c" }` + `code-text-paragraph-v1`. + +### Module layout + +``` +crates/kebab-parse-code/src/ +├── c.rs [신규] — C AST extractor (PARSER_VERSION `tree-sitter-c-`) +├── cpp.rs [신규] — C++ AST extractor (PARSER_VERSION `tree-sitter-cpp-`) +└── lib.rs [edit] — pub use + C_PARSER_VERSION / CPP_PARSER_VERSION 상수 노출 + +crates/kebab-chunk/src/ +├── code_c_ast_v1.rs [신규] — VERSION_LABEL `code-c-ast-v1`. 1A-2 패턴 (canonical Document → Vec). +├── code_cpp_ast_v1.rs [신규] — VERSION_LABEL `code-cpp-ast-v1`. 동일 패턴. +└── lib.rs [edit] — pub use 2개 + +crates/kebab-source-fs/src/media.rs [편집 불요] — code_lang_for_path 위임 패턴 그대로 (Task C of p10-2 이후 단일 source of truth). + +crates/kebab-parse-code/src/lang.rs [편집 불요] — `.c`/`.h`/`.cpp` 등 매핑은 1A-1 시점부터 이미 존재. + +crates/kebab-app/src/lib.rs [edit] — ingest_one_code_asset 의 allowlist + 4-arm match 에 "c" + "cpp" 추가. tier3 fallback list 에도 둘 추가. + +crates/kebab-chunk/tests/ [신규] +├── fixtures/sample.c — C fixture (top-level fn + struct) +├── fixtures/sample.cpp — C++ fixture (namespace + class + method) +├── code_c_ast_snapshot.rs — C snapshot test +└── code_cpp_ast_snapshot.rs — C++ snapshot test + +crates/kebab-app/tests/code_ingest_smoke.rs [edit] — 2 신규 integration test (c + cpp). 16 + 2 = 18. + +Cargo.toml workspace.dependencies [edit] — tree-sitter-c + tree-sitter-cpp. +crates/kebab-parse-code/Cargo.toml [edit] — 위 2 dep 신규 entry. +``` + +## Acceptance criteria + +- `cargo test --workspace --no-fail-fast -j 1` PASS (memory-conscious `-j 1`). +- `cargo clippy --workspace --all-targets -- -D warnings` clean. +- C fixture (`tests/fixtures/sample.c`) + C++ fixture (`tests/fixtures/sample.cpp`) ingest → chunk snapshot 안정. C snapshot 의 chunks 가 모두 `Citation::Code { lang: "c", symbol: Some(), ... }`. C++ snapshot 의 chunks 가 namespace + class nesting 포함 (`kebab::chunk::Foo::bar`). +- 격리 TempDir KB 에 `.c` / `.cpp` 파일 두고 `kebab search --code-lang c --json` / `--code-lang cpp --json` 가 각각 `Citation::Code` 반환. integration test `tier1_c_ingest_searchable` + `tier1_cpp_ingest_searchable` (기존 16 + 2 = 18). +- `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"c"` + `"cpp"` 카운트 등장 (.c/.cpp 파일 ingest 후). +- README + HANDOFF + docs/ARCHITECTURE + docs/SMOKE + tasks/INDEX + tasks/p10/INDEX 갱신. +- frozen design 2026-04-27 §10 activation log 한 줄. +- workspace `Cargo.toml` minor bump (0.15.0 → 0.16.0), gitea-release v0.16.0. + +## Allowed dependencies + +- `kebab-parse-code` 에 `tree-sitter-c` + `tree-sitter-cpp` workspace deps 추가. 기존 deps 유지. +- `kebab-chunk` 의 새 모듈 2개 (`code_c_ast_v1.rs`, `code_cpp_ast_v1.rs`) — language-agnostic body, tree-sitter import 금지. 기존 `tier2_shared::build_chunk` (pub(crate)) 재사용. +- `kebab-app`, `kebab-source-fs` — 새 crate dep 없음. + +## Forbidden dependencies + +- `kebab-chunk` 가 tree-sitter-c / tree-sitter-cpp 직접 import 금지 (boundary §6.3). +- `kebab-parse-code` 가 store / embed / llm / rag 직접 import 금지. +- UI crate (`kebab-cli` / `kebab-mcp` / `kebab-tui`) 가 `kebab-parse-code` / `kebab-chunk` 직접 import 금지 — `kebab-app` facade 만. + +## Risks / notes + +- **tree-sitter-c / tree-sitter-cpp 호환성**: tree-sitter 0.26 (현재 workspace) 과 호환 필요. resolve 시 `tree-sitter-language` shim 사용 fork (1C-JK 의 tree-sitter-kotlin-ng 패턴) 가능성 — crate.io 의 가장 활발한 maintainer 우선. 실패 시 별도 fork 검토. +- **`.h` parse 실패**: C++ 헤더 (`namespace`, `template`, `class`) 를 C parser 가 만나면 partial parse + error nodes. 1A-2 의 extractor 패턴이 error node 무시 + recoverable parse 진행 — emit 결과가 *불완전* 할 가능성. 그럴 때 chunks 가 0 으로 떨어지면 p10-3 Tier 3 fallback 으로 자동 picked up (이미 wired). 부분 emit 시 일부만 색인 — Tier 3 fallback 안 함. dogfood 후 HOTFIXES 검토. +- **Method out-of-class definition** (`Class::method` 형식): tree-sitter-cpp 의 `function_definition` 의 declarator 가 `qualified_identifier` 일 때 prefix 복원. fixture 로 검증. +- **Template specialization** (`template<> class Foo`): tree-sitter-cpp 의 `template_declaration` 안의 `class_specifier` name 만 추출 — `Foo` 만 symbol 에 들어가고 `` 미포함. design 의 generic 무시 룰 일관. +- **`extern "C"` block 안의 fn**: 일반 fn 처리. 외부 wrapping block 은 glue. +- **Anonymous union / struct** (`struct { int x; }` 변수 안에): 흔치 않음 + named 만 unit. anonymous 는 glue. +- **Macro-heavy code** (Linux kernel 등): `#define FOO(x) ...` 매크로가 function-like 라도 parser 가 fn 으로 인식 안 함. preprocessor glue 로 처리 — symbol 안 잡힘. 의도된 동작 (parser 의 macro expansion 안 함). +- **`__attribute__((...))`** annotations: tree-sitter-c 의 attribute 노드는 declarator 옆 sibling. 무시 가능. function name 추출에 영향 없음. +- **fixture 크기**: sample.c 는 ~30 line (top-level fn + struct + enum + preprocessor), sample.cpp 는 ~50 line (nested namespace + class + method + template + free fn). oversize fallback 의 별도 검증은 1A-2 의 long_section_snapshot 패턴이 이미 cover (필요 시 별도 fixture). +- **머지 후 deviation** 은 `tasks/HOTFIXES.md` dated 로그 + 본 spec `Risks / notes` cross-link.