From 8add684ffcf54506f819c420cb37d85c642b2127 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 13:12:11 +0000 Subject: [PATCH 01/13] docs(p10-1d): task spec for C + C++ AST chunkers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Frozen contract: single PR with code-c-ast-v1 + code-cpp-ast-v1. C symbol = function name only (no nesting). C++ symbol = namespace::Class::method (recursion). .h β†’ C (design Β§3.5); C++ headers' parse failure picked up by p10-3 Tier 3 fallback. tree-sitter-c + tree-sitter-cpp workspace deps, version bump 0.15.0 β†’ 0.16.0. Co-Authored-By: Claude Opus 4.7 (1M context) --- tasks/p10/p10-1d-c-cpp-ast-chunker.md | 119 ++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 tasks/p10/p10-1d-c-cpp-ast-chunker.md diff --git a/tasks/p10/p10-1d-c-cpp-ast-chunker.md b/tasks/p10/p10-1d-c-cpp-ast-chunker.md new file mode 100644 index 0000000..e8b891d --- /dev/null +++ b/tasks/p10/p10-1d-c-cpp-ast-chunker.md @@ -0,0 +1,119 @@ +# p10-1D β€” C + C++ AST chunkers + +**Status:** 🟑 μ§„ν–‰ 쀑 +**Contract sections:** Β§3.3 (chunker_version `code-c-ast-v1` + `code-cpp-ast-v1`), Β§3.4 (symbol path β€” C `func_name`, C++ `namespace::Class::method`), Β§3.5 (code_lang `c` + `cpp`, ext `.c`/`.h` / `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx`), Β§6.1 (`kebab-parse-code/src/{c,cpp}.rs`), Β§6.2 (`kebab-chunk/src/code_{c,cpp}_ast_v1.rs`), Β§9.1 (Tier 1 AST per-language + oversize fallback), Β§10 (activation log). +**Design:** [2026-05-15-kebab-code-ingest-design.md](../../docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md) Β§1D (C + C++ λΆ€λΆ„). +**Plan:** [2026-05-21-p10-1d-c-cpp-ast-chunker.md](../../docs/superpowers/plans/2026-05-21-p10-1d-c-cpp-ast-chunker.md). + +## Goal + +p10-1A-2 / 1B / 1C / p10-2 / p10-3 인프라 μœ„μ— C + C++ AST chunker 2쒅을 단일 PR 둜 ν™œμ„±ν™”. P10 의 Tier 1 chunker family λ§ˆμ§€λ§‰. λ¨Έμ§€ μ‹œμ λΆ€ν„° `.c` / `.h` / `.cpp` / `.cc` / `.cxx` / `.hpp` / `.hh` / `.hxx` 파일 dogfooding κ°€λŠ₯. + +`.h` κ°€ design λͺ…μ‹œλŒ€λ‘œ C λ§€ν•‘ β€” C++ ν”„λ‘œμ νŠΈμ˜ `.h` λŠ” tree-sitter-c 의 parse κ°€ namespace / template 같은 C++ syntax 에 μ‹€νŒ¨ν•  κ°€λŠ₯μ„±. μ‹€νŒ¨ μ‹œ p10-3 의 Tier 3 fallback 으둜 μžλ™ picked up (이미 wired). + +## λ™κ²°λœ 섀계 κ²°μ • (이 task 둜 ν™•μ •) + +### C extractor (`code-c-ast-v1`) + +- **Symbol** = function name only. design Β§3.4 κ·ΈλŒ€λ‘œ β€” no nesting, no namespace. 예: `parse_blocks`. +- **Top-level units**: + - `function_definition` (named) β†’ 1 unit, symbol = function name + - `struct_specifier` (named, top-level) β†’ 1 unit, symbol = struct name + - `enum_specifier` (named, top-level) β†’ 1 unit, symbol = enum name + - `union_specifier` (named, top-level) β†’ 1 unit, symbol = union name + - `declaration` (top-level β€” typedef / global var / fn prototype) β†’ glue `` + - `preproc_include` / `preproc_def` / `preproc_function_def` / `preproc_ifdef` λ“± preprocessor β†’ glue `` +- **Static / extern / inline fn**: 일반 fn κ³Ό 동일 처리 (storage class qualifier λ¬΄μ‹œ β€” symbol 은 declarator 의 fn name 만). +- **Inner struct / enum μ•ˆμ˜ nested declaration** (C 도 κ°€λŠ₯): 1B Python class-nesting 미적용 β€” C 의 inner type 은 ν”μΉ˜ μ•Šκ³  outer κ°€ typedef wrapper 인 νŒ¨ν„΄μ΄λΌ top-level 만 emit. +- **Empty file λ˜λŠ” unit 0개** β†’ `` post-pass (1A-2 νŒ¨ν„΄). + +### C++ extractor (`code-cpp-ast-v1`) + +- **Symbol** = `namespace::Class::method` (design Β§3.4 κ·ΈλŒ€λ‘œ). namespace κ°€ μ—†μœΌλ©΄ `Class::method` λ˜λŠ” `func_name`. 예: `kebab::chunk::MdHeadingV1Chunker::chunk_doc`. +- **Top-level units + recursion**: + - `namespace_definition` (named) β†’ recurse with namespace name pushed (Python class-nesting + Java/Kotlin package-prefix hybrid). + - **Anonymous namespace** (`namespace { ... }`) β†’ namespace name = `` push (Python `` νŒ¨ν„΄ 일관). + - `class_specifier` / `struct_specifier` (top-level or in namespace or nested in class, named) β†’ recurse with class name pushed. + - `function_definition` (top-level or in namespace or in class) β†’ 1 unit, symbol per nesting (`namespace::Class::method` / `namespace::func` / `Class::method` / `func_name`). + - `template_declaration` β†’ λ‚΄λΆ€ declarator type 따라 recurse / emit (function template β†’ method emit, class template β†’ class recurse). template type params (``, ``) λŠ” symbol 미포함 (Go generic μ²˜λ¦¬μ™€ 동일). + - `enum_specifier` (named) β†’ 1 unit, symbol per nesting. + - `concept_definition` (C++20) β†’ 1 unit, symbol per nesting (treat as type-level definition). + - `using_declaration` / `using_directive` / `preproc_include` / `preproc_def` λ“± β†’ glue ``. + - `extern "C"` 블둝 μ•ˆμ˜ μ •μ˜ β†’ 일반 fn 처리 (block μžμ²΄λŠ” glue). +- **Method out-of-class definition** (`Class::method` ν˜•νƒœλ‘œ namespace λ°–μ—μ„œ μ •μ˜): tree-sitter-cpp 의 `function_declarator` 의 `qualified_identifier` 따라 prefix 볡원 β€” declarator 의 `Class::method` μžμ²΄μ—μ„œ μΆ”μΆœ. +- **Operator overload** (`operator+`, `operator()` λ“±): symbol = `Class::operator+` κ·ΈλŒ€λ‘œ. +- **Constructor / destructor**: symbol = `Class::Class` / `Class::~Class` (convention). +- **Empty file λ˜λŠ” unit 0개** β†’ `` post-pass. + +### 곡톡 + +- **`` glue grouping**: preprocessor + global var + using μ„ μ–Έ λ“± 의미 λ‹¨μœ„ μ™Έ β†’ 1 glue chunk per file. +- **Oversize fallback**: 1A-2 의 `AST_CHUNK_MAX_LINES = 200` 동일. +- **`.h` 의 fallback 보μž₯**: C parser μ‹€νŒ¨ μ‹œ p10-3 의 Tier 3 fallback wrapper (이미 wired) κ°€ picked up β†’ `Citation::Code { symbol: None, lang: "c" }` + `code-text-paragraph-v1`. + +### Module layout + +``` +crates/kebab-parse-code/src/ +β”œβ”€β”€ c.rs [μ‹ κ·œ] β€” C AST extractor (PARSER_VERSION `tree-sitter-c-`) +β”œβ”€β”€ cpp.rs [μ‹ κ·œ] β€” C++ AST extractor (PARSER_VERSION `tree-sitter-cpp-`) +└── lib.rs [edit] β€” pub use + C_PARSER_VERSION / CPP_PARSER_VERSION μƒμˆ˜ λ…ΈμΆœ + +crates/kebab-chunk/src/ +β”œβ”€β”€ code_c_ast_v1.rs [μ‹ κ·œ] β€” VERSION_LABEL `code-c-ast-v1`. 1A-2 νŒ¨ν„΄ (canonical Document β†’ Vec). +β”œβ”€β”€ code_cpp_ast_v1.rs [μ‹ κ·œ] β€” VERSION_LABEL `code-cpp-ast-v1`. 동일 νŒ¨ν„΄. +└── lib.rs [edit] β€” pub use 2개 + +crates/kebab-source-fs/src/media.rs [νŽΈμ§‘ λΆˆμš”] β€” code_lang_for_path μœ„μž„ νŒ¨ν„΄ κ·ΈλŒ€λ‘œ (Task C of p10-2 이후 단일 source of truth). + +crates/kebab-parse-code/src/lang.rs [νŽΈμ§‘ λΆˆμš”] β€” `.c`/`.h`/`.cpp` λ“± 맀핑은 1A-1 μ‹œμ λΆ€ν„° 이미 쑴재. + +crates/kebab-app/src/lib.rs [edit] β€” ingest_one_code_asset 의 allowlist + 4-arm match 에 "c" + "cpp" μΆ”κ°€. tier3 fallback list 에도 λ‘˜ μΆ”κ°€. + +crates/kebab-chunk/tests/ [μ‹ κ·œ] +β”œβ”€β”€ fixtures/sample.c β€” C fixture (top-level fn + struct) +β”œβ”€β”€ fixtures/sample.cpp β€” C++ fixture (namespace + class + method) +β”œβ”€β”€ code_c_ast_snapshot.rs β€” C snapshot test +└── code_cpp_ast_snapshot.rs β€” C++ snapshot test + +crates/kebab-app/tests/code_ingest_smoke.rs [edit] β€” 2 μ‹ κ·œ integration test (c + cpp). 16 + 2 = 18. + +Cargo.toml workspace.dependencies [edit] β€” tree-sitter-c + tree-sitter-cpp. +crates/kebab-parse-code/Cargo.toml [edit] β€” μœ„ 2 dep μ‹ κ·œ entry. +``` + +## Acceptance criteria + +- `cargo test --workspace --no-fail-fast -j 1` PASS (memory-conscious `-j 1`). +- `cargo clippy --workspace --all-targets -- -D warnings` clean. +- C fixture (`tests/fixtures/sample.c`) + C++ fixture (`tests/fixtures/sample.cpp`) ingest β†’ chunk snapshot μ•ˆμ •. C snapshot 의 chunks κ°€ λͺ¨λ‘ `Citation::Code { lang: "c", symbol: Some(), ... }`. C++ snapshot 의 chunks κ°€ namespace + class nesting 포함 (`kebab::chunk::Foo::bar`). +- 격리 TempDir KB 에 `.c` / `.cpp` 파일 두고 `kebab search --code-lang c --json` / `--code-lang cpp --json` κ°€ 각각 `Citation::Code` λ°˜ν™˜. integration test `tier1_c_ingest_searchable` + `tier1_cpp_ingest_searchable` (κΈ°μ‘΄ 16 + 2 = 18). +- `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"c"` + `"cpp"` 카운트 λ“±μž₯ (.c/.cpp 파일 ingest ν›„). +- README + HANDOFF + docs/ARCHITECTURE + docs/SMOKE + tasks/INDEX + tasks/p10/INDEX κ°±μ‹ . +- frozen design 2026-04-27 Β§10 activation log ν•œ 쀄. +- workspace `Cargo.toml` minor bump (0.15.0 β†’ 0.16.0), gitea-release v0.16.0. + +## Allowed dependencies + +- `kebab-parse-code` 에 `tree-sitter-c` + `tree-sitter-cpp` workspace deps μΆ”κ°€. κΈ°μ‘΄ deps μœ μ§€. +- `kebab-chunk` 의 μƒˆ λͺ¨λ“ˆ 2개 (`code_c_ast_v1.rs`, `code_cpp_ast_v1.rs`) β€” language-agnostic body, tree-sitter import κΈˆμ§€. κΈ°μ‘΄ `tier2_shared::build_chunk` (pub(crate)) μž¬μ‚¬μš©. +- `kebab-app`, `kebab-source-fs` β€” μƒˆ crate dep μ—†μŒ. + +## Forbidden dependencies + +- `kebab-chunk` κ°€ tree-sitter-c / tree-sitter-cpp 직접 import κΈˆμ§€ (boundary Β§6.3). +- `kebab-parse-code` κ°€ store / embed / llm / rag 직접 import κΈˆμ§€. +- UI crate (`kebab-cli` / `kebab-mcp` / `kebab-tui`) κ°€ `kebab-parse-code` / `kebab-chunk` 직접 import κΈˆμ§€ β€” `kebab-app` facade 만. + +## Risks / notes + +- **tree-sitter-c / tree-sitter-cpp ν˜Έν™˜μ„±**: tree-sitter 0.26 (ν˜„μž¬ workspace) κ³Ό ν˜Έν™˜ ν•„μš”. resolve μ‹œ `tree-sitter-language` shim μ‚¬μš© fork (1C-JK 의 tree-sitter-kotlin-ng νŒ¨ν„΄) κ°€λŠ₯μ„± β€” crate.io 의 κ°€μž₯ ν™œλ°œν•œ maintainer μš°μ„ . μ‹€νŒ¨ μ‹œ 별도 fork κ²€ν† . +- **`.h` parse μ‹€νŒ¨**: C++ 헀더 (`namespace`, `template`, `class`) λ₯Ό C parser κ°€ λ§Œλ‚˜λ©΄ partial parse + error nodes. 1A-2 의 extractor νŒ¨ν„΄μ΄ error node λ¬΄μ‹œ + recoverable parse μ§„ν–‰ β€” emit κ²°κ³Όκ°€ *λΆˆμ™„μ „* ν•  κ°€λŠ₯μ„±. 그럴 λ•Œ chunks κ°€ 0 으둜 λ–¨μ–΄μ§€λ©΄ p10-3 Tier 3 fallback 으둜 μžλ™ picked up (이미 wired). λΆ€λΆ„ emit μ‹œ μΌλΆ€λ§Œ 색인 β€” Tier 3 fallback μ•ˆ 함. dogfood ν›„ HOTFIXES κ²€ν† . +- **Method out-of-class definition** (`Class::method` ν˜•μ‹): tree-sitter-cpp 의 `function_definition` 의 declarator κ°€ `qualified_identifier` 일 λ•Œ prefix 볡원. fixture 둜 검증. +- **Template specialization** (`template<> class Foo`): tree-sitter-cpp 의 `template_declaration` μ•ˆμ˜ `class_specifier` name 만 μΆ”μΆœ β€” `Foo` 만 symbol 에 λ“€μ–΄κ°€κ³  `` 미포함. design 의 generic λ¬΄μ‹œ λ£° 일관. +- **`extern "C"` block μ•ˆμ˜ fn**: 일반 fn 처리. μ™ΈλΆ€ wrapping block 은 glue. +- **Anonymous union / struct** (`struct { int x; }` λ³€μˆ˜ μ•ˆμ—): ν”μΉ˜ μ•ŠμŒ + named 만 unit. anonymous λŠ” glue. +- **Macro-heavy code** (Linux kernel λ“±): `#define FOO(x) ...` λ§€ν¬λ‘œκ°€ function-like 라도 parser κ°€ fn 으둜 인식 μ•ˆ 함. preprocessor glue 둜 처리 β€” symbol μ•ˆ 작힘. μ˜λ„λœ λ™μž‘ (parser 의 macro expansion μ•ˆ 함). +- **`__attribute__((...))`** annotations: tree-sitter-c 의 attribute λ…Έλ“œλŠ” declarator μ˜† sibling. λ¬΄μ‹œ κ°€λŠ₯. function name μΆ”μΆœμ— 영ν–₯ μ—†μŒ. +- **fixture 크기**: sample.c λŠ” ~30 line (top-level fn + struct + enum + preprocessor), sample.cpp λŠ” ~50 line (nested namespace + class + method + template + free fn). oversize fallback 의 별도 검증은 1A-2 의 long_section_snapshot νŒ¨ν„΄μ΄ 이미 cover (ν•„μš” μ‹œ 별도 fixture). +- **λ¨Έμ§€ ν›„ deviation** 은 `tasks/HOTFIXES.md` dated 둜그 + λ³Έ spec `Risks / notes` cross-link. From a58d400abdfb94ddd81c3acd376019ceae0a80ef Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 13:15:22 +0000 Subject: [PATCH 02/13] docs(p10-1d): implementation plan (11 tasks A-K, subagent-driven) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tasks: workspace deps / C extractor / C++ extractor / C chunker + snapshot / C++ chunker + snapshot / ingest dispatch + tier3_fallback_cv extension / 2 smoke tests / frozen design Β§10 / docs sync / workspace test gate / version bump 0.15.0 β†’ 0.16.0 + gitea PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-21-p10-1d-c-cpp-ast-chunker.md | 930 ++++++++++++++++++ 1 file changed, 930 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-21-p10-1d-c-cpp-ast-chunker.md diff --git a/docs/superpowers/plans/2026-05-21-p10-1d-c-cpp-ast-chunker.md b/docs/superpowers/plans/2026-05-21-p10-1d-c-cpp-ast-chunker.md new file mode 100644 index 0000000..89c74e7 --- /dev/null +++ b/docs/superpowers/plans/2026-05-21-p10-1d-c-cpp-ast-chunker.md @@ -0,0 +1,930 @@ +# p10-1D C + C++ AST Chunkers Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Activate C + C++ code ingest end-to-end. P10 Tier 1 chunker family final entry. + +**Architecture:** Same shape as 1B (multi-language single PR) and 1C-JK (JVM family). 2 new tree-sitter grammars + 2 extractors + 2 chunkers + media routing (delegated via `code_lang_for_path`, no change) + app dispatch arms. C symbol = function name only; C++ symbol = `namespace::Class::method` via recursive class/namespace nesting (Java/Kotlin + Python hybrid). + +**Tech Stack:** Rust 2024 workspace, `tree-sitter` 0.26 (already), `tree-sitter-c` + `tree-sitter-cpp` (NEW). 1A-2/1B/1C/p10-2/p10-3 infrastructure unchanged. + +**Memory note:** Host has been OOM'd previously (μž¬λΆ€νŒ… 사둀). Per-crate cargo only. ONE full-suite + clippy invocation in Task J. NO `cargo test --workspace` outside that gate. + +--- + +## Pre-flight + +Branch `feat/p10-1d-c-cpp` already exists (spec commit `8add684`). + +- [ ] **Disk hygiene**: `df -h /` 점검. 80% λ„˜μœΌλ©΄ `cargo clean`. + +Reference files: +- 1C-JK extractor: `crates/kebab-parse-code/src/{java,kotlin}.rs` β€” closest template for source-side identifier prefix (package vs namespace). +- 1B Python extractor: `crates/kebab-parse-code/src/python.rs` β€” class-nesting recursion model (relevant for C++ class nesting). +- 1A-2 chunker: `crates/kebab-chunk/src/code_rust_ast_v1.rs` β€” duplicate-with-substitution pattern. +- 1B/1C/p10-2/p10-3 dispatch generalization: `crates/kebab-app/src/lib.rs::ingest_one_code_asset` (~L1796–2116). Current allowlist + 4-arm match. +- spec: `tasks/p10/p10-1d-c-cpp-ast-chunker.md`. + +--- + +## Task A: Workspace deps (tree-sitter-c + tree-sitter-cpp) + +**Files:** +- Modify: `Cargo.toml` (`[workspace.dependencies]`, after `tree-sitter-kotlin-ng`) +- Modify: `crates/kebab-parse-code/Cargo.toml` + +- [ ] **Step 1**: `cargo add tree-sitter-c tree-sitter-cpp -p kebab-parse-code`. If either crate's actively-maintained name differs (e.g. `tree-sitter-cpp` vs `tree-sitter-cpp-ng`), verify on crates.io. The `tree-sitter-c` 0.24 / `tree-sitter-cpp` 0.23 line is the most common; verify compatibility with workspace `tree-sitter = "0.26"` (likely already supported via the `tree-sitter-language` shim). + +- [ ] **Step 2**: Lift the two resolved versions into `[workspace.dependencies]` (after `tree-sitter-kotlin-ng`): + +```toml +# C/C++ family grammars for code ingest (kebab-parse-code, p10-1D). +tree-sitter-c = "" +tree-sitter-cpp = "" +``` + +Switch crate's `Cargo.toml` entries to `{ workspace = true }`. + +- [ ] **Step 3**: `cargo build -p kebab-parse-code` β†’ clean. Unused dep warning is fine. + +- [ ] **Step 4**: Commit: + +```bash +git add Cargo.toml Cargo.lock crates/kebab-parse-code/Cargo.toml +git commit -m "$(cat <<'EOF' +build(p10-1d): add tree-sitter-c + tree-sitter-cpp workspace deps + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +If a crate's resolved name has a non-obvious fork suffix (e.g. `tree-sitter-cpp-ng`), document it in the commit body. + +--- + +## Task B: C AST extractor (`kebab-parse-code/src/c.rs`) + +**Files:** +- Create: `crates/kebab-parse-code/src/c.rs` +- Modify: `crates/kebab-parse-code/src/lib.rs` (pub mod + `C_PARSER_VERSION` const) + +- [ ] **Step 1**: Create `crates/kebab-parse-code/src/c.rs`. Mirror `crates/kebab-parse-code/src/go.rs` (closest template β€” single-language, no namespace/package nesting, top-level units). Replace tree-sitter-go with tree-sitter-c: + +```rust +//! p10-1D: C AST extractor. + +use crate::traits::{Extractor, ExtractContext}; +use anyhow::{Context, Result}; +use kebab_core::{Block, BlockId, CanonicalDocument, CodeBlock, CommonBlock, /*..*/, SourceSpan, id_for_block, id_for_doc}; +use tree_sitter::Parser; + +pub const C_PARSER_VERSION: &str = concat!("tree-sitter-c-", env!("CARGO_PKG_VERSION")); +// Or use the tree-sitter-c crate version: better to hardcode for stability. +// Look at how go.rs / rust.rs / etc. set their PARSER_VERSION. + +pub struct CAstExtractor { + parser: Parser, +} + +impl CAstExtractor { + pub fn new() -> Self { + let mut parser = Parser::new(); + parser.set_language(&tree_sitter_c::LANGUAGE.into()).expect("load tree-sitter-c"); + Self { parser } + } +} + +impl Extractor for CAstExtractor { + fn extract(&mut self, ctx: &ExtractContext, bytes: &[u8]) -> Result { + // ... mirror go.rs: + // 1. parse the tree + // 2. iterate source_file's named_children + // 3. for each top-level node: + // - function_definition β†’ emit unit (symbol = fn name) + // - struct_specifier (named) β†’ emit unit (symbol = struct name) + // - enum_specifier (named) β†’ emit unit (symbol = enum name) + // - union_specifier (named) β†’ emit unit (symbol = union name) + // - declaration β†’ glue + // - preproc_include / preproc_def / preproc_function_def / preproc_ifdef β†’ glue + // - else β†’ glue + // 4. glue chunk if any glue accumulated + // 5. post-pass if 0 units + // ... + todo!("mirror go.rs structure with C-specific node-kind names") + } +} +``` + +**ACTION**: Read `crates/kebab-parse-code/src/go.rs` in full first. It's the closest template β€” single-language, no namespace prefix to thread through (C is even simpler than Go since there's no `package`). Port the structure: parse β†’ iterate top-level β†’ match on node-kind β†’ emit units or accumulate glue. + +Node-kind name reference (tree-sitter-c): `function_definition`, `struct_specifier`, `enum_specifier`, `union_specifier`, `declaration`, `preproc_*`. Confirm by checking the crate's `node-types.json` if uncertain. + +**Function name extraction**: `function_definition` has a `declarator` field. The innermost `identifier` of that declarator is the function name. Mirror how go.rs extracts function names β€” it uses tree-sitter field traversal. + +- [ ] **Step 2**: Register the module in `crates/kebab-parse-code/src/lib.rs`: + +```rust +pub mod c; +pub use c::{CAstExtractor, C_PARSER_VERSION}; +``` + +- [ ] **Step 3**: Build: + +```bash +cargo build -p kebab-parse-code 2>&1 | tail -5 +``` + +Expected: clean. + +- [ ] **Step 4**: Commit (no test yet β€” Task D adds the snapshot test): + +```bash +git add crates/kebab-parse-code/src/c.rs crates/kebab-parse-code/src/lib.rs +git commit -m "$(cat <<'EOF' +feat(p10-1d): C AST extractor (tree-sitter-c) + +Top-level units: function_definition (symbol = fn name), struct_specifier, +enum_specifier, union_specifier (each emits 1 unit with the symbol being +the named identifier). Preprocessor directives + top-level declarations +group into a glue chunk. Empty file or zero units β†’ +post-pass. + +C symbol = function name only β€” no namespace, no class nesting (design Β§3.4). + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task C: C++ AST extractor (`kebab-parse-code/src/cpp.rs`) + +**Files:** +- Create: `crates/kebab-parse-code/src/cpp.rs` +- Modify: `crates/kebab-parse-code/src/lib.rs` + +- [ ] **Step 1**: Create `crates/kebab-parse-code/src/cpp.rs`. The closest template is `crates/kebab-parse-code/src/java.rs` (1C-JK) β€” it handles package prefix + class nesting via recursion. C++ adds namespace nesting (multiple levels possible). + +Pseudocode: + +```rust +//! p10-1D: C++ AST extractor. + +use crate::traits::{Extractor, ExtractContext}; +use anyhow::{Context, Result}; +use kebab_core::{/* ... */}; +use tree_sitter::{Node, Parser}; + +pub const CPP_PARSER_VERSION: &str = "tree-sitter-cpp-"; + +pub struct CppAstExtractor { parser: Parser } + +impl CppAstExtractor { + pub fn new() -> Self { + let mut parser = Parser::new(); + parser.set_language(&tree_sitter_cpp::LANGUAGE.into()).expect("load tree-sitter-cpp"); + Self { parser } + } + + fn visit(&self, node: Node, source: &[u8], prefix: &[&str], units: &mut Vec<(String, Node)>, glue: &mut Vec) { + // prefix is the namespace/class chain so far (e.g. ["kebab", "chunk", "MdHeadingV1Chunker"]). + for child in node.named_children(&mut node.walk()) { + match child.kind() { + "namespace_definition" => { + let name = child.child_by_field_name("name") + .and_then(|n| n.utf8_text(source).ok()) + .unwrap_or(""); + let mut new_prefix = prefix.to_vec(); + new_prefix.push(name); + let body = child.child_by_field_name("body").unwrap_or(child); + self.visit(body, source, &new_prefix, units, glue); + } + "class_specifier" | "struct_specifier" if child.child_by_field_name("name").is_some() => { + let name = child.child_by_field_name("name") + .and_then(|n| n.utf8_text(source).ok()) + .unwrap_or(""); + // Emit the class itself as a unit. + let symbol = build_symbol(prefix, &[], name); // e.g. "kebab::chunk::Foo" + units.push((symbol, child)); + // Recurse for nested classes / methods. + let mut new_prefix = prefix.to_vec(); + new_prefix.push(name); + let body = child.child_by_field_name("body").unwrap_or(child); + self.visit(body, source, &new_prefix, units, glue); + } + "function_definition" => { + // declarator may be qualified_identifier (out-of-class def) or plain identifier. + let symbol = extract_fn_symbol(child, source, prefix); + units.push((symbol, child)); + // Do NOT recurse into function body β€” inner classes/lambdas left to a future revision. + } + "template_declaration" => { + // Recurse: unwrap to inner declarator (function_definition or class_specifier) + // and treat it as if it were directly there. Template params NOT in symbol. + self.visit(child, source, prefix, units, glue); + } + "enum_specifier" if child.child_by_field_name("name").is_some() => { + let name = child.child_by_field_name("name").and_then(|n| n.utf8_text(source).ok()).unwrap_or(""); + let symbol = build_symbol(prefix, &[], name); + units.push((symbol, child)); + } + "concept_definition" => { + let name = /* extract */; + let symbol = build_symbol(prefix, &[], &name); + units.push((symbol, child)); + } + _ => glue.push(child), + } + } + } +} + +fn build_symbol(prefix: &[&str], extras: &[&str], leaf: &str) -> String { + // Join with :: + let mut parts: Vec<&str> = prefix.iter().copied().collect(); + parts.extend_from_slice(extras); + parts.push(leaf); + parts.join("::") +} + +fn extract_fn_symbol(node: Node, source: &[u8], prefix: &[&str]) -> String { + // function_definition.declarator may be a function_declarator wrapping a + // qualified_identifier (out-of-class def like `void Foo::bar(){}`) or a + // plain identifier (free fn or in-namespace fn). + // Need to walk down to the leaf identifier and any qualifier chain. + // For qualified_identifier "Foo::bar::baz", break into ["Foo", "bar"] qualifier + "baz" leaf. + // ... + todo!("walk declarator β†’ qualified_identifier β†’ assemble symbol with prefix") +} + +// Extractor impl: parse, visit(root, ...), emit chunks-of-blocks per (symbol, node) pair + glue + fallback. +``` + +This is the most intricate extractor in p10-1D. **Action**: read `crates/kebab-parse-code/src/java.rs` for the recursion pattern, then `crates/kebab-parse-code/src/python.rs` for the class-nesting pattern, and combine. tree-sitter-cpp's node-types.json (or a quick `tree-sitter parse` against a sample file) confirms exact node-kind names. + +- [ ] **Step 2**: Register in `crates/kebab-parse-code/src/lib.rs`: + +```rust +pub mod cpp; +pub use cpp::{CppAstExtractor, CPP_PARSER_VERSION}; +``` + +- [ ] **Step 3**: Build: + +```bash +cargo build -p kebab-parse-code 2>&1 | tail -5 +``` + +Expected: clean. + +- [ ] **Step 4**: Commit: + +```bash +git add crates/kebab-parse-code/src/cpp.rs crates/kebab-parse-code/src/lib.rs +git commit -m "$(cat <<'EOF' +feat(p10-1d): C++ AST extractor (tree-sitter-cpp) + +Symbol = namespace::Class::method via recursive visit. namespace_definition +pushes namespace name (anonymous β†’ ). class_specifier / struct_specifier +(named) emit class unit + recurse with class name pushed. function_definition +emits method unit (symbol may include qualified_identifier prefix for +out-of-class definitions). template_declaration unwraps to inner declarator +(template params NOT in symbol). enum_specifier + concept_definition emit +type-level units. extern "C" block content + using/include/define β†’ glue. + +Constructor / destructor symbols use Class::Class / Class::~Class +convention. Operator overloads keep operator+ form. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task D: C chunker + snapshot test + +**Files:** +- Create: `crates/kebab-chunk/src/code_c_ast_v1.rs` +- Create: `crates/kebab-chunk/tests/fixtures/sample.c` +- Create: `crates/kebab-chunk/tests/code_c_ast_snapshot.rs` +- Modify: `crates/kebab-chunk/src/lib.rs` + +- [ ] **Step 1**: Create `crates/kebab-chunk/src/code_c_ast_v1.rs`. **Mirror `crates/kebab-chunk/src/code_go_ast_v1.rs`** (closest 1-extractor pattern, no nesting): + +```rust +//! p10-1D: C AST chunker. + +use crate::tier2_shared::build_chunk; +use crate::{Chunker, ChunkPolicy}; +use anyhow::Result; +use kebab_core::{Block, Chunk, Document}; + +pub const VERSION_LABEL: &str = "code-c-ast-v1"; + +pub struct CodeCAstV1Chunker; + +impl Chunker for CodeCAstV1Chunker { + fn chunker_version(&self) -> &'static str { VERSION_LABEL } + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + crate::tier2_shared::policy_hash(policy) + } + fn chunk(&self, doc: &Document, policy: &ChunkPolicy) -> Result> { + // Mirror code_go_ast_v1.rs's body β€” iterate doc.blocks, each Block::Code + // contributes 1 chunk via build_chunk. Apply oversize fallback per block + // via tier2_shared::push_chunks_with_oversize. + // ... + todo!("mirror code_go_ast_v1.rs verbatim, substituting VERSION_LABEL") + } +} +``` + +Read `code_go_ast_v1.rs` and port verbatim β€” the language-agnostic body iterates `doc.blocks` and emits chunks. Only the `VERSION_LABEL` and (potentially) symbol formatting helper change. + +- [ ] **Step 2**: Create `tests/fixtures/sample.c` (~30 lines, includes top-level fn, struct, enum, preprocessor): + +```c +#include +#include + +#define MAX_BUF 4096 + +typedef enum { + OK = 0, + ERR_PARSE, + ERR_IO, +} status_t; + +typedef struct { + int id; + char name[64]; + status_t status; +} record_t; + +static int counter = 0; + +int parse_record(const char *line, record_t *out) { + if (line == NULL || out == NULL) return ERR_PARSE; + return OK; +} + +void print_record(const record_t *r) { + printf("[%d] %s (status=%d)\n", r->id, r->name, r->status); +} + +int main(void) { + record_t r = { .id = 1, .name = "foo", .status = OK }; + print_record(&r); + return 0; +} +``` + +Expected snapshot: 3 function units (`parse_record`, `print_record`, `main`) + 1 enum unit (`status_t`) + 1 struct unit (`record_t`) + 1 `` glue (preproc + global var). Total ~6 chunks. + +- [ ] **Step 3**: Create `tests/code_c_ast_snapshot.rs` mirroring `tests/code_go_ast_snapshot.rs`. Assertions: + +```rust +// Pseudocode: +// 1. Load fixture sample.c +// 2. Run CAstExtractor β†’ Document +// 3. Run CodeCAstV1Chunker.chunk(&doc, &policy) +// 4. Assert chunks.len() == expected (6). +// 5. Assert symbols (from chunks[i].source_spans[0]::SourceSpan::Code.symbol) match expected list: +// ["status_t", "record_t", "parse_record", "print_record", "main", ""] +// (order matches AST traversal order β€” verify by running once.) +// 6. Assert all chunks have lang = Some("c"). +``` + +- [ ] **Step 4**: Register module in `crates/kebab-chunk/src/lib.rs`: + +```rust +pub mod code_c_ast_v1; +pub use code_c_ast_v1::CodeCAstV1Chunker; +``` + +- [ ] **Step 5**: Run test: + +```bash +cargo test -p kebab-chunk --test code_c_ast_snapshot -- --nocapture 2>&1 | tail -25 +``` + +Expected: PASS. If chunk count or symbol order differs from expectation, INSPECT the actual output and update the test's expected list to match (run once to learn, codify on second run). + +- [ ] **Step 6**: Clippy + commit: + +```bash +cargo clippy -p kebab-chunk --all-targets -- -D warnings +git add crates/kebab-chunk/src/code_c_ast_v1.rs \ + crates/kebab-chunk/src/lib.rs \ + crates/kebab-chunk/tests/fixtures/sample.c \ + crates/kebab-chunk/tests/code_c_ast_snapshot.rs +git commit -m "$(cat <<'EOF' +feat(p10-1d): code-c-ast-v1 chunker + snapshot test + +Mirrors code-go-ast-v1's chunker pattern (1 chunk per AST unit + +glue + oversize fallback). Snapshot test against tests/fixtures/sample.c +(function + struct + enum + preprocessor) verifies symbol order + lang=c +stamping. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task E: C++ chunker + snapshot test + +**Files:** +- Create: `crates/kebab-chunk/src/code_cpp_ast_v1.rs` +- Create: `crates/kebab-chunk/tests/fixtures/sample.cpp` +- Create: `crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs` +- Modify: `crates/kebab-chunk/src/lib.rs` + +- [ ] **Step 1**: Create `code_cpp_ast_v1.rs`. **Mirror `code_c_ast_v1.rs`** verbatim, only VERSION_LABEL differs: + +```rust +pub const VERSION_LABEL: &str = "code-cpp-ast-v1"; + +pub struct CodeCppAstV1Chunker; + +impl Chunker for CodeCppAstV1Chunker { + fn chunker_version(&self) -> &'static str { VERSION_LABEL } + // ... identical body β€” both languages use the same Block::Code β†’ Chunk emission ... +} +``` + +The actual symbol-formatting work happens in the EXTRACTOR (Task C). The chunker's job is to iterate blocks the extractor produced and emit Chunks. Both C and C++ chunkers are essentially identical bodies. + +- [ ] **Step 2**: Create `tests/fixtures/sample.cpp` (~50 lines, includes namespace + nested class + method + free fn + template): + +```cpp +#include +#include + +namespace kebab { +namespace chunk { + +class MdHeadingV1Chunker { +public: + MdHeadingV1Chunker() = default; + ~MdHeadingV1Chunker() = default; + + std::string chunk_doc(const std::string& doc) { + return doc; + } + + int operator()(int x) const { + return x * 2; + } + +private: + int counter_ = 0; +}; + +template +T identity(T value) { + return value; +} + +} // namespace chunk + +void global_helper() { + // free function in kebab namespace +} + +} // namespace kebab + +int main() { + kebab::chunk::MdHeadingV1Chunker c; + return 0; +} +``` + +Expected snapshot symbols (verify on first run, then codify): +- `kebab::chunk::MdHeadingV1Chunker` (class unit) +- `kebab::chunk::MdHeadingV1Chunker::MdHeadingV1Chunker` (constructor) +- `kebab::chunk::MdHeadingV1Chunker::~MdHeadingV1Chunker` (destructor) +- `kebab::chunk::MdHeadingV1Chunker::chunk_doc` +- `kebab::chunk::MdHeadingV1Chunker::operator()` +- `kebab::chunk::identity` (template fn) +- `kebab::global_helper` +- `main` (free fn, no namespace) +- `` (include + using) + +~9 chunks total. + +- [ ] **Step 3**: Create `tests/code_cpp_ast_snapshot.rs` mirroring `code_c_ast_snapshot.rs`. Assert symbol list matches expected (run once to learn the actual order, codify). + +- [ ] **Step 4**: Register module in `lib.rs`: + +```rust +pub mod code_cpp_ast_v1; +pub use code_cpp_ast_v1::CodeCppAstV1Chunker; +``` + +- [ ] **Step 5**: Run test: + +```bash +cargo test -p kebab-chunk --test code_cpp_ast_snapshot -- --nocapture 2>&1 | tail -30 +``` + +Expected: PASS. + +- [ ] **Step 6**: Clippy + commit: + +```bash +cargo clippy -p kebab-chunk --all-targets -- -D warnings +git add crates/kebab-chunk/src/code_cpp_ast_v1.rs \ + crates/kebab-chunk/src/lib.rs \ + crates/kebab-chunk/tests/fixtures/sample.cpp \ + crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs +git commit -m "$(cat <<'EOF' +feat(p10-1d): code-cpp-ast-v1 chunker + snapshot test + +Identical chunker body to code-c-ast-v1; per-language work happens in the +CppAstExtractor (Task C). Snapshot fixture covers nested namespace + +class + ctor/dtor + method + operator overload + template fn + free fn + +top-level main, verifying namespace::Class::method symbol convention per +design Β§3.4. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task F: ingest_one_code_asset dispatch + tier3 fallback list extension + +**Files:** +- Modify: `crates/kebab-app/src/lib.rs` + +- [ ] **Step 1**: Top-of-file `use kebab_chunk::{...}` extend with `CodeCAstV1Chunker` + `CodeCppAstV1Chunker`: + +```rust +use kebab_chunk::{ + /* existing items */, + CodeCAstV1Chunker, + CodeCppAstV1Chunker, +}; +``` + +- [ ] **Step 2**: Allowlist (around line 953) extend: + +```rust +if matches!(lang.as_str(), + "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + | "shell" + | "c" | "cpp") +``` + +- [ ] **Step 3**: `parser_version` match β€” add C/C++ arms (Tier 1, so they DO get a real parser version): + +```rust +let parser_version = match code_lang { + // ... existing 7 Tier 1 + Tier 2 + shell arms ... + "c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()), + "cpp" => ParserVersion(kebab_parse_code::CPP_PARSER_VERSION.to_string()), + other => anyhow::bail!("unsupported code_lang: {other}"), +}; +``` + +- [ ] **Step 4**: `chunker_version` match β€” add C/C++ arms: + +```rust +let chunker_version = match code_lang { + // ... existing arms ... + "c" => CodeCAstV1Chunker.chunker_version(), + "cpp" => CodeCppAstV1Chunker.chunker_version(), + other => anyhow::bail!("unreachable chunker_version: {other}"), +}; +``` + +- [ ] **Step 5**: `canonical_result` extract match β€” add C/C++ arms: + +```rust +let canonical_result: anyhow::Result = match code_lang { + "rust" => RustAstExtractor::new().extract(&ctx, &bytes).context("..."), + // ... existing ... + "c" => CAstExtractor::new().extract(&ctx, &bytes) + .context("kb-parse-code::CAstExtractor::extract (code:c)"), + "cpp" => CppAstExtractor::new().extract(&ctx, &bytes) + .context("kb-parse-code::CppAstExtractor::extract (code:cpp)"), + // ... Tier 2 + shell ... + other => anyhow::bail!("unreachable (extract): {other}"), +}; +``` + +(Add `use kebab_parse_code::{CAstExtractor, CppAstExtractor};` at the top if not already wildcard-imported.) + +- [ ] **Step 6**: `chunks_result` match β€” add C/C++ arms: + +```rust +let chunks_result: anyhow::Result> = if extract_fell_back { + // ... existing ... +} else { + match code_lang { + "rust" => CodeRustAstV1Chunker.chunk(&canonical, chunk_policy).context("..."), + // ... existing ... + "c" => CodeCAstV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeCAstV1Chunker::chunk (code:c)"), + "cpp" => CodeCppAstV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeCppAstV1Chunker::chunk (code:cpp)"), + // ... existing ... + other => anyhow::bail!("unreachable (chunk): {other}"), + } +}; +``` + +- [ ] **Step 7**: `tier3_fallback_cv` (p10-3 Critical fix) β€” C/C++ are fallback-eligible (extract may fail on `.h` C++ headers or malformed code): + +```rust +let tier3_fallback_cv = match code_lang { + "rust" | "python" | "typescript" | "javascript" + | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + | "c" | "cpp" // p10-1d: + => Some(CodeTextParagraphV1Chunker.chunker_version()), + _ => None, +}; +``` + +(The exact location of this match is in `ingest_one_code_asset` between ~lines 1921-1927 per the p10-3 critical fix.) + +- [ ] **Step 8**: Build: + +```bash +cargo build -p kebab-app 2>&1 | tail -5 +``` + +Expected: clean. + +- [ ] **Step 9**: Per-crate test (no regression): + +```bash +cargo test -p kebab-app --lib -- --nocapture 2>&1 | tail -10 +``` + +Expected: 52 PASS (existing baseline). + +- [ ] **Step 10**: Clippy + commit: + +```bash +cargo clippy -p kebab-app --all-targets -- -D warnings +git add crates/kebab-app/src/lib.rs +git commit -m "$(cat <<'EOF' +feat(p10-1d): activate C + C++ in ingest_one_code_asset dispatch + +Extends 4-arm match (parser_version / chunker_version / extract / chunks) ++ allowlist + tier3_fallback_cv list with "c" + "cpp" arms. C uses +CAstExtractor + CodeCAstV1Chunker; C++ uses CppAstExtractor + +CodeCppAstV1Chunker. Both langs are Tier 3-fallback-eligible (e.g. .h +file with C++ syntax may fail tree-sitter-c parse β†’ Tier 3 paragraph +fallback). + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task G: code_ingest_smoke integration tests (C + C++) + +**Files:** +- Modify: `crates/kebab-app/tests/code_ingest_smoke.rs` + +- [ ] **Step 1**: Append 2 tests at the end of the file (mirror the existing tier1 tests `c_ast_v1_*` if present; if not, mirror `rust_ast_v1_*` or `go_ast_v1_*`): + +```rust +#[test] +fn tier1_c_ingest_searchable() { + let env = TestEnv::lexical_only(); + let workspace = env.workspace_root(); + std::fs::write( + workspace.join("parser.c"), + "#include \n\nint parse_record(const char *line) {\n if (line == NULL) return -1;\n return 0;\n}\n", + ) + .unwrap(); + + let report = env.ingest().expect("ingest"); + assert!(report.new_docs >= 1, "expected at least 1 new doc"); + + let hits = env.search_code_lang("c", "parse_record").expect("search"); + assert!(!hits.is_empty(), "expected at least 1 c hit"); + + match &hits[0].citation { + Citation::Code { symbol, lang, .. } => { + assert_eq!(symbol.as_deref(), Some("parse_record"), "C symbol must be function name only"); + assert_eq!(lang.as_deref(), Some("c")); + } + other => panic!("expected Citation::Code, got {other:?}"), + } + assert_eq!( + hits[0].chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-c-ast-v1"), + ); +} + +#[test] +fn tier1_cpp_ingest_searchable() { + let env = TestEnv::lexical_only(); + let workspace = env.workspace_root(); + std::fs::write( + workspace.join("chunker.cpp"), + "namespace kebab {\nnamespace chunk {\nclass Foo {\npublic:\n void bar() { /* impl */ }\n};\n}\n}\n", + ) + .unwrap(); + + let report = env.ingest().expect("ingest"); + assert!(report.new_docs >= 1); + + let hits = env.search_code_lang("cpp", "bar").expect("search"); + assert!(!hits.is_empty(), "expected at least 1 cpp hit"); + + match &hits[0].citation { + Citation::Code { symbol, lang, .. } => { + // Symbol could be "kebab::chunk::Foo::bar" or "kebab::chunk::Foo" depending on which chunk hits first. + assert!( + symbol.as_deref().map_or(false, |s| s.starts_with("kebab::chunk::Foo")), + "C++ symbol must start with namespace::Class prefix, got {:?}", symbol + ); + assert_eq!(lang.as_deref(), Some("cpp")); + } + other => panic!("expected Citation::Code, got {other:?}"), + } + assert_eq!( + hits[0].chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-cpp-ast-v1"), + ); +} +``` + +- [ ] **Step 2**: Run tests: + +```bash +cargo test -p kebab-app --test code_ingest_smoke tier1_c_ingest tier1_cpp_ingest -- --nocapture 2>&1 | tail -30 +``` + +Expected: 2 PASS. + +- [ ] **Step 3**: Full smoke regression: + +```bash +cargo test -p kebab-app --test code_ingest_smoke -- --nocapture 2>&1 | tail -30 +``` + +Expected: 18 PASS (16 existing + 2 new). + +- [ ] **Step 4**: Clippy + commit: + +```bash +cargo clippy -p kebab-app --tests -- -D warnings +git add crates/kebab-app/tests/code_ingest_smoke.rs +git commit -m "$(cat <<'EOF' +test(p10-1d): integration smoke tests for C + C++ + +Verifies end-to-end ingest + search + Citation::Code shape: +- tier1_c_ingest_searchable: .c file β†’ --code-lang c search β†’ symbol + = function name (no nesting), lang = "c", chunker_version = "code-c-ast-v1". +- tier1_cpp_ingest_searchable: .cpp file β†’ --code-lang cpp search β†’ + symbol starts with namespace::Class prefix, lang = "cpp", + chunker_version = "code-cpp-ast-v1". + +Brings code_ingest_smoke to 18 tests (Rust 3 + Python 1 + TS 1 + JS 1 + +Go 1 + Java 1 + Kotlin 1 + yaml 1 + dockerfile 1 + manifest 1 + shell 1 + +yaml-fallback 1 + 2 reingest-unchanged regression + c 1 + cpp 1). + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task H: frozen design Β§10 activation log + +**Files:** +- Modify: `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` + +- [ ] **Step 1**: Find Β§10 activation log. Add p10-1D entry right after the p10-3 entry: + +``` +**p10-1D ν™œμ„±ν™” (C + C++) (2026-05-21)**: Tier 1 chunker family μ™„λ£Œ β€” C (`code-c-ast-v1`, `.c`/`.h`) + C++ (`code-cpp-ast-v1`, `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx`) AST chunker ν™œμ„±ν™”. C symbol = function name only; C++ symbol = `namespace::Class::method` (recursive namespace + class nesting). `.h` κ°€ C++ syntax λ§Œλ‚˜λ©΄ tree-sitter-c parse μ‹€νŒ¨ β†’ p10-3 Tier 3 fallback 으둜 μžλ™ picked up. +``` + +- [ ] **Step 2**: Commit: + +```bash +git add docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md \ + docs/superpowers/specs/2026-04-27-kebab-final-form-design.md 2>/dev/null +git add docs/superpowers/specs/2026-04-27-kebab-final-form-design.md +git commit -m "$(cat <<'EOF' +docs(p10-1d): activate C + C++ in frozen design Β§10 + +P10 Tier 1 chunker family complete. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task I: README + HANDOFF + ARCHITECTURE + SMOKE + tasks/INDEX + tasks/p10/INDEX + +**Files:** +- Modify: `README.md` (Mermaid + ingest row), `HANDOFF.md`, `docs/ARCHITECTURE.md`, `docs/SMOKE.md`, `tasks/INDEX.md`, `tasks/p10/INDEX.md` + +- [ ] **Step 1 β€” README.md**: Update the `kebab ingest` row's supported-langs list to include `.c` / `.h` β†’ `code-c-ast-v1` and `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx` β†’ `code-cpp-ast-v1`. Extend `--code-lang c` / `--code-lang cpp` in the enumeration. Update the Mermaid `chunker[...]` node to include `code-c-ast-v1, code-cpp-ast-v1` in the brace. + +- [ ] **Step 2 β€” HANDOFF.md**: P10 row append `, **1D βœ… (C + C++ AST chunkers, code-c-ast-v1 + code-cpp-ast-v1 β€” v0.16.0)**`. Update ν•œ 쀄 μš”μ•½ to include C/C++. Update λ‹€μŒ 후보 (drop p10-1D; remaining: P9-5 desktop / P8 audio). + +- [ ] **Step 3 β€” docs/ARCHITECTURE.md**: code parser table row: append C + C++ row mention. Flowchart `pcode` node: append `+ P10-1D`. Directory tree chunkers list: add `code_c_ast_v1.rs` + `code_cpp_ast_v1.rs`. + +- [ ] **Step 4 β€” docs/SMOKE.md**: Add a "## P10-1D C + C++ AST chunker" section after the P10-3 section. Walkthrough with sample.c + sample.cpp ingest + `--code-lang c` / `--code-lang cpp` search assertions. Append verification checklist entry. + +- [ ] **Step 5 β€” tasks/INDEX.md + tasks/p10/INDEX.md**: Flip p10-1D row ⏳ β†’ βœ… (v0.16.0). + +- [ ] **Step 6**: Commit: + +```bash +git add README.md HANDOFF.md docs/ARCHITECTURE.md docs/SMOKE.md tasks/INDEX.md tasks/p10/INDEX.md +git commit -m "$(cat <<'EOF' +docs(p10-1d): README/HANDOFF/ARCHITECTURE/SMOKE/INDEX sync + +P10 Tier 1 chunker family complete (Rust + Python + TS + JS + Go + Java + +Kotlin + C + C++). Tier 2 (k8s + dockerfile + manifest) and Tier 3 +(paragraph fallback) already active. p10-1D ν™œμ„±ν™” + βœ… flip. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task J: workspace test gate + clippy + +- [ ] **Step 1**: Disk check (`df -h /`) + optional `cargo clean`. + +- [ ] **Step 2**: `cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -80`. Expected: all PASS. + +- [ ] **Step 3**: `cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -30`. Expected: clean. + +--- + +## Task K: version bump + gitea PR + release + +**Files:** +- Modify: `Cargo.toml` + +- [ ] **Step 1**: Workspace `version = "0.15.0"` β†’ `"0.16.0"`. + +- [ ] **Step 2**: `cargo build -p kebab-cli` to refresh Cargo.lock. + +- [ ] **Step 3**: Commit: + +```bash +git add Cargo.toml Cargo.lock +git commit -m "$(cat <<'EOF' +chore: bump version 0.15.0 β†’ 0.16.0 (p10-1d C + C++ AST chunkers) + +Minor bump β€” additive new chunker_versions code-c-ast-v1 + code-cpp-ast-v1 ++ new routing langs c / cpp + new tree-sitter-c / tree-sitter-cpp workspace +deps. P10 Tier 1 chunker family complete. No DB migration, no wire schema +major bump. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +- [ ] **Step 4**: Push branch + open gitea PR via REST API. Title: `feat(p10-1d): C + C++ AST chunkers β€” P10 Tier 1 chunker family complete`. + +- [ ] **Step 5**: Wait for code-reviewer APPROVE β†’ merge via gitea REST API β†’ cut `gitea-release v0.16.0`. + +--- + +## Verification matrix + +| 검증 | λͺ…λ Ή | κΈ°λŒ€ | +|------|------|------| +| C symbol | `kebab search --code-lang c --json` | `Citation::Code.symbol = ""` | +| C++ symbol | `kebab search --code-lang cpp --json` | `Citation::Code.symbol = "namespace::Class::method"` | +| .h fallback | `.h` with C++ syntax β†’ ingest | Tier 3 fallback: `chunker_version = "code-text-paragraph-v1"`, lang = c | +| code_lang_breakdown | `kebab schema --json` | `"c": N`, `"cpp": M` | + +--- + +## Risks reminder (κ΅¬ν˜„ 쀑 주의) + +- **tree-sitter grammar version resolution**: tree-sitter 0.26 ν˜Έν™˜ grammar. crates.io μ΅œμ‹  버전 default. +- **tree-sitter-cpp 의 node-kind λͺ…**: spec 의 κ°€μ • (`namespace_definition`, `class_specifier`, `function_definition`, `template_declaration`, `concept_definition`, etc.) 이 μ‹€μ œ grammar 와 μΌμΉ˜ν•˜λŠ”μ§€ fixture parse 둜 검증. +- **out-of-class method def 의 prefix 볡원**: `void Foo::bar()` 의 declarator κ°€ `function_declarator > qualified_identifier > namespace_identifier "Foo" + identifier "bar"`. spec 의 `extract_fn_symbol` 이 이 chain μ •ν™•νžˆ walk. +- **Operator overload**: tree-sitter-cpp 의 `operator_name` λ˜λŠ” `field_identifier` "operator+" ν˜•νƒœ. fixture 둜 검증. +- **λ¨Έμ§€ ν›„ deviation** 은 `tasks/HOTFIXES.md` dated 둜그. From b541567946a63fabe71ff3029bde7bc31d6efb78 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 13:19:00 +0000 Subject: [PATCH 03/13] build(p10-1d): add tree-sitter-c + tree-sitter-cpp workspace deps Standard crate names resolved cleanly: tree-sitter-c v0.24.2 and tree-sitter-cpp v0.23.4 are both compatible with workspace tree-sitter 0.26. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 22 ++++++++++++++++++++++ Cargo.toml | 3 +++ crates/kebab-parse-code/Cargo.toml | 2 ++ 3 files changed, 27 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 34e4a91..4a82f3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4353,6 +4353,8 @@ dependencies = [ "time", "tracing", "tree-sitter", + "tree-sitter-c", + "tree-sitter-cpp", "tree-sitter-go", "tree-sitter-java", "tree-sitter-javascript", @@ -8531,6 +8533,26 @@ dependencies = [ "tree-sitter-language", ] +[[package]] +name = "tree-sitter-c" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9b2eb57a55fed6b00812912e730b7a275cf4fe98bfd6a5d76263d4438371728" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-cpp" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df2196ea9d47b4ab4a31b9297eaa5a5d19a0b121dceb9f118f6790ad0ab94743" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-go" version = "0.25.0" diff --git a/Cargo.toml b/Cargo.toml index 571d7ff..42d34a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -99,6 +99,9 @@ tree-sitter-go = "0.25.0" # JVM family grammars for code ingest (kebab-parse-code, p10-1C-JK). tree-sitter-java = "0.23.5" tree-sitter-kotlin-ng = "1.1.0" # bare tree-sitter-kotlin requires ts <0.23; -ng uses tree-sitter-language 0.1 (ts 0.26 compat) +# C/C++ family grammars for code ingest (kebab-parse-code, p10-1D). +tree-sitter-c = "0.24.2" +tree-sitter-cpp = "0.23.4" # Disk-footprint trim for dev / test builds. Codegen, opt-level, and # behavior are unchanged β€” only DWARF debug info is reduced (line diff --git a/crates/kebab-parse-code/Cargo.toml b/crates/kebab-parse-code/Cargo.toml index caaceaf..cfdbca6 100644 --- a/crates/kebab-parse-code/Cargo.toml +++ b/crates/kebab-parse-code/Cargo.toml @@ -22,6 +22,8 @@ tree-sitter-javascript = { workspace = true } tree-sitter-go = { workspace = true } tree-sitter-java = { workspace = true } tree-sitter-kotlin-ng = { workspace = true } +tree-sitter-c = { workspace = true } +tree-sitter-cpp = { workspace = true } [dev-dependencies] tempfile = { workspace = true } From e0a29225dae12b94fedbcc2ffbf2c88e9c760f00 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 13:29:36 +0000 Subject: [PATCH 04/13] feat(p10-1d): C AST extractor (tree-sitter-c) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Top-level units: function_definition (symbol = fn name from declarator's innermost identifier), struct_specifier, enum_specifier, union_specifier (each emits 1 unit with the named identifier as symbol). Preprocessor directives + top-level declarations group into a glue chunk. Empty file or zero units β†’ post-pass. C symbol = function name only β€” no namespace, no class nesting (design Β§3.4). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-parse-code/src/c.rs | 337 +++++++++++++++++++++++++++++ crates/kebab-parse-code/src/lib.rs | 2 + 2 files changed, 339 insertions(+) create mode 100644 crates/kebab-parse-code/src/c.rs diff --git a/crates/kebab-parse-code/src/c.rs b/crates/kebab-parse-code/src/c.rs new file mode 100644 index 0000000..4b88b58 --- /dev/null +++ b/crates/kebab-parse-code/src/c.rs @@ -0,0 +1,337 @@ +//! `kebab-parse-code::c` β€” tree-sitter C AST extractor (P10-1D Task B). +//! +//! Implements [`kebab_core::Extractor`] for [`MediaType::Code("c")`]. +//! Walks the tree-sitter parse tree and emits one [`Block::Code`] per +//! top-level AST semantic unit: +//! +//! - `function_definition` β†’ 1 unit, symbol = function name (extracted +//! from the declarator's innermost `identifier`, handles pointer-returning +//! functions where the declarator is wrapped in `pointer_declarator`). +//! - `struct_specifier` (named) β†’ 1 unit, symbol = struct name. +//! - `enum_specifier` (named) β†’ 1 unit, symbol = enum name. +//! - `union_specifier` (named) β†’ 1 unit, symbol = union name. +//! +//! Everything else (`declaration`, `preproc_*`, `type_definition`, +//! `linkage_specification`, etc.) collapses into a single `` +//! glue chunk. If the file produces zero units **and** zero glue, the +//! `` post-pass emits one unit covering the whole file (1A-2 +//! pattern). +//! +//! C symbol = function name only β€” no namespace, no class nesting +//! (design Β§3.4 C row). Per design Β§3.4 / Β§9.1 / Β§9 versioning. + +use anyhow::Result; +use kebab_core::{ + Block, CanonicalDocument, CodeBlock, CommonBlock, Extractor, Lang, MediaType, Metadata, + ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TrustLevel, + id_for_block, id_for_doc, +}; +use serde_json::Map; +use time::OffsetDateTime; + +use crate::scaffold::{filename_from_workspace_path, strip_extension}; + +pub const PARSER_VERSION: &str = "code-c-v1"; + +/// C AST extractor. Per-unit blocks via tree-sitter-c 0.24.2 +/// (`LANGUAGE: LanguageFn`) parsed by tree-sitter 0.26. +pub struct CAstExtractor; + +impl CAstExtractor { + pub fn new() -> Self { + Self + } +} + +impl Default for CAstExtractor { + fn default() -> Self { + Self::new() + } +} + +impl Extractor for CAstExtractor { + fn supports(&self, m: &MediaType) -> bool { + matches!(m, MediaType::Code(l) if l == "c") + } + + fn parser_version(&self) -> ParserVersion { + ParserVersion(PARSER_VERSION.to_string()) + } + + fn extract( + &self, + ctx: &kebab_core::ExtractContext<'_>, + bytes: &[u8], + ) -> Result { + let asset = ctx.asset; + if !self.supports(&asset.media_type) { + anyhow::bail!( + "kebab-parse-code: unsupported media_type for CAstExtractor: {:?}", + asset.media_type + ); + } + + let parser_version = self.parser_version(); + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); + + let source = String::from_utf8(bytes.to_vec()) + .map_err(|e| anyhow::anyhow!("kebab-parse-code: C source is not valid UTF-8: {e}"))?; + + let blocks = build_blocks(&source, &doc_id)?; + let unit_count = blocks.len() as u32; + + let now = OffsetDateTime::now_utc(); + let mut events: Vec = Vec::with_capacity(2); + events.push(ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-code".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!( + "parser_version={}; unit_count={}", + parser_version.0, unit_count + )), + }); + + let title = { + let fname = filename_from_workspace_path(&asset.workspace_path.0); + strip_extension(&fname) + }; + + let abs_path = match &asset.source_uri { + kebab_core::SourceUri::File(p) => { + if p.is_absolute() { + p.clone() + } else { + ctx.workspace_root.join(p) + } + } + kebab_core::SourceUri::Kb(_) => ctx.workspace_root.to_path_buf(), + }; + let (repo, git_branch, git_commit) = match crate::repo::detect_repo(&abs_path) { + Some(r) => (Some(r.name), r.branch, r.commit), + None => (None, None, None), + }; + + let metadata = Metadata { + aliases: Vec::new(), + tags: Vec::new(), + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Map::new(), + repo, + git_branch, + git_commit, + code_lang: Some("c".to_string()), + }; + + tracing::debug!( + target: "kebab-parse-code", + "extracted C doc_id={} workspace_path={} units={}", + doc_id.0, + asset.workspace_path.0, + unit_count + ); + + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks, + metadata, + provenance: Provenance { events }, + parser_version, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + }) + } +} + +/// Walk down the declarator chain of a `function_definition` to find +/// the innermost `identifier` β€” the function name. +/// +/// The tree for `int *foo(int x) { ... }` looks like: +/// ```text +/// function_definition +/// type: primitive_type "int" +/// declarator: pointer_declarator +/// declarator: function_declarator +/// declarator: identifier "foo" +/// parameters: parameter_list +/// body: compound_statement +/// ``` +/// We walk `declarator` fields recursively until we reach an `identifier` +/// or run out of nodes. Returns `None` if no identifier is found +/// (malformed / unsupported declarator shape). +fn extract_fn_name<'a>(decl_node: tree_sitter::Node, src: &'a str) -> Option<&'a str> { + let mut cur = decl_node; + loop { + match cur.kind() { + "identifier" => return Some(&src[cur.start_byte()..cur.end_byte()]), + // pointer_declarator, function_declarator, array_declarator, + // attributed_declarator, parenthesized_declarator β€” + // all carry a `declarator` field pointing deeper. + _ => { + if let Some(inner) = cur.child_by_field_name("declarator") { + cur = inner; + } else { + // No further `declarator` field; give up. + return None; + } + } + } + } +} + +fn build_blocks( + source: &str, + doc_id: &kebab_core::DocumentId, +) -> anyhow::Result> { + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&tree_sitter_c::LANGUAGE.into()) + .map_err(|e| anyhow::anyhow!("set tree-sitter-c language: {e}"))?; + let tree = parser + .parse(source.as_bytes(), None) + .ok_or_else(|| anyhow::anyhow!("tree-sitter failed to parse C source"))?; + let lines: Vec<&str> = source.split('\n').collect(); + + let root = tree.root_node(); + + // units: (symbol, line_start, line_end, is_real_semantic_unit). + // Glue is accumulated as (start, end) pairs and flushed into one + // "" block (or "" if no real unit exists). + let mut units: Vec<(String, u32, u32, bool)> = Vec::new(); + let mut glue: Vec<(u32, u32)> = Vec::new(); + + /// Walk preceding `comment` siblings to extend the unit's line range + /// upward, folding doc / line comments into the unit (1B pattern). + fn unit_start(n: &tree_sitter::Node) -> u32 { + let mut start = n.start_position().row as u32 + 1; + let mut prev = n.prev_sibling(); + while let Some(p) = prev { + if p.kind() == "comment" { + start = p.start_position().row as u32 + 1; + prev = p.prev_sibling(); + } else { + break; + } + } + start + } + + let mut cur = root.walk(); + for child in root.named_children(&mut cur) { + let s = unit_start(&child); + let e = child.end_position().row as u32 + 1; + + match child.kind() { + "function_definition" => { + if let Some(decl) = child.child_by_field_name("declarator") { + if let Some(name) = extract_fn_name(decl, source) { + flush_glue(&mut glue, &mut units); + units.push((name.to_string(), s, e, true)); + } else { + // Could not extract name β€” treat as glue. + glue.push((s, e)); + } + } else { + glue.push((s, e)); + } + } + "struct_specifier" | "enum_specifier" | "union_specifier" => { + if let Some(name_node) = child.child_by_field_name("name") { + let name = &source[name_node.start_byte()..name_node.end_byte()]; + flush_glue(&mut glue, &mut units); + units.push((name.to_string(), s, e, true)); + } else { + // Anonymous struct/enum/union β€” glue. + glue.push((s, e)); + } + } + // Everything else: preprocessor directives, declarations + // (typedef / global var / fn prototype), type_definition, + // linkage_specification, etc. β€” all collapse into glue. + _ => { + glue.push((s, e)); + } + } + } + flush_glue(&mut glue, &mut units); + + // Post-pass: if the file has no real semantic unit (only glue, or + // completely empty), rename the single glue unit to "" and + // emit it. If there are zero units AND zero glue, synthesise a + // one-line "" covering the whole file. + let has_real_unit = units.iter().any(|(_, _, _, is_real)| *is_real); + + if units.is_empty() { + // Completely empty file or whitespace/comments only. + let total = lines.len() as u32; + units.push(( + "".to_string(), + 1, + total.max(1), + false, + )); + } + // If there is only glue (no real unit) the single pushed "" + // label should be "" β€” rename it now. + if !has_real_unit { + for (sym, _, _, _) in units.iter_mut() { + if sym == "" { + *sym = "".to_string(); + } + } + } + + let total_lines = lines.len() as u32; + let mut blocks = Vec::with_capacity(units.len()); + for (ordinal, (symbol, ls, le, _is_real)) in units.into_iter().enumerate() { + let line_start = ls.max(1); + let line_end = le.min(total_lines.max(1)); + let span = SourceSpan::Code { + line_start, + line_end, + symbol: Some(symbol), + lang: Some("c".to_string()), + }; + let block_id = id_for_block(doc_id, "code", &[], ordinal as u32, &span); + let code = lines[(line_start as usize - 1)..=(line_end as usize - 1)].join("\n"); + blocks.push(Block::Code(CodeBlock { + common: CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span, + }, + lang: Some("c".to_string()), + code, + })); + } + Ok(blocks) +} + +fn flush_glue(glue: &mut Vec<(u32, u32)>, units: &mut Vec<(String, u32, u32, bool)>) { + if glue.is_empty() { + return; + } + let s = glue.iter().map(|(a, _)| *a).min().unwrap(); + let e = glue.iter().map(|(_, b)| *b).max().unwrap(); + units.push(("".to_string(), s, e, false)); + glue.clear(); +} + +// Tests for CAstExtractor (snapshot + unit assertions) are added in Task D +// alongside the C fixture file. This module is intentionally empty until then. diff --git a/crates/kebab-parse-code/src/lib.rs b/crates/kebab-parse-code/src/lib.rs index 854ba27..a49cd14 100644 --- a/crates/kebab-parse-code/src/lib.rs +++ b/crates/kebab-parse-code/src/lib.rs @@ -13,6 +13,7 @@ //! `kebab-parse-*` crates per design Β§8: must NOT depend on store / embed //! / llm / rag. +pub mod c; pub mod go; pub mod java; pub mod javascript; @@ -25,6 +26,7 @@ pub(crate) mod scaffold; pub mod skip; pub mod typescript; +pub use c::{PARSER_VERSION as C_PARSER_VERSION, CAstExtractor}; pub use go::{PARSER_VERSION as GO_PARSER_VERSION, GoAstExtractor}; pub use java::{PARSER_VERSION as JAVA_PARSER_VERSION, JavaAstExtractor}; pub use javascript::{PARSER_VERSION as JS_PARSER_VERSION, JavascriptAstExtractor}; From 926042049c4effe3d1bc2c9fb000cb986eb74aa6 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 13:37:58 +0000 Subject: [PATCH 05/13] feat(p10-1d): C++ AST extractor (tree-sitter-cpp) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symbol = namespace::Class::method via recursive build_blocks. namespace_definition pushes namespace name (anonymous β†’ ). nested_namespace_specifier (outer::inner) flattens all segments and pushes them. class_specifier / struct_specifier (named) emit class unit + recurse with class name pushed. function_definition emits method unit; symbol resolution unpacks declarator chain (pointer_declarator / reference_declarator β†’ function_declarator β†’ identifier / field_identifier / qualified_identifier / operator_name / destructor_name). operator_cast (conversion operators, e.g. operator bool) handled as a direct declarator kind on function_definition. template_declaration recurses with same prefix (template params NOT in symbol). enum_specifier + concept_definition emit type-level units. linkage_specification (extern "C") recurses into body with same prefix. Other top-level nodes β†’ glue. All 15 unit tests pass; build and clippy clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-parse-code/src/cpp.rs | 883 +++++++++++++++++++++++++++++ crates/kebab-parse-code/src/lib.rs | 2 + 2 files changed, 885 insertions(+) create mode 100644 crates/kebab-parse-code/src/cpp.rs diff --git a/crates/kebab-parse-code/src/cpp.rs b/crates/kebab-parse-code/src/cpp.rs new file mode 100644 index 0000000..81bf1f9 --- /dev/null +++ b/crates/kebab-parse-code/src/cpp.rs @@ -0,0 +1,883 @@ +//! `kebab-parse-code::cpp` β€” tree-sitter C++ AST extractor (P10-1D Task C). +//! +//! Implements [`kebab_core::Extractor`] for [`MediaType::Code("cpp")`]. +//! Walks the tree-sitter parse tree and emits one [`Block::Code`] per +//! top-level AST semantic unit, each carrying [`SourceSpan::Code`] with +//! the unit's `::` separated symbol path (design Β§3.4 C++ row). +//! +//! ## Symbol formation +//! +//! Symbol = `namespace::Class::method` via recursive `build_blocks`: +//! +//! - `namespace_definition` (named) β†’ push namespace name, recurse into body. +//! - Anonymous namespace (`namespace { ... }`) β†’ push ``, recurse. +//! - `nested_namespace_specifier` (`outer::inner`) β†’ push all segments, recurse. +//! - `class_specifier` / `struct_specifier` (named) β†’ emit class unit + recurse +//! into body with class name pushed. +//! - `function_definition` β†’ emit method/function unit. Symbol is built from +//! the prefix chain + the extracted declarator name component. +//! - Out-of-class method def (`void Foo::bar() {}`) β€” the declarator's inner +//! node is a `qualified_identifier`; its scope chain is prepended to the +//! current prefix to form the full symbol. +//! - `template_declaration` β†’ recurse into named children with same prefix; +//! the inner function/class body is matched by its own arm. Template params +//! are NOT included in the symbol. +//! - `enum_specifier` (named) β†’ emit type unit. +//! - `concept_definition` (C++20) β†’ emit type unit. +//! - `linkage_specification` (extern "C") β†’ recurse into body with same prefix. +//! +//! ## Constructor / destructor / operator overload +//! +//! - Constructor: `function_declarator > identifier` matching the class name. +//! Symbol = `Class::Class` (name duplicated, same convention as Java). +//! - Destructor: `function_declarator > destructor_name`. Symbol = `Class::~Foo`. +//! - Operator overload: `function_declarator > operator_name`. Symbol = `Class::operator+`. +//! - Conversion operator: `function_definition.declarator` is `operator_cast`. +//! Symbol = `Class::operator ` (e.g. `Class::operator bool`). +//! +//! ## Glue +//! +//! Everything not in the unit list collapses into a single `` glue +//! chunk (preproc, declarations, using, typedef, etc.). If the file produces +//! zero units AND zero glue, the `` post-pass emits one unit covering +//! the whole file. +//! +//! Per design Β§3.4 / Β§9.1 / Β§9 versioning. + +use anyhow::Result; +use kebab_core::{ + Block, CanonicalDocument, CodeBlock, CommonBlock, Extractor, Lang, MediaType, Metadata, + ParserVersion, Provenance, ProvenanceEvent, ProvenanceKind, SourceSpan, SourceType, TrustLevel, + id_for_block, id_for_doc, +}; +use serde_json::Map; +use time::OffsetDateTime; + +use crate::scaffold::{filename_from_workspace_path, strip_extension}; + +pub const PARSER_VERSION: &str = "code-cpp-v1"; + +/// C++ AST extractor. Per-unit blocks via tree-sitter-cpp 0.23.4 +/// (`LANGUAGE: LanguageFn`) parsed by tree-sitter 0.26. +pub struct CppAstExtractor; + +impl CppAstExtractor { + pub fn new() -> Self { + Self + } +} + +impl Default for CppAstExtractor { + fn default() -> Self { + Self::new() + } +} + +impl Extractor for CppAstExtractor { + fn supports(&self, m: &MediaType) -> bool { + matches!(m, MediaType::Code(l) if l == "cpp") + } + + fn parser_version(&self) -> ParserVersion { + ParserVersion(PARSER_VERSION.to_string()) + } + + fn extract( + &self, + ctx: &kebab_core::ExtractContext<'_>, + bytes: &[u8], + ) -> Result { + let asset = ctx.asset; + if !self.supports(&asset.media_type) { + anyhow::bail!( + "kebab-parse-code: unsupported media_type for CppAstExtractor: {:?}", + asset.media_type + ); + } + + let parser_version = self.parser_version(); + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, &parser_version); + + let source = String::from_utf8(bytes.to_vec()).map_err(|e| { + anyhow::anyhow!("kebab-parse-code: C++ source is not valid UTF-8: {e}") + })?; + + let blocks = build_blocks_top(&source, &doc_id)?; + let unit_count = blocks.len() as u32; + + let now = OffsetDateTime::now_utc(); + let mut events: Vec = Vec::with_capacity(2); + events.push(ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }); + events.push(ProvenanceEvent { + at: now, + agent: "kb-parse-code".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!( + "parser_version={}; unit_count={}", + parser_version.0, unit_count + )), + }); + + let title = { + let fname = filename_from_workspace_path(&asset.workspace_path.0); + strip_extension(&fname) + }; + + let abs_path = match &asset.source_uri { + kebab_core::SourceUri::File(p) => { + if p.is_absolute() { + p.clone() + } else { + ctx.workspace_root.join(p) + } + } + kebab_core::SourceUri::Kb(_) => ctx.workspace_root.to_path_buf(), + }; + let (repo, git_branch, git_commit) = match crate::repo::detect_repo(&abs_path) { + Some(r) => (Some(r.name), r.branch, r.commit), + None => (None, None, None), + }; + + let metadata = Metadata { + aliases: Vec::new(), + tags: Vec::new(), + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Map::new(), + repo, + git_branch, + git_commit, + code_lang: Some("cpp".to_string()), + }; + + tracing::debug!( + target: "kebab-parse-code", + "extracted C++ doc_id={} workspace_path={} units={}", + doc_id.0, + asset.workspace_path.0, + unit_count + ); + + Ok(CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks, + metadata, + provenance: Provenance { events }, + parser_version, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + }) + } +} + +// --------------------------------------------------------------------------- +// Core block-building logic +// --------------------------------------------------------------------------- + +/// Top-level entry: parse source, walk the `translation_unit` root, assemble +/// units + glue, apply the `` post-pass, and emit `Block::Code`s. +fn build_blocks_top( + source: &str, + doc_id: &kebab_core::DocumentId, +) -> anyhow::Result> { + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&tree_sitter_cpp::LANGUAGE.into()) + .map_err(|e| anyhow::anyhow!("set tree-sitter-cpp language: {e}"))?; + let tree = parser + .parse(source.as_bytes(), None) + .ok_or_else(|| anyhow::anyhow!("tree-sitter failed to parse C++ source"))?; + let lines: Vec<&str> = source.split('\n').collect(); + let root = tree.root_node(); + + // units: (symbol, line_start, line_end, is_real_semantic_unit). + // Glue is accumulated as (start, end) pairs and flushed into one + // "" block (or "" if no real unit exists). + let mut units: Vec<(String, u32, u32, bool)> = Vec::new(); + let mut glue: Vec<(u32, u32)> = Vec::new(); + + build_blocks(root, source, &[], &mut units, &mut glue); + flush_glue(&mut glue, &mut units); + + // Post-pass: if the file has no real semantic unit (only glue, or + // completely empty), rename the single glue unit to "". + // If there are zero units AND zero glue, synthesize a one-line + // "" covering the whole file. + let has_real_unit = units.iter().any(|(_, _, _, is_real)| *is_real); + + if units.is_empty() { + let total = lines.len() as u32; + units.push(("".to_string(), 1, total.max(1), false)); + } + if !has_real_unit { + for (sym, _, _, _) in units.iter_mut() { + if sym == "" { + *sym = "".to_string(); + } + } + } + + let total_lines = lines.len() as u32; + let mut blocks = Vec::with_capacity(units.len()); + for (ordinal, (symbol, ls, le, _is_real)) in units.into_iter().enumerate() { + let line_start = ls.max(1); + let line_end = le.min(total_lines.max(1)); + let span = SourceSpan::Code { + line_start, + line_end, + symbol: Some(symbol), + lang: Some("cpp".to_string()), + }; + let block_id = id_for_block(doc_id, "code", &[], ordinal as u32, &span); + let code = lines[(line_start as usize - 1)..=(line_end as usize - 1)].join("\n"); + blocks.push(Block::Code(CodeBlock { + common: CommonBlock { + block_id, + heading_path: Vec::new(), + source_span: span, + }, + lang: Some("cpp".to_string()), + code, + })); + } + Ok(blocks) +} + +/// Walk preceding `comment` siblings to extend the unit's line range upward, +/// folding leading doc / line comments into the unit (1B pattern). +fn unit_start(n: &tree_sitter::Node) -> u32 { + let mut start = n.start_position().row as u32 + 1; + let mut prev = n.prev_sibling(); + while let Some(p) = prev { + if p.kind() == "comment" { + start = p.start_position().row as u32 + 1; + prev = p.prev_sibling(); + } else { + break; + } + } + start +} + +fn flush_glue(glue: &mut Vec<(u32, u32)>, units: &mut Vec<(String, u32, u32, bool)>) { + if glue.is_empty() { + return; + } + let s = glue.iter().map(|(a, _)| *a).min().unwrap(); + let e = glue.iter().map(|(_, b)| *b).max().unwrap(); + units.push(("".to_string(), s, e, false)); + glue.clear(); +} + +/// Walk a scope node (translation_unit, declaration_list, field_declaration_list) +/// emitting unit + glue blocks. `prefix` is the current namespace/class chain +/// (e.g. `["kebab", "Chunk", "Foo"]`). +/// +/// After returning, any pending glue in `glue` is NOT flushed β€” callers +/// responsible for flushing at the scope boundary (top-level flush in +/// `build_blocks_top`). Within recursive scope bodies (namespace/class) we +/// do flush before returning so that glue doesn't leak across scopes. +fn build_blocks( + node: tree_sitter::Node, + source: &str, + prefix: &[String], + units: &mut Vec<(String, u32, u32, bool)>, + glue: &mut Vec<(u32, u32)>, +) { + let mut cur = node.walk(); + for child in node.named_children(&mut cur) { + let s = unit_start(&child); + let e = child.end_position().row as u32 + 1; + + match child.kind() { + "namespace_definition" => { + // Flush pending glue before starting this namespace block. + flush_glue(glue, units); + + let name_node = child.child_by_field_name("name"); + let body = child + .child_by_field_name("body") + .unwrap_or(child); + + match name_node { + None => { + // Anonymous namespace: push "", recurse. + let mut new_prefix = prefix.to_vec(); + new_prefix.push("".to_string()); + build_blocks(body, source, &new_prefix, units, glue); + flush_glue(glue, units); + } + Some(nn) => match nn.kind() { + "namespace_identifier" => { + let name = &source[nn.start_byte()..nn.end_byte()]; + let mut new_prefix = prefix.to_vec(); + new_prefix.push(name.to_string()); + build_blocks(body, source, &new_prefix, units, glue); + flush_glue(glue, units); + } + "nested_namespace_specifier" => { + // e.g. `namespace outer::inner { ... }` + // All named children are namespace_identifier nodes. + let mut new_prefix = prefix.to_vec(); + let mut nc = nn.walk(); + for seg in nn.named_children(&mut nc) { + new_prefix.push(source[seg.start_byte()..seg.end_byte()].to_string()); + } + build_blocks(body, source, &new_prefix, units, glue); + flush_glue(glue, units); + } + _ => { + // Unknown name kind β€” treat entire namespace as glue. + glue.push((s, e)); + } + }, + } + } + + "class_specifier" | "struct_specifier" => { + let name_node = child.child_by_field_name("name"); + let Some(nn) = name_node else { + // Anonymous class/struct β€” glue. + glue.push((s, e)); + continue; + }; + let name = match nn.kind() { + "type_identifier" => &source[nn.start_byte()..nn.end_byte()], + _ => { + // template_type or qualified_identifier β€” use full text + // as the symbol segment (includes template args). + &source[nn.start_byte()..nn.end_byte()] + } + }; + + flush_glue(glue, units); + let sym = build_symbol(prefix, &[name]); + units.push((sym, s, e, true)); + + if let Some(body) = child.child_by_field_name("body") { + let mut new_prefix = prefix.to_vec(); + new_prefix.push(name.to_string()); + build_blocks(body, source, &new_prefix, units, glue); + flush_glue(glue, units); + } + } + + "function_definition" => { + let decl = child.child_by_field_name("declarator"); + let Some(decl_node) = decl else { + glue.push((s, e)); + continue; + }; + + match extract_fn_symbol(decl_node, source, prefix) { + Some(sym) => { + flush_glue(glue, units); + units.push((sym, s, e, true)); + } + None => { + glue.push((s, e)); + } + } + } + + "template_declaration" => { + // Unwrap: recurse into named children with same prefix. + // The inner function/class/concept will be matched by their own + // arms. template_parameter_list is not a unit; it will fall + // through to glue (it's not a named child of the template_declaration + // that matches any of our arms). + build_blocks(child, source, prefix, units, glue); + // Do NOT flush glue here β€” template body may be part of a glue group. + } + + "enum_specifier" => { + if let Some(nn) = child.child_by_field_name("name") { + let name = &source[nn.start_byte()..nn.end_byte()]; + flush_glue(glue, units); + let sym = build_symbol(prefix, &[name]); + units.push((sym, s, e, true)); + } else { + // Anonymous enum β€” glue. + glue.push((s, e)); + } + } + + "concept_definition" => { + // C++20. Has required "name" field (identifier). + if let Some(nn) = child.child_by_field_name("name") { + let name = &source[nn.start_byte()..nn.end_byte()]; + flush_glue(glue, units); + let sym = build_symbol(prefix, &[name]); + units.push((sym, s, e, true)); + } else { + glue.push((s, e)); + } + } + + "linkage_specification" => { + // extern "C" { ... } β€” glue-wrapper, but recurse into body + // with same prefix so inner definitions are extracted. + let body = child.child_by_field_name("body").unwrap_or(child); + // The linkage_spec itself is glue; inner defs handled by recursion. + // Don't emit the wrapper as a unit; but also don't push it as glue + // since recursion will push its inner children individually. + build_blocks(body, source, prefix, units, glue); + } + + // Everything else: preproc, declarations, using, typedef, etc. + _ => { + glue.push((s, e)); + } + } + } +} + +/// Join prefix + extras into a `::` separated symbol. +fn build_symbol(prefix: &[String], extras: &[&str]) -> String { + let mut parts: Vec<&str> = prefix.iter().map(String::as_str).collect(); + parts.extend_from_slice(extras); + parts.join("::") +} + +/// Extract the symbol for a `function_definition` given its top-level +/// `declarator` node. Returns `None` if the name cannot be determined. +/// +/// The declarator chain may be: +/// - `function_declarator` (plain fn or method) +/// - `pointer_declarator` wrapping `function_declarator` (fn returning pointer) +/// - `reference_declarator` wrapping `function_declarator` (fn returning ref) +/// - `operator_cast` (conversion operator β€” e.g. `operator bool`) +/// +/// The inner `function_declarator.declarator` is one of: +/// - `identifier` β†’ free fn or constructor, symbol = `prefix::name` +/// - `field_identifier` β†’ method in class body, symbol = `prefix::name` +/// - `destructor_name` β†’ `~Foo`, symbol = `prefix::~Foo` +/// - `operator_name` β†’ `operator+` etc., symbol = `prefix::operator+` +/// - `qualified_identifier` β†’ out-of-class def `Foo::bar` or `ns::Foo::bar`; +/// the scope chain is extracted and prepended to prefix. +/// +/// For `qualified_identifier`, the scope hierarchy (which may itself be a +/// `qualified_identifier`) is flattened into a list of segments. These +/// segments REPLACE the current prefix (since out-of-class defs carry their +/// full scope explicitly). Example: `void ns::Foo::bar() {}` at top level +/// with prefix=[] β†’ segments=[ns, Foo, bar] β†’ symbol = `ns::Foo::bar`. +fn extract_fn_symbol( + decl_node: tree_sitter::Node, + source: &str, + prefix: &[String], +) -> Option { + // Walk down pointer/reference wrapper layers to reach the + // function_declarator (or operator_cast at definition level). + let fn_decl = unwrap_to_fn_declarator(decl_node, source)?; + + match fn_decl.kind() { + "operator_cast" => { + // e.g. `operator bool() const` β€” the function_definition.declarator + // IS the operator_cast (no function_declarator wrapper). + // Symbol = `prefix::operator `. + let type_node = fn_decl.child_by_field_name("type")?; + let type_text = &source[type_node.start_byte()..type_node.end_byte()]; + Some(build_symbol(prefix, &[&format!("operator {type_text}")])) + } + "function_declarator" => { + let inner = fn_decl.child_by_field_name("declarator")?; + extract_name_node(inner, source, prefix) + } + _ => None, + } +} + +/// Walk pointer_declarator / reference_declarator chains down to the +/// first `function_declarator` or `operator_cast` node. +/// +/// Returns `None` if no such node is found (e.g. a function definition +/// whose declarator is malformed or unknown). +fn unwrap_to_fn_declarator<'a>( + mut node: tree_sitter::Node<'a>, + _source: &str, +) -> Option> { + loop { + match node.kind() { + "function_declarator" | "operator_cast" => return Some(node), + "pointer_declarator" => { + node = node.child_by_field_name("declarator")?; + } + "reference_declarator" | "rvalue_reference_declarator" => { + // reference_declarator has no `declarator` field; its child + // is in the unnamed children list. + let mut walker = node.walk(); + node = node.named_children(&mut walker).next()?; + } + _ => return None, + } + } +} + +/// Given the innermost name node of a function_declarator, produce the symbol. +fn extract_name_node( + inner: tree_sitter::Node, + source: &str, + prefix: &[String], +) -> Option { + match inner.kind() { + "identifier" | "field_identifier" => { + let name = &source[inner.start_byte()..inner.end_byte()]; + Some(build_symbol(prefix, &[name])) + } + "destructor_name" => { + // destructor_name text includes the `~` prefix (e.g. "~Foo"). + let full = &source[inner.start_byte()..inner.end_byte()]; + Some(build_symbol(prefix, &[full])) + } + "operator_name" => { + // Full text e.g. "operator+", "operator->", "operator()". + let full = &source[inner.start_byte()..inner.end_byte()]; + Some(build_symbol(prefix, &[full])) + } + "template_function" | "template_method" => { + // Template function like `foo()`. Use the `name` field + // (the identifier / field_identifier before `<`). + let name_node = inner.child_by_field_name("name")?; + let name = &source[name_node.start_byte()..name_node.end_byte()]; + Some(build_symbol(prefix, &[name])) + } + "qualified_identifier" => { + // Out-of-class method definition. Flatten the nested + // qualified_identifier chain into ordered segments. + // Example: `ns::Foo::method` + // qualified_identifier { + // scope: namespace_identifier "ns" + // name: qualified_identifier { + // scope: namespace_identifier "Foo" + // name: identifier "method" + // } + // } + // β†’ ["ns", "Foo", "method"] + // + // These segments are combined with the current prefix so that a + // top-level out-of-class def `void Foo::bar() {}` inside a + // namespace body with prefix=["ns"] produces `ns::Foo::bar`. + let mut segments: Vec = Vec::new(); + flatten_qualified_id(inner, source, &mut segments); + if segments.is_empty() { + return None; + } + // Build: prefix + all segments (scope chain + leaf). + let mut all: Vec<&str> = prefix.iter().map(String::as_str).collect(); + for seg in &segments { + all.push(seg.as_str()); + } + Some(all.join("::")) + } + _ => None, + } +} + +/// Recursively flatten a `qualified_identifier` node into ordered string +/// segments. For `ns::Foo::method` this produces `["ns", "Foo", "method"]`. +fn flatten_qualified_id(node: tree_sitter::Node, source: &str, out: &mut Vec) { + // A qualified_identifier has: + // scope: namespace_identifier | (None for global-scope `::foo`) + // name: identifier | field_identifier | destructor_name | + // operator_name | qualified_identifier | template_function | + // template_method | ... + let scope_node = node.child_by_field_name("scope"); + let name_node = node.child_by_field_name("name"); + + if let Some(s) = scope_node { + out.push(source[s.start_byte()..s.end_byte()].to_string()); + } + + match name_node { + Some(n) if n.kind() == "qualified_identifier" => { + // Recurse: more nesting. + flatten_qualified_id(n, source, out); + } + Some(n) => { + // Leaf name β€” push its text. + out.push(source[n.start_byte()..n.end_byte()].to_string()); + } + None => {} + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +pub(crate) mod tests_support { + use kebab_core::*; + use std::path::PathBuf; + use time::OffsetDateTime; + + pub fn fixed_code_asset(workspace_path: &str, lang: &str) -> RawAsset { + RawAsset { + asset_id: AssetId("a".repeat(64)), + source_uri: SourceUri::File(PathBuf::from(workspace_path)), + workspace_path: WorkspacePath(workspace_path.to_string()), + media_type: MediaType::Code(lang.to_string()), + byte_len: 0, + checksum: Checksum("b".repeat(64)), + discovered_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + stored: AssetStorage::Reference { + path: PathBuf::from(workspace_path), + sha: Checksum("b".repeat(64)), + }, + } + } + + pub fn extract_cpp(src: &str, path: &str) -> kebab_core::CanonicalDocument { + use super::CppAstExtractor; + use kebab_core::Extractor; + let asset = fixed_code_asset(path, "cpp"); + let cfg = ExtractConfig::default(); + let root = PathBuf::from("/tmp"); + let ctx = ExtractContext { + asset: &asset, + workspace_root: &root, + config: &cfg, + }; + CppAstExtractor::new().extract(&ctx, src.as_bytes()).unwrap() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{Block, MediaType, SourceSpan}; + + fn syms(doc: &kebab_core::CanonicalDocument) -> Vec { + let mut s: Vec = doc + .blocks + .iter() + .filter_map(|b| match b { + Block::Code(c) => match &c.common.source_span { + SourceSpan::Code { symbol, .. } => symbol.clone(), + _ => None, + }, + _ => None, + }) + .collect(); + s.sort(); + s + } + + #[test] + fn extractor_supports_only_media_code_cpp() { + let e = CppAstExtractor::new(); + assert!(e.supports(&MediaType::Code("cpp".into()))); + assert!(!e.supports(&MediaType::Code("c".into()))); + assert!(!e.supports(&MediaType::Code("rust".into()))); + assert!(!e.supports(&MediaType::Markdown)); + } + + #[test] + fn free_function() { + let src = "void foo() {}\n"; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "foo"), "got {s:?}"); + } + + #[test] + fn namespace_and_class() { + let src = r#" +namespace ns { + class Foo { + public: + void method() {} + Foo() {} + ~Foo() {} + int operator+(const Foo& o) { return 0; } + }; +} +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "ns::Foo"), "ns::Foo missing: {s:?}"); + assert!(s.iter().any(|x| x == "ns::Foo::method"), "method missing: {s:?}"); + assert!(s.iter().any(|x| x == "ns::Foo::Foo"), "ctor missing: {s:?}"); + assert!(s.iter().any(|x| x == "ns::Foo::~Foo"), "dtor missing: {s:?}"); + assert!(s.iter().any(|x| x == "ns::Foo::operator+"), "op+ missing: {s:?}"); + } + + #[test] + fn anonymous_namespace() { + let src = r#" +namespace { + void hidden_fn() {} +} +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "::hidden_fn"), + "anon fn missing: {s:?}" + ); + } + + #[test] + fn nested_namespace_specifier() { + let src = r#" +namespace outer::inner { + void fn_in_nested() {} +} +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "outer::inner::fn_in_nested"), + "nested ns fn missing: {s:?}" + ); + } + + #[test] + fn out_of_class_method_def() { + let src = r#" +void ns::Foo::method() { } +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "ns::Foo::method"), + "out-of-class method missing: {s:?}" + ); + } + + #[test] + fn template_declaration() { + let src = r#" +template +class Bar { + void tmpl_method() {} +}; + +template +void tmpl_free_fn(T x) {} +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "Bar"), "Bar class missing: {s:?}"); + assert!( + s.iter().any(|x| x == "Bar::tmpl_method"), + "Bar::tmpl_method missing: {s:?}" + ); + assert!( + s.iter().any(|x| x == "tmpl_free_fn"), + "tmpl_free_fn missing: {s:?}" + ); + } + + #[test] + fn enum_and_concept() { + let src = r#" +enum class Color { Red, Green }; + +template +concept Printable = requires(T t) { t.print(); }; +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "Color"), "Color missing: {s:?}"); + assert!(s.iter().any(|x| x == "Printable"), "Printable missing: {s:?}"); + } + + #[test] + fn extern_c_block() { + let src = r#" +extern "C" { + void c_fn1() {} + void c_fn2() {} +} +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "c_fn1"), "c_fn1 missing: {s:?}"); + assert!(s.iter().any(|x| x == "c_fn2"), "c_fn2 missing: {s:?}"); + } + + #[test] + fn conversion_operator() { + let src = r#" +class Foo { + operator bool() const { return true; } +}; +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "Foo::operator bool"), + "conversion op missing: {s:?}" + ); + } + + #[test] + fn empty_file_produces_module() { + let src = ""; + let doc = tests_support::extract_cpp(src, "x/empty.cpp"); + let s = syms(&doc); + assert_eq!(s, vec![""], "expected : got {s:?}"); + } + + #[test] + fn glue_only_produces_module() { + let src = "#include \nusing namespace std;\n"; + let doc = tests_support::extract_cpp(src, "x/glue.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == ""), "expected : got {s:?}"); + } + + #[test] + fn ptr_returning_function() { + let src = "int* ptr_fn(int x) { return &x; }\n"; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!(s.iter().any(|x| x == "ptr_fn"), "ptr_fn missing: {s:?}"); + } + + #[test] + fn ref_returning_operator() { + let src = r#" +class Foo { + Foo& operator=(const Foo& o) { return *this; } +}; +"#; + let doc = tests_support::extract_cpp(src, "x/foo.cpp"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "Foo::operator="), + "operator= missing: {s:?}" + ); + } + + #[test] + fn deterministic_across_runs() { + let src = r#" +namespace ns { + class Foo { + void method() {} + }; +} +void free_fn() {} +"#; + let a = tests_support::extract_cpp(src, "x/foo.cpp"); + for _ in 0..20 { + assert_eq!(tests_support::extract_cpp(src, "x/foo.cpp").blocks, a.blocks); + } + } +} diff --git a/crates/kebab-parse-code/src/lib.rs b/crates/kebab-parse-code/src/lib.rs index a49cd14..7659fdb 100644 --- a/crates/kebab-parse-code/src/lib.rs +++ b/crates/kebab-parse-code/src/lib.rs @@ -14,6 +14,7 @@ //! / llm / rag. pub mod c; +pub mod cpp; pub mod go; pub mod java; pub mod javascript; @@ -27,6 +28,7 @@ pub mod skip; pub mod typescript; pub use c::{PARSER_VERSION as C_PARSER_VERSION, CAstExtractor}; +pub use cpp::{PARSER_VERSION as CPP_PARSER_VERSION, CppAstExtractor}; pub use go::{PARSER_VERSION as GO_PARSER_VERSION, GoAstExtractor}; pub use java::{PARSER_VERSION as JAVA_PARSER_VERSION, JavaAstExtractor}; pub use javascript::{PARSER_VERSION as JS_PARSER_VERSION, JavascriptAstExtractor}; From 03cd41c48fa95433b12c5b09fb6250afdb753642 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 13:41:19 +0000 Subject: [PATCH 06/13] feat(p10-1d): code-c-ast-v1 chunker + snapshot test Mirrors code-go-ast-v1's chunker pattern. Snapshot test against tests/fixtures/sample.c (function + typedef struct + typedef enum + preprocessor) verifies symbol list + lang=c stamping. Chunks produced (4 total): - glue: includes, defines, static vars, typedefs (lines 1-18) - parse_record function (lines 20-23) - print_record function (lines 25-27) - main function (lines 29-33) All chunks stamped with lang=c and chunker_version=code-c-ast-v1. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-chunk/src/code_c_ast_v1.rs | 322 ++++++++++++++++++ crates/kebab-chunk/src/lib.rs | 2 + .../kebab-chunk/tests/code_c_ast_snapshot.rs | 196 +++++++++++ .../code-sample.c.chunks.snapshot.json | 86 +++++ crates/kebab-chunk/tests/fixtures/sample.c | 33 ++ 5 files changed, 639 insertions(+) create mode 100644 crates/kebab-chunk/src/code_c_ast_v1.rs create mode 100644 crates/kebab-chunk/tests/code_c_ast_snapshot.rs create mode 100644 crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json create mode 100644 crates/kebab-chunk/tests/fixtures/sample.c diff --git a/crates/kebab-chunk/src/code_c_ast_v1.rs b/crates/kebab-chunk/src/code_c_ast_v1.rs new file mode 100644 index 0000000..22dbcf2 --- /dev/null +++ b/crates/kebab-chunk/src/code_c_ast_v1.rs @@ -0,0 +1,322 @@ +//! `code-c-ast-v1` β€” maps a tree-sitter-derived C AST +//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with +//! `SourceSpan::Code`) to chunks 1:1. A unit longer than +//! `AST_CHUNK_MAX_LINES` is split into ` [part i/N]` sub-chunks +//! at blank-line paragraph boundaries (design Β§9.1 oversize fallback). +//! +//! tree-sitter is intentionally NOT a dependency here: AST work is +//! parser-side (`kebab-parse-code`, design Β§6.3). This chunker only +//! consumes the `CanonicalDocument`. +//! +//! `AST_CHUNK_MAX_LINES` is a constant matching +//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium +//! config threading needs a chunker registry (P+); same deviation +//! pattern as `pdf-page-v1`'s pinned `chunker_version` +//! (`tasks/HOTFIXES.md`). + +use kebab_core::{ + Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId, + SourceSpan, id_for_chunk, +}; + +const VERSION_LABEL: &str = "code-c-ast-v1"; +const BYTES_PER_TOKEN: usize = 3; +const POLICY_HASH_HEX_LEN: usize = 16; +const AST_CHUNK_MAX_LINES: u32 = 200; + +#[derive(Clone, Copy, Debug, Default)] +pub struct CodeCAstV1Chunker; + +impl Chunker for CodeCAstV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() + } + + fn chunk( + &self, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + ) -> anyhow::Result> { + for b in &doc.blocks { + let c = match b { + Block::Code(c) => c, + _ => anyhow::bail!( + "CodeCAstV1Chunker only handles code docs (got non-Code block)" + ), + }; + if !matches!(c.common.source_span, SourceSpan::Code { .. }) { + anyhow::bail!( + "CodeCAstV1Chunker only handles code docs (got non-Code source_span)" + ); + } + } + + let base_policy_hash = self.policy_hash(policy); + let chunker_version = self.chunker_version(); + let mut out: Vec = Vec::new(); + + for b in &doc.blocks { + let cb = match b { + Block::Code(c) => c, + _ => unreachable!("validated above"), + }; + let (ls, le, symbol, lang) = match &cb.common.source_span { + SourceSpan::Code { line_start, line_end, symbol, lang } => { + (*line_start, *line_end, symbol.clone(), lang.clone()) + } + _ => unreachable!("validated above"), + }; + let block_ids: Vec = vec![cb.common.block_id.clone()]; + let span_lines = le.saturating_sub(ls) + 1; + + if span_lines <= AST_CHUNK_MAX_LINES { + let span = SourceSpan::Code { + line_start: ls, + line_end: le, + symbol: symbol.clone(), + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + None, span, cb.code.clone(), + )); + } else { + let parts = split_oversize(&cb.code); + let n = parts.len(); + for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { + let part_ls = ls + off_start; + let part_le = ls + off_end; + let part_sym = symbol + .as_ref() + .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let span = SourceSpan::Code { + line_start: part_ls, + line_end: part_le, + symbol: part_sym, + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + Some(part_ls), span, text, + )); + } + } + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = out.len(), + "code-c-ast-v1 chunked", + ); + Ok(out) + } +} + +#[allow(clippy::too_many_arguments)] +fn make_chunk( + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + block_ids: &[BlockId], + base_policy_hash: &str, + split_key: Option, + span: SourceSpan, + text: String, +) -> Chunk { + let id_hash = match split_key { + Some(k) => format!("{base_policy_hash}#L{k}"), + None => base_policy_hash.to_string(), + }; + let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash); + let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN); + Chunk { + chunk_id, + doc_id: DocumentId(doc.doc_id.0.clone()), + block_ids: block_ids.to_vec(), + text, + heading_path: Vec::new(), + source_spans: vec![span], + token_estimate, + chunker_version: chunker_version.clone(), + policy_hash: base_policy_hash.to_string(), + } +} + +/// Split an oversize unit at blank-line paragraph boundaries, greedily +/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate. +/// Returns `(line_offset_start, line_offset_end, text)` where offsets are +/// 0-based within the unit (caller adds the unit's absolute `line_start`). +fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { + let lines: Vec<&str> = code.split('\n').collect(); + let total = lines.len() as u32; + let mut out: Vec<(u32, u32, String)> = Vec::new(); + let mut start: u32 = 0; + while start < total { + let mut end = (start + AST_CHUNK_MAX_LINES).min(total); + let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5); + if end < total { + if let Some(b) = (floor.min(end)..end) + .rev() + .find(|&i| lines[i as usize].trim().is_empty()) + { + end = b + 1; + } + } + let text = lines[start as usize..end as usize].join("\n"); + out.push((start, end.saturating_sub(1), text)); + start = end; + } + if out.is_empty() { + out.push((0, total.saturating_sub(1), code.to_string())); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{ + Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, + SourceType, TrustLevel, WorkspacePath, + }; + use time::OffsetDateTime; + + fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument { + let wp = WorkspacePath("crates/x/src/a.c".into()); + let aid = AssetId("a".repeat(64)); + let pv = ParserVersion("code-c-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + let blocks = units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("c".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + lang: Some("c".into()), + code: (*code).to_string(), + }) + }) + .collect(); + CanonicalDocument { + doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), + lang: Lang("und".into()), blocks, + metadata: Metadata { + aliases: vec![], tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, trust_level: TrustLevel::Primary, + user_id_alias: None, user: Default::default(), + repo: Some("kebab".into()), git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), code_lang: Some("c".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, schema_version: 1, doc_version: 1, + last_chunker_version: None, last_embedding_version: None, + } + } + fn policy() -> ChunkPolicy { + ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + } + + #[test] + fn chunker_version_is_code_c_ast_v1() { + assert_eq!(CodeCAstV1Chunker.chunker_version(), + ChunkerVersion("code-c-ast-v1".into())); + } + + #[test] + fn one_chunk_per_unit_preserves_code_span() { + let doc = code_doc(&[ + ("parse", 1, 3, "int parse() {\n\t// x\n}"), + ("print", 5, 7, "void print() {\n\t//\n\treturn;\n}"), + ]); + let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert_eq!(chunks.len(), 2); + for c in &chunks { + assert_eq!(c.source_spans.len(), 1); + assert!(matches!(c.source_spans[0], SourceSpan::Code { .. })); + assert_eq!(c.heading_path, Vec::::new()); + assert_eq!(c.chunker_version.0, "code-c-ast-v1"); + } + match &chunks[0].source_spans[0] { + SourceSpan::Code { symbol, line_start, line_end, .. } => { + assert_eq!(symbol.as_deref(), Some("parse")); + assert_eq!((*line_start, *line_end), (1, 3)); + } + _ => unreachable!(), + } + } + + #[test] + fn oversize_unit_splits_into_parts_with_unique_ids() { + let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::>().join(""); + let code = format!("int big() {{\n{body}\n}}"); + let doc = code_doc(&[("big", 1, 502, &code)]); + let chunks = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + for c in &chunks { + match &c.source_spans[0] { + SourceSpan::Code { symbol, .. } => { + assert!(symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}"); + } + _ => unreachable!(), + } + } + let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); + let n = ids.len(); ids.sort(); ids.dedup(); + assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); + } + + #[test] + fn non_code_doc_errors() { + use kebab_core::TextBlock; + let mut doc = code_doc(&[("parse", 1, 1, "int parse() {}")]); + doc.blocks = vec![Block::Paragraph(TextBlock { + common: CommonBlock { + block_id: kebab_core::BlockId("b".into()), + heading_path: vec![], + source_span: SourceSpan::Line { start: 1, end: 1 }, + }, + text: "x".into(), inlines: vec![], + })]; + let err = CodeCAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); + assert!(err.to_string().contains("CodeCAstV1Chunker")); + } + + #[test] + fn deterministic_chunk_ids_1000() { + let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]); + let base: Vec = CodeCAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + for _ in 0..1000 { + let again: Vec = CodeCAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + assert_eq!(again, base); + } + } + + #[test] + fn policy_hash_matches_md_heading_v1() { + let p = policy(); + assert_eq!(CodeCAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p)); + } +} diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index eee3f69..f1636ea 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -15,6 +15,7 @@ //! embedder, the retriever, the LLM, the RAG layer, or the UI layers. //! It consumes `CanonicalDocument` purely through `kb-core` types. +mod code_c_ast_v1; mod code_go_ast_v1; mod code_java_ast_v1; mod code_js_ast_v1; @@ -30,6 +31,7 @@ pub mod dockerfile_file_v1; pub mod manifest_file_v1; pub mod code_text_paragraph_v1; +pub use code_c_ast_v1::CodeCAstV1Chunker; pub use code_go_ast_v1::CodeGoAstV1Chunker; pub use code_java_ast_v1::CodeJavaAstV1Chunker; pub use code_js_ast_v1::CodeJsAstV1Chunker; diff --git a/crates/kebab-chunk/tests/code_c_ast_snapshot.rs b/crates/kebab-chunk/tests/code_c_ast_snapshot.rs new file mode 100644 index 0000000..62162b0 --- /dev/null +++ b/crates/kebab-chunk/tests/code_c_ast_snapshot.rs @@ -0,0 +1,196 @@ +//! Snapshot test pinning the `Vec` JSON for a +//! representative C code `CanonicalDocument`. +//! +//! This is an integration test. `kebab-parse-code` is intentionally NOT +//! a dev-dep (design Β§6.3 / Β§8 boundary: AST extraction is parser-side). +//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code` +//! units, which is the same pattern used in `code_go_ast_v1.rs`'s +//! internal `code_doc` test helper. +//! +//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline. + +use std::path::PathBuf; + +use kebab_chunk::CodeCAstV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, + id_for_block, id_for_doc, +}; +use serde_json::Value; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +fn fixed_doc() -> CanonicalDocument { + let wp = WorkspacePath("projects/record.c".into()); + let aid = AssetId("c".repeat(64)); + // Pin parser_version so doc_id / block_ids are reproducible. + let pv = ParserVersion("code-c-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + // Representative units: + // 0. imports + defines (lines 1–4, ≀200) + // 1. status_t enum typedef (lines 6–9, ≀200) + // 2. record_t struct typedef (lines 11–16, ≀200) + // 3. static counter decl glue (line 18, ≀200) + // 4. parse_record fn (lines 20–23, ≀200) + // 5. print_record fn (lines 25–27, ≀200) + // 6. main fn (lines 29–33, ≀200) + let raw_units: Vec<(&str, u32, u32, String)> = vec![ + ( + "", + 1, + 18, + "#include \n#include \n\n#define MAX_BUF 4096\n\ntypedef enum {\n OK = 0,\n ERR_PARSE,\n ERR_IO,\n} status_t;\n\ntypedef struct {\n int id;\n char name[64];\n status_t status;\n} record_t;\n\nstatic int counter = 0;".to_string(), + ), + ( + "parse_record", + 20, + 23, + "int parse_record(const char *line, record_t *out) {\n if (line == NULL || out == NULL) return ERR_PARSE;\n return OK;\n}".to_string(), + ), + ( + "print_record", + 25, + 27, + "void print_record(const record_t *r) {\n printf(\"[%d] %s (status=%d)\\n\", r->id, r->name, r->status);\n}".to_string(), + ), + ( + "main", + 29, + 33, + "int main(void) {\n record_t r = { .id = 1, .name = \"foo\", .status = OK };\n print_record(&r);\n return 0;\n}".to_string(), + ), + ]; + + let blocks: Vec = raw_units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("c".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some("c".into()), + code: code.clone(), + }) + }) + .collect(); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "record.c".into(), + lang: Lang("und".into()), + blocks, + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("c".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn fixed_policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("code-c-ast-v1".into()), + } +} + +#[test] +fn code_c_ast_chunks_snapshot() { + let doc = fixed_doc(); + let policy = fixed_policy(); + + let chunks = CodeCAstV1Chunker.chunk(&doc, &policy).expect("chunk"); + let actual = serde_json::to_value(&chunks).unwrap(); + + let dir = fixtures_dir(); + let baseline_path = dir.join("code-sample.c.chunks.snapshot.json"); + let baseline_text = match std::fs::read_to_string(&baseline_path) { + Ok(s) => s, + Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => { + std::fs::create_dir_all(&dir).unwrap(); + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + return; + } + Err(e) => panic!( + "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}", + baseline_path.display() + ), + }; + let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json"); + + if actual != expected { + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + eprintln!("updated baseline {}", baseline_path.display()); + return; + } + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "code-c-ast-v1 chunks snapshot drift\n\ + --- expected ({}) ---\n{baseline_text}\n\ + --- actual ---\n{pretty}\n\ + If intentional, re-run with UPDATE_SNAPSHOTS=1.", + baseline_path.display() + ); + } +} + +/// Determinism cross-check: re-running the same pipeline yields the same +/// chunk_ids byte-for-byte. +#[test] +fn code_c_ast_chunks_are_deterministic() { + let policy = fixed_policy(); + let baseline: Vec = CodeCAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + for _ in 0..5 { + let again: Vec = CodeCAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + assert_eq!(again, baseline); + } +} diff --git a/crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json b/crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json new file mode 100644 index 0000000..832c474 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/code-sample.c.chunks.snapshot.json @@ -0,0 +1,86 @@ +[ + { + "block_ids": [ + "8149e12ca002489acb4a0f74c97a061a" + ], + "chunk_id": "ec3cf06ae56c8e9796bbc9196438b7c5", + "chunker_version": "code-c-ast-v1", + "doc_id": "6bec42dd593920a060541db16c4e8e45", + "heading_path": [], + "policy_hash": "ecfad2ec1223662d", + "source_spans": [ + { + "kind": "code", + "lang": "c", + "line_end": 18, + "line_start": 1, + "symbol": "" + } + ], + "text": "#include \n#include \n\n#define MAX_BUF 4096\n\ntypedef enum {\n OK = 0,\n ERR_PARSE,\n ERR_IO,\n} status_t;\n\ntypedef struct {\n int id;\n char name[64];\n status_t status;\n} record_t;\n\nstatic int counter = 0;", + "token_estimate": 78 + }, + { + "block_ids": [ + "1baaa89f21a47b2f32d6396a24a85454" + ], + "chunk_id": "c2d7a81c898106733ef2e703774a6a4a", + "chunker_version": "code-c-ast-v1", + "doc_id": "6bec42dd593920a060541db16c4e8e45", + "heading_path": [], + "policy_hash": "ecfad2ec1223662d", + "source_spans": [ + { + "kind": "code", + "lang": "c", + "line_end": 23, + "line_start": 20, + "symbol": "parse_record" + } + ], + "text": "int parse_record(const char *line, record_t *out) {\n if (line == NULL || out == NULL) return ERR_PARSE;\n return OK;\n}", + "token_estimate": 41 + }, + { + "block_ids": [ + "8d0e14cbcc6d1e92d7878ab796ea68b8" + ], + "chunk_id": "0e4d7b131ab64eba03b51903b5d8f96d", + "chunker_version": "code-c-ast-v1", + "doc_id": "6bec42dd593920a060541db16c4e8e45", + "heading_path": [], + "policy_hash": "ecfad2ec1223662d", + "source_spans": [ + { + "kind": "code", + "lang": "c", + "line_end": 27, + "line_start": 25, + "symbol": "print_record" + } + ], + "text": "void print_record(const record_t *r) {\n printf(\"[%d] %s (status=%d)\\n\", r->id, r->name, r->status);\n}", + "token_estimate": 35 + }, + { + "block_ids": [ + "9c2ede84423871b615d48c38fefb1853" + ], + "chunk_id": "e076f8edb2ff141d7e99b4106bb95157", + "chunker_version": "code-c-ast-v1", + "doc_id": "6bec42dd593920a060541db16c4e8e45", + "heading_path": [], + "policy_hash": "ecfad2ec1223662d", + "source_spans": [ + { + "kind": "code", + "lang": "c", + "line_end": 33, + "line_start": 29, + "symbol": "main" + } + ], + "text": "int main(void) {\n record_t r = { .id = 1, .name = \"foo\", .status = OK };\n print_record(&r);\n return 0;\n}", + "token_estimate": 38 + } +] diff --git a/crates/kebab-chunk/tests/fixtures/sample.c b/crates/kebab-chunk/tests/fixtures/sample.c new file mode 100644 index 0000000..ded7945 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample.c @@ -0,0 +1,33 @@ +#include +#include + +#define MAX_BUF 4096 + +typedef enum { + OK = 0, + ERR_PARSE, + ERR_IO, +} status_t; + +typedef struct { + int id; + char name[64]; + status_t status; +} record_t; + +static int counter = 0; + +int parse_record(const char *line, record_t *out) { + if (line == NULL || out == NULL) return ERR_PARSE; + return OK; +} + +void print_record(const record_t *r) { + printf("[%d] %s (status=%d)\n", r->id, r->name, r->status); +} + +int main(void) { + record_t r = { .id = 1, .name = "foo", .status = OK }; + print_record(&r); + return 0; +} From b2a2902e3870e5d8f4bf48f1956e517105ae12f9 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 13:46:12 +0000 Subject: [PATCH 07/13] feat(p10-1d): code-cpp-ast-v1 chunker + snapshot test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Identical chunker body to code-c-ast-v1 (per-language work happens in the CppAstExtractor, Task C). Snapshot fixture covers nested namespace + class + ctor/dtor + method + operator overload + template fn + free fn + top-level main, verifying namespace::Class::method symbol convention per design Β§3.4. 5 chunks emitted: - (includes, namespace opening) - kebab::chunk::MdHeadingV1Chunker (class unit) - kebab::identity (template function) - kebab::global_helper (free function in namespace) - main (top-level main function) Template function symbols emit without parameters per spec convention. Namespace::Class::method pattern verified. All tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-chunk/src/code_cpp_ast_v1.rs | 322 ++++++++++++++++++ crates/kebab-chunk/src/lib.rs | 2 + .../tests/code_cpp_ast_snapshot.rs | 200 +++++++++++ .../code-sample.cpp.chunks.snapshot.json | 107 ++++++ crates/kebab-chunk/tests/fixtures/sample.cpp | 40 +++ 5 files changed, 671 insertions(+) create mode 100644 crates/kebab-chunk/src/code_cpp_ast_v1.rs create mode 100644 crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs create mode 100644 crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json create mode 100644 crates/kebab-chunk/tests/fixtures/sample.cpp diff --git a/crates/kebab-chunk/src/code_cpp_ast_v1.rs b/crates/kebab-chunk/src/code_cpp_ast_v1.rs new file mode 100644 index 0000000..f9272d3 --- /dev/null +++ b/crates/kebab-chunk/src/code_cpp_ast_v1.rs @@ -0,0 +1,322 @@ +//! `code-cpp-ast-v1` β€” maps a tree-sitter-derived C++ AST +//! `CanonicalDocument` (one `Block::Code` per semantic unit, each with +//! `SourceSpan::Code`) to chunks 1:1. A unit longer than +//! `AST_CHUNK_MAX_LINES` is split into ` [part i/N]` sub-chunks +//! at blank-line paragraph boundaries (design Β§9.1 oversize fallback). +//! +//! tree-sitter is intentionally NOT a dependency here: AST work is +//! parser-side (`kebab-parse-code`, design Β§6.3). This chunker only +//! consumes the `CanonicalDocument`. +//! +//! `AST_CHUNK_MAX_LINES` is a constant matching +//! `IngestCodeCfg::default().ast_chunk_max_lines` (200). Per-medium +//! config threading needs a chunker registry (P+); same deviation +//! pattern as `pdf-page-v1`'s pinned `chunker_version` +//! (`tasks/HOTFIXES.md`). + +use kebab_core::{ + Block, BlockId, CanonicalDocument, Chunk, ChunkPolicy, Chunker, ChunkerVersion, DocumentId, + SourceSpan, id_for_chunk, +}; + +const VERSION_LABEL: &str = "code-cpp-ast-v1"; +const BYTES_PER_TOKEN: usize = 3; +const POLICY_HASH_HEX_LEN: usize = 16; +const AST_CHUNK_MAX_LINES: u32 = 200; + +#[derive(Clone, Copy, Debug, Default)] +pub struct CodeCppAstV1Chunker; + +impl Chunker for CodeCppAstV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() + } + + fn chunk( + &self, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + ) -> anyhow::Result> { + for b in &doc.blocks { + let c = match b { + Block::Code(c) => c, + _ => anyhow::bail!( + "CodeCppAstV1Chunker only handles code docs (got non-Code block)" + ), + }; + if !matches!(c.common.source_span, SourceSpan::Code { .. }) { + anyhow::bail!( + "CodeCppAstV1Chunker only handles code docs (got non-Code source_span)" + ); + } + } + + let base_policy_hash = self.policy_hash(policy); + let chunker_version = self.chunker_version(); + let mut out: Vec = Vec::new(); + + for b in &doc.blocks { + let cb = match b { + Block::Code(c) => c, + _ => unreachable!("validated above"), + }; + let (ls, le, symbol, lang) = match &cb.common.source_span { + SourceSpan::Code { line_start, line_end, symbol, lang } => { + (*line_start, *line_end, symbol.clone(), lang.clone()) + } + _ => unreachable!("validated above"), + }; + let block_ids: Vec = vec![cb.common.block_id.clone()]; + let span_lines = le.saturating_sub(ls) + 1; + + if span_lines <= AST_CHUNK_MAX_LINES { + let span = SourceSpan::Code { + line_start: ls, + line_end: le, + symbol: symbol.clone(), + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + None, span, cb.code.clone(), + )); + } else { + let parts = split_oversize(&cb.code); + let n = parts.len(); + for (i, (off_start, off_end, text)) in parts.into_iter().enumerate() { + let part_ls = ls + off_start; + let part_le = ls + off_end; + let part_sym = symbol + .as_ref() + .map(|s| format!("{s} [part {}/{n}]", i + 1)); + let span = SourceSpan::Code { + line_start: part_ls, + line_end: part_le, + symbol: part_sym, + lang: lang.clone(), + }; + out.push(make_chunk( + doc, &chunker_version, &block_ids, &base_policy_hash, + Some(part_ls), span, text, + )); + } + } + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = out.len(), + "code-cpp-ast-v1 chunked", + ); + Ok(out) + } +} + +#[allow(clippy::too_many_arguments)] +fn make_chunk( + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + block_ids: &[BlockId], + base_policy_hash: &str, + split_key: Option, + span: SourceSpan, + text: String, +) -> Chunk { + let id_hash = match split_key { + Some(k) => format!("{base_policy_hash}#L{k}"), + None => base_policy_hash.to_string(), + }; + let chunk_id = id_for_chunk(&doc.doc_id, chunker_version, block_ids, &id_hash); + let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN); + Chunk { + chunk_id, + doc_id: DocumentId(doc.doc_id.0.clone()), + block_ids: block_ids.to_vec(), + text, + heading_path: Vec::new(), + source_spans: vec![span], + token_estimate, + chunker_version: chunker_version.clone(), + policy_hash: base_policy_hash.to_string(), + } +} + +/// Split an oversize unit at blank-line paragraph boundaries, greedily +/// gluing paragraphs until ~`AST_CHUNK_MAX_LINES` lines accumulate. +/// Returns `(line_offset_start, line_offset_end, text)` where offsets are +/// 0-based within the unit (caller adds the unit's absolute `line_start`). +fn split_oversize(code: &str) -> Vec<(u32, u32, String)> { + let lines: Vec<&str> = code.split('\n').collect(); + let total = lines.len() as u32; + let mut out: Vec<(u32, u32, String)> = Vec::new(); + let mut start: u32 = 0; + while start < total { + let mut end = (start + AST_CHUNK_MAX_LINES).min(total); + let floor = start + (AST_CHUNK_MAX_LINES * 4 / 5); + if end < total { + if let Some(b) = (floor.min(end)..end) + .rev() + .find(|&i| lines[i as usize].trim().is_empty()) + { + end = b + 1; + } + } + let text = lines[start as usize..end as usize].join("\n"); + out.push((start, end.saturating_sub(1), text)); + start = end; + } + if out.is_empty() { + out.push((0, total.saturating_sub(1), code.to_string())); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + use kebab_core::{ + Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + SourceSpan, id_for_block, id_for_doc, AssetId, Lang, Metadata, ParserVersion, Provenance, + SourceType, TrustLevel, WorkspacePath, + }; + use time::OffsetDateTime; + + fn code_doc(units: &[(&str, u32, u32, &str)]) -> CanonicalDocument { + let wp = WorkspacePath("crates/x/src/a.cpp".into()); + let aid = AssetId("a".repeat(64)); + let pv = ParserVersion("code-cpp-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + let blocks = units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("cpp".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { block_id: bid, heading_path: vec![], source_span: span }, + lang: Some("cpp".into()), + code: (*code).to_string(), + }) + }) + .collect(); + CanonicalDocument { + doc_id, source_asset_id: aid, workspace_path: wp, title: "a".into(), + lang: Lang("und".into()), blocks, + metadata: Metadata { + aliases: vec![], tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, trust_level: TrustLevel::Primary, + user_id_alias: None, user: Default::default(), + repo: Some("kebab".into()), git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), code_lang: Some("cpp".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, schema_version: 1, doc_version: 1, + last_chunker_version: None, last_embedding_version: None, + } + } + fn policy() -> ChunkPolicy { + ChunkPolicy { target_tokens: 500, overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion(VERSION_LABEL.into()) } + } + + #[test] + fn chunker_version_is_code_cpp_ast_v1() { + assert_eq!(CodeCppAstV1Chunker.chunker_version(), + ChunkerVersion("code-cpp-ast-v1".into())); + } + + #[test] + fn one_chunk_per_unit_preserves_code_span() { + let doc = code_doc(&[ + ("parse", 1, 3, "int parse() {\n\t// x\n}"), + ("print", 5, 7, "void print() {\n\t//\n\treturn;\n}"), + ]); + let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert_eq!(chunks.len(), 2); + for c in &chunks { + assert_eq!(c.source_spans.len(), 1); + assert!(matches!(c.source_spans[0], SourceSpan::Code { .. })); + assert_eq!(c.heading_path, Vec::::new()); + assert_eq!(c.chunker_version.0, "code-cpp-ast-v1"); + } + match &chunks[0].source_spans[0] { + SourceSpan::Code { symbol, line_start, line_end, .. } => { + assert_eq!(symbol.as_deref(), Some("parse")); + assert_eq!((*line_start, *line_end), (1, 3)); + } + _ => unreachable!(), + } + } + + #[test] + fn oversize_unit_splits_into_parts_with_unique_ids() { + let body = (0..500).map(|i| format!("\tx{i} = {i};\n")).collect::>().join(""); + let code = format!("int big() {{\n{body}\n}}"); + let doc = code_doc(&[("big", 1, 502, &code)]); + let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap(); + assert!(chunks.len() >= 2, "oversize unit must split, got {}", chunks.len()); + for c in &chunks { + match &c.source_spans[0] { + SourceSpan::Code { symbol, .. } => { + assert!(symbol.as_deref().unwrap().starts_with("big [part "), + "part-numbered symbol, got {symbol:?}"); + } + _ => unreachable!(), + } + } + let mut ids: Vec<&str> = chunks.iter().map(|c| c.chunk_id.0.as_str()).collect(); + let n = ids.len(); ids.sort(); ids.dedup(); + assert_eq!(ids.len(), n, "chunk_ids unique across split parts"); + } + + #[test] + fn non_code_doc_errors() { + use kebab_core::TextBlock; + let mut doc = code_doc(&[("parse", 1, 1, "int parse() {}")]); + doc.blocks = vec![Block::Paragraph(TextBlock { + common: CommonBlock { + block_id: kebab_core::BlockId("b".into()), + heading_path: vec![], + source_span: SourceSpan::Line { start: 1, end: 1 }, + }, + text: "x".into(), inlines: vec![], + })]; + let err = CodeCppAstV1Chunker.chunk(&doc, &policy()).unwrap_err(); + assert!(err.to_string().contains("CodeCppAstV1Chunker")); + } + + #[test] + fn deterministic_chunk_ids_1000() { + let doc = code_doc(&[("parse", 1, 2, "int parse() {}\n")]); + let base: Vec = CodeCppAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + for _ in 0..1000 { + let again: Vec = CodeCppAstV1Chunker.chunk(&doc, &policy()) + .unwrap().into_iter().map(|c| c.chunk_id.0).collect(); + assert_eq!(again, base); + } + } + + #[test] + fn policy_hash_matches_md_heading_v1() { + let p = policy(); + assert_eq!(CodeCppAstV1Chunker.policy_hash(&p), + crate::MdHeadingV1Chunker.policy_hash(&p)); + } +} diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index f1636ea..1be8bd2 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -16,6 +16,7 @@ //! It consumes `CanonicalDocument` purely through `kb-core` types. mod code_c_ast_v1; +mod code_cpp_ast_v1; mod code_go_ast_v1; mod code_java_ast_v1; mod code_js_ast_v1; @@ -32,6 +33,7 @@ pub mod manifest_file_v1; pub mod code_text_paragraph_v1; pub use code_c_ast_v1::CodeCAstV1Chunker; +pub use code_cpp_ast_v1::CodeCppAstV1Chunker; pub use code_go_ast_v1::CodeGoAstV1Chunker; pub use code_java_ast_v1::CodeJavaAstV1Chunker; pub use code_js_ast_v1::CodeJsAstV1Chunker; diff --git a/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs b/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs new file mode 100644 index 0000000..0b7724f --- /dev/null +++ b/crates/kebab-chunk/tests/code_cpp_ast_snapshot.rs @@ -0,0 +1,200 @@ +//! Snapshot test pinning the `Vec` JSON for a +//! representative C++ code `CanonicalDocument`. +//! +//! This is an integration test. `kebab-parse-code` is intentionally NOT +//! a dev-dep (design Β§6.3 / Β§8 boundary: AST extraction is parser-side). +//! The `CanonicalDocument` is built inline from hand-crafted `Block::Code` +//! units, which is the same pattern used in `code_c_ast_v1.rs`'s +//! internal `code_doc` test helper. +//! +//! Set `UPDATE_SNAPSHOTS=1` to re-bake the baseline. + +use std::path::PathBuf; + +use kebab_chunk::CodeCppAstV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, CommonBlock, + Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, + id_for_block, id_for_doc, +}; +use serde_json::Value; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +fn fixed_doc() -> CanonicalDocument { + let wp = WorkspacePath("projects/record.cpp".into()); + let aid = AssetId("c".repeat(64)); + // Pin parser_version so doc_id / block_ids are reproducible. + let pv = ParserVersion("code-cpp-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + // Representative units (C++ specific): + // 0. includes + namespace opening (lines 1–4, ≀200) + // 1. class definition (lines 6–20, ≀200) + // 2. template function (lines 22–25, ≀200) + // 3. namespace closing + free fn (lines 27–29, ≀200) + // 4. main fn (lines 31–34, ≀200) + let raw_units: Vec<(&str, u32, u32, String)> = vec![ + ( + "", + 1, + 4, + "#include \n#include \n\nnamespace kebab {".to_string(), + ), + ( + "kebab::chunk::MdHeadingV1Chunker", + 6, + 20, + "class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};".to_string(), + ), + ( + "kebab::identity", + 22, + 25, + "template \nT identity(T value) {\n return value;\n}".to_string(), + ), + ( + "kebab::global_helper", + 27, + 29, + "void global_helper() {\n // free function in kebab namespace\n}".to_string(), + ), + ( + "main", + 31, + 34, + "int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}".to_string(), + ), + ]; + + let blocks: Vec = raw_units + .iter() + .enumerate() + .map(|(i, (sym, ls, le, code))| { + let span = SourceSpan::Code { + line_start: *ls, + line_end: *le, + symbol: Some((*sym).to_string()), + lang: Some("cpp".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], i as u32, &span); + Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some("cpp".into()), + code: code.clone(), + }) + }) + .collect(); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "record.cpp".into(), + lang: Lang("und".into()), + blocks, + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("cpp".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn fixed_policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("code-cpp-ast-v1".into()), + } +} + +#[test] +fn code_cpp_ast_chunks_snapshot() { + let doc = fixed_doc(); + let policy = fixed_policy(); + + let chunks = CodeCppAstV1Chunker.chunk(&doc, &policy).expect("chunk"); + let actual = serde_json::to_value(&chunks).unwrap(); + + let dir = fixtures_dir(); + let baseline_path = dir.join("code-sample.cpp.chunks.snapshot.json"); + let baseline_text = match std::fs::read_to_string(&baseline_path) { + Ok(s) => s, + Err(_) if std::env::var("UPDATE_SNAPSHOTS").is_ok() => { + std::fs::create_dir_all(&dir).unwrap(); + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + return; + } + Err(e) => panic!( + "missing baseline {}; run with UPDATE_SNAPSHOTS=1 to create: {e}", + baseline_path.display() + ), + }; + let expected: Value = serde_json::from_str(&baseline_text).expect("baseline parses as json"); + + if actual != expected { + if std::env::var("UPDATE_SNAPSHOTS").is_ok() { + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + std::fs::write(&baseline_path, format!("{pretty}\n")).unwrap(); + eprintln!("updated baseline {}", baseline_path.display()); + return; + } + let pretty = serde_json::to_string_pretty(&actual).unwrap(); + panic!( + "code-cpp-ast-v1 chunks snapshot drift\n\ + --- expected ({}) ---\n{baseline_text}\n\ + --- actual ---\n{pretty}\n\ + If intentional, re-run with UPDATE_SNAPSHOTS=1.", + baseline_path.display() + ); + } +} + +/// Determinism cross-check: re-running the same pipeline yields the same +/// chunk_ids byte-for-byte. +#[test] +fn code_cpp_ast_chunks_are_deterministic() { + let policy = fixed_policy(); + let baseline: Vec = CodeCppAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + for _ in 0..5 { + let again: Vec = CodeCppAstV1Chunker + .chunk(&fixed_doc(), &policy) + .unwrap() + .into_iter() + .map(|c| c.chunk_id.0) + .collect(); + assert_eq!(again, baseline); + } +} diff --git a/crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json b/crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json new file mode 100644 index 0000000..257d6e9 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/code-sample.cpp.chunks.snapshot.json @@ -0,0 +1,107 @@ +[ + { + "block_ids": [ + "53292605459065d170cd36c118e20546" + ], + "chunk_id": "50a5b324300d9082eac4ce2a422810e1", + "chunker_version": "code-cpp-ast-v1", + "doc_id": "fff1e1f0a7ff70ef682937470e5d1d28", + "heading_path": [], + "policy_hash": "71f3c07bb9ec1d09", + "source_spans": [ + { + "kind": "code", + "lang": "cpp", + "line_end": 4, + "line_start": 1, + "symbol": "" + } + ], + "text": "#include \n#include \n\nnamespace kebab {", + "token_estimate": 18 + }, + { + "block_ids": [ + "f349acad94c9fa4cf9ad1c0a93e83610" + ], + "chunk_id": "0e6bc7c522665af8a4b0f66afb9d29c8", + "chunker_version": "code-cpp-ast-v1", + "doc_id": "fff1e1f0a7ff70ef682937470e5d1d28", + "heading_path": [], + "policy_hash": "71f3c07bb9ec1d09", + "source_spans": [ + { + "kind": "code", + "lang": "cpp", + "line_end": 20, + "line_start": 6, + "symbol": "kebab::chunk::MdHeadingV1Chunker" + } + ], + "text": "class MdHeadingV1Chunker {\npublic:\n MdHeadingV1Chunker() = default;\n ~MdHeadingV1Chunker() = default;\n\n std::string chunk_doc(const std::string& doc) {\n return doc;\n }\n\n int operator()(int x) const {\n return x * 2;\n }\n\nprivate:\n int counter_ = 0;\n};", + "token_estimate": 95 + }, + { + "block_ids": [ + "8b9811387717d0bd4abf84abcc35b8b1" + ], + "chunk_id": "d9326d252905b665b2adb9a416c20451", + "chunker_version": "code-cpp-ast-v1", + "doc_id": "fff1e1f0a7ff70ef682937470e5d1d28", + "heading_path": [], + "policy_hash": "71f3c07bb9ec1d09", + "source_spans": [ + { + "kind": "code", + "lang": "cpp", + "line_end": 25, + "line_start": 22, + "symbol": "kebab::identity" + } + ], + "text": "template \nT identity(T value) {\n return value;\n}", + "token_estimate": 21 + }, + { + "block_ids": [ + "1754cb6b971f6a4cb292f144a4f0570b" + ], + "chunk_id": "56ee5f991de4a413c016da8dc4acfc35", + "chunker_version": "code-cpp-ast-v1", + "doc_id": "fff1e1f0a7ff70ef682937470e5d1d28", + "heading_path": [], + "policy_hash": "71f3c07bb9ec1d09", + "source_spans": [ + { + "kind": "code", + "lang": "cpp", + "line_end": 29, + "line_start": 27, + "symbol": "kebab::global_helper" + } + ], + "text": "void global_helper() {\n // free function in kebab namespace\n}", + "token_estimate": 22 + }, + { + "block_ids": [ + "14b5f3393d6d25f822f5b70763d24acd" + ], + "chunk_id": "c0d7c043cdd575c530db3909b54cc906", + "chunker_version": "code-cpp-ast-v1", + "doc_id": "fff1e1f0a7ff70ef682937470e5d1d28", + "heading_path": [], + "policy_hash": "71f3c07bb9ec1d09", + "source_spans": [ + { + "kind": "code", + "lang": "cpp", + "line_end": 34, + "line_start": 31, + "symbol": "main" + } + ], + "text": "int main() {\n kebab::chunk::MdHeadingV1Chunker c;\n return 0;\n}", + "token_estimate": 23 + } +] diff --git a/crates/kebab-chunk/tests/fixtures/sample.cpp b/crates/kebab-chunk/tests/fixtures/sample.cpp new file mode 100644 index 0000000..2b95a60 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample.cpp @@ -0,0 +1,40 @@ +#include +#include + +namespace kebab { +namespace chunk { + +class MdHeadingV1Chunker { +public: + MdHeadingV1Chunker() = default; + ~MdHeadingV1Chunker() = default; + + std::string chunk_doc(const std::string& doc) { + return doc; + } + + int operator()(int x) const { + return x * 2; + } + +private: + int counter_ = 0; +}; + +template +T identity(T value) { + return value; +} + +} // namespace chunk + +void global_helper() { + // free function in kebab namespace +} + +} // namespace kebab + +int main() { + kebab::chunk::MdHeadingV1Chunker c; + return 0; +} From d1560be80d30e27c9c5c408a72113fcea03099f3 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 13:56:45 +0000 Subject: [PATCH 08/13] feat(p10-1d): activate C + C++ in ingest_one_code_asset dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends 4-arm match (parser_version / chunker_version / extract / chunks) + allowlist + tier3_fallback_cv with "c" + "cpp" arms. C uses CAstExtractor + CodeCAstV1Chunker; C++ uses CppAstExtractor + CodeCppAstV1Chunker. Both langs are Tier 3-fallback-eligible (e.g. .h file with C++ syntax may fail tree-sitter-c parse β†’ Tier 3 paragraph fallback per p10-3 wrapper). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 907d93b..ddf211a 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -39,7 +39,7 @@ use std::sync::Arc; use anyhow::{Context, anyhow}; use serde::{Deserialize, Serialize}; -use kebab_chunk::{CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; +use kebab_chunk::{CodeCAstV1Chunker, CodeCppAstV1Chunker, CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, @@ -50,7 +50,7 @@ use kebab_core::{ use kebab_llm_local::OllamaLanguageModel; use kebab_normalize::build_canonical_document; use kebab_parse_image::{ImageExtractor, OllamaVisionOcr, apply_caption, apply_ocr}; -use kebab_parse_code::{GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor}; +use kebab_parse_code::{CAstExtractor, CppAstExtractor, GoAstExtractor, JavaAstExtractor, JavascriptAstExtractor, KotlinAstExtractor, PythonAstExtractor, RustAstExtractor, TypescriptAstExtractor}; use kebab_parse_pdf::PdfTextExtractor; use kebab_parse_md::{BodyHints, parse_blocks, parse_frontmatter}; use kebab_source_fs::FsSourceConnector; @@ -948,12 +948,12 @@ fn ingest_one_asset( force_reingest, ); } - // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. + // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. p10-1D: c/cpp added. MediaType::Code(lang) if matches!(lang.as_str(), "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" - | "shell") => + | "shell" | "c" | "cpp") => { return ingest_one_code_asset( app, @@ -1838,6 +1838,9 @@ fn ingest_one_code_asset( => ParserVersion("none-v1".to_string()), // p10-3: shell direct routes to Tier 3 (no parse step). "shell" => ParserVersion("none-v1".to_string()), + // p10-1D: C + C++ AST extractors. + "c" => ParserVersion(kebab_parse_code::C_PARSER_VERSION.to_string()), + "cpp" => ParserVersion(kebab_parse_code::CPP_PARSER_VERSION.to_string()), other => anyhow::bail!("unsupported code_lang: {other}"), }; @@ -1857,6 +1860,9 @@ fn ingest_one_code_asset( => ManifestFileV1Chunker.chunker_version(), // p10-3: "shell" => CodeTextParagraphV1Chunker.chunker_version(), + // p10-1D: C + C++ AST chunkers. + "c" => CodeCAstV1Chunker.chunker_version(), + "cpp" => CodeCppAstV1Chunker.chunker_version(), other => anyhow::bail!("unreachable chunker_version: {other}"), }; @@ -1911,6 +1917,13 @@ fn ingest_one_code_asset( } // p10-3: shell reuses the same synthesizer. "shell" => synthesize_tier2_document(asset, &bytes, "shell", &parser_version), + // p10-1D: C + C++ AST extractors. + "c" => CAstExtractor::new() + .extract(&ctx, &bytes) + .context("kebab-parse-code::CAstExtractor::extract (code:c)"), + "cpp" => CppAstExtractor::new() + .extract(&ctx, &bytes) + .context("kebab-parse-code::CppAstExtractor::extract (code:cpp)"), other => anyhow::bail!("unreachable (extract): {other}"), }; @@ -1987,6 +2000,13 @@ fn ingest_one_code_asset( "shell" => CodeTextParagraphV1Chunker .chunk(&canonical, chunk_policy) .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (code:shell)"), + // p10-1D: C + C++ AST chunkers. + "c" => CodeCAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kebab-chunk::CodeCAstV1Chunker::chunk (code:c)"), + "cpp" => CodeCppAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kebab-chunk::CodeCppAstV1Chunker::chunk (code:cpp)"), other => anyhow::bail!("unreachable (chunk): {other}"), } }; From 1034de25a2b401a317d31bc5c7eca36cc2bbca6f Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 14:19:17 +0000 Subject: [PATCH 09/13] fix(p10-3+p10-1d): land the missing try_skip_unchanged fallback-aware fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #155 (p10-3) merged WITHOUT the reviewer's required Option B1 fix β€” the implementer reported a commit SHA (2a39513) that never made it to main. Result: every reingest of a Tier 3-fallback file (non-k8s YAML, invalid YAML, AST extractor failure) re-runs full extract + chunk + embed because the parser/chunker version comparison can never match (stored is code-text-paragraph-v1 / none-v1, but caller uses Tier 1/2 dispatch values). This commit: 1. Adds the 7th param `fallback_chunker_version: Option<&ChunkerVersion>` to try_skip_unchanged + the stored_is_tier3_fallback detection branch (skip parser/chunker equality, keep embedder check). 2. Threads `None` through non-code call sites (md / image / pdf). 3. Code call site computes tier3_fallback_cv covering all Tier 1/2 langs that can fall back: rust / python / ts / js / go / java / kotlin / yaml / dockerfile / toml / json / xml / groovy / go-mod / c / cpp (p10-1D additions). 4. Adds tier3_yaml_fallback_reingest_is_unchanged + tier3_shell_reingest_is_unchanged regression tests (the originally-promised PR #155 regression coverage that also never made it to main). Smoke tests: 14 + 2 = 16 PASS. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/src/lib.rs | 61 +++++++++++++ crates/kebab-app/tests/code_ingest_smoke.rs | 98 +++++++++++++++++++++ 2 files changed, 159 insertions(+) diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index ddf211a..37013e3 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -795,6 +795,7 @@ fn try_skip_unchanged( current_chunker_version: &ChunkerVersion, current_embedding_version: Option<&kebab_core::EmbeddingVersion>, force_reingest: bool, + fallback_chunker_version: Option<&ChunkerVersion>, // p10-3 fix ) -> anyhow::Result> { if force_reingest { return Ok(None); @@ -829,6 +830,50 @@ fn try_skip_unchanged( if existing_doc.source_asset_id != asset.asset_id { return Ok(None); } + // p10-3 fix: detect "stored doc was previously Tier 3 fallback". + // When a Tier 1/2 extractor emits empty chunks, the fallback wrapper + // retries with CodeTextParagraphV1Chunker and stores + // last_chunker_version = "code-text-paragraph-v1" + parser_version = "none-v1". + // On the next ingest the caller computes current_parser_version / + // current_chunker_version from the Tier 1/2 dispatch (e.g. + // "k8s-manifest-resource-v1"), which can never match the stored + // fallback values, causing spurious re-ingests. Detect this case + // and bypass the parser/chunker equality checks β€” only the embedder + // version still must match. + let stored_is_tier3_fallback = fallback_chunker_version.is_some_and(|fbv| { + existing_doc.last_chunker_version.as_ref() == Some(fbv) + && existing_doc.parser_version.0 == "none-v1" + }); + + if stored_is_tier3_fallback { + // Embedder version still must match. + let embedder_match = existing_doc.last_embedding_version.as_ref() + == current_embedding_version; + if !embedder_match { + return Ok(None); + } + let candidate_doc_id = existing_doc.doc_id.clone(); + tracing::debug!( + target: "kebab-app::ingest", + path = %asset.workspace_path.0, + doc_id = %candidate_doc_id.0, + "skip-unchanged: tier 3 fallback state detected; bypassing parser/chunker equality" + ); + return Ok(Some(kebab_core::IngestItem { + kind: kebab_core::IngestItemKind::Unchanged, + doc_id: Some(candidate_doc_id), + doc_path: asset.workspace_path.clone(), + asset_id: Some(asset.asset_id.clone()), + byte_len: Some(asset.byte_len), + block_count: u32::try_from(existing_doc.blocks.len()).ok(), + chunk_count: None, + parser_version: Some(existing_doc.parser_version.clone()), + chunker_version: existing_doc.last_chunker_version.clone(), + warnings: Vec::new(), + error: None, + })); + } + // 2. Parser unchanged: parser_version is baked into id_for_doc so // a version bump yields a different doc_id and the row above // would have been missing. Checking here explicitly keeps the @@ -1017,6 +1062,7 @@ fn ingest_one_asset( &MdHeadingV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1211,6 +1257,7 @@ fn ingest_one_image_asset( &MdHeadingV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1657,6 +1704,7 @@ fn ingest_one_pdf_asset( &PdfPageV1Chunker.chunker_version(), embedder.map(|e| e.model_version()).as_ref(), force_reingest, + None, )? { return Ok(item); } @@ -1866,6 +1914,18 @@ fn ingest_one_code_asset( other => anyhow::bail!("unreachable chunker_version: {other}"), }; + // p10-3 fix: if this lang can fall back to Tier 3, compute the fallback + // chunker_version so try_skip_unchanged can detect the stored-as-Tier-3 + // state and skip parser/chunker equality checks. + let tier3_fallback_cv: Option = match code_lang { + "rust" | "python" | "typescript" | "javascript" + | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + | "c" | "cpp" // p10-1D + => Some(CodeTextParagraphV1Chunker.chunker_version()), + _ => None, + }; + if let Some(item) = try_skip_unchanged( app, asset, @@ -1873,6 +1933,7 @@ fn ingest_one_code_asset( &chunker_version, embedder.map(|e| e.model_version()).as_ref(), force_reingest, + tier3_fallback_cv.as_ref(), )? { return Ok(item); } diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index a462666..4acb12e 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -1064,3 +1064,101 @@ fn rust_file_re_ingest_is_unchanged() { ); assert_eq!(item2.doc_id, item1.doc_id); } + +/// p10-3 fix regression: a docker-compose YAML that falls back to Tier 3 +/// (k8s chunker returns empty, CodeTextParagraphV1Chunker retries) must +/// report Unchanged on the second ingest rather than re-processing. +/// Before the fix, try_skip_unchanged returned None because the stored +/// last_chunker_version ("code-text-paragraph-v1" / parser_version +/// "none-v1") never matched the caller's dispatch values. +#[test] +fn tier3_yaml_fallback_reingest_is_unchanged() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("docker-compose.yml"), + "version: '3'\nservices:\n api:\n image: nginx:latest\n", + ) + .unwrap(); + + let report1 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("first ingest"); + let item1 = report1 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) + .expect("docker-compose.yml in first report"); + assert!( + matches!(item1.kind, IngestItemKind::New), + "first ingest must be New, got {:?}", item1.kind + ); + assert_eq!( + item1.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-text-paragraph-v1"), + "first ingest must use Tier 3 fallback chunker" + ); + + let report2 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest"); + let item2 = report2 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) + .expect("docker-compose.yml in second report"); + assert!( + matches!(item2.kind, IngestItemKind::Unchanged), + "second ingest must be Unchanged, got {:?}", item2.kind + ); +} + +/// p10-3 fix regression: a shell file (direct Tier 3, not a fallback) +/// must also report Unchanged on re-ingest. Shell goes straight to +/// CodeTextParagraphV1Chunker so `stored_is_tier3_fallback` is false +/// (parser_version is "none-v1" and chunker matches the current dispatch), +/// but the normal equality path should pass regardless. +#[test] +fn tier3_shell_reingest_is_unchanged() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("deploy.sh"), + "#!/usr/bin/env bash\nset -e\necho hello\n", + ) + .unwrap(); + + let report1 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("first ingest"); + let item1 = report1 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("deploy.sh")) + .expect("deploy.sh in first report"); + assert!( + matches!(item1.kind, IngestItemKind::New), + "first ingest must be New, got {:?}", item1.kind + ); + + let report2 = + kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("second ingest"); + let item2 = report2 + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("deploy.sh")) + .expect("deploy.sh in second report"); + assert!( + matches!(item2.kind, IngestItemKind::Unchanged), + "shell reingest must be Unchanged, got {:?}", item2.kind + ); +} From 192835e5bf5fb1a3c1f362ab081f2ed0db642c42 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 14:31:35 +0000 Subject: [PATCH 10/13] test(p10-1d): integration smoke tests for C + C++ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verifies end-to-end ingest + search + Citation::Code shape: - tier1_c_ingest_searchable: .c file β†’ --code-lang c search β†’ symbol = function name (no nesting), lang = "c", chunker_version = "code-c-ast-v1". - tier1_cpp_ingest_searchable: .cpp file β†’ --code-lang cpp search β†’ symbol starts with namespace::Class prefix, lang = "cpp", chunker_version = "code-cpp-ast-v1". Brings code_ingest_smoke to 18 tests (Tier 1: 9 β†’ 11, Tier 2: 3, Tier 3: 4). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/kebab-app/tests/code_ingest_smoke.rs | 169 ++++++++++++++++++++ 1 file changed, 169 insertions(+) diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index 4acb12e..e5f2338 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -1117,6 +1117,175 @@ fn tier3_yaml_fallback_reingest_is_unchanged() { ); } +/// p10-1d Task G: a `.c` file with a single top-level function is ingested +/// and the resulting `Citation::Code` hit must carry `lang="c"`, +/// `symbol="parse_record"` (function name only β€” no nesting in C), and +/// `chunker_version = "code-c-ast-v1"`. +#[test] +fn tier1_c_ingest_searchable() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("parser.c"), + "#include \n\nint parse_record(const char *line) {\n if (line == NULL) return -1;\n return 0;\n}\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + assert!(report.new >= 1, "c file ingested: {report:?}"); + + let c_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("parser.c")) + .expect("parser.c item present"); + assert_eq!( + c_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("code-c-v1"), + "parser_version must be code-c-v1" + ); + assert_eq!( + c_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-c-ast-v1"), + "chunker_version must be code-c-ast-v1" + ); + + let query = kebab_core::SearchQuery { + text: "parse_record".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["c".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'parse_record'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!(lang.as_deref(), Some("c"), "citation.lang must be 'c'"); + assert_eq!( + symbol.as_deref(), + Some("parse_record"), + "C symbol must be function name only (no nesting)" + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("c"), + "SearchHit.code_lang must be 'c'" + ); + assert_eq!( + h.chunker_version.0.as_str(), + "code-c-ast-v1", + "C chunks must be stamped with code-c-ast-v1" + ); +} + +/// p10-1d Task G: a `.cpp` file with nested namespace + class is ingested +/// and the resulting `Citation::Code` hit must carry `lang="cpp"`, a +/// `symbol` that starts with `"kebab::chunk::Foo"` (namespace::Class or +/// namespace::Class::method), and `chunker_version = "code-cpp-ast-v1"`. +#[test] +fn tier1_cpp_ingest_searchable() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("chunker.cpp"), + "namespace kebab {\nnamespace chunk {\nclass Foo {\npublic:\n void bar() { /* impl */ }\n};\n}\n}\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + assert!(report.new >= 1, "cpp file ingested: {report:?}"); + + let cpp_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("chunker.cpp")) + .expect("chunker.cpp item present"); + assert_eq!( + cpp_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("code-cpp-v1"), + "parser_version must be code-cpp-v1" + ); + assert_eq!( + cpp_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-cpp-ast-v1"), + "chunker_version must be code-cpp-ast-v1" + ); + + let query = kebab_core::SearchQuery { + text: "bar".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["cpp".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'bar'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!(lang.as_deref(), Some("cpp"), "citation.lang must be 'cpp'"); + // Symbol could be "kebab::chunk::Foo" (class) or "kebab::chunk::Foo::bar" + // (method) depending on which chunk ranks first. + assert!( + symbol.as_deref().is_some_and(|s| s.starts_with("kebab::chunk::Foo")), + "C++ symbol must start with namespace::Class prefix, got {:?}", symbol + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("cpp"), + "SearchHit.code_lang must be 'cpp'" + ); + assert_eq!( + h.chunker_version.0.as_str(), + "code-cpp-ast-v1", + "C++ chunks must be stamped with code-cpp-ast-v1" + ); +} + /// p10-3 fix regression: a shell file (direct Tier 3, not a fallback) /// must also report Unchanged on re-ingest. Shell goes straight to /// CodeTextParagraphV1Chunker so `stored_is_tier3_fallback` is false From 438870ee252c616b9aae500ef7fb2300fabfe690 Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 14:32:26 +0000 Subject: [PATCH 11/13] =?UTF-8?q?docs(p10-1d):=20activate=20C=20+=20C++=20?= =?UTF-8?q?in=20frozen=20design=20=C2=A710?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P10 Tier 1 chunker family complete (Rust + Python + TS + JS + Go + Java + Kotlin + C + C++). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/superpowers/specs/2026-04-27-kebab-final-form-design.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md index 72992fb..3e2c7a9 100644 --- a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md +++ b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md @@ -1553,6 +1553,8 @@ transitional ν˜•νƒœ) 의 source of truth. **p10-3 ν™œμ„±ν™” (Tier 3 paragraph fallback) (2026-05-21)**: Tier 3 chunker `code-text-paragraph-v1` ν™œμ„±ν™”. shell script (`.sh`/`.bash`/`.zsh`) direct routing + Tier 1/2 κ°€ 0 chunk λ˜λŠ” Err μ‹œ μžλ™ fallback 으둜 retry. λΉ„-k8s YAML / invalid YAML / AST μ‹€νŒ¨ μΌ€μ΄μŠ€ λͺ¨λ‘ picked up. lang 은 μž…λ ₯ 보쑴 (shell β†’ "shell", yaml β†’ "yaml" λ“±), symbol 은 항상 None. +**p10-1D ν™œμ„±ν™” (C + C++) (2026-05-21)**: P10 Tier 1 chunker family μ™„λ£Œ β€” C (`code-c-ast-v1`, `.c`/`.h`) + C++ (`code-cpp-ast-v1`, `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx`) AST chunker ν™œμ„±ν™”. C symbol = function name only (no nesting); C++ symbol = `namespace::Class::method` (recursive namespace + class nesting). `.h` κ°€ C++ syntax λ§Œλ‚˜λ©΄ tree-sitter-c parse μ‹€νŒ¨ β†’ p10-3 Tier 3 fallback 으둜 μžλ™ picked up. + ### 10.2 MCP server transport (fb-30) `kebab mcp` κ°€ stdio JSON-RPC server. Rust SDK = `rmcp 1.6`. Tool surface From 802c573c07e53e78aba5bf87090c1ea90d30dded Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 14:35:59 +0000 Subject: [PATCH 12/13] docs(p10-1d): README/HANDOFF/ARCHITECTURE/SMOKE/INDEX sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P10 Tier 1 chunker family complete (Rust + Python + TS + JS + Go + Java + Kotlin + C + C++). - README adds C/C++ to the ingest row + --code-lang c/cpp + Mermaid brace. - HANDOFF flips p10-1D to βœ… (v0.16.0), updates ν•œ 쀄 μš”μ•½ + λ‹€μŒ 후보. - ARCHITECTURE adds C/C++ to the code-parser row, extends flowchart pcode node, adds chunker tree entries. - SMOKE adds P10-1D walkthrough section + verification checklist entry. - tasks/INDEX + tasks/p10/INDEX flip p10-1D to βœ…. Co-Authored-By: Claude Opus 4.7 (1M context) --- HANDOFF.md | 4 ++-- README.md | 4 ++-- docs/ARCHITECTURE.md | 14 +++++++------ docs/SMOKE.md | 49 ++++++++++++++++++++++++++++++++++++++++++++ tasks/INDEX.md | 2 +- tasks/p10/INDEX.md | 2 +- 6 files changed, 63 insertions(+), 12 deletions(-) diff --git a/HANDOFF.md b/HANDOFF.md index bb8338b..386bf0f 100644 --- a/HANDOFF.md +++ b/HANDOFF.md @@ -4,7 +4,7 @@ ## ν•œ 쀄 μš”μ•½ -P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) λ¨Έμ§€ μ™„λ£Œ. `kebab ingest` κ°€ markdown / image / PDF / μ†ŒμŠ€μ½”λ“œ (Rust / Python / TS / JS / Go / Java / Kotlin) / Tier 2 λ¦¬μ†ŒμŠ€ 파일 (yaml/k8s / dockerfile / toml / json / xml / groovy / go-mod) + Tier 3 paragraph fallback (shell / λΉ„-k8s YAML / AST μ‹€νŒ¨ μΌ€μ΄μŠ€) 처리. `kebab search` / `kebab ask` κ°€ 맀체 κ°€λ‘œμ§ˆλŸ¬ κ²°κ³Ό + page / code citation λ°˜ν™˜. `kebab tui` κ°€ 4 νŒ¨λ„ (Library + Search + Ask + Inspect) 제곡. P10-3 (Tier 3 paragraph fallback) μ™„λ£Œ β€” λ‹€μŒ 후보 = P10-1D (C/C++) λ˜λŠ” P9-5 (desktop tauri) λ˜λŠ” 보λ₯˜ 쀑인 P8 (audio). +P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) λ¨Έμ§€ μ™„λ£Œ. `kebab ingest` κ°€ markdown / image / PDF / μ†ŒμŠ€μ½”λ“œ (Rust / Python / TS / JS / Go / Java / Kotlin) / Tier 2 λ¦¬μ†ŒμŠ€ 파일 (yaml/k8s / dockerfile / toml / json / xml / groovy / go-mod) + Tier 3 paragraph fallback (shell / λΉ„-k8s YAML / AST μ‹€νŒ¨ μΌ€μ΄μŠ€) 처리. `kebab search` / `kebab ask` κ°€ 맀체 κ°€λ‘œμ§ˆλŸ¬ κ²°κ³Ό + page / code citation λ°˜ν™˜. `kebab tui` κ°€ 4 νŒ¨λ„ (Library + Search + Ask + Inspect) 제곡. P10-3 (Tier 3 paragraph fallback) μ™„λ£Œ. P10-1D (C + C++) μ™„λ£Œλ‘œ Tier 1 chunker family 마무리 β€” λ‹€μŒ 후보 = P9-5 (desktop tauri) λ˜λŠ” 보λ₯˜ 쀑인 P8 (audio). ## Phase λ‘œλ“œλ§΅ @@ -20,7 +20,7 @@ P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) λ¨Έμ§€ μ™„λ£Œ. | **P7** | PDF text + page citation | `kebab-parse-pdf` | P5 | βœ… μ™„λ£Œ (3/3 component, page-level chunker + ingest wiring) | | **P8** | μŒμ„± transcription + timestamp citation | `kebab-parse-audio` | P5 | ⏸ 보λ₯˜ (whisper-rs μ‹œμŠ€ν…œ dep brainstorm ν•„μš”) | | **P9** | TUI + desktop app | `kebab-tui`, `kebab-desktop` | P5 | 🟑 μ§„ν–‰ (4/5 component β€” P9-1/2/3/4 μ™„λ£Œ [Library / Search / Ask / Inspect], P9-5 desktop μ˜ˆμ • Β· 도그푸딩 ν”Όλ“œλ°± **20/20 βœ…**) | -| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟑 μ§„ν–‰ 쀑 β€” 1A-1 βœ… (wire schema + parse-code skeleton + filter flags), 1A-2 βœ… (Rust AST chunker, `code-rust-ast-v1` β€” v0.7.0), 1B βœ… (Python/TS/JS AST chunkers β€” v0.8.0 이후), **1C-Go βœ… (Go AST chunker, `code-go-ast-v1` β€” v0.12.0)**, **1C-JavaKotlin βœ… (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` β€” v0.13.0)**, **2 βœ… (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` β€” v0.14.0)**, **3 βœ… (Tier 3 paragraph fallback: code-text-paragraph-v1 β€” v0.15.0)** | +| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟑 μ§„ν–‰ 쀑 β€” 1A-1 βœ… (wire schema + parse-code skeleton + filter flags), 1A-2 βœ… (Rust AST chunker, `code-rust-ast-v1` β€” v0.7.0), 1B βœ… (Python/TS/JS AST chunkers β€” v0.8.0 이후), **1C-Go βœ… (Go AST chunker, `code-go-ast-v1` β€” v0.12.0)**, **1C-JavaKotlin βœ… (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` β€” v0.13.0)**, **2 βœ… (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` β€” v0.14.0)**, **3 βœ… (Tier 3 paragraph fallback: code-text-paragraph-v1 β€” v0.15.0)**, **1D βœ… (C + C++ AST chunkers, code-c-ast-v1 + code-cpp-ast-v1 β€” v0.16.0)** | P0~P5 직렬. P6~P9 P5 이후 병렬 κ°€λŠ₯. diff --git a/README.md b/README.md index 828ac89..1455e14 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ kebab doctor | λͺ…λ Ή | λ™μž‘ | |------|------| | `kebab init` | XDG κ²½λ‘œμ— 데이터 디렉토리 + config.toml 생성 | -| `kebab ingest []` | Markdown / 이미지 / PDF / Rust μ†ŒμŠ€μ½”λ“œ 색인 (idempotent). TTY μ—μ„œλŠ” stderr μ§„ν–‰ λ°”, non-TTY (CI / pipe) λŠ” stderr ν•œ 쀄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming ν›„ λ§ˆμ§€λ§‰μ— `ingest_report.v1`. Ctrl-C ν•œ 번이면 ν˜„μž¬ asset 마무리 ν›„ abort (λΆ€λΆ„ commit 보쑴, idempotent re-run), 두 번째 Ctrl-C λŠ” hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 β†’ H2 β†’ 첫 paragraph 80 자 β†’ 파일λͺ… 순으둜 μžλ™ 채움 (parser_version `md-frontmatter-v2`) β€” κΈ°μ‘΄ μƒ‰μΈλœ doc 도 λ‹€μŒ ingest μ—μ„œ μƒˆ title 둜 κ°±μ‹ . **Incremental** (p9-fb-23): 두 번째 μ΄ν›„μ˜ ingest λŠ” λ³€ν•˜μ§€ μ•Šμ€ doc (blake3 + parser/chunker/embedder version λͺ¨λ‘ 동일) 의 parse/chunk/embed/vector upsert λ₯Ό μžλ™ μŠ€ν‚΅. final summary 에 `N unchanged` 카운트 ν‘œμ‹œ. `--force-reingest` 둜 skip λ¬΄μ‹œ κ°•μ œ 재처리. **지원 ν˜•μ‹** (extractor μžλ™ κ²°μ • β€” config 에 λͺ…μ‹œ λΆˆκ°€): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`), **μ†ŒμŠ€μ½”λ“œ** (`.rs` β†’ `code-rust-ast-v1`, `.py` β†’ `code-python-ast-v1`, `.ts`/`.tsx` β†’ `code-ts-ast-v1`, `.js`/`.mjs`/`.cjs`/`.jsx` β†’ `code-js-ast-v1`, `.go` β†’ `code-go-ast-v1`, `.java` β†’ `code-java-ast-v1`, `.kt`/`.kts` β†’ `code-kotlin-ast-v1` β€” λͺ¨λ‘ tree-sitter AST chunker; **Tier 2 λ¦¬μ†ŒμŠ€ 파일**: `.yaml`/`.yml` β†’ `k8s-manifest-resource-v1` (apiVersion+kind νŒŒμ‹±), `Dockerfile`/`Dockerfile.*`/`*.dockerfile` β†’ `dockerfile-file-v1` (전체 파일), `Cargo.toml`/`pyproject.toml`/`.toml`/`package.json`/`tsconfig.json`/`.json`/`pom.xml`/`.xml`/`build.gradle`/`.gradle`/`go.mod` β†’ `manifest-file-v1` (전체 파일) β€” yaml (k8s) / dockerfile / toml / json / xml / groovy / go-mod 지원); **Tier 3 paragraph fallback** (`.sh`/`.bash`/`.zsh` β†’ `code-text-paragraph-v1`, blank-line paragraph split + 80-line/20-overlap line-window. Tier 1/2 κ°€ 0 chunk λ˜λŠ” Err μ‹œ μžλ™ fallback β€” λΉ„-k8s YAML 같은 μΌ€μ΄μŠ€ picked up. symbol = None, lang 은 원본 보쑴.). λ‹€λ₯Έ ν™•μž₯μžλŠ” μžλ™ skip β€” `IngestItem.warnings` 에 μ‚¬μœ  (`"unsupported media type: .docx"` λ“±), `IngestReport.skipped_by_extension` 에 카운트 λΆ„λ₯˜, CLI / TUI summary 에 breakdown ν‘œμ‹œ. μ½”λ“œ chunk λŠ” `citation.kind = "code"` 에 `citation.lang = ""` + `symbol` + line range λ₯Ό λ‹΄κ³ , SearchHit top-level 에 `code_lang` + `repo` (`.git/` walk-up 의 디렉토리 이름) κ°€ backfill 됨. `--code-lang rust` / `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` / `--code-lang go` / `--code-lang java` / `--code-lang kotlin` / `--code-lang yaml` / `--code-lang dockerfile` / `--code-lang toml` / `--code-lang json` / `--code-lang xml` / `--code-lang groovy` / `--code-lang go-mod` / `--code-lang shell` / `--media code` filter 둜 μ–Έμ–΄λ³„Β·μ½”λ“œ μ „μš© 검색 κ°€λŠ₯ (p10-1A-1 filter flags). Python symbol 은 workspace 경둜 β†’ dotted module path prefix (예: `kebab_eval.metrics.compute_mrr`), TS/JS symbol 은 slash-style module path prefix (예: `src/Foo.Foo.search`), Go symbol 은 `package.Func` / `package.(*Receiver).Method` ν˜•μ‹, Java / Kotlin symbol 은 `com.foo.Foo.bar` ν˜•μ‹ (νŒ¨ν‚€μ§€ + 클래슀 + λ©”μ„œλ“œ/ν•„λ“œ). | +| `kebab ingest []` | Markdown / 이미지 / PDF / Rust μ†ŒμŠ€μ½”λ“œ 색인 (idempotent). TTY μ—μ„œλŠ” stderr μ§„ν–‰ λ°”, non-TTY (CI / pipe) λŠ” stderr ν•œ 쀄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming ν›„ λ§ˆμ§€λ§‰μ— `ingest_report.v1`. Ctrl-C ν•œ 번이면 ν˜„μž¬ asset 마무리 ν›„ abort (λΆ€λΆ„ commit 보쑴, idempotent re-run), 두 번째 Ctrl-C λŠ” hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 β†’ H2 β†’ 첫 paragraph 80 자 β†’ 파일λͺ… 순으둜 μžλ™ 채움 (parser_version `md-frontmatter-v2`) β€” κΈ°μ‘΄ μƒ‰μΈλœ doc 도 λ‹€μŒ ingest μ—μ„œ μƒˆ title 둜 κ°±μ‹ . **Incremental** (p9-fb-23): 두 번째 μ΄ν›„μ˜ ingest λŠ” λ³€ν•˜μ§€ μ•Šμ€ doc (blake3 + parser/chunker/embedder version λͺ¨λ‘ 동일) 의 parse/chunk/embed/vector upsert λ₯Ό μžλ™ μŠ€ν‚΅. final summary 에 `N unchanged` 카운트 ν‘œμ‹œ. `--force-reingest` 둜 skip λ¬΄μ‹œ κ°•μ œ 재처리. **지원 ν˜•μ‹** (extractor μžλ™ κ²°μ • β€” config 에 λͺ…μ‹œ λΆˆκ°€): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`), **μ†ŒμŠ€μ½”λ“œ** (`.rs` β†’ `code-rust-ast-v1`, `.py` β†’ `code-python-ast-v1`, `.ts`/`.tsx` β†’ `code-ts-ast-v1`, `.js`/`.mjs`/`.cjs`/`.jsx` β†’ `code-js-ast-v1`, `.go` β†’ `code-go-ast-v1`, `.java` β†’ `code-java-ast-v1`, `.kt`/`.kts` β†’ `code-kotlin-ast-v1`, `.c`/`.h` β†’ `code-c-ast-v1`, `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx` β†’ `code-cpp-ast-v1` β€” λͺ¨λ‘ tree-sitter AST chunker; **Tier 2 λ¦¬μ†ŒμŠ€ 파일**: `.yaml`/`.yml` β†’ `k8s-manifest-resource-v1` (apiVersion+kind νŒŒμ‹±), `Dockerfile`/`Dockerfile.*`/`*.dockerfile` β†’ `dockerfile-file-v1` (전체 파일), `Cargo.toml`/`pyproject.toml`/`.toml`/`package.json`/`tsconfig.json`/`.json`/`pom.xml`/`.xml`/`build.gradle`/`.gradle`/`go.mod` β†’ `manifest-file-v1` (전체 파일) β€” yaml (k8s) / dockerfile / toml / json / xml / groovy / go-mod 지원); **Tier 3 paragraph fallback** (`.sh`/`.bash`/`.zsh` β†’ `code-text-paragraph-v1`, blank-line paragraph split + 80-line/20-overlap line-window. Tier 1/2 κ°€ 0 chunk λ˜λŠ” Err μ‹œ μžλ™ fallback β€” λΉ„-k8s YAML 같은 μΌ€μ΄μŠ€ picked up. symbol = None, lang 은 원본 보쑴.). λ‹€λ₯Έ ν™•μž₯μžλŠ” μžλ™ skip β€” `IngestItem.warnings` 에 μ‚¬μœ  (`"unsupported media type: .docx"` λ“±), `IngestReport.skipped_by_extension` 에 카운트 λΆ„λ₯˜, CLI / TUI summary 에 breakdown ν‘œμ‹œ. μ½”λ“œ chunk λŠ” `citation.kind = "code"` 에 `citation.lang = ""` + `symbol` + line range λ₯Ό λ‹΄κ³ , SearchHit top-level 에 `code_lang` + `repo` (`.git/` walk-up 의 디렉토리 이름) κ°€ backfill 됨. `--code-lang rust` / `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` / `--code-lang go` / `--code-lang java` / `--code-lang kotlin` / `--code-lang yaml` / `--code-lang dockerfile` / `--code-lang toml` / `--code-lang json` / `--code-lang xml` / `--code-lang groovy` / `--code-lang go-mod` / `--code-lang shell` / `--code-lang c` / `--code-lang cpp` / `--media code` filter 둜 μ–Έμ–΄λ³„Β·μ½”λ“œ μ „μš© 검색 κ°€λŠ₯ (p10-1A-1 filter flags). Python symbol 은 workspace 경둜 β†’ dotted module path prefix (예: `kebab_eval.metrics.compute_mrr`), TS/JS symbol 은 slash-style module path prefix (예: `src/Foo.Foo.search`), Go symbol 은 `package.Func` / `package.(*Receiver).Method` ν˜•μ‹, Java / Kotlin symbol 은 `com.foo.Foo.bar` ν˜•μ‹ (νŒ¨ν‚€μ§€ + 클래슀 + λ©”μ„œλ“œ/ν•„λ“œ). | | `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID] [--trace] [--bulk] [--repo NAME ...] [--code-lang LIST]` | 검색. hybridλŠ” RRF fusion, citation 포함. 같은 process μ•ˆμ—μ„œ 동일 query (NFKC + trim + lowercase μ •κ·œν™”) 반볡 μ‹œ in-process LRU μΊμ‹œ hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 둜 κ°•μ œ bypass β€” λ””λ²„κΉ…μš©. ingest commit λ°œμƒ μ‹œ `kv['corpus_revision']` bump 으둜 λͺ¨λ“  entry μžλ™ stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** β€” agent budget controls. `--json` 좜λ ₯은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) β€” pre-fb-34 의 bare array 와 ν˜Έν™˜ μ•ˆ 됨. mismatched cursor β†’ `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` λŠ” 반볡 κ°€λŠ₯ flag (`--tag rust --tag async`) 둜 OR λ§€μΉ­, `--media` λŠ” `,` ꡬ뢄 닀쀑 κ°’ OR λ§€μΉ­, λ‚˜λ¨Έμ§€ flags 간은 AND μ‘°ν•©. `--trust-min` 은 `primary\|secondary\|generated` 쀑 ν•˜λ‚˜ (ν•΄λ‹Ή level 이상 포함). `--ingested-after` λŠ” RFC3339 UTC β€” νŒŒμ‹± μ‹€νŒ¨ μ‹œ `error.v1.code = config_invalid` (exit 2). `--media md` λŠ” `markdown` alias 둜 μ •κ·œν™”. μ•Œ 수 μ—†λŠ” `--media` 값은 무쑰건 empty hits (였λ₯˜ μ•„λ‹˜). **`--trace` (p9-fb-37)** β€” `search_response.v1.trace` 에 lexical / vector pre-fusion 후보 + RRF union + per-stage timing (`lexical_ms` / `vector_ms` / `fusion_ms` / `total_ms`) λ…ΈμΆœ. trace μš”μ²­μ€ μΊμ‹œ 우회 (`--no-cache` 없이도 항상 cold). **`--bulk` (p9-fb-42)** β€” stdin ndjson 으둜 N query ν•œ λ²ˆμ— μ‹€ν–‰. `--json` λ©΄ stdout per-query ndjson (`bulk_search_item.v1`) + stderr summary (`bulk_summary: total=N succeeded=S failed=F`). Cap 100. agent κ°€ query decomposition ν›„ sub-query 일괄 μ‹€ν–‰ μ‹œ single round-trip β€” App instance μž¬μ‚¬μš©μœΌλ‘œ μΊμ‹œ / embedder cold-start λΉ„μš© ν•œ 번만. Per-query failure λŠ” item 의 `error` (error.v1) 에 격리, λ‹€λ₯Έ query 계속 μ§„ν–‰. **code corpus filters (p10-1A-1):** `--repo` λŠ” 반볡 κ°€λŠ₯ (`--repo kebab --repo other`) OR λ§€μΉ­. `--code-lang` λŠ” 반볡 λ˜λŠ” comma 닀쀑 κ°’ (`--code-lang rust,python`), μ•Œ 수 μ—†λŠ” 값은 빈 hits. `--media code` λŠ” Tier 1/2/3 λͺ¨λ“  code chunk 포함. 1A-1 μ‹œμ μ—μ„œλŠ” indexed 된 code chunk κ°€ μ—†μ–΄ filter κ°€ 항상 빈 κ²°κ³Ό β€” 1A-2 (Rust AST chunker) λ¨Έμ§€ 이후 μ‹€νš¨. | | `kebab list docs` | μƒ‰μΈλœ λ¬Έμ„œ λͺ©λ‘ | | `kebab inspect doc ` / `kebab inspect chunk ` | raw record 보기 | @@ -132,7 +132,7 @@ flowchart TB subgraph Pipeline["도메인 + νŒŒμ΄ν”„λΌμΈ"] parse["parse-md / parse-pdf / parse-image / parse-code"] - chunker["chunker (md-heading-v1, pdf-page-v1, code-{rust,python,ts,js,go,java,kotlin}-ast-v1, k8s-manifest-resource-v1, dockerfile-file-v1, manifest-file-v1, code-text-paragraph-v1)"] + chunker["chunker (md-heading-v1, pdf-page-v1, code-{rust,python,ts,js,go,java,kotlin,c,cpp}-ast-v1, k8s-manifest-resource-v1, dockerfile-file-v1, manifest-file-v1, code-text-paragraph-v1)"] embedder["embedder (fastembed multilingual-e5-large)"] retriever["retriever (lexical / vector / hybrid RRF)"] rag["RAG pipeline"] diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 39a0941..dfa38db 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -22,7 +22,7 @@ Cargo workspace, ν•¨μˆ˜ 호좜 기반 λͺ¨λ“ˆλŸ¬ λͺ¨λ†€λ¦¬μŠ€. UI binary (`kebab- | OCR | Ollama vision LM (default `gemma4:e4b`) β€” `OcrEngine` trait 으둜 Tesseract / Apple Vision λ“± future swap (HOTFIXES P6-2) | | Image caption | Ollama vision LM, runtime gate `image.caption.enabled` (default OFF) | | PDF parser | `lopdf` per-page ν…μŠ€νŠΈ, `chunker_version = "pdf-page-v1"` κ°€ PDF μžμ‚°μ— ν•˜λ“œμ½”λ”© (HOTFIXES P7-3) | -| code parser | `tree-sitter` + `tree-sitter-rust` / `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` / `tree-sitter-go` / `tree-sitter-java` / `tree-sitter-kotlin-ng` β€” **parser-side** (`kebab-parse-code`), chunker-side μ•„λ‹˜ (design Β§6.3). chunker versions: Rust = `code-rust-ast-v1`, Python = `code-python-ast-v1`, TypeScript = `code-ts-ast-v1`, JavaScript = `code-js-ast-v1`, Go = `code-go-ast-v1`, Java = `code-java-ast-v1`, Kotlin = `code-kotlin-ast-v1`. `ast_chunk_max_lines = 200` μƒμˆ˜ κ³ μ • (HOTFIXES 2026-05-19 β€” Chunker trait 이 per-medium config λ―Έλ…ΈμΆœ). Kotlin grammar 은 `tree-sitter-kotlin-ng` μ‚¬μš© β€” bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 κ³ μ°©λ˜μ–΄ μžˆμ–΄ μ‚¬μš© λΆˆκ°€. **Tier 2 (p10-2)**: YAML/k8s β†’ `serde_yaml` + `k8s-manifest-resource-v1` (apiVersion+kind per resource), Dockerfile β†’ `dockerfile-file-v1` (whole-file), Cargo.toml/go.mod/.json/.xml/.groovy β†’ `manifest-file-v1` (whole-file). Tier 2 chunkers live in `kebab-chunk`; no tree-sitter grammar needed (structure from file type, not AST). **Tier 3 (p10-3)**: shell scripts (`.sh`/`.bash`/`.zsh`) direct β†’ `code-text-paragraph-v1` (blank-line paragraph segmentation + 80-line / 20-overlap line-window for oversize). Same chunker also serves as fallback when Tier 1/2 emit 0 chunks or Err β€” non-k8s YAML / invalid YAML / AST extractor failures all picked up. symbol = None; lang preserved from input doc. | +| code parser | `tree-sitter` + `tree-sitter-rust` / `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` / `tree-sitter-go` / `tree-sitter-java` / `tree-sitter-kotlin-ng` β€” **parser-side** (`kebab-parse-code`), chunker-side μ•„λ‹˜ (design Β§6.3). chunker versions: Rust = `code-rust-ast-v1`, Python = `code-python-ast-v1`, TypeScript = `code-ts-ast-v1`, JavaScript = `code-js-ast-v1`, Go = `code-go-ast-v1`, Java = `code-java-ast-v1`, Kotlin = `code-kotlin-ast-v1`. `ast_chunk_max_lines = 200` μƒμˆ˜ κ³ μ • (HOTFIXES 2026-05-19 β€” Chunker trait 이 per-medium config λ―Έλ…ΈμΆœ). Kotlin grammar 은 `tree-sitter-kotlin-ng` μ‚¬μš© β€” bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 κ³ μ°©λ˜μ–΄ μžˆμ–΄ μ‚¬μš© λΆˆκ°€. **Tier 2 (p10-2)**: YAML/k8s β†’ `serde_yaml` + `k8s-manifest-resource-v1` (apiVersion+kind per resource), Dockerfile β†’ `dockerfile-file-v1` (whole-file), Cargo.toml/go.mod/.json/.xml/.groovy β†’ `manifest-file-v1` (whole-file). Tier 2 chunkers live in `kebab-chunk`; no tree-sitter grammar needed (structure from file type, not AST). **Tier 3 (p10-3)**: shell scripts (`.sh`/`.bash`/`.zsh`) direct β†’ `code-text-paragraph-v1` (blank-line paragraph segmentation + 80-line / 20-overlap line-window for oversize). Same chunker also serves as fallback when Tier 1/2 emit 0 chunks or Err β€” non-k8s YAML / invalid YAML / AST extractor failures all picked up. symbol = None; lang preserved from input doc. **Tier 1 family complete (p10-1D)**: C (`tree-sitter-c`, `code-c-ast-v1`, `.c`/`.h`) + C++ (`tree-sitter-cpp`, `code-cpp-ast-v1`, `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx`). C symbol = function name only; C++ symbol = `namespace::Class::method` (recursive nesting). `.h` κ°€ C++ syntax λ§Œλ‚˜λ©΄ tree-sitter-c parse μ‹€νŒ¨ β†’ Tier 3 fallback. | | 1B symbol path | workspace path β†’ module path: Python = dotted prefix (`kebab_eval.metrics.compute_mrr`), TypeScript/JavaScript = slash-style prefix (`src/Foo.Foo.search`). Rust 1A-2 λŠ” file-scope nesting 만 (workspace prefix μ—†μŒ, 비일관 수용 β€” HOTFIXES 2026-05-20). | | TUI | Ratatui + crossterm β€” P9-1 Library νŒ¨λ„, P9-2/3/4 μ§„ν–‰ μ˜ˆμ • | | Desktop | Tauri 2 + `pdfjs-dist` (native PDF render backend κΈˆμ§€) β€” P9-5 | @@ -52,7 +52,7 @@ flowchart TB ppdf["kebab-parse-pdf"] pimg["kebab-parse-image"] paud["kebab-parse-audio
(P8 보λ₯˜)"] - pcode["kebab-parse-code
(P10-1A-2 + P10-1B + P10-1C-Go + P10-1C-JK + P10-2 + P10-3)"] + pcode["kebab-parse-code
(P10-1A-2 + P10-1B + P10-1C-Go + P10-1C-JK + P10-2 + P10-3 + P10-1D)"] ptypes["kebab-parse-types"] norm["kebab-normalize"] chunk["kebab-chunk"] @@ -127,7 +127,7 @@ flowchart TB UI β†’ store/llm/parse 직접 의쑴 κΈˆμ§€. λͺ¨λ“  user-facing μ§„μž…μ€ `kebab-app` facade 만 ν†΅ν•œλ‹€ (frozen 섀계 Β§8). `kebab-cli` κ°€ `--config ` flag λ₯Ό honor ν•˜λ €λ©΄ `kebab_app::*_with_config(cfg, …)` companion 을 톡해 Config 을 λͺ…μ‹œμ μœΌλ‘œ thread ν•˜λŠ” νŒ¨ν„΄ β€” μžμ„Έν•œ μ΄μœ λŠ” [tasks/HOTFIXES.md](../tasks/HOTFIXES.md) 의 `--config` ν•­λͺ©. -`kebab-parse-code` 의 μ™ΈλΆ€ tree-sitter grammar crate 의쑴: P10-1A-2 μ—μ„œ `tree-sitter-rust` μΆ”κ°€, P10-1B μ—μ„œ `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` μΆ”κ°€, P10-1C-Go μ—μ„œ `tree-sitter-go` μΆ”κ°€, P10-1C-JK μ—μ„œ `tree-sitter-java` / `tree-sitter-kotlin-ng` μΆ”κ°€. λͺ¨λ‘ `kebab-parse-code` μ—λ§Œ 격리 (facade λ£° β€” UI crate / chunker κ°€ 직접 import κΈˆμ§€). Kotlin 은 `tree-sitter-kotlin-ng` μ‚¬μš© (bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 κ³ μ°© β€” μ‚¬μš© λΆˆκ°€). +`kebab-parse-code` 의 μ™ΈλΆ€ tree-sitter grammar crate 의쑴: P10-1A-2 μ—μ„œ `tree-sitter-rust` μΆ”κ°€, P10-1B μ—μ„œ `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` μΆ”κ°€, P10-1C-Go μ—μ„œ `tree-sitter-go` μΆ”κ°€, P10-1C-JK μ—μ„œ `tree-sitter-java` / `tree-sitter-kotlin-ng` μΆ”κ°€, P10-1D μ—μ„œ `tree-sitter-c` / `tree-sitter-cpp` μΆ”κ°€. λͺ¨λ‘ `kebab-parse-code` μ—λ§Œ 격리 (facade λ£° β€” UI crate / chunker κ°€ 직접 import κΈˆμ§€). Kotlin 은 `tree-sitter-kotlin-ng` μ‚¬μš© (bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 κ³ μ°© β€” μ‚¬μš© λΆˆκ°€). ## 디렉토리 ꡬ쑰 @@ -165,9 +165,11 @@ kebab/ β”‚ β”œβ”€β”€ kebab-source-fs/ # μ›Œν¬μŠ€νŽ˜μ΄μŠ€ walk + checksum (P1-1) β”‚ β”œβ”€β”€ kebab-parse-md/ # Markdown frontmatter + blocks (P1-2/3) β”‚ β”œβ”€β”€ kebab-normalize/ # ParsedBlock β†’ CanonicalDocument (P1-4) -β”‚ β”œβ”€β”€ kebab-chunk/ # heading-aware + pdf-page-v1 + code-*-ast-v1 (Tier 1) + k8s-manifest-resource-v1 + dockerfile-file-v1 + manifest-file-v1 + tier2_shared (P10-2) + code-text-paragraph-v1 (P10-3) chunker (P1-5, P7-2, P10-1A-2, P10-1B, P10-1C-Go, P10-1C-JK, P10-2, P10-3) +β”‚ β”œβ”€β”€ kebab-chunk/ # heading-aware + pdf-page-v1 + code-*-ast-v1 (Tier 1) + k8s-manifest-resource-v1 + dockerfile-file-v1 + manifest-file-v1 + tier2_shared (P10-2) + code-text-paragraph-v1 (P10-3) chunker (P1-5, P7-2, P10-1A-2, P10-1B, P10-1C-Go, P10-1C-JK, P10-2, P10-3, P10-1D) β”‚ β”‚ └── src/ -β”‚ β”‚ β”œβ”€β”€ code_*_ast_v1.rs # Tier 1 AST chunkers (rust/python/ts/js/go/java/kotlin) +β”‚ β”‚ β”œβ”€β”€ code_*_ast_v1.rs # Tier 1 AST chunkers (rust/python/ts/js/go/java/kotlin/c/cpp) +β”‚ β”‚ β”œβ”€β”€ code_c_ast_v1.rs # Tier 1 (p10-1D): C top-level fn / struct / enum / union +β”‚ β”‚ β”œβ”€β”€ code_cpp_ast_v1.rs # Tier 1 (p10-1D): C++ namespace::Class::method (recursive nesting) β”‚ β”‚ β”œβ”€β”€ k8s_manifest_resource_v1.rs # Tier 2 (p10-2): YAML multi-doc, apiVersion+kind per resource β”‚ β”‚ β”œβ”€β”€ dockerfile_file_v1.rs # Tier 2 (p10-2): whole-file Dockerfile β”‚ β”‚ β”œβ”€β”€ manifest_file_v1.rs # Tier 2 (p10-2): whole-file Cargo.toml / go.mod / .json / .xml / .groovy @@ -182,7 +184,7 @@ kebab/ β”‚ β”œβ”€β”€ kebab-eval/ # golden query runner + metrics (P5-1, P5-2) β”‚ β”œβ”€β”€ kebab-parse-image/ # ImageExtractor + Ollama OCR + caption (P6) β”‚ β”œβ”€β”€ kebab-parse-pdf/ # lopdf per-page text extractor (P7-1) -β”‚ β”œβ”€β”€ kebab-parse-code/ # tree-sitter AST extractors: Rust (P10-1A-2), Python + TypeScript + JavaScript (P10-1B), Go (P10-1C-Go), Java + Kotlin (P10-1C-JK β€” java.rs + kotlin.rs); chunker lives in kebab-chunk +β”‚ β”œβ”€β”€ kebab-parse-code/ # tree-sitter AST extractors: Rust (P10-1A-2), Python + TypeScript + JavaScript (P10-1B), Go (P10-1C-Go), Java + Kotlin (P10-1C-JK β€” java.rs + kotlin.rs), C + C++ (P10-1D β€” c.rs + cpp.rs); chunker lives in kebab-chunk β”‚ β”œβ”€β”€ kebab-app/ # facade (P0 μ‹œκ·Έλ‹ˆμ²˜ + P3-5/P6-4/P7-3 본체) β”‚ β”œβ”€β”€ kebab-tui/ # Ratatui shell + Library νŒ¨λ„ (P9-1) β”‚ β”œβ”€β”€ kebab-mcp/ # stdio MCP server β€” tools: schema, doctor, search, ask (P9-FB-30) diff --git a/docs/SMOKE.md b/docs/SMOKE.md index 52380b7..961ec0a 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -548,6 +548,54 @@ KB --json schema | jq '.stats.code_lang_breakdown' **Tier 3 citation.symbol μ»¨λ²€μ…˜**: 항상 `null`. 의미 λ‹¨μœ„ 식별 μ•ˆ 함. `lang` 은 원본 lang 보쑴 (shell β†’ `"shell"`, yaml β†’ `"yaml"` λ“±). +## P10-1D C + C++ AST chunkers + +P10-3 와 λ™μΌν•œ 격리 KB μ„€μ •. `.c` 와 `.cpp` 파일이 각자의 AST chunker 둜 μ²˜λ¦¬λœλ‹€. + +```bash +# 1) C 파일 β€” top-level function symbol +cat > /tmp/kebab-smoke/workspace/parser.c <<'EOF' +#include + +int parse_record(const char *line) { + if (line == NULL) return -1; + return 0; +} +EOF + +# 2) C++ 파일 β€” namespace::Class::method symbol +cat > /tmp/kebab-smoke/workspace/chunker.cpp <<'EOF' +namespace kebab { +namespace chunk { + +class Foo { +public: + void bar() { /* impl */ } +}; + +} // namespace chunk +} // namespace kebab +EOF + +# 3) ingest +KB ingest + +# 4) 언어별 검색 (citation.symbol 확인) +KB search --mode hybrid "parse_record" --code-lang c --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang}]}' +# κΈ°λŒ€: symbol = "parse_record" (function name only), lang = "c" + +KB search --mode hybrid "bar" --code-lang cpp --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang}]}' +# κΈ°λŒ€: symbol = "kebab::chunk::Foo" λ˜λŠ” "kebab::chunk::Foo::bar" (namespace::Class[::method]), lang = "cpp" + +# 5) schema stats 에 C/C++ 카운트 확인 +KB --json schema | jq '.stats.code_lang_breakdown' +# κΈ°λŒ€: {"c": N, "cpp": M, ...} +``` + +**Tier 1 (p10-1D) citation.symbol μ»¨λ²€μ…˜**: C λŠ” function name only (`parse_record` 같이 nesting μ—†μŒ). C++ λŠ” `namespace::Class::method` (recursive namespace + class nesting). `.h` 파일이 C++ syntax (namespace / template / class) λ§Œλ‚˜λ©΄ tree-sitter-c parse μ‹€νŒ¨ β†’ p10-3 Tier 3 fallback (`code-text-paragraph-v1`) 으둜 μžλ™ picked up. + ## 검증 체크리슀트 - `kebab doctor` κ°€ `--config` path λ₯Ό honor ν•˜κ³  κ·Έ μ•ˆμ˜ `storage.data_dir` λ₯Ό 좜λ ₯ (XDG default κ°€ μ•„λ‹˜). @@ -584,6 +632,7 @@ rm -rf /tmp/kebab-smoke # ν†΅μ§Έλ‘œ 정리 - (P10-1C-JK) `.java` νŒŒμΌμ€ `code-java-ast-v1`, `.kt`/`.kts` νŒŒμΌμ€ `code-kotlin-ast-v1` 둜 처리. `--code-lang java` / `--code-lang kotlin` 검색이 `citation.symbol` 에 `com.foo.Foo.bar` ν˜•μ‹ κ²°κ³Όλ₯Ό λ°˜ν™˜ν•˜λ©΄ wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"java": N` / `"kotlin": N` λ“±μž₯ 확인. - (P10-2) `.yaml`/`.yml` νŒŒμΌμ€ apiVersion+kind νŒŒμ‹±μœΌλ‘œ k8s resource 별 chunk 생성 (`k8s-manifest-resource-v1`). `Dockerfile`/`Dockerfile.*` λŠ” 전체 파일 단일 chunk (`dockerfile-file-v1`). `.toml`/`.json`/`.xml`/`.groovy`/`go.mod` λŠ” 전체 파일 단일 chunk (`manifest-file-v1`). `--code-lang yaml` / `--code-lang dockerfile` / `--code-lang toml` 검색이 `citation.symbol` 에 각각 `Deployment/default/my-app` / `` / `` ν˜•μ‹ κ²°κ³Όλ₯Ό λ°˜ν™˜ν•˜λ©΄ wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"yaml": N` / `"dockerfile": N` / `"toml": N` λ“±μž₯ 확인. - (P10-3) `.sh`/`.bash`/`.zsh` νŒŒμΌμ€ direct Tier 3 (`code-text-paragraph-v1`). λΉ„-k8s YAML (apiVersion+kind μ—†λŠ” yaml) 은 k8s chunker κ°€ 0 chunk β†’ Tier 3 fallback 으둜 picked up. `--code-lang shell` / `--code-lang yaml` 검색이 `citation.symbol = null`, `chunker_version = "code-text-paragraph-v1"` κ²°κ³Όλ₯Ό λ°˜ν™˜ν•˜λ©΄ wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"shell": N` λ“±μž₯ 확인. +- (P10-1D) `.c` / `.h` νŒŒμΌμ€ `code-c-ast-v1` (function name only symbol). `.cpp`/`.cc`/`.cxx`/`.hpp`/`.hh`/`.hxx` λŠ” `code-cpp-ast-v1` (`namespace::Class::method` symbol). `--code-lang c` / `--code-lang cpp` 검색 λ™μž‘ + `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"c": N` / `"cpp": M` λ“±μž₯ 확인. `.h` 파일이 C++ λ‚΄μš© (namespace λ“±) κ°–κ³  있으면 μžλ™μœΌλ‘œ Tier 3 (`code-text-paragraph-v1`) fallback 으둜 picked up. - (P7-3 + follow-up) 동일 path 에 byte κ°€ λ‹€λ₯Έ PDF λ₯Ό 두 번째 ingest ν•˜λ©΄ `purge_vector_orphans_for_workspace_path` κ°€ μ˜› chunk_id λ₯Ό LanceDB μ—μ„œ λ¨Όμ € μ‚­μ œ, μ΄μ–΄μ„œ `purge_orphan_at_workspace_path` κ°€ μ˜› doc / chunks / embedding_records λ₯Ό SQLite μ—μ„œ sweep. μƒˆ byte κ°€ μƒˆ `doc_id` 둜 색인됨. `IngestReport` 에 κ·Έ μžμ‚°λ§Œ `new+=1` (λ‹€λ₯Έ μžμ‚°μ€ `updated`). 두 store λͺ¨λ‘ μ •ν•© β€” μ˜› λ³Έλ¬Έ 검색 μ‹œ μ˜› chunks κ°€ 더 이상 surface λ˜μ§€ μ•ŠμŒ. ### Embedding upgrade (fb-39b) diff --git a/tasks/INDEX.md b/tasks/INDEX.md index 7bf62ac..e02038e 100644 --- a/tasks/INDEX.md +++ b/tasks/INDEX.md @@ -144,7 +144,7 @@ P0~P5 λŠ” 직렬. P6~P9 λŠ” P5 이후 병렬 κ°€λŠ₯. - [p10-1B Python + TS/JS AST chunkers](p10/p10-1b-py-ts-js-ast-chunkers.md) β€” 🟑 PR μ˜€ν”ˆ (μ½”λ“œ μ™„μ„±, λ¨Έμ§€ λŒ€κΈ°) - p10-1C-Go Go AST chunker β€” 🟑 PR μ˜€ν”ˆ (v0.12.0, `code-go-ast-v1`) - p10-1C-JavaKotlin Java + Kotlin AST chunkers β€” 🟒 PR μ˜€ν”ˆ (v0.13.0, `code-java-ast-v1` / `code-kotlin-ast-v1`) - - p10-1D C + C++ AST chunkers β€” ⏳ + - p10-1D C + C++ AST chunkers β€” βœ… λ¨Έμ§€ (v0.16.0, `code-c-ast-v1` + `code-cpp-ast-v1`) - p10-2 Tier 2 resource-aware β€” βœ… λ¨Έμ§€ (v0.14.0, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1`) - p10-3 Tier 3 paragraph + line-window fallback β€” βœ… λ¨Έμ§€ (v0.15.0, `code-text-paragraph-v1`) diff --git a/tasks/p10/INDEX.md b/tasks/p10/INDEX.md index f2bb2c9..e2c762f 100644 --- a/tasks/p10/INDEX.md +++ b/tasks/p10/INDEX.md @@ -7,7 +7,7 @@ | 1B | Python + TS/JS AST chunkers | 🟑 PR μ˜€ν”ˆ (μ½”λ“œ μ™„μ„±, λ¨Έμ§€ λŒ€κΈ°) | | 1C-Go | Go AST chunker (`code-go-ast-v1`) | 🟑 PR μ˜€ν”ˆ (v0.12.0) | | 1C-JavaKotlin | Java + Kotlin AST chunkers (`code-java-ast-v1` / `code-kotlin-ast-v1`) | 🟒 PR μ˜€ν”ˆ (v0.13.0) | -| 1D | C + C++ AST chunkers | ⏳ | +| 1D | C + C++ AST chunkers | βœ… λ¨Έμ§€ (v0.16.0) | | 2 | Tier 2 resource-aware (k8s / Dockerfile / manifest) | βœ… λ¨Έμ§€ (v0.14.0) | | 3 | Tier 3 paragraph + line-window fallback | βœ… λ¨Έμ§€ (v0.15.0) | From 86aa180ad74504ebb06d96ebe6a9dc90ae2dcd7d Mon Sep 17 00:00:00 2001 From: altair823 Date: Thu, 21 May 2026 15:38:00 +0000 Subject: [PATCH 13/13] =?UTF-8?q?chore:=20bump=20version=200.15.0=20?= =?UTF-8?q?=E2=86=92=200.16.0=20(p10-1d=20C=20+=20C++=20AST=20chunkers)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Minor bump β€” additive new chunker_versions code-c-ast-v1 + code-cpp-ast-v1 + new routing langs c / cpp + new tree-sitter-c / tree-sitter-cpp workspace deps. P10 Tier 1 chunker family complete. No DB migration, no wire schema major bump. Also lands the missing p10-3 try_skip_unchanged fallback-aware fix (Option B1 β€” 7th param) that PR #155 was supposed to ship but never made it to main (implementer reported commit SHA 2a39513 that didn't exist in the merged branch). Same commit extends tier3_fallback_cv to include c/cpp. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 46 +++++++++++++++++++++++----------------------- Cargo.toml | 2 +- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4a82f3b..73f9b16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4127,7 +4127,7 @@ dependencies = [ [[package]] name = "kebab-app" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "base64 0.22.1", @@ -4172,7 +4172,7 @@ dependencies = [ [[package]] name = "kebab-chunk" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4188,7 +4188,7 @@ dependencies = [ [[package]] name = "kebab-cli" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "clap", @@ -4209,7 +4209,7 @@ dependencies = [ [[package]] name = "kebab-config" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "dirs 5.0.1", @@ -4224,7 +4224,7 @@ dependencies = [ [[package]] name = "kebab-core" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4238,7 +4238,7 @@ dependencies = [ [[package]] name = "kebab-embed" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4252,7 +4252,7 @@ dependencies = [ [[package]] name = "kebab-embed-local" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "fastembed", @@ -4265,7 +4265,7 @@ dependencies = [ [[package]] name = "kebab-eval" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-app", @@ -4284,7 +4284,7 @@ dependencies = [ [[package]] name = "kebab-llm" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-core", @@ -4293,7 +4293,7 @@ dependencies = [ [[package]] name = "kebab-llm-local" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-config", @@ -4310,7 +4310,7 @@ dependencies = [ [[package]] name = "kebab-mcp" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-app", @@ -4328,7 +4328,7 @@ dependencies = [ [[package]] name = "kebab-normalize" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-core", @@ -4343,7 +4343,7 @@ dependencies = [ [[package]] name = "kebab-parse-code" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "gix", @@ -4366,7 +4366,7 @@ dependencies = [ [[package]] name = "kebab-parse-image" -version = "0.15.0" +version = "0.16.0" dependencies = [ "ab_glyph", "anyhow", @@ -4390,7 +4390,7 @@ dependencies = [ [[package]] name = "kebab-parse-md" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "kebab-core", @@ -4407,7 +4407,7 @@ dependencies = [ [[package]] name = "kebab-parse-pdf" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4420,7 +4420,7 @@ dependencies = [ [[package]] name = "kebab-parse-types" -version = "0.15.0" +version = "0.16.0" dependencies = [ "kebab-core", "serde", @@ -4428,7 +4428,7 @@ dependencies = [ [[package]] name = "kebab-rag" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4449,7 +4449,7 @@ dependencies = [ [[package]] name = "kebab-search" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "globset", @@ -4468,7 +4468,7 @@ dependencies = [ [[package]] name = "kebab-source-fs" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4487,7 +4487,7 @@ dependencies = [ [[package]] name = "kebab-store-sqlite" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "blake3", @@ -4508,7 +4508,7 @@ dependencies = [ [[package]] name = "kebab-store-vector" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "arrow", @@ -4532,7 +4532,7 @@ dependencies = [ [[package]] name = "kebab-tui" -version = "0.15.0" +version = "0.16.0" dependencies = [ "anyhow", "crossterm", diff --git a/Cargo.toml b/Cargo.toml index 42d34a5..3a5c1d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ edition = "2024" rust-version = "1.85" license = "MIT OR Apache-2.0" repository = "https://github.com/altair823/kebab" -version = "0.15.0" +version = "0.16.0" [workspace.dependencies] anyhow = "1"