diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 19b77e5..e4ad89c 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -880,6 +880,22 @@ fn try_skip_unchanged( // logic self-documenting and guards against future id_for_doc // changes. if existing_doc.parser_version != *current_parser_version { + // v0.17.0 PR-B: parser_version bump cascade. Same bytes (same + // asset_id) → asset-keyed `stale_chunk_ids_at` is a no-op, but + // the stale `documents` row at this workspace_path still + // collides with `idx_docs_workspace_path` on the next INSERT + // and the LanceDB rows under the old chunk_ids orphan. Sweep + // both stores here, before returning Ok(None), so the caller's + // full-ingest path lands a clean slate. The `keep_doc_id = ""` + // sentinel removes every doc at this path (the new doc_id is + // not yet known here — it's computed downstream from the new + // PARSER_VERSION). + purge_workspace_path_for_parser_bump(app, asset).with_context(|| { + format!( + "parser-bump orphan purge at {}", + asset.workspace_path.0 + ) + })?; return Ok(None); } // 3. Chunker unchanged. @@ -1486,6 +1502,53 @@ fn record_image_analysis_failure( warning_notes.push(note); } +/// v0.17.0 PR-B: parser-bump cascade. When a code extractor ships a +/// new `PARSER_VERSION` (e.g. `code-c-v1` → `code-c-v2`), the same +/// (workspace_path, asset_id) pair re-emerges with a fresh `doc_id`. +/// The existing asset-keyed [`purge_vector_orphans_for_workspace_path`] +/// only fires on asset_id changes (file bytes edited) and is a no-op +/// here. Without an explicit doc-keyed sweep the next INSERT raises +/// `idx_docs_workspace_path` UNIQUE and the LanceDB rows under the +/// stale chunk_ids orphan. This helper: +/// +/// 1. Fetches every stale chunk_id at `workspace_path` from SQLite +/// (`keep_doc_id = ""` means "all existing docs are stale" — +/// `try_skip_unchanged` calls this before the new doc_id is +/// computed). +/// 2. Deletes the matching vectors from every Lance table (no-op if +/// embeddings are disabled). +/// 3. Sweeps the SQLite `documents` row (CASCADE drops `blocks` / +/// `chunks` / `embedding_records`). The `assets` row stays — same +/// bytes, same asset_id, only the derived `doc_id` changed. +fn purge_workspace_path_for_parser_bump( + app: &App, + asset: &RawAsset, +) -> anyhow::Result<()> { + let path = &asset.workspace_path.0; + let stale = app + .sqlite + .stale_chunk_ids_for_workspace_path_except_doc_id(path, "") + .context("SqliteStore::stale_chunk_ids_for_workspace_path_except_doc_id")?; + if !stale.is_empty() { + if let Some(vec_store) = app.vector().context("App::vector")? { + use kebab_core::VectorStore as _; + vec_store + .delete_by_chunk_ids(&stale) + .context("VectorStore::delete_by_chunk_ids (parser-bump orphans)")?; + } + } + app.sqlite + .purge_document_at_workspace_path_except_doc_id(path, "") + .context("SqliteStore::purge_document_at_workspace_path_except_doc_id")?; + tracing::debug!( + target: "kebab-app", + path = %path, + count = stale.len(), + "purged orphan vectors + document for parser_version bump" + ); + Ok(()) +} + /// HOTFIXES 2026-05-02 P7-3 follow-up: when a tracked file's bytes /// change, `purge_orphan_at_workspace_path` (in `kebab-store-sqlite`) /// sweeps the SQLite chain (documents → blocks / chunks / embedding_records) diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index 793ece1..28d1d05 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -1145,8 +1145,8 @@ fn tier1_c_ingest_searchable() { .expect("parser.c item present"); assert_eq!( c_item.parser_version.as_ref().map(|p| p.0.as_str()), - Some("code-c-v1"), - "parser_version must be code-c-v1" + Some("code-c-v2"), + "parser_version must be code-c-v2 (v0.17.0 PR-B: typedef-wrapped struct/enum/union 이 typedef alias unit 으로 방출)" ); assert_eq!( c_item.chunker_version.as_ref().map(|c| c.0.as_str()), diff --git a/crates/kebab-parse-code/src/c.rs b/crates/kebab-parse-code/src/c.rs index 057ed50..ce7e1a3 100644 --- a/crates/kebab-parse-code/src/c.rs +++ b/crates/kebab-parse-code/src/c.rs @@ -31,7 +31,7 @@ use time::OffsetDateTime; use crate::scaffold::{filename_from_workspace_path, strip_extension}; -pub const PARSER_VERSION: &str = "code-c-v1"; +pub const PARSER_VERSION: &str = "code-c-v2"; /// C AST extractor. Per-unit blocks via tree-sitter-c 0.24.2 /// (`LANGUAGE: LanguageFn`) parsed by tree-sitter 0.26. @@ -257,13 +257,33 @@ fn build_blocks( flush_glue(&mut glue, &mut units); units.push((name.to_string(), s, e, true)); } else { - // Anonymous struct/enum/union — glue. + // Anonymous struct/enum/union at the top level (not + // wrapped in typedef) — glue. typedef-wrapped case + // is recovered in the `type_definition` arm below. glue.push((s, e)); } } - // Everything else: preprocessor directives, declarations - // (typedef / global var / fn prototype), type_definition, - // linkage_specification, etc. — all collapse into glue. + "type_definition" => { + // v0.17.0 PR-B: typedef-wrapped anonymous aggregate + // recovery. `typedef struct { ... } Foo;` exposes only + // the alias `Foo` as a useful symbol — the inner + // struct_specifier has no `name` field. Pre-v0.17.0 + // this whole construct collapsed into glue and hid the + // alias from search (HOTFIXES 2026-05-21). v2 recovers + // the alias from the `declarator` field and emits a + // synthetic unit so `Citation::Code.symbol = "Foo"`. + // Plain `typedef int MyInt;` (no inner aggregate) stays + // glue — there's no struct body to name. + if let Some(name) = recover_typedef_alias(child, source) { + flush_glue(&mut glue, &mut units); + units.push((name, s, e, true)); + } else { + glue.push((s, e)); + } + } + // Everything else: preprocessor directives, plain declarations + // (global var / fn prototype), linkage_specification, etc. + // — all collapse into glue. _ => { glue.push((s, e)); } @@ -323,6 +343,62 @@ fn build_blocks( Ok(blocks) } +/// v0.17.0 PR-B: try to recover the typedef alias name from a +/// `type_definition` node *iff* the inner type-specifier is an +/// anonymous struct/enum/union. Returns `None` for any other shape +/// (named aggregate handled elsewhere, plain type alias has no body +/// worth naming). +fn recover_typedef_alias(node: tree_sitter::Node, source: &str) -> Option { + let mut has_anon_aggregate = false; + let mut cursor = node.walk(); + for sub in node.children(&mut cursor) { + match sub.kind() { + "struct_specifier" | "enum_specifier" | "union_specifier" => { + if sub.child_by_field_name("name").is_none() { + has_anon_aggregate = true; + } else { + // Named inner aggregate (e.g. `typedef struct Pt {...} P;`) + // — the named struct itself is the primary symbol and + // is *not* extracted at the top level today (it lives + // inside `type_definition`, not as a sibling + // `struct_specifier`). For v2 we keep behavior conservative: + // return None so the type_definition stays glue, matching + // pre-v2 behavior for this minor case. Real-world C tends + // to use one of: bare named struct, typedef alias only, + // or typedef on anonymous body — the latter is what we fix. + return None; + } + } + _ => {} + } + } + if !has_anon_aggregate { + return None; + } + let decl = node.child_by_field_name("declarator")?; + extract_typedef_alias_name(decl, source).map(str::to_string) +} + +/// Extract the typedef alias identifier from a declarator subtree. +/// Handles the common shapes: direct `type_identifier`, or one wrapped +/// in pointer / function declarator nodes (the alias is always the +/// rightmost `type_identifier` descendant). +fn extract_typedef_alias_name<'a>( + decl: tree_sitter::Node, + source: &'a str, +) -> Option<&'a str> { + if decl.kind() == "type_identifier" { + return Some(&source[decl.start_byte()..decl.end_byte()]); + } + let mut cursor = decl.walk(); + for sub in decl.children(&mut cursor) { + if let Some(found) = extract_typedef_alias_name(sub, source) { + return Some(found); + } + } + None +} + fn flush_glue(glue: &mut Vec<(u32, u32)>, units: &mut Vec<(String, u32, u32, bool)>) { if glue.is_empty() { return; @@ -489,20 +565,72 @@ mod tests { } #[test] - fn c_extractor_typedef_struct_falls_into_glue() { - // typedef struct { ... } Foo; — inner struct_specifier is anonymous, - // outer node is type_definition → glue. See HOTFIXES.md 2026-05-21. + fn c_extractor_typedef_struct_emits_unit() { + // v0.17.0 PR-B: `typedef struct { ... } Foo;` was previously a + // hotfix-tracked deviation (HOTFIXES.md 2026-05-21) — the inner + // struct_specifier is anonymous so the named-struct arm didn't + // fire, dropping the whole construct into glue and hiding the + // `Foo` alias from symbol search. The v2 extractor recovers the + // typedef alias from the `declarator` field on the + // `type_definition` node and emits a synthetic unit with that + // name. parser_version bumped `code-c-v1` → `code-c-v2`. let src = "typedef struct { int x; int y; } Point;\n"; let doc = tests_support::extract_c(src, "x/typedef.c"); let s = syms(&doc); + // The typedef alias surfaces as a Code symbol. + assert!( + s.iter().any(|x| x == "Point"), + "expected 'Point' unit from typedef alias: {s:?}" + ); + // No `` (the file has exactly one semantic unit now, + // the typedef alias — no glue-only fallback needed). + assert!( + !s.iter().any(|x| x == ""), + "no fallback expected when typedef emits a unit: {s:?}" + ); + } + + #[test] + fn c_extractor_typedef_enum_emits_unit() { + // Parallel coverage for enum_specifier — same typedef-alias + // synthesis path. `typedef enum { A, B } Color;` → unit `Color`. + let src = "typedef enum { A, B } Color;\n"; + let doc = tests_support::extract_c(src, "x/typedef_enum.c"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "Color"), + "expected 'Color' unit from typedef enum alias: {s:?}" + ); + } + + #[test] + fn c_extractor_typedef_union_emits_unit() { + // Parallel coverage for union_specifier. + let src = "typedef union { int i; float f; } IntOrFloat;\n"; + let doc = tests_support::extract_c(src, "x/typedef_union.c"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "IntOrFloat"), + "expected 'IntOrFloat' unit from typedef union alias: {s:?}" + ); + } + + #[test] + fn c_extractor_typedef_to_existing_type_stays_glue() { + // Negative case: `typedef int MyInt;` has no inner struct/enum/ + // union — there's no struct body to attach the alias to, so the + // construct falls into glue (becomes `` when alone). + // Confirms the new arm only fires for anonymous-struct typedef. + let src = "typedef int MyInt;\n"; + let doc = tests_support::extract_c(src, "x/typedef_alias.c"); + let s = syms(&doc); assert!( s.iter().any(|x| x == ""), - "expected for typedef struct: {s:?}" + "expected for plain typedef alias: {s:?}" ); - // The typedef alias should NOT surface as a Code symbol assert!( - !s.iter().any(|x| x == "Point"), - "unexpected 'Point' unit for typedef struct: {s:?}" + !s.iter().any(|x| x == "MyInt"), + "plain typedef alias must not emit a unit: {s:?}" ); } diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index aa1ff19..c1e3a9b 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -464,6 +464,74 @@ impl SqliteStore { } Ok(out) } + + /// v0.17.0 PR-B: sister of [`Self::stale_chunk_ids_at`] for the + /// `parser_version` bump cascade. When `doc_id` depends on + /// `parser_version` (design §9) and an extractor ships a new + /// `PARSER_VERSION`, the next ingest computes a fresh `doc_id` for + /// the *same* `(workspace_path, asset_id)` pair. The existing + /// asset_id-keyed [`Self::stale_chunk_ids_at`] does NOT fire (same + /// asset), so the legacy `chunks` rows and their LanceDB shadows + /// would orphan. This helper queries by `workspace_path` instead, + /// excluding the freshly-computed `keep_doc_id` so a re-entry + /// during the same ingest doesn't re-sweep the new row. + /// + /// Caller usage: pass the *new* `doc_id` if known; pass an empty + /// string when called before the new INSERT (the case in + /// `try_skip_unchanged`) — all existing docs at `workspace_path` + /// are then collected as stale. + pub fn stale_chunk_ids_for_workspace_path_except_doc_id( + &self, + workspace_path: &str, + keep_doc_id: &str, + ) -> Result> { + let conn = self.lock_conn(); + let mut stmt = conn + .prepare( + "SELECT c.chunk_id + FROM chunks c + INNER JOIN documents d ON c.doc_id = d.doc_id + WHERE d.workspace_path = ?1 AND d.doc_id != ?2", + ) + .map_err(StoreError::from)?; + let rows = stmt + .query_map(params![workspace_path, keep_doc_id], |row| { + row.get::<_, String>(0) + }) + .map_err(StoreError::from)?; + let mut out: Vec = Vec::new(); + for row in rows { + let id = row.map_err(StoreError::from)?; + out.push(kebab_core::ChunkId(id)); + } + Ok(out) + } + + /// v0.17.0 PR-B: sweep the SQLite document chain (`documents` → + /// `blocks` / `chunks` / `embedding_records` via CASCADE) for every + /// row at `workspace_path` whose `doc_id` differs from `keep_doc_id`. + /// Pair with [`Self::stale_chunk_ids_for_workspace_path_except_doc_id`] + /// — caller fetches the chunk_ids first, hands them to + /// `VectorStore::delete_by_chunk_ids`, then calls this sweep. + /// `assets` row is preserved (same bytes, same asset_id — only the + /// derived `doc_id` changed). + /// + /// `keep_doc_id = ""` deletes every doc at `workspace_path` + /// (semantics mirror the sister helper above — used by + /// `try_skip_unchanged` before the new INSERT exists). + pub fn purge_document_at_workspace_path_except_doc_id( + &self, + workspace_path: &str, + keep_doc_id: &str, + ) -> Result<()> { + let conn = self.lock_conn(); + conn.execute( + "DELETE FROM documents WHERE workspace_path = ?1 AND doc_id != ?2", + params![workspace_path, keep_doc_id], + ) + .map_err(StoreError::from)?; + Ok(()) + } } /// Sweep stale `assets` + `documents` + downstream rows when the file diff --git a/tasks/HOTFIXES.md b/tasks/HOTFIXES.md index 023126c..598d0c7 100644 --- a/tasks/HOTFIXES.md +++ b/tasks/HOTFIXES.md @@ -64,6 +64,20 @@ multi-root 도그푸딩(2026-05-20)에서 관찰한 본문 vs 테스트 / glue c Cross-link: `tasks/p10/INDEX.md`, `migrations/V002__fts.sql`, design §5.5 / §3.5. +## 2026-05-24 — v0.17.0 PR-B: C typedef-wrapped struct/enum/union 이 typedef alias unit 으로 방출 (closure of 2026-05-21) + +`crates/kebab-parse-code/src/c.rs::extract_blocks` 에 `type_definition` 분기 추가. 내부 anonymous `struct_specifier` / `enum_specifier` / `union_specifier` (name field 없음) 인 typedef 일 때 declarator 의 typedef alias identifier 를 추출해 synthetic unit 방출. named inner aggregate (`typedef struct Pt { ... } P;`) 와 plain alias (`typedef int MyInt;`) 는 기존대로 glue (top-level typedef-wrapped anonymous aggregate 만 v2 의 1차 범위). + +**parser_version cascade**: `PARSER_VERSION` `code-c-v1` → `code-c-v2` bump. design §9 — `doc_id = (workspace_path, asset_id, parser_version)`. 같은 file (asset_id 불변) + 새 parser_version → 새 doc_id. 즉 같은 workspace_path 에 옛 doc_id 와 새 doc_id 가 동시 INSERT 시도 → `idx_docs_workspace_path` UNIQUE 충돌. + +**Same-workspace_path orphan purge (B1 Step 5b)**: `crates/kebab-store-sqlite/src/store.rs` 에 두 helper 신규 — `stale_chunk_ids_for_workspace_path_except_doc_id(workspace_path, keep_doc_id)` (chunk_ids 수집) + `purge_document_at_workspace_path_except_doc_id(workspace_path, keep_doc_id)` (CASCADE document/chunks 제거). `crates/kebab-app/src/lib.rs::try_skip_unchanged` 의 parser_mismatch 분기에서 `purge_workspace_path_for_parser_bump` wrapper 호출 → 옛 chunk_ids 의 LanceDB orphan 도 `delete_by_chunk_ids` 로 정리 후 SQLite document row 제거 → 이후 `Ok(None)` 반환 → caller 가 새 doc_id 로 INSERT. 기존 `purge_orphan_at_workspace_path` (asset_id 변경 케이스) 는 그대로 — bytes 변경 경로 회귀 없음. + +**사용자 영향**: 기존 v0.16.x KB 의 C 파일은 v0.17.0 binary 로 다음 ingest 시 자동 재처리 (parser_version mismatch → cleanup → 새 doc). 명시적 re-ingest 명령 불필요 (다음 `kebab ingest` 가 자연스럽게 처리). `typedef struct {...} Foo;` 가 `Citation::Code.symbol = "Foo"` 로 search 에 노출. + +**미해결 (Risks)**: nested typedef (`typedef struct { struct {...} inner; } Outer;`) 의 inner 익명 struct 는 여전히 glue — v2 의 1차 범위는 top-level typedef alias 만. + +Cross-link: `crates/kebab-parse-code/src/c.rs::recover_typedef_alias`, `tasks/p10/p10-1d-c-cpp-ast-chunker.md` Risks/notes section. + ## 2026-05-21 — p10-2: k8s multi-resource YAML chunk_id collision **Origin**: P10 종합 도그푸딩 (`/tmp/kebab-p10-dogfood/`, 16 파일). 한 파일에 2+ k8s document (Deployment + Service, `---` 구분) 인 YAML 이 ingest 실패. @@ -84,11 +98,11 @@ Cross-link: `tasks/p10/p10-2-tier2-resource-aware.md` Risks/notes section. **Symptom**: `typedef struct { ... } Foo;` in a `.c` file does NOT emit a struct-level unit. tree-sitter-c classifies the construct as a top-level `type_definition` with an *anonymous* inner `struct_specifier` (no `name` field), so the extractor's `struct_specifier` arm doesn't fire — the whole declaration falls into `` glue. The named typedef alias `Foo` is therefore not searchable as a symbol. -**Status**: Consistent with spec p10-1d-c-cpp-ast-chunker.md's Risks/notes ("Anonymous union / struct … anonymous → glue"), but the spec's main body line 22 ("struct_specifier (named, top-level) → 1 unit") suggests this idiom WOULD emit. Tension noted, not yet fixed. +**Status**: ✅ closed — v0.17.0 (2026-05-24) PR-B 에서 extractor 의 `type_definition` 분기 추가로 해소. 영향은 위 2026-05-24 PR-B 절 참조. 이하는 closure 전 round-2 dogfood 관찰 기록 (frozen). -**Workaround**: search the struct by its field/function names, or use `--code-lang c` to broaden scope. Typedef-aliased struct names won't surface as `Citation::Code.symbol`. +**Workaround (pre-v0.17.0)**: search the struct by its field/function names, or use `--code-lang c` to broaden scope. Typedef-aliased struct names won't surface as `Citation::Code.symbol`. -**Next step**: dogfood real C code for a week+; if this turns out to be a frequent pain point (kernel-style code, libuv, etc.), revisit the extractor to detect `type_definition` → inner `struct_specifier` and emit a synthetic unit named after the typedef alias. +**Resolution (v0.17.0)**: extractor 가 top-level `type_definition` 노드를 만나 내부 anonymous `struct_specifier` / `enum_specifier` / `union_specifier` 가 있으면 `declarator` field 의 typedef alias 이름으로 synthetic unit 방출. `PARSER_VERSION` `code-c-v1` → `code-c-v2` bump. design §9 cascade 동작 — 같은 `(workspace_path, asset_id)` 의 `doc_id` 가 새 parser_version 으로 다르게 계산됨. 옛 doc/chunks row + LanceDB orphan 회피용 same-workspace_path orphan purge helper 동반 (`stale_chunk_ids_for_workspace_path_except_doc_id` + `purge_document_at_workspace_path_except_doc_id`). Cross-link: `tasks/p10/p10-1d-c-cpp-ast-chunker.md` Risks/notes section. diff --git a/tasks/p10/p10-1d-c-cpp-ast-chunker.md b/tasks/p10/p10-1d-c-cpp-ast-chunker.md index cad3208..f60dfb6 100644 --- a/tasks/p10/p10-1d-c-cpp-ast-chunker.md +++ b/tasks/p10/p10-1d-c-cpp-ast-chunker.md @@ -113,7 +113,7 @@ crates/kebab-parse-code/Cargo.toml [edit] — 위 2 dep 신규 entry. - **Template specialization** (`template<> class Foo`): tree-sitter-cpp 의 `template_declaration` 안의 `class_specifier` name 만 추출 — `Foo` 만 symbol 에 들어가고 `` 미포함. design 의 generic 무시 룰 일관. - **`extern "C"` block 안의 fn**: 일반 fn 처리. 외부 wrapping block 은 glue. - **Anonymous union / struct** (`struct { int x; }` 변수 안에): 흔치 않음 + named 만 unit. anonymous 는 glue. -- **typedef-wrapped struct/enum idiom** (`typedef struct { ... } Foo;`) — anonymous inner struct → glue. Named typedef alias 미캡처. dogfood 후 HOTFIXES 검토. See [HOTFIXES.md 2026-05-21 entry](../HOTFIXES.md). +- **typedef-wrapped struct/enum idiom** (`typedef struct { ... } Foo;`) — ✅ v0.17.0 (2026-05-24) PR-B 에서 해소. extractor 의 `type_definition` 분기가 inner anonymous `struct_specifier` / `enum_specifier` / `union_specifier` 를 탐지해 declarator 의 typedef alias 이름으로 synthetic unit 방출. `PARSER_VERSION` `code-c-v1` → `code-c-v2` bump + same-workspace_path orphan purge cascade 동반. **잔여 미해결**: nested typedef (`typedef struct { struct {...} inner; } Outer;`) 의 inner 익명 struct 는 여전히 glue — v2 의 1차 범위는 top-level typedef alias 만. See [HOTFIXES.md 2026-05-21 entry](../HOTFIXES.md) (frozen 관찰) + 2026-05-24 closure entry. - **Macro-heavy code** (Linux kernel 등): `#define FOO(x) ...` 매크로가 function-like 라도 parser 가 fn 으로 인식 안 함. preprocessor glue 로 처리 — symbol 안 잡힘. 의도된 동작 (parser 의 macro expansion 안 함). - **`__attribute__((...))`** annotations: tree-sitter-c 의 attribute 노드는 declarator 옆 sibling. 무시 가능. function name 추출에 영향 없음. - **fixture 크기**: sample.c 는 ~30 line (top-level fn + struct + enum + preprocessor), sample.cpp 는 ~50 line (nested namespace + class + method + template + free fn). oversize fallback 의 별도 검증은 1A-2 의 long_section_snapshot 패턴이 이미 cover (필요 시 별도 fixture).