diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 19b77e5..e4ad89c 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -880,6 +880,22 @@ fn try_skip_unchanged( // logic self-documenting and guards against future id_for_doc // changes. if existing_doc.parser_version != *current_parser_version { + // v0.17.0 PR-B: parser_version bump cascade. Same bytes (same + // asset_id) → asset-keyed `stale_chunk_ids_at` is a no-op, but + // the stale `documents` row at this workspace_path still + // collides with `idx_docs_workspace_path` on the next INSERT + // and the LanceDB rows under the old chunk_ids orphan. Sweep + // both stores here, before returning Ok(None), so the caller's + // full-ingest path lands a clean slate. The `keep_doc_id = ""` + // sentinel removes every doc at this path (the new doc_id is + // not yet known here — it's computed downstream from the new + // PARSER_VERSION). + purge_workspace_path_for_parser_bump(app, asset).with_context(|| { + format!( + "parser-bump orphan purge at {}", + asset.workspace_path.0 + ) + })?; return Ok(None); } // 3. Chunker unchanged. @@ -1486,6 +1502,53 @@ fn record_image_analysis_failure( warning_notes.push(note); } +/// v0.17.0 PR-B: parser-bump cascade. When a code extractor ships a +/// new `PARSER_VERSION` (e.g. `code-c-v1` → `code-c-v2`), the same +/// (workspace_path, asset_id) pair re-emerges with a fresh `doc_id`. +/// The existing asset-keyed [`purge_vector_orphans_for_workspace_path`] +/// only fires on asset_id changes (file bytes edited) and is a no-op +/// here. Without an explicit doc-keyed sweep the next INSERT raises +/// `idx_docs_workspace_path` UNIQUE and the LanceDB rows under the +/// stale chunk_ids orphan. This helper: +/// +/// 1. Fetches every stale chunk_id at `workspace_path` from SQLite +/// (`keep_doc_id = ""` means "all existing docs are stale" — +/// `try_skip_unchanged` calls this before the new doc_id is +/// computed). +/// 2. Deletes the matching vectors from every Lance table (no-op if +/// embeddings are disabled). +/// 3. Sweeps the SQLite `documents` row (CASCADE drops `blocks` / +/// `chunks` / `embedding_records`). The `assets` row stays — same +/// bytes, same asset_id, only the derived `doc_id` changed. +fn purge_workspace_path_for_parser_bump( + app: &App, + asset: &RawAsset, +) -> anyhow::Result<()> { + let path = &asset.workspace_path.0; + let stale = app + .sqlite + .stale_chunk_ids_for_workspace_path_except_doc_id(path, "") + .context("SqliteStore::stale_chunk_ids_for_workspace_path_except_doc_id")?; + if !stale.is_empty() { + if let Some(vec_store) = app.vector().context("App::vector")? { + use kebab_core::VectorStore as _; + vec_store + .delete_by_chunk_ids(&stale) + .context("VectorStore::delete_by_chunk_ids (parser-bump orphans)")?; + } + } + app.sqlite + .purge_document_at_workspace_path_except_doc_id(path, "") + .context("SqliteStore::purge_document_at_workspace_path_except_doc_id")?; + tracing::debug!( + target: "kebab-app", + path = %path, + count = stale.len(), + "purged orphan vectors + document for parser_version bump" + ); + Ok(()) +} + /// HOTFIXES 2026-05-02 P7-3 follow-up: when a tracked file's bytes /// change, `purge_orphan_at_workspace_path` (in `kebab-store-sqlite`) /// sweeps the SQLite chain (documents → blocks / chunks / embedding_records) diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index 793ece1..28d1d05 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -1145,8 +1145,8 @@ fn tier1_c_ingest_searchable() { .expect("parser.c item present"); assert_eq!( c_item.parser_version.as_ref().map(|p| p.0.as_str()), - Some("code-c-v1"), - "parser_version must be code-c-v1" + Some("code-c-v2"), + "parser_version must be code-c-v2 (v0.17.0 PR-B: typedef-wrapped struct/enum/union 이 typedef alias unit 으로 방출)" ); assert_eq!( c_item.chunker_version.as_ref().map(|c| c.0.as_str()), diff --git a/crates/kebab-parse-code/src/c.rs b/crates/kebab-parse-code/src/c.rs index 057ed50..ce7e1a3 100644 --- a/crates/kebab-parse-code/src/c.rs +++ b/crates/kebab-parse-code/src/c.rs @@ -31,7 +31,7 @@ use time::OffsetDateTime; use crate::scaffold::{filename_from_workspace_path, strip_extension}; -pub const PARSER_VERSION: &str = "code-c-v1"; +pub const PARSER_VERSION: &str = "code-c-v2"; /// C AST extractor. Per-unit blocks via tree-sitter-c 0.24.2 /// (`LANGUAGE: LanguageFn`) parsed by tree-sitter 0.26. @@ -257,13 +257,33 @@ fn build_blocks( flush_glue(&mut glue, &mut units); units.push((name.to_string(), s, e, true)); } else { - // Anonymous struct/enum/union — glue. + // Anonymous struct/enum/union at the top level (not + // wrapped in typedef) — glue. typedef-wrapped case + // is recovered in the `type_definition` arm below. glue.push((s, e)); } } - // Everything else: preprocessor directives, declarations - // (typedef / global var / fn prototype), type_definition, - // linkage_specification, etc. — all collapse into glue. + "type_definition" => { + // v0.17.0 PR-B: typedef-wrapped anonymous aggregate + // recovery. `typedef struct { ... } Foo;` exposes only + // the alias `Foo` as a useful symbol — the inner + // struct_specifier has no `name` field. Pre-v0.17.0 + // this whole construct collapsed into glue and hid the + // alias from search (HOTFIXES 2026-05-21). v2 recovers + // the alias from the `declarator` field and emits a + // synthetic unit so `Citation::Code.symbol = "Foo"`. + // Plain `typedef int MyInt;` (no inner aggregate) stays + // glue — there's no struct body to name. + if let Some(name) = recover_typedef_alias(child, source) { + flush_glue(&mut glue, &mut units); + units.push((name, s, e, true)); + } else { + glue.push((s, e)); + } + } + // Everything else: preprocessor directives, plain declarations + // (global var / fn prototype), linkage_specification, etc. + // — all collapse into glue. _ => { glue.push((s, e)); } @@ -323,6 +343,62 @@ fn build_blocks( Ok(blocks) } +/// v0.17.0 PR-B: try to recover the typedef alias name from a +/// `type_definition` node *iff* the inner type-specifier is an +/// anonymous struct/enum/union. Returns `None` for any other shape +/// (named aggregate handled elsewhere, plain type alias has no body +/// worth naming). +fn recover_typedef_alias(node: tree_sitter::Node, source: &str) -> Option { + let mut has_anon_aggregate = false; + let mut cursor = node.walk(); + for sub in node.children(&mut cursor) { + match sub.kind() { + "struct_specifier" | "enum_specifier" | "union_specifier" => { + if sub.child_by_field_name("name").is_none() { + has_anon_aggregate = true; + } else { + // Named inner aggregate (e.g. `typedef struct Pt {...} P;`) + // — the named struct itself is the primary symbol and + // is *not* extracted at the top level today (it lives + // inside `type_definition`, not as a sibling + // `struct_specifier`). For v2 we keep behavior conservative: + // return None so the type_definition stays glue, matching + // pre-v2 behavior for this minor case. Real-world C tends + // to use one of: bare named struct, typedef alias only, + // or typedef on anonymous body — the latter is what we fix. + return None; + } + } + _ => {} + } + } + if !has_anon_aggregate { + return None; + } + let decl = node.child_by_field_name("declarator")?; + extract_typedef_alias_name(decl, source).map(str::to_string) +} + +/// Extract the typedef alias identifier from a declarator subtree. +/// Handles the common shapes: direct `type_identifier`, or one wrapped +/// in pointer / function declarator nodes (the alias is always the +/// rightmost `type_identifier` descendant). +fn extract_typedef_alias_name<'a>( + decl: tree_sitter::Node, + source: &'a str, +) -> Option<&'a str> { + if decl.kind() == "type_identifier" { + return Some(&source[decl.start_byte()..decl.end_byte()]); + } + let mut cursor = decl.walk(); + for sub in decl.children(&mut cursor) { + if let Some(found) = extract_typedef_alias_name(sub, source) { + return Some(found); + } + } + None +} + fn flush_glue(glue: &mut Vec<(u32, u32)>, units: &mut Vec<(String, u32, u32, bool)>) { if glue.is_empty() { return; @@ -489,20 +565,72 @@ mod tests { } #[test] - fn c_extractor_typedef_struct_falls_into_glue() { - // typedef struct { ... } Foo; — inner struct_specifier is anonymous, - // outer node is type_definition → glue. See HOTFIXES.md 2026-05-21. + fn c_extractor_typedef_struct_emits_unit() { + // v0.17.0 PR-B: `typedef struct { ... } Foo;` was previously a + // hotfix-tracked deviation (HOTFIXES.md 2026-05-21) — the inner + // struct_specifier is anonymous so the named-struct arm didn't + // fire, dropping the whole construct into glue and hiding the + // `Foo` alias from symbol search. The v2 extractor recovers the + // typedef alias from the `declarator` field on the + // `type_definition` node and emits a synthetic unit with that + // name. parser_version bumped `code-c-v1` → `code-c-v2`. let src = "typedef struct { int x; int y; } Point;\n"; let doc = tests_support::extract_c(src, "x/typedef.c"); let s = syms(&doc); + // The typedef alias surfaces as a Code symbol. + assert!( + s.iter().any(|x| x == "Point"), + "expected 'Point' unit from typedef alias: {s:?}" + ); + // No `` (the file has exactly one semantic unit now, + // the typedef alias — no glue-only fallback needed). + assert!( + !s.iter().any(|x| x == ""), + "no fallback expected when typedef emits a unit: {s:?}" + ); + } + + #[test] + fn c_extractor_typedef_enum_emits_unit() { + // Parallel coverage for enum_specifier — same typedef-alias + // synthesis path. `typedef enum { A, B } Color;` → unit `Color`. + let src = "typedef enum { A, B } Color;\n"; + let doc = tests_support::extract_c(src, "x/typedef_enum.c"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "Color"), + "expected 'Color' unit from typedef enum alias: {s:?}" + ); + } + + #[test] + fn c_extractor_typedef_union_emits_unit() { + // Parallel coverage for union_specifier. + let src = "typedef union { int i; float f; } IntOrFloat;\n"; + let doc = tests_support::extract_c(src, "x/typedef_union.c"); + let s = syms(&doc); + assert!( + s.iter().any(|x| x == "IntOrFloat"), + "expected 'IntOrFloat' unit from typedef union alias: {s:?}" + ); + } + + #[test] + fn c_extractor_typedef_to_existing_type_stays_glue() { + // Negative case: `typedef int MyInt;` has no inner struct/enum/ + // union — there's no struct body to attach the alias to, so the + // construct falls into glue (becomes `` when alone). + // Confirms the new arm only fires for anonymous-struct typedef. + let src = "typedef int MyInt;\n"; + let doc = tests_support::extract_c(src, "x/typedef_alias.c"); + let s = syms(&doc); assert!( s.iter().any(|x| x == ""), - "expected for typedef struct: {s:?}" + "expected for plain typedef alias: {s:?}" ); - // The typedef alias should NOT surface as a Code symbol assert!( - !s.iter().any(|x| x == "Point"), - "unexpected 'Point' unit for typedef struct: {s:?}" + !s.iter().any(|x| x == "MyInt"), + "plain typedef alias must not emit a unit: {s:?}" ); } diff --git a/crates/kebab-store-sqlite/src/store.rs b/crates/kebab-store-sqlite/src/store.rs index aa1ff19..c1e3a9b 100644 --- a/crates/kebab-store-sqlite/src/store.rs +++ b/crates/kebab-store-sqlite/src/store.rs @@ -464,6 +464,74 @@ impl SqliteStore { } Ok(out) } + + /// v0.17.0 PR-B: sister of [`Self::stale_chunk_ids_at`] for the + /// `parser_version` bump cascade. When `doc_id` depends on + /// `parser_version` (design §9) and an extractor ships a new + /// `PARSER_VERSION`, the next ingest computes a fresh `doc_id` for + /// the *same* `(workspace_path, asset_id)` pair. The existing + /// asset_id-keyed [`Self::stale_chunk_ids_at`] does NOT fire (same + /// asset), so the legacy `chunks` rows and their LanceDB shadows + /// would orphan. This helper queries by `workspace_path` instead, + /// excluding the freshly-computed `keep_doc_id` so a re-entry + /// during the same ingest doesn't re-sweep the new row. + /// + /// Caller usage: pass the *new* `doc_id` if known; pass an empty + /// string when called before the new INSERT (the case in + /// `try_skip_unchanged`) — all existing docs at `workspace_path` + /// are then collected as stale. + pub fn stale_chunk_ids_for_workspace_path_except_doc_id( + &self, + workspace_path: &str, + keep_doc_id: &str, + ) -> Result> { + let conn = self.lock_conn(); + let mut stmt = conn + .prepare( + "SELECT c.chunk_id + FROM chunks c + INNER JOIN documents d ON c.doc_id = d.doc_id + WHERE d.workspace_path = ?1 AND d.doc_id != ?2", + ) + .map_err(StoreError::from)?; + let rows = stmt + .query_map(params![workspace_path, keep_doc_id], |row| { + row.get::<_, String>(0) + }) + .map_err(StoreError::from)?; + let mut out: Vec = Vec::new(); + for row in rows { + let id = row.map_err(StoreError::from)?; + out.push(kebab_core::ChunkId(id)); + } + Ok(out) + } + + /// v0.17.0 PR-B: sweep the SQLite document chain (`documents` → + /// `blocks` / `chunks` / `embedding_records` via CASCADE) for every + /// row at `workspace_path` whose `doc_id` differs from `keep_doc_id`. + /// Pair with [`Self::stale_chunk_ids_for_workspace_path_except_doc_id`] + /// — caller fetches the chunk_ids first, hands them to + /// `VectorStore::delete_by_chunk_ids`, then calls this sweep. + /// `assets` row is preserved (same bytes, same asset_id — only the + /// derived `doc_id` changed). + /// + /// `keep_doc_id = ""` deletes every doc at `workspace_path` + /// (semantics mirror the sister helper above — used by + /// `try_skip_unchanged` before the new INSERT exists). + pub fn purge_document_at_workspace_path_except_doc_id( + &self, + workspace_path: &str, + keep_doc_id: &str, + ) -> Result<()> { + let conn = self.lock_conn(); + conn.execute( + "DELETE FROM documents WHERE workspace_path = ?1 AND doc_id != ?2", + params![workspace_path, keep_doc_id], + ) + .map_err(StoreError::from)?; + Ok(()) + } } /// Sweep stale `assets` + `documents` + downstream rows when the file