feat(v0.17.0/PR-B/B1): C typedef extractor + parser_version bump + orphan purge cascade

closure of HOTFIXES 2026-05-21. C typedef-wrapped anonymous
struct/enum/union 이 typedef alias 이름으로 symbol unit 방출.

- crates/kebab-parse-code/src/c.rs: type_definition 분기 추가.
  inner anonymous struct_specifier / enum_specifier / union_specifier
  탐지 → declarator field 의 type_identifier 재귀 추출 → synthetic
  unit (typedef alias). named inner aggregate / plain alias 는
  기존대로 glue. PARSER_VERSION code-c-v1 → code-c-v2.
  recover_typedef_alias + extract_typedef_alias_name helper 추가.

- crates/kebab-store-sqlite/src/store.rs: 두 helper 신규
  (parser_version bump cascade 용 doc-id 기반 orphan purge).
  - stale_chunk_ids_for_workspace_path_except_doc_id(workspace_path,
    keep_doc_id) — sister of stale_chunk_ids_at, doc_id 기반.
  - purge_document_at_workspace_path_except_doc_id(workspace_path,
    keep_doc_id) — CASCADE document/chunks 제거, assets 보존.
  keep_doc_id="" 가 "모든 doc 제거" 사용.

- crates/kebab-app/src/lib.rs: try_skip_unchanged 의 parser_mismatch
  분기에서 purge_workspace_path_for_parser_bump 호출. helper 가
  app.vector() 로 lazy 접근 + delete_by_chunk_ids + SQLite document
  row 제거. Ok(None) 반환 전 cleanup 끝나서 caller 의 새 INSERT 시
  idx_docs_workspace_path UNIQUE 충돌 회피.

- tests:
  - c.rs unit tests 4 신규 — typedef_struct_emits_unit /
    typedef_enum_emits_unit / typedef_union_emits_unit /
    typedef_to_existing_type_stays_glue (negative).
  - tier1_c_ingest_searchable: parser_version assertion code-c-v1 →
    code-c-v2.
- 회귀: bytes-edit 경로 (asset_id 변경) 의 기존 purge_orphan_at_workspace_path
  + purge_vector_orphans_for_workspace_path 는 그대로 — 신규 분기와
  공존, 기존 test 모두 PASS.

미해결 (Risks): nested typedef (typedef struct { struct {...} inner; }
Outer;) 의 inner 익명 struct 는 여전히 glue — v2 의 1차 범위는
top-level typedef alias 만.

cargo test --workspace --no-fail-fast -j 1 + clippy 통과.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-24 14:15:32 +00:00
parent 67559fb3ce
commit 93ddece111
4 changed files with 273 additions and 14 deletions

View File

@@ -880,6 +880,22 @@ fn try_skip_unchanged(
// logic self-documenting and guards against future id_for_doc
// changes.
if existing_doc.parser_version != *current_parser_version {
// v0.17.0 PR-B: parser_version bump cascade. Same bytes (same
// asset_id) → asset-keyed `stale_chunk_ids_at` is a no-op, but
// the stale `documents` row at this workspace_path still
// collides with `idx_docs_workspace_path` on the next INSERT
// and the LanceDB rows under the old chunk_ids orphan. Sweep
// both stores here, before returning Ok(None), so the caller's
// full-ingest path lands a clean slate. The `keep_doc_id = ""`
// sentinel removes every doc at this path (the new doc_id is
// not yet known here — it's computed downstream from the new
// PARSER_VERSION).
purge_workspace_path_for_parser_bump(app, asset).with_context(|| {
format!(
"parser-bump orphan purge at {}",
asset.workspace_path.0
)
})?;
return Ok(None);
}
// 3. Chunker unchanged.
@@ -1486,6 +1502,53 @@ fn record_image_analysis_failure(
warning_notes.push(note);
}
/// v0.17.0 PR-B: parser-bump cascade. When a code extractor ships a
/// new `PARSER_VERSION` (e.g. `code-c-v1` → `code-c-v2`), the same
/// (workspace_path, asset_id) pair re-emerges with a fresh `doc_id`.
/// The existing asset-keyed [`purge_vector_orphans_for_workspace_path`]
/// only fires on asset_id changes (file bytes edited) and is a no-op
/// here. Without an explicit doc-keyed sweep the next INSERT raises
/// `idx_docs_workspace_path` UNIQUE and the LanceDB rows under the
/// stale chunk_ids orphan. This helper:
///
/// 1. Fetches every stale chunk_id at `workspace_path` from SQLite
/// (`keep_doc_id = ""` means "all existing docs are stale" —
/// `try_skip_unchanged` calls this before the new doc_id is
/// computed).
/// 2. Deletes the matching vectors from every Lance table (no-op if
/// embeddings are disabled).
/// 3. Sweeps the SQLite `documents` row (CASCADE drops `blocks` /
/// `chunks` / `embedding_records`). The `assets` row stays — same
/// bytes, same asset_id, only the derived `doc_id` changed.
fn purge_workspace_path_for_parser_bump(
app: &App,
asset: &RawAsset,
) -> anyhow::Result<()> {
let path = &asset.workspace_path.0;
let stale = app
.sqlite
.stale_chunk_ids_for_workspace_path_except_doc_id(path, "")
.context("SqliteStore::stale_chunk_ids_for_workspace_path_except_doc_id")?;
if !stale.is_empty() {
if let Some(vec_store) = app.vector().context("App::vector")? {
use kebab_core::VectorStore as _;
vec_store
.delete_by_chunk_ids(&stale)
.context("VectorStore::delete_by_chunk_ids (parser-bump orphans)")?;
}
}
app.sqlite
.purge_document_at_workspace_path_except_doc_id(path, "")
.context("SqliteStore::purge_document_at_workspace_path_except_doc_id")?;
tracing::debug!(
target: "kebab-app",
path = %path,
count = stale.len(),
"purged orphan vectors + document for parser_version bump"
);
Ok(())
}
/// HOTFIXES 2026-05-02 P7-3 follow-up: when a tracked file's bytes
/// change, `purge_orphan_at_workspace_path` (in `kebab-store-sqlite`)
/// sweeps the SQLite chain (documents → blocks / chunks / embedding_records)

View File

@@ -1145,8 +1145,8 @@ fn tier1_c_ingest_searchable() {
.expect("parser.c item present");
assert_eq!(
c_item.parser_version.as_ref().map(|p| p.0.as_str()),
Some("code-c-v1"),
"parser_version must be code-c-v1"
Some("code-c-v2"),
"parser_version must be code-c-v2 (v0.17.0 PR-B: typedef-wrapped struct/enum/union 이 typedef alias unit 으로 방출)"
);
assert_eq!(
c_item.chunker_version.as_ref().map(|c| c.0.as_str()),

View File

@@ -31,7 +31,7 @@ use time::OffsetDateTime;
use crate::scaffold::{filename_from_workspace_path, strip_extension};
pub const PARSER_VERSION: &str = "code-c-v1";
pub const PARSER_VERSION: &str = "code-c-v2";
/// C AST extractor. Per-unit blocks via tree-sitter-c 0.24.2
/// (`LANGUAGE: LanguageFn`) parsed by tree-sitter 0.26.
@@ -257,13 +257,33 @@ fn build_blocks(
flush_glue(&mut glue, &mut units);
units.push((name.to_string(), s, e, true));
} else {
// Anonymous struct/enum/union — glue.
// Anonymous struct/enum/union at the top level (not
// wrapped in typedef) — glue. typedef-wrapped case
// is recovered in the `type_definition` arm below.
glue.push((s, e));
}
}
// Everything else: preprocessor directives, declarations
// (typedef / global var / fn prototype), type_definition,
// linkage_specification, etc. — all collapse into glue.
"type_definition" => {
// v0.17.0 PR-B: typedef-wrapped anonymous aggregate
// recovery. `typedef struct { ... } Foo;` exposes only
// the alias `Foo` as a useful symbol — the inner
// struct_specifier has no `name` field. Pre-v0.17.0
// this whole construct collapsed into glue and hid the
// alias from search (HOTFIXES 2026-05-21). v2 recovers
// the alias from the `declarator` field and emits a
// synthetic unit so `Citation::Code.symbol = "Foo"`.
// Plain `typedef int MyInt;` (no inner aggregate) stays
// glue — there's no struct body to name.
if let Some(name) = recover_typedef_alias(child, source) {
flush_glue(&mut glue, &mut units);
units.push((name, s, e, true));
} else {
glue.push((s, e));
}
}
// Everything else: preprocessor directives, plain declarations
// (global var / fn prototype), linkage_specification, etc.
// — all collapse into glue.
_ => {
glue.push((s, e));
}
@@ -323,6 +343,62 @@ fn build_blocks(
Ok(blocks)
}
/// v0.17.0 PR-B: try to recover the typedef alias name from a
/// `type_definition` node *iff* the inner type-specifier is an
/// anonymous struct/enum/union. Returns `None` for any other shape
/// (named aggregate handled elsewhere, plain type alias has no body
/// worth naming).
fn recover_typedef_alias(node: tree_sitter::Node, source: &str) -> Option<String> {
let mut has_anon_aggregate = false;
let mut cursor = node.walk();
for sub in node.children(&mut cursor) {
match sub.kind() {
"struct_specifier" | "enum_specifier" | "union_specifier" => {
if sub.child_by_field_name("name").is_none() {
has_anon_aggregate = true;
} else {
// Named inner aggregate (e.g. `typedef struct Pt {...} P;`)
// — the named struct itself is the primary symbol and
// is *not* extracted at the top level today (it lives
// inside `type_definition`, not as a sibling
// `struct_specifier`). For v2 we keep behavior conservative:
// return None so the type_definition stays glue, matching
// pre-v2 behavior for this minor case. Real-world C tends
// to use one of: bare named struct, typedef alias only,
// or typedef on anonymous body — the latter is what we fix.
return None;
}
}
_ => {}
}
}
if !has_anon_aggregate {
return None;
}
let decl = node.child_by_field_name("declarator")?;
extract_typedef_alias_name(decl, source).map(str::to_string)
}
/// Extract the typedef alias identifier from a declarator subtree.
/// Handles the common shapes: direct `type_identifier`, or one wrapped
/// in pointer / function declarator nodes (the alias is always the
/// rightmost `type_identifier` descendant).
fn extract_typedef_alias_name<'a>(
decl: tree_sitter::Node,
source: &'a str,
) -> Option<&'a str> {
if decl.kind() == "type_identifier" {
return Some(&source[decl.start_byte()..decl.end_byte()]);
}
let mut cursor = decl.walk();
for sub in decl.children(&mut cursor) {
if let Some(found) = extract_typedef_alias_name(sub, source) {
return Some(found);
}
}
None
}
fn flush_glue(glue: &mut Vec<(u32, u32)>, units: &mut Vec<(String, u32, u32, bool)>) {
if glue.is_empty() {
return;
@@ -489,20 +565,72 @@ mod tests {
}
#[test]
fn c_extractor_typedef_struct_falls_into_glue() {
// typedef struct { ... } Foo; — inner struct_specifier is anonymous,
// outer node is type_definition → glue. See HOTFIXES.md 2026-05-21.
fn c_extractor_typedef_struct_emits_unit() {
// v0.17.0 PR-B: `typedef struct { ... } Foo;` was previously a
// hotfix-tracked deviation (HOTFIXES.md 2026-05-21) — the inner
// struct_specifier is anonymous so the named-struct arm didn't
// fire, dropping the whole construct into glue and hiding the
// `Foo` alias from symbol search. The v2 extractor recovers the
// typedef alias from the `declarator` field on the
// `type_definition` node and emits a synthetic unit with that
// name. parser_version bumped `code-c-v1` → `code-c-v2`.
let src = "typedef struct { int x; int y; } Point;\n";
let doc = tests_support::extract_c(src, "x/typedef.c");
let s = syms(&doc);
// The typedef alias surfaces as a Code symbol.
assert!(
s.iter().any(|x| x == "Point"),
"expected 'Point' unit from typedef alias: {s:?}"
);
// No `<module>` (the file has exactly one semantic unit now,
// the typedef alias — no glue-only fallback needed).
assert!(
!s.iter().any(|x| x == "<module>"),
"no <module> fallback expected when typedef emits a unit: {s:?}"
);
}
#[test]
fn c_extractor_typedef_enum_emits_unit() {
// Parallel coverage for enum_specifier — same typedef-alias
// synthesis path. `typedef enum { A, B } Color;` → unit `Color`.
let src = "typedef enum { A, B } Color;\n";
let doc = tests_support::extract_c(src, "x/typedef_enum.c");
let s = syms(&doc);
assert!(
s.iter().any(|x| x == "Color"),
"expected 'Color' unit from typedef enum alias: {s:?}"
);
}
#[test]
fn c_extractor_typedef_union_emits_unit() {
// Parallel coverage for union_specifier.
let src = "typedef union { int i; float f; } IntOrFloat;\n";
let doc = tests_support::extract_c(src, "x/typedef_union.c");
let s = syms(&doc);
assert!(
s.iter().any(|x| x == "IntOrFloat"),
"expected 'IntOrFloat' unit from typedef union alias: {s:?}"
);
}
#[test]
fn c_extractor_typedef_to_existing_type_stays_glue() {
// Negative case: `typedef int MyInt;` has no inner struct/enum/
// union — there's no struct body to attach the alias to, so the
// construct falls into glue (becomes `<module>` when alone).
// Confirms the new arm only fires for anonymous-struct typedef.
let src = "typedef int MyInt;\n";
let doc = tests_support::extract_c(src, "x/typedef_alias.c");
let s = syms(&doc);
assert!(
s.iter().any(|x| x == "<module>"),
"expected <module> for typedef struct: {s:?}"
"expected <module> for plain typedef alias: {s:?}"
);
// The typedef alias should NOT surface as a Code symbol
assert!(
!s.iter().any(|x| x == "Point"),
"unexpected 'Point' unit for typedef struct: {s:?}"
!s.iter().any(|x| x == "MyInt"),
"plain typedef alias must not emit a unit: {s:?}"
);
}

View File

@@ -464,6 +464,74 @@ impl SqliteStore {
}
Ok(out)
}
/// v0.17.0 PR-B: sister of [`Self::stale_chunk_ids_at`] for the
/// `parser_version` bump cascade. When `doc_id` depends on
/// `parser_version` (design §9) and an extractor ships a new
/// `PARSER_VERSION`, the next ingest computes a fresh `doc_id` for
/// the *same* `(workspace_path, asset_id)` pair. The existing
/// asset_id-keyed [`Self::stale_chunk_ids_at`] does NOT fire (same
/// asset), so the legacy `chunks` rows and their LanceDB shadows
/// would orphan. This helper queries by `workspace_path` instead,
/// excluding the freshly-computed `keep_doc_id` so a re-entry
/// during the same ingest doesn't re-sweep the new row.
///
/// Caller usage: pass the *new* `doc_id` if known; pass an empty
/// string when called before the new INSERT (the case in
/// `try_skip_unchanged`) — all existing docs at `workspace_path`
/// are then collected as stale.
pub fn stale_chunk_ids_for_workspace_path_except_doc_id(
&self,
workspace_path: &str,
keep_doc_id: &str,
) -> Result<Vec<kebab_core::ChunkId>> {
let conn = self.lock_conn();
let mut stmt = conn
.prepare(
"SELECT c.chunk_id
FROM chunks c
INNER JOIN documents d ON c.doc_id = d.doc_id
WHERE d.workspace_path = ?1 AND d.doc_id != ?2",
)
.map_err(StoreError::from)?;
let rows = stmt
.query_map(params![workspace_path, keep_doc_id], |row| {
row.get::<_, String>(0)
})
.map_err(StoreError::from)?;
let mut out: Vec<kebab_core::ChunkId> = Vec::new();
for row in rows {
let id = row.map_err(StoreError::from)?;
out.push(kebab_core::ChunkId(id));
}
Ok(out)
}
/// v0.17.0 PR-B: sweep the SQLite document chain (`documents` →
/// `blocks` / `chunks` / `embedding_records` via CASCADE) for every
/// row at `workspace_path` whose `doc_id` differs from `keep_doc_id`.
/// Pair with [`Self::stale_chunk_ids_for_workspace_path_except_doc_id`]
/// — caller fetches the chunk_ids first, hands them to
/// `VectorStore::delete_by_chunk_ids`, then calls this sweep.
/// `assets` row is preserved (same bytes, same asset_id — only the
/// derived `doc_id` changed).
///
/// `keep_doc_id = ""` deletes every doc at `workspace_path`
/// (semantics mirror the sister helper above — used by
/// `try_skip_unchanged` before the new INSERT exists).
pub fn purge_document_at_workspace_path_except_doc_id(
&self,
workspace_path: &str,
keep_doc_id: &str,
) -> Result<()> {
let conn = self.lock_conn();
conn.execute(
"DELETE FROM documents WHERE workspace_path = ?1 AND doc_id != ?2",
params![workspace_path, keep_doc_id],
)
.map_err(StoreError::from)?;
Ok(())
}
}
/// Sweep stale `assets` + `documents` + downstream rows when the file