diff --git a/Cargo.lock b/Cargo.lock index 311a7be..0839e76 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4127,7 +4127,7 @@ dependencies = [ [[package]] name = "kebab-app" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "base64 0.22.1", @@ -4172,7 +4172,7 @@ dependencies = [ [[package]] name = "kebab-chunk" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "blake3", @@ -4181,13 +4181,14 @@ dependencies = [ "kebab-parse-md", "serde_json", "serde_json_canonicalizer", + "serde_yaml", "time", "tracing", ] [[package]] name = "kebab-cli" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "clap", @@ -4208,7 +4209,7 @@ dependencies = [ [[package]] name = "kebab-config" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "dirs 5.0.1", @@ -4223,7 +4224,7 @@ dependencies = [ [[package]] name = "kebab-core" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "blake3", @@ -4237,7 +4238,7 @@ dependencies = [ [[package]] name = "kebab-embed" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "blake3", @@ -4251,7 +4252,7 @@ dependencies = [ [[package]] name = "kebab-embed-local" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "fastembed", @@ -4264,7 +4265,7 @@ dependencies = [ [[package]] name = "kebab-eval" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "kebab-app", @@ -4283,7 +4284,7 @@ dependencies = [ [[package]] name = "kebab-llm" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "kebab-core", @@ -4292,7 +4293,7 @@ dependencies = [ [[package]] name = "kebab-llm-local" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "kebab-config", @@ -4309,7 +4310,7 @@ dependencies = [ [[package]] name = "kebab-mcp" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "kebab-app", @@ -4327,7 +4328,7 @@ dependencies = [ [[package]] name = "kebab-normalize" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "kebab-core", @@ -4342,7 +4343,7 @@ dependencies = [ [[package]] name = "kebab-parse-code" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "gix", @@ -4363,7 +4364,7 @@ dependencies = [ [[package]] name = "kebab-parse-image" -version = "0.13.0" +version = "0.14.0" dependencies = [ "ab_glyph", "anyhow", @@ -4387,7 +4388,7 @@ dependencies = [ [[package]] name = "kebab-parse-md" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "kebab-core", @@ -4404,7 +4405,7 @@ dependencies = [ [[package]] name = "kebab-parse-pdf" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "blake3", @@ -4417,7 +4418,7 @@ dependencies = [ [[package]] name = "kebab-parse-types" -version = "0.13.0" +version = "0.14.0" dependencies = [ "kebab-core", "serde", @@ -4425,7 +4426,7 @@ dependencies = [ [[package]] name = "kebab-rag" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "blake3", @@ -4446,7 +4447,7 @@ dependencies = [ [[package]] name = "kebab-search" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "globset", @@ -4465,7 +4466,7 @@ dependencies = [ [[package]] name = "kebab-source-fs" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "blake3", @@ -4484,7 +4485,7 @@ dependencies = [ [[package]] name = "kebab-store-sqlite" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "blake3", @@ -4505,7 +4506,7 @@ dependencies = [ [[package]] name = "kebab-store-vector" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "arrow", @@ -4529,7 +4530,7 @@ dependencies = [ [[package]] name = "kebab-tui" -version = "0.13.0" +version = "0.14.0" dependencies = [ "anyhow", "crossterm", diff --git a/Cargo.toml b/Cargo.toml index 6d3c9f9..cf98073 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ edition = "2024" rust-version = "1.85" license = "MIT OR Apache-2.0" repository = "https://github.com/altair823/kebab" -version = "0.13.0" +version = "0.14.0" [workspace.dependencies] anyhow = "1" diff --git a/HANDOFF.md b/HANDOFF.md index 3554fac..184b1e8 100644 --- a/HANDOFF.md +++ b/HANDOFF.md @@ -4,7 +4,7 @@ ## 한 줄 요약 -P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. `kebab ingest` 가 markdown / image / PDF / 소스코드 (Rust / Python / TS / JS / Go / Java / Kotlin) 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page / code citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공. P10-1C (Go + Java + Kotlin) 완료 — 다음 후보 = P10-1D (C/C++) 또는 P9-5 (desktop tauri) 또는 보류 중인 P8 (audio). +P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. `kebab ingest` 가 markdown / image / PDF / 소스코드 (Rust / Python / TS / JS / Go / Java / Kotlin) / Tier 2 리소스 파일 (yaml/k8s / dockerfile / toml / json / xml / groovy / go-mod) 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page / code citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공. P10-2 (Tier 2 resource-aware) 완료 — 다음 후보 = P10-1D (C/C++) 또는 P10-3 (Tier 3 fallback) 또는 P9-5 (desktop tauri) 또는 보류 중인 P8 (audio). ## Phase 로드맵 @@ -20,7 +20,7 @@ P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. | **P7** | PDF text + page citation | `kebab-parse-pdf` | P5 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring) | | **P8** | 음성 transcription + timestamp citation | `kebab-parse-audio` | P5 | ⏸ 보류 (whisper-rs 시스템 dep brainstorm 필요) | | **P9** | TUI + desktop app | `kebab-tui`, `kebab-desktop` | P5 | 🟡 진행 (4/5 component — P9-1/2/3/4 완료 [Library / Search / Ask / Inspect], P9-5 desktop 예정 · 도그푸딩 피드백 **20/20 ✅**) | -| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)** | +| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)**, **2 ✅ (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` — v0.14.0)** | P0~P5 직렬. P6~P9 P5 이후 병렬 가능. diff --git a/README.md b/README.md index fddb742..833432b 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ kebab doctor | 명령 | 동작 | |------|------| | `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 | -| `kebab ingest []` | Markdown / 이미지 / PDF / Rust 소스코드 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`), **소스코드** (`.rs` → `code-rust-ast-v1`, `.py` → `code-python-ast-v1`, `.ts`/`.tsx` → `code-ts-ast-v1`, `.js`/`.mjs`/`.cjs`/`.jsx` → `code-js-ast-v1`, `.go` → `code-go-ast-v1`, `.java` → `code-java-ast-v1`, `.kt`/`.kts` → `code-kotlin-ast-v1` — 모두 tree-sitter AST chunker). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. 코드 chunk 는 `citation.kind = "code"` 에 `citation.lang = ""` + `symbol` + line range 를 담고, SearchHit top-level 에 `code_lang` + `repo` (`.git/` walk-up 의 디렉토리 이름) 가 backfill 됨. `--code-lang rust` / `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` / `--code-lang go` / `--code-lang java` / `--code-lang kotlin` / `--media code` filter 로 언어별·코드 전용 검색 가능 (p10-1A-1 filter flags). Python symbol 은 workspace 경로 → dotted module path prefix (예: `kebab_eval.metrics.compute_mrr`), TS/JS symbol 은 slash-style module path prefix (예: `src/Foo.Foo.search`), Go symbol 은 `package.Func` / `package.(*Receiver).Method` 형식, Java / Kotlin symbol 은 `com.foo.Foo.bar` 형식 (패키지 + 클래스 + 메서드/필드). | +| `kebab ingest []` | Markdown / 이미지 / PDF / Rust 소스코드 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`), **소스코드** (`.rs` → `code-rust-ast-v1`, `.py` → `code-python-ast-v1`, `.ts`/`.tsx` → `code-ts-ast-v1`, `.js`/`.mjs`/`.cjs`/`.jsx` → `code-js-ast-v1`, `.go` → `code-go-ast-v1`, `.java` → `code-java-ast-v1`, `.kt`/`.kts` → `code-kotlin-ast-v1` — 모두 tree-sitter AST chunker; **Tier 2 리소스 파일**: `.yaml`/`.yml` → `k8s-manifest-resource-v1` (apiVersion+kind 파싱), `Dockerfile`/`Dockerfile.*`/`*.dockerfile` → `dockerfile-file-v1` (전체 파일), `Cargo.toml`/`pyproject.toml`/`.toml`/`package.json`/`tsconfig.json`/`.json`/`pom.xml`/`.xml`/`build.gradle`/`.gradle`/`go.mod` → `manifest-file-v1` (전체 파일) — yaml (k8s) / dockerfile / toml / json / xml / groovy / go-mod 지원). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. 코드 chunk 는 `citation.kind = "code"` 에 `citation.lang = ""` + `symbol` + line range 를 담고, SearchHit top-level 에 `code_lang` + `repo` (`.git/` walk-up 의 디렉토리 이름) 가 backfill 됨. `--code-lang rust` / `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` / `--code-lang go` / `--code-lang java` / `--code-lang kotlin` / `--code-lang yaml` / `--code-lang dockerfile` / `--code-lang toml` / `--code-lang json` / `--code-lang xml` / `--code-lang groovy` / `--code-lang go-mod` / `--media code` filter 로 언어별·코드 전용 검색 가능 (p10-1A-1 filter flags). Python symbol 은 workspace 경로 → dotted module path prefix (예: `kebab_eval.metrics.compute_mrr`), TS/JS symbol 은 slash-style module path prefix (예: `src/Foo.Foo.search`), Go symbol 은 `package.Func` / `package.(*Receiver).Method` 형식, Java / Kotlin symbol 은 `com.foo.Foo.bar` 형식 (패키지 + 클래스 + 메서드/필드). | | `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID] [--trace] [--bulk] [--repo NAME ...] [--code-lang LIST]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` 는 반복 가능 flag (`--tag rust --tag async`) 로 OR 매칭, `--media` 는 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). **`--trace` (p9-fb-37)** — `search_response.v1.trace` 에 lexical / vector pre-fusion 후보 + RRF union + per-stage timing (`lexical_ms` / `vector_ms` / `fusion_ms` / `total_ms`) 노출. trace 요청은 캐시 우회 (`--no-cache` 없이도 항상 cold). **`--bulk` (p9-fb-42)** — stdin ndjson 으로 N query 한 번에 실행. `--json` 면 stdout per-query ndjson (`bulk_search_item.v1`) + stderr summary (`bulk_summary: total=N succeeded=S failed=F`). Cap 100. agent 가 query decomposition 후 sub-query 일괄 실행 시 single round-trip — App instance 재사용으로 캐시 / embedder cold-start 비용 한 번만. Per-query failure 는 item 의 `error` (error.v1) 에 격리, 다른 query 계속 진행. **code corpus filters (p10-1A-1):** `--repo` 는 반복 가능 (`--repo kebab --repo other`) OR 매칭. `--code-lang` 는 반복 또는 comma 다중 값 (`--code-lang rust,python`), 알 수 없는 값은 빈 hits. `--media code` 는 Tier 1/2/3 모든 code chunk 포함. 1A-1 시점에서는 indexed 된 code chunk 가 없어 filter 가 항상 빈 결과 — 1A-2 (Rust AST chunker) 머지 이후 실효. | | `kebab list docs` | 색인된 문서 목록 | | `kebab inspect doc ` / `kebab inspect chunk ` | raw record 보기 | diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 08c89cc..149b687 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -39,7 +39,7 @@ use std::sync::Arc; use anyhow::{Context, anyhow}; use serde::{Deserialize, Serialize}; -use kebab_chunk::{CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTsAstV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; +use kebab_chunk::{CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, @@ -948,10 +948,11 @@ fn ingest_one_asset( force_reingest, ); } - // p10-1A-2 / 1B: code ingest dispatch. + // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. MediaType::Code(lang) if matches!(lang.as_str(), - "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin") => + "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod") => { return ingest_one_code_asset( app, @@ -1831,6 +1832,9 @@ fn ingest_one_code_asset( "go" => ParserVersion(kebab_parse_code::GO_PARSER_VERSION.to_string()), "java" => ParserVersion(kebab_parse_code::JAVA_PARSER_VERSION.to_string()), "kotlin" => ParserVersion(kebab_parse_code::KOTLIN_PARSER_VERSION.to_string()), + // p10-2: Tier 2 has no parse step — sentinel "none-v1". + "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + => ParserVersion("none-v1".to_string()), other => anyhow::bail!("unsupported code_lang: {other}"), }; @@ -1842,7 +1846,12 @@ fn ingest_one_code_asset( "javascript" => CodeJsAstV1Chunker.chunker_version(), "go" => CodeGoAstV1Chunker.chunker_version(), "java" => CodeJavaAstV1Chunker.chunker_version(), - "kotlin" => CodeKotlinAstV1Chunker.chunker_version(), + "kotlin" => CodeKotlinAstV1Chunker.chunker_version(), + // p10-2 Tier 2: + "yaml" => K8sManifestResourceV1Chunker.chunker_version(), + "dockerfile" => DockerfileFileV1Chunker.chunker_version(), + "toml" | "json" | "xml" | "groovy" | "go-mod" + => ManifestFileV1Chunker.chunker_version(), other => anyhow::bail!("unreachable chunker_version: {other}"), }; @@ -1890,6 +1899,10 @@ fn ingest_one_code_asset( "kotlin" => KotlinAstExtractor::new() .extract(&ctx, &bytes) .context("kb-parse-code::KotlinAstExtractor::extract (code:kotlin)")?, + // p10-2 Tier 2: no extractor — synthesize Document directly from raw bytes. + "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => { + synthesize_tier2_document(asset, &bytes, code_lang, &parser_version)? + } other => anyhow::bail!("unreachable (extract): {other}"), }; @@ -1913,9 +1926,20 @@ fn ingest_one_code_asset( "java" => CodeJavaAstV1Chunker .chunk(&canonical, chunk_policy) .context("kb-chunk::CodeJavaAstV1Chunker::chunk (code:java)")?, - "kotlin" => CodeKotlinAstV1Chunker + "kotlin" => CodeKotlinAstV1Chunker .chunk(&canonical, chunk_policy) .context("kb-chunk::CodeKotlinAstV1Chunker::chunk (code:kotlin)")?, + // p10-2 Tier 2: + "yaml" => K8sManifestResourceV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::K8sManifestResourceV1Chunker::chunk")?, + "dockerfile" => DockerfileFileV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::DockerfileFileV1Chunker::chunk")?, + "toml" | "json" | "xml" | "groovy" | "go-mod" + => ManifestFileV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::ManifestFileV1Chunker::chunk")?, other => anyhow::bail!("unreachable (chunk): {other}"), }; @@ -2011,6 +2035,135 @@ fn ingest_one_code_asset( }) } +/// p10-2: Build a minimal [`CanonicalDocument`] for Tier 2 code assets +/// (yaml / dockerfile / toml / json / xml / groovy / go-mod) that have +/// no AST extractor. Produces a single `Block::Code` whose source span +/// covers the entire file, mirroring the shape the Tier 1 extractors +/// produce for glue / top-level regions. +fn synthesize_tier2_document( + asset: &RawAsset, + bytes: &[u8], + code_lang: &str, + parser_version: &ParserVersion, +) -> anyhow::Result { + use anyhow::Context as _; + use kebab_core::{ + BlockId, CodeBlock, CommonBlock, Lang, Metadata, Provenance, ProvenanceEvent, + ProvenanceKind, SourceSpan, SourceType, TrustLevel, id_for_block, id_for_doc, + }; + + let text = std::str::from_utf8(bytes) + .with_context(|| format!("tier2 doc not utf-8: {}", asset.workspace_path.0))? + .to_string(); + + let doc_id = id_for_doc(&asset.workspace_path, &asset.asset_id, parser_version); + + let n_lines = text.lines().count().max(1) as u32; + let span = SourceSpan::Code { + line_start: 1, + line_end: n_lines, + symbol: Some("".to_string()), + lang: Some(code_lang.to_string()), + }; + let block_id: BlockId = id_for_block( + &doc_id, + "code", + &[], + 0, + &span, + ); + let block = kebab_core::Block::Code(CodeBlock { + common: CommonBlock { + block_id, + heading_path: vec![], + source_span: span, + }, + lang: Some(code_lang.to_string()), + code: text, + }); + + let now = time::OffsetDateTime::now_utc(); + let events = vec![ + ProvenanceEvent { + at: asset.discovered_at, + agent: "kb-source-fs".to_string(), + kind: ProvenanceKind::Discovered, + note: None, + }, + ProvenanceEvent { + at: now, + agent: "kb-app".to_string(), + kind: ProvenanceKind::Parsed, + note: Some(format!( + "parser_version={}; tier2_synthesized; lang={}", + parser_version.0, code_lang + )), + }, + ]; + + // Resolve abs path for repo detection (mirrors RustAstExtractor pattern). + let workspace_root = std::path::PathBuf::new(); // not needed for detect_repo walk + let abs_path = match &asset.source_uri { + kebab_core::SourceUri::File(p) => p.clone(), + kebab_core::SourceUri::Kb(_) => workspace_root, + }; + let (repo, git_branch, git_commit) = match kebab_parse_code::detect_repo(&abs_path) { + Some(r) => (Some(r.name), r.branch, r.commit), + None => (None, None, None), + }; + + let title = { + let fname = asset.workspace_path.0 + .rsplit('/') + .next() + .unwrap_or(&asset.workspace_path.0); + // strip extension + match fname.rfind('.') { + Some(i) => fname[..i].to_string(), + None => fname.to_string(), + } + }; + + let metadata = Metadata { + aliases: vec![], + tags: vec![], + created_at: asset.discovered_at, + updated_at: asset.discovered_at, + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: serde_json::Map::new(), + repo, + git_branch, + git_commit, + code_lang: Some(code_lang.to_string()), + }; + + tracing::debug!( + target: "kebab-app", + "synthesized tier2 doc_id={} workspace_path={} lang={}", + doc_id.0, + asset.workspace_path.0, + code_lang, + ); + + Ok(kebab_core::CanonicalDocument { + doc_id, + source_asset_id: asset.asset_id.clone(), + workspace_path: asset.workspace_path.clone(), + title, + lang: Lang("und".to_string()), + blocks: vec![block], + metadata, + provenance: Provenance { events }, + parser_version: parser_version.clone(), + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + }) +} + /// Pull the BCP-47 language hint from the canonical document. P6-1 /// stamps `Lang("und")` by default; image-pipeline OCR / caption /// adapters special-case "und" so the hint is intentionally dropped diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index 6cffb66..69ac528 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -603,6 +603,253 @@ fn kotlin_file_ingests_and_searches_as_code_citation() { ); } +/// p10-2 Task H: a `k8s/deploy.yaml` file with a Deployment resource is +/// ingested and the resulting `Citation::Code` hit must carry +/// `lang="yaml"`, `symbol="Deployment/prod/api"`, and `line_start >= 1`. +/// Exercises the k8s-manifest-resource-v1 chunker end-to-end. +#[test] +fn tier2_k8s_yaml_ingest_searchable() { + let env = TestEnv::lexical_only(); + + let k8s_dir = env.workspace_root.join("k8s"); + std::fs::create_dir_all(&k8s_dir).unwrap(); + std::fs::write( + k8s_dir.join("deploy.yaml"), + "apiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: api\n namespace: prod\nspec:\n replicas: 1\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + assert!(report.new >= 1, "yaml file ingested: {report:?}"); + + let yaml_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("deploy.yaml")) + .expect("deploy.yaml item present"); + assert_eq!( + yaml_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("none-v1"), + "parser_version must be none-v1" + ); + assert_eq!( + yaml_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("k8s-manifest-resource-v1"), + "chunker_version must be k8s-manifest-resource-v1" + ); + + let query = kebab_core::SearchQuery { + text: "api".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["yaml".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'api'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!(lang.as_deref(), Some("yaml"), "citation.lang must be 'yaml'"); + assert_eq!( + symbol.as_deref(), + Some("Deployment/prod/api"), + "citation.symbol must be 'Deployment/prod/api'" + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("yaml"), + "SearchHit.code_lang must be 'yaml'" + ); +} + +/// p10-2 Task H: a `Dockerfile` is ingested and the resulting +/// `Citation::Code` hit must carry `lang="dockerfile"`, +/// `symbol=""`, and `line_start >= 1`. +/// Exercises the dockerfile-file-v1 chunker end-to-end. +#[test] +fn tier2_dockerfile_ingest_searchable() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("Dockerfile"), + "FROM rust:1.94\nRUN cargo install foo\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + assert!(report.new >= 1, "Dockerfile ingested: {report:?}"); + + let df_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("Dockerfile")) + .expect("Dockerfile item present"); + assert_eq!( + df_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("none-v1"), + "parser_version must be none-v1" + ); + assert_eq!( + df_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("dockerfile-file-v1"), + "chunker_version must be dockerfile-file-v1" + ); + + let query = kebab_core::SearchQuery { + text: "cargo".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["dockerfile".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'cargo'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!( + lang.as_deref(), + Some("dockerfile"), + "citation.lang must be 'dockerfile'" + ); + assert_eq!( + symbol.as_deref(), + Some(""), + "citation.symbol must be ''" + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("dockerfile"), + "SearchHit.code_lang must be 'dockerfile'" + ); +} + +/// p10-2 Task H: a `Cargo.toml` manifest is ingested and the resulting +/// `Citation::Code` hit must carry `lang="toml"`, `symbol=""`, +/// and `line_start >= 1`. +/// Exercises the manifest-file-v1 chunker end-to-end. +#[test] +fn tier2_cargo_toml_ingest_searchable() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("Cargo.toml"), + "[package]\nname = \"demo\"\nversion = \"0.1.0\"\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + assert!(report.new >= 1, "Cargo.toml ingested: {report:?}"); + + let toml_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("Cargo.toml")) + .expect("Cargo.toml item present"); + assert_eq!( + toml_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("none-v1"), + "parser_version must be none-v1" + ); + assert_eq!( + toml_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("manifest-file-v1"), + "chunker_version must be manifest-file-v1" + ); + + let query = kebab_core::SearchQuery { + text: "demo".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["toml".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'demo'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!( + lang.as_deref(), + Some("toml"), + "citation.lang must be 'toml'" + ); + assert_eq!( + symbol.as_deref(), + Some(""), + "citation.symbol must be ''" + ); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("toml"), + "SearchHit.code_lang must be 'toml'" + ); +} + /// Re-ingesting the same `.rs` file without changes must report /// `Unchanged` (incremental-skip path exercised). #[test] diff --git a/crates/kebab-chunk/Cargo.toml b/crates/kebab-chunk/Cargo.toml index be91c3c..a7f9534 100644 --- a/crates/kebab-chunk/Cargo.toml +++ b/crates/kebab-chunk/Cargo.toml @@ -13,6 +13,7 @@ serde_json_canonicalizer = "0.3" blake3 = { workspace = true } anyhow = { workspace = true } tracing = { workspace = true } +serde_yaml = { workspace = true } [dev-dependencies] # kb-parse-md / kb-normalize are dev-only — used by the snapshot integration diff --git a/crates/kebab-chunk/src/dockerfile_file_v1.rs b/crates/kebab-chunk/src/dockerfile_file_v1.rs new file mode 100644 index 0000000..519d1ae --- /dev/null +++ b/crates/kebab-chunk/src/dockerfile_file_v1.rs @@ -0,0 +1,57 @@ +//! p10-2: dockerfile whole-file chunker (Tier 2). +//! +//! Reads entire Dockerfile content and emits a single Chunk with symbol +//! "", code_lang "dockerfile", line range 1..EOF. +//! Oversize >200 lines splits into line-windows sharing the symbol via +//! tier2_shared::push_chunks_with_oversize. + +use crate::tier2_shared::{policy_hash, push_chunks_with_oversize}; +use anyhow::Result; +use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker}; + +pub const VERSION_LABEL: &str = "dockerfile-file-v1"; + +#[derive(Clone, Copy, Debug, Default)] +pub struct DockerfileFileV1Chunker; + +impl Chunker for DockerfileFileV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + policy_hash(policy) + } + + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result> { + // Expect a single Block::Code carrying the full Dockerfile text. + let text = match doc.blocks.first() { + Some(Block::Code(cb)) => cb.code.as_str(), + _ => return Ok(vec![]), + }; + + let total_lines = text.lines().count().max(1) as u32; + let mut chunks = Vec::new(); + + push_chunks_with_oversize( + &mut chunks, + doc, + policy, + text, + 1, + total_lines, + "", + "dockerfile", + VERSION_LABEL, + )?; + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = chunks.len(), + "dockerfile-file-v1 chunked", + ); + + Ok(chunks) + } +} diff --git a/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs b/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs new file mode 100644 index 0000000..71a4104 --- /dev/null +++ b/crates/kebab-chunk/src/k8s_manifest_resource_v1.rs @@ -0,0 +1,169 @@ +//! p10-2: k8s manifest resource-aware chunker. +//! +//! Splits a multi-document YAML file on `^---\s*$` boundaries, recognises +//! documents that have both `apiVersion` and `kind` string fields as k8s +//! resources, and emits one `Chunk` per resource (with oversize >200-line +//! fallback). Non-k8s documents are skipped; invalid YAML yields 0 chunks +//! for the entire file. + +use crate::tier2_shared::{policy_hash, push_chunks_with_oversize}; +use anyhow::Result; +use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker}; + +pub const VERSION_LABEL: &str = "k8s-manifest-resource-v1"; + +#[derive(Clone, Copy, Debug, Default)] +pub struct K8sManifestResourceV1Chunker; + +impl Chunker for K8sManifestResourceV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + policy_hash(policy) + } + + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result> { + // Expect a single Block::Code carrying the full YAML text. + let text = match doc.blocks.first() { + Some(Block::Code(cb)) => cb.code.as_str(), + _ => return Ok(vec![]), + }; + + let slices = split_yaml_documents(text); + let mut chunks: Vec = Vec::new(); + + for slice in slices { + // Invalid YAML in any document → return 0 chunks for the file. + let value: serde_yaml::Value = match serde_yaml::from_str(slice.text) { + Ok(v) => v, + Err(_) => return Ok(vec![]), + }; + + let Some(mapping) = value.as_mapping() else { + continue; + }; + + let api = mapping + .get("apiVersion") + .and_then(|v| v.as_str()) + .unwrap_or(""); + let kind = mapping + .get("kind") + .and_then(|v| v.as_str()) + .unwrap_or(""); + + // Skip non-k8s documents. + if api.is_empty() || kind.is_empty() { + continue; + } + + let metadata = mapping + .get("metadata") + .and_then(|v| v.as_mapping()); + let name = metadata + .and_then(|m| m.get("name")) + .and_then(|v| v.as_str()) + .unwrap_or(""); + let namespace = metadata + .and_then(|m| m.get("namespace")) + .and_then(|v| v.as_str()); + + let symbol = match namespace { + Some(ns) if !ns.is_empty() => format!("{kind}/{ns}/{name}"), + _ => format!("{kind}/{name}"), + }; + + push_chunks_with_oversize( + &mut chunks, + doc, + policy, + slice.text, + slice.line_start, + slice.line_end, + &symbol, + "yaml", + VERSION_LABEL, + )?; + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = chunks.len(), + "k8s-manifest-resource-v1 chunked", + ); + + Ok(chunks) + } +} + +struct YamlSlice<'a> { + text: &'a str, + line_start: u32, + line_end: u32, +} + +/// Split raw YAML text into per-document slices on `---` separator lines. +/// Line numbers are 1-indexed. +fn split_yaml_documents(text: &str) -> Vec> { + let lines: Vec<&str> = text.lines().collect(); + + // Collect indices of separator lines (0-based), then append a sentinel at + // the end so the last slice is always terminated. + let mut separators: Vec = lines + .iter() + .enumerate() + .filter_map(|(i, l)| { + let trimmed = l.trim_end(); + if trimmed == "---" + || trimmed.starts_with("--- ") + || trimmed.starts_with("---\t") + { + Some(i) + } else { + None + } + }) + .collect(); + separators.push(lines.len()); + + let mut slices: Vec> = Vec::new(); + let mut doc_start_line: usize = 0; // 0-based index of current doc start + + for sep_line in separators { + if sep_line > doc_start_line { + let start_byte = byte_offset_of_line(text, doc_start_line); + let end_byte = byte_offset_of_line(text, sep_line); + let slice_text = &text[start_byte..end_byte]; + if !slice_text.trim().is_empty() { + slices.push(YamlSlice { + text: slice_text, + line_start: (doc_start_line + 1) as u32, + line_end: sep_line as u32, + }); + } + } + doc_start_line = sep_line + 1; + } + + slices +} + +/// Return the byte offset of the start of `line_idx` (0-based line index). +fn byte_offset_of_line(text: &str, line_idx: usize) -> usize { + if line_idx == 0 { + return 0; + } + let mut count = 0usize; + for (i, c) in text.char_indices() { + if c == '\n' { + count += 1; + if count == line_idx { + return i + 1; + } + } + } + text.len() +} diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index 750d18e..9b65e05 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -24,6 +24,10 @@ mod code_rust_ast_v1; mod code_ts_ast_v1; mod md_heading_v1; mod pdf_page_v1; +mod tier2_shared; +pub mod k8s_manifest_resource_v1; +pub mod dockerfile_file_v1; +pub mod manifest_file_v1; pub use code_go_ast_v1::CodeGoAstV1Chunker; pub use code_java_ast_v1::CodeJavaAstV1Chunker; @@ -34,3 +38,6 @@ pub use code_rust_ast_v1::CodeRustAstV1Chunker; pub use code_ts_ast_v1::CodeTsAstV1Chunker; pub use md_heading_v1::MdHeadingV1Chunker; pub use pdf_page_v1::PdfPageV1Chunker; +pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker; +pub use dockerfile_file_v1::DockerfileFileV1Chunker; +pub use manifest_file_v1::ManifestFileV1Chunker; diff --git a/crates/kebab-chunk/src/manifest_file_v1.rs b/crates/kebab-chunk/src/manifest_file_v1.rs new file mode 100644 index 0000000..1e859e0 --- /dev/null +++ b/crates/kebab-chunk/src/manifest_file_v1.rs @@ -0,0 +1,58 @@ +//! p10-2: manifest whole-file chunker (Tier 2). +//! +//! Reads entire manifest file (Cargo.toml / package.json / pom.xml / go.mod / +//! build.gradle / pyproject.toml / tsconfig.json) and emits a single Chunk +//! with symbol "", code_lang read from Block::Code.lang, line range +//! 1..EOF. Oversize >200 lines splits into line-windows sharing the symbol via +//! tier2_shared::push_chunks_with_oversize. + +use crate::tier2_shared::{policy_hash, push_chunks_with_oversize}; +use anyhow::Result; +use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker}; + +pub const VERSION_LABEL: &str = "manifest-file-v1"; + +#[derive(Clone, Copy, Debug, Default)] +pub struct ManifestFileV1Chunker; + +impl Chunker for ManifestFileV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + policy_hash(policy) + } + + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result> { + // Expect a single Block::Code carrying the full manifest text. + let (text, lang) = match doc.blocks.first() { + Some(Block::Code(cb)) => (cb.code.as_str(), cb.lang.as_deref().unwrap_or("")), + _ => return Ok(vec![]), + }; + + let total_lines = text.lines().count().max(1) as u32; + let mut chunks = Vec::new(); + + push_chunks_with_oversize( + &mut chunks, + doc, + policy, + text, + 1, + total_lines, + "", + lang, + VERSION_LABEL, + )?; + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = chunks.len(), + "manifest-file-v1 chunked", + ); + + Ok(chunks) + } +} diff --git a/crates/kebab-chunk/src/tier2_shared.rs b/crates/kebab-chunk/src/tier2_shared.rs new file mode 100644 index 0000000..f52173c --- /dev/null +++ b/crates/kebab-chunk/src/tier2_shared.rs @@ -0,0 +1,142 @@ +//! p10-2: Tier 2 chunker shared helpers (oversize fallback + Chunk build). +//! +//! Mirrors `code_rust_ast_v1`'s Chunk-construction pattern exactly so that +//! id / hashes / token-count / ChunkPolicy semantics stay identical across +//! Tier 1 (AST) and Tier 2 (resource-aware) chunkers. + +use anyhow::Result; +use kebab_core::{ + BlockId, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, DocumentId, SourceSpan, + id_for_chunk, +}; + +pub(crate) const AST_CHUNK_MAX_LINES: u32 = 200; +const BYTES_PER_TOKEN: usize = 3; +const POLICY_HASH_HEX_LEN: usize = 16; + +/// Compute the policy hash the same way `code_rust_ast_v1` does. +pub(crate) fn policy_hash(policy: &ChunkPolicy) -> String { + let bytes = serde_json_canonicalizer::to_vec(policy) + .expect("canonical JSON serialization of ChunkPolicy must not fail"); + let hex = blake3::hash(&bytes).to_hex().to_string(); + hex[..POLICY_HASH_HEX_LEN].to_string() +} + +/// Emit one chunk for `(text, line_start..=line_end, symbol, lang)`, splitting +/// into line-windows of at most `AST_CHUNK_MAX_LINES` if the slice is oversize. +/// Mirrors the oversize path in `code_rust_ast_v1`'s `chunk` impl. +#[allow(clippy::too_many_arguments)] +pub(crate) fn push_chunks_with_oversize( + out: &mut Vec, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + text: &str, + line_start: u32, + line_end: u32, + symbol: &str, + lang: &str, + chunker_version: &str, +) -> Result<()> { + let n_lines = (line_end - line_start + 1).max(1); + let cv = ChunkerVersion(chunker_version.to_string()); + let base_policy_hash = policy_hash(policy); + + if n_lines <= AST_CHUNK_MAX_LINES { + out.push(build_chunk( + doc, + &cv, + &base_policy_hash, + text, + line_start, + line_end, + symbol, + lang, + None, + )); + return Ok(()); + } + + let lines: Vec<&str> = text.lines().collect(); + let total = lines.len(); + let mut window_start = line_start; + let mut i = 0usize; + while i < total { + let take = (AST_CHUNK_MAX_LINES as usize).min(total - i); + let window_text = lines[i..i + take].join("\n"); + let window_end = window_start + take as u32 - 1; + out.push(build_chunk( + doc, + &cv, + &base_policy_hash, + &window_text, + window_start, + window_end, + symbol, + lang, + Some(window_start), + )); + i += take; + window_start = window_end + 1; + } + Ok(()) +} + +/// Build a single `Chunk`, mirroring `make_chunk` in `code_rust_ast_v1.rs` +/// exactly (same id recipe, same token estimate, same field set). +/// +/// `split_key` is `Some(line_start_of_window)` for oversize splits, `None` +/// for normal single-chunk emission. Mirrors the `Some(part_ls)` / `None` +/// split_key pattern in 1A-2. +#[allow(clippy::too_many_arguments)] +fn build_chunk( + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + base_policy_hash: &str, + text: &str, + line_start: u32, + line_end: u32, + symbol: &str, + lang: &str, + split_key: Option, +) -> Chunk { + let span = SourceSpan::Code { + line_start, + line_end, + symbol: Some(symbol.to_string()), + lang: Some(lang.to_string()), + }; + + // id_hash mirrors code_rust_ast_v1's make_chunk logic: + // split_key Some(k) => "{base_policy_hash}#L{k}" + // split_key None => base_policy_hash + let id_hash = match split_key { + Some(k) => format!("{base_policy_hash}#L{k}"), + None => base_policy_hash.to_string(), + }; + + // block_ids: Tier 2 chunkers have no per-block structure (the whole file + // is one Block::Code), so we pass an empty slice — same as using the doc- + // level slice without explicit block granularity. + let block_ids: Vec = vec![]; + + let chunk_id = id_for_chunk( + &DocumentId(doc.doc_id.0.clone()), + chunker_version, + &block_ids, + &id_hash, + ); + + let token_estimate = text.len().div_ceil(BYTES_PER_TOKEN); + + Chunk { + chunk_id, + doc_id: DocumentId(doc.doc_id.0.clone()), + block_ids, + text: text.to_string(), + heading_path: Vec::new(), + source_spans: vec![span], + token_estimate, + chunker_version: chunker_version.clone(), + policy_hash: base_policy_hash.to_string(), + } +} diff --git a/crates/kebab-chunk/tests/dockerfile_file_v1.rs b/crates/kebab-chunk/tests/dockerfile_file_v1.rs new file mode 100644 index 0000000..44dd94a --- /dev/null +++ b/crates/kebab-chunk/tests/dockerfile_file_v1.rs @@ -0,0 +1,134 @@ +//! Behavioural tests for `DockerfileFileV1Chunker`. +//! +//! Documents are constructed manually (no kebab-parse-code dependency) by +//! placing the raw Dockerfile text into a single `Block::Code`, mirroring the +//! pattern used in `k8s_manifest_resource_v1.rs`. + +use std::path::PathBuf; + +use kebab_chunk::DockerfileFileV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, +}; +use time::OffsetDateTime; + +// ── helpers ────────────────────────────────────────────────────────────────── + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +/// Build a `CanonicalDocument` with a single `Block::Code` containing `dockerfile_text`. +fn dockerfile_doc(dockerfile_text: &str) -> CanonicalDocument { + let wp = WorkspacePath("build/Dockerfile".into()); + let aid = AssetId("d".repeat(64)); + let pv = ParserVersion("code-dockerfile-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + let line_count = dockerfile_text.lines().count() as u32; + let span = SourceSpan::Code { + line_start: 1, + line_end: line_count.max(1), + symbol: None, + lang: Some("dockerfile".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], 0, &span); + let block = Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some("dockerfile".into()), + code: dockerfile_text.to_string(), + }); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "Dockerfile".into(), + lang: Lang("und".into()), + blocks: vec![block], + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("dockerfile".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("dockerfile-file-v1".into()), + } +} + +// ── tests ───────────────────────────────────────────────────────────────────── + +/// A simple 5-line Dockerfile fixture must emit exactly 1 chunk with the +/// correct symbol, lang, and line range. +#[test] +fn dockerfile_emits_single_chunk() { + let fixture_path = fixtures_dir().join("sample.dockerfile"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + let doc = dockerfile_doc(&text); + let chunks = DockerfileFileV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 1, + "expected 1 chunk, got {}: {chunks:#?}", + chunks.len() + ); + + // Inspect the Chunk's source_spans for symbol / lang / line range. + let span = chunks[0].source_spans.first().expect("at least one span"); + match span { + SourceSpan::Code { + line_start, + line_end, + symbol, + lang, + } => { + assert_eq!(*line_start, 1, "line_start must be 1"); + assert_eq!(*line_end, 5, "line_end must be 5 (5-line fixture)"); + assert_eq!( + symbol.as_deref(), + Some(""), + "symbol must be ''" + ); + assert_eq!(lang.as_deref(), Some("dockerfile"), "lang must be 'dockerfile'"); + } + other => panic!("expected SourceSpan::Code, got {other:?}"), + } + + // Verify chunker_version label. + assert_eq!(chunks[0].chunker_version.0, "dockerfile-file-v1"); +} diff --git a/crates/kebab-chunk/tests/fixtures/sample.dockerfile b/crates/kebab-chunk/tests/fixtures/sample.dockerfile new file mode 100644 index 0000000..94352b8 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample.dockerfile @@ -0,0 +1,5 @@ +FROM rust:1.94-slim AS builder +WORKDIR /app +COPY . . +RUN cargo build --release +CMD ["/app/target/release/kebab"] diff --git a/crates/kebab-chunk/tests/fixtures/sample_cargo.toml b/crates/kebab-chunk/tests/fixtures/sample_cargo.toml new file mode 100644 index 0000000..cae4a85 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample_cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "demo" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = "1" diff --git a/crates/kebab-chunk/tests/fixtures/sample_go.mod b/crates/kebab-chunk/tests/fixtures/sample_go.mod new file mode 100644 index 0000000..b2af9d4 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample_go.mod @@ -0,0 +1,5 @@ +module example.com/demo + +go 1.22 + +require github.com/spf13/cobra v1.8.0 diff --git a/crates/kebab-chunk/tests/fixtures/sample_k8s.yaml b/crates/kebab-chunk/tests/fixtures/sample_k8s.yaml new file mode 100644 index 0000000..b7f61f0 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample_k8s.yaml @@ -0,0 +1,34 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api-server + namespace: prod +spec: + replicas: 3 + selector: + matchLabels: + app: api-server + template: + metadata: + labels: + app: api-server + spec: + containers: + - name: api + image: example/api:1.2.3 +--- +apiVersion: v1 +kind: Service +metadata: + name: api-server + namespace: prod +spec: + selector: + app: api-server + ports: + - port: 80 + targetPort: 8080 +--- +# Non-k8s document — apiVersion missing +kind: ClusterIP +foo: bar diff --git a/crates/kebab-chunk/tests/fixtures/sample_package.json b/crates/kebab-chunk/tests/fixtures/sample_package.json new file mode 100644 index 0000000..a84f652 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample_package.json @@ -0,0 +1,7 @@ +{ + "name": "demo", + "version": "0.1.0", + "dependencies": { + "react": "^18.0.0" + } +} diff --git a/crates/kebab-chunk/tests/fixtures/sample_pom.xml b/crates/kebab-chunk/tests/fixtures/sample_pom.xml new file mode 100644 index 0000000..c6dd05d --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample_pom.xml @@ -0,0 +1,7 @@ + + + 4.0.0 + com.demo + demo + 0.1.0 + diff --git a/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs b/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs new file mode 100644 index 0000000..42625a0 --- /dev/null +++ b/crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs @@ -0,0 +1,205 @@ +//! Behavioural tests for `K8sManifestResourceV1Chunker`. +//! +//! Documents are constructed manually (no kebab-parse-code dependency) by +//! placing the raw YAML text into a single `Block::Code`, mirroring the +//! pattern used in `code_rust_ast_snapshot.rs`. + +use std::path::PathBuf; + +use kebab_chunk::K8sManifestResourceV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, +}; +use time::OffsetDateTime; + +// ── helpers ────────────────────────────────────────────────────────────────── + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +/// Build a `CanonicalDocument` with a single `Block::Code` containing `yaml_text`. +fn yaml_doc(yaml_text: &str) -> CanonicalDocument { + let wp = WorkspacePath("manifests/deploy.yaml".into()); + let aid = AssetId("c".repeat(64)); + let pv = ParserVersion("code-yaml-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + let line_count = yaml_text.lines().count() as u32; + let span = SourceSpan::Code { + line_start: 1, + line_end: line_count.max(1), + symbol: None, + lang: Some("yaml".into()), + }; + let bid = id_for_block(&doc_id, "code", &[], 0, &span); + let block = Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some("yaml".into()), + code: yaml_text.to_string(), + }); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "deploy.yaml".into(), + lang: Lang("und".into()), + blocks: vec![block], + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some("yaml".into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("k8s-manifest-resource-v1".into()), + } +} + +// ── tests ───────────────────────────────────────────────────────────────────── + +/// Three YAML documents: 2 valid k8s resources + 1 non-k8s (no apiVersion). +/// The chunker must emit exactly 2 chunks with the correct symbols and lang. +#[test] +fn k8s_multi_doc_emits_one_chunk_per_resource() { + let fixture_path = fixtures_dir().join("sample_k8s.yaml"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + let doc = yaml_doc(&text); + let chunks = K8sManifestResourceV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 2, + "expected 2 k8s chunks, got {}: {chunks:#?}", + chunks.len() + ); + + let symbols: Vec<&str> = chunks + .iter() + .map(|c| { + match &c.source_spans[0] { + SourceSpan::Code { symbol, .. } => { + symbol.as_deref().expect("symbol must be Some for k8s chunks") + } + other => panic!("expected Code span, got {other:?}"), + } + }) + .collect(); + + assert_eq!( + symbols, + vec!["Deployment/prod/api-server", "Service/prod/api-server"], + "symbols mismatch: {symbols:?}" + ); + + // Verify lang = "yaml" on every chunk. + for chunk in &chunks { + match &chunk.source_spans[0] { + SourceSpan::Code { lang, .. } => { + assert_eq!(lang.as_deref(), Some("yaml"), "lang must be 'yaml'"); + } + other => panic!("expected Code span, got {other:?}"), + } + } + + // Verify chunker_version label. + for chunk in &chunks { + assert_eq!(chunk.chunker_version.0, "k8s-manifest-resource-v1"); + } +} + +/// A YAML document with an indentation error (tab in a space-indented context) +/// must cause the chunker to return 0 chunks for the entire file. +#[test] +fn k8s_invalid_yaml_emits_zero_chunks() { + // serde_yaml 0.9 is lenient about duplicate keys (last wins), so use a + // genuine YAML structural error (unclosed flow sequence) to force a parse + // failure. + let actually_bad = "apiVersion: v1\nkind: Service\nfoo: [\nbar\n"; + + let doc = yaml_doc(actually_bad); + let chunks = K8sManifestResourceV1Chunker + .chunk(&doc, &policy()) + .expect("chunk should not error — return Ok(vec![]) for invalid yaml"); + + assert_eq!( + chunks.len(), + 0, + "invalid YAML must yield 0 chunks, got {}: {chunks:#?}", + chunks.len() + ); +} + +/// A cluster-scoped resource (no `metadata.namespace`) must produce a symbol +/// of the form `/` (two components, no namespace segment). +#[test] +fn k8s_cluster_scoped_resource_symbol() { + let yaml = "\ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-admin +rules: +- apiGroups: [\"*\"] + resources: [\"*\"] + verbs: [\"*\"] +"; + + let doc = yaml_doc(yaml); + let chunks = K8sManifestResourceV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 1, + "expected 1 chunk for cluster-scoped resource, got {}: {chunks:#?}", + chunks.len() + ); + + match &chunks[0].source_spans[0] { + SourceSpan::Code { symbol, lang, .. } => { + assert_eq!( + symbol.as_deref(), + Some("ClusterRole/cluster-admin"), + "cluster-scoped symbol must be /" + ); + assert_eq!(lang.as_deref(), Some("yaml")); + } + other => panic!("expected Code span, got {other:?}"), + } +} diff --git a/crates/kebab-chunk/tests/manifest_file_v1.rs b/crates/kebab-chunk/tests/manifest_file_v1.rs new file mode 100644 index 0000000..297c563 --- /dev/null +++ b/crates/kebab-chunk/tests/manifest_file_v1.rs @@ -0,0 +1,267 @@ +//! Behavioural tests for `ManifestFileV1Chunker`. +//! +//! Documents are constructed manually (no kebab-parse-code dependency) by +//! placing the raw manifest text into a single `Block::Code`, mirroring the +//! pattern used in `dockerfile_file_v1.rs`. + +use std::path::PathBuf; + +use kebab_chunk::ManifestFileV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, +}; +use time::OffsetDateTime; + +// ── helpers ────────────────────────────────────────────────────────────────── + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +/// Build a `CanonicalDocument` with a single `Block::Code` containing manifest text. +fn manifest_doc(lang: &str, manifest_text: &str) -> CanonicalDocument { + let wp = WorkspacePath(format!("build/{}", manifest_filename(lang))); + let aid = AssetId("m".repeat(64)); + let pv = ParserVersion("code-manifest-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + let line_count = manifest_text.lines().count() as u32; + let span = SourceSpan::Code { + line_start: 1, + line_end: line_count.max(1), + symbol: None, + lang: Some(lang.into()), + }; + let bid = id_for_block(&doc_id, "code", &[], 0, &span); + let block = Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some(lang.into()), + code: manifest_text.to_string(), + }); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: format!("Manifest ({})", lang), + lang: Lang("und".into()), + blocks: vec![block], + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some(lang.into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn manifest_filename(lang: &str) -> &'static str { + match lang { + "toml" => "Cargo.toml", + "json" => "package.json", + "xml" => "pom.xml", + "go-mod" => "go.mod", + _ => "manifest", + } +} + +fn policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("manifest-file-v1".into()), + } +} + +// ── tests ───────────────────────────────────────────────────────────────────── + +/// A Cargo.toml fixture must emit exactly 1 chunk with the correct symbol, +/// lang, and line range. +#[test] +fn cargo_toml_single_chunk_with_toml_lang() { + let fixture_path = fixtures_dir().join("sample_cargo.toml"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + let doc = manifest_doc("toml", &text); + let chunks = ManifestFileV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 1, + "expected 1 chunk, got {}: {chunks:#?}", + chunks.len() + ); + + let span = chunks[0].source_spans.first().expect("at least one span"); + match span { + SourceSpan::Code { + line_start, + line_end: _, + symbol, + lang, + } => { + assert_eq!(*line_start, 1, "line_start must be 1"); + assert_eq!( + symbol.as_deref(), + Some(""), + "symbol must be ''" + ); + assert_eq!(lang.as_deref(), Some("toml"), "lang must be 'toml'"); + } + other => panic!("expected SourceSpan::Code, got {other:?}"), + } + + assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1"); +} + +/// A package.json fixture must emit exactly 1 chunk with the correct symbol, +/// lang, and line range. +#[test] +fn package_json_single_chunk_with_json_lang() { + let fixture_path = fixtures_dir().join("sample_package.json"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + let doc = manifest_doc("json", &text); + let chunks = ManifestFileV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 1, + "expected 1 chunk, got {}: {chunks:#?}", + chunks.len() + ); + + let span = chunks[0].source_spans.first().expect("at least one span"); + match span { + SourceSpan::Code { + line_start, + line_end: _, + symbol, + lang, + } => { + assert_eq!(*line_start, 1, "line_start must be 1"); + assert_eq!( + symbol.as_deref(), + Some(""), + "symbol must be ''" + ); + assert_eq!(lang.as_deref(), Some("json"), "lang must be 'json'"); + } + other => panic!("expected SourceSpan::Code, got {other:?}"), + } + + assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1"); +} + +/// A pom.xml fixture must emit exactly 1 chunk with the correct symbol, +/// lang, and line range. +#[test] +fn pom_xml_single_chunk_with_xml_lang() { + let fixture_path = fixtures_dir().join("sample_pom.xml"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + let doc = manifest_doc("xml", &text); + let chunks = ManifestFileV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 1, + "expected 1 chunk, got {}: {chunks:#?}", + chunks.len() + ); + + let span = chunks[0].source_spans.first().expect("at least one span"); + match span { + SourceSpan::Code { + line_start, + line_end: _, + symbol, + lang, + } => { + assert_eq!(*line_start, 1, "line_start must be 1"); + assert_eq!( + symbol.as_deref(), + Some(""), + "symbol must be ''" + ); + assert_eq!(lang.as_deref(), Some("xml"), "lang must be 'xml'"); + } + other => panic!("expected SourceSpan::Code, got {other:?}"), + } + + assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1"); +} + +/// A go.mod fixture must emit exactly 1 chunk with the correct symbol, +/// lang, and line range. +#[test] +fn go_mod_single_chunk_with_go_mod_lang() { + let fixture_path = fixtures_dir().join("sample_go.mod"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + let doc = manifest_doc("go-mod", &text); + let chunks = ManifestFileV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 1, + "expected 1 chunk, got {}: {chunks:#?}", + chunks.len() + ); + + let span = chunks[0].source_spans.first().expect("at least one span"); + match span { + SourceSpan::Code { + line_start, + line_end: _, + symbol, + lang, + } => { + assert_eq!(*line_start, 1, "line_start must be 1"); + assert_eq!( + symbol.as_deref(), + Some(""), + "symbol must be ''" + ); + assert_eq!(lang.as_deref(), Some("go-mod"), "lang must be 'go-mod'"); + } + other => panic!("expected SourceSpan::Code, got {other:?}"), + } + + assert_eq!(chunks[0].chunker_version.0, "manifest-file-v1"); +} diff --git a/crates/kebab-parse-code/src/lang.rs b/crates/kebab-parse-code/src/lang.rs index 19fbb38..9f974a2 100644 --- a/crates/kebab-parse-code/src/lang.rs +++ b/crates/kebab-parse-code/src/lang.rs @@ -10,18 +10,39 @@ use std::path::Path; /// `None` if the extension / filename is not recognized. /// /// Matching priority: -/// 1. exact filename match (e.g. `Dockerfile`, `Makefile`) -/// 2. lowercase extension match +/// 1. Tier 1 basename exact match (e.g. `Dockerfile`, `Makefile`) +/// 2. Tier 2 basename match (e.g. `Cargo.toml`, `package.json`, `build.gradle`) +/// 3. Tier 2 `Dockerfile.*` prefix variant +/// 4. Tier 1 + Tier 2 extension fallback (lowercase) pub fn code_lang_for_path(path: &Path) -> Option<&'static str> { if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + // Tier 1 basename exact match match name { "Dockerfile" => return Some("dockerfile"), "Makefile" | "GNUmakefile" => return Some("make"), _ => {} } + + // Tier 2 basename match (configuration / manifest files) + match name { + "Cargo.toml" | "pyproject.toml" => return Some("toml"), + "package.json" | "tsconfig.json" => return Some("json"), + "go.mod" => return Some("go-mod"), + "pom.xml" => return Some("xml"), + "build.gradle" => return Some("groovy"), + _ => {} + } + + // Tier 2: `Dockerfile.*` prefix variant (e.g. `Dockerfile.dev`, `Dockerfile.prod`) + if name.starts_with("Dockerfile.") && name.len() > "Dockerfile.".len() { + return Some("dockerfile"); + } } + + // Extension fallback (Tier 1 + Tier 2) let ext = path.extension()?.to_str()?.to_ascii_lowercase(); match ext.as_str() { + // Tier 1 extensions "rs" => Some("rust"), "py" | "pyi" => Some("python"), "ts" | "tsx" | "mts" | "cts" => Some("typescript"), @@ -31,12 +52,15 @@ pub fn code_lang_for_path(path: &Path) -> Option<&'static str> { "kt" | "kts" => Some("kotlin"), "c" | "h" => Some("c"), "cpp" | "cc" | "cxx" | "hpp" | "hh" | "hxx" => Some("cpp"), + "sh" | "bash" | "zsh" => Some("shell"), + "mk" => Some("make"), + // Tier 2 extensions "yaml" | "yml" => Some("yaml"), "toml" => Some("toml"), "json" => Some("json"), - "sh" | "bash" | "zsh" => Some("shell"), - "mk" => Some("make"), + "xml" => Some("xml"), "dockerfile" => Some("dockerfile"), + "gradle" => Some("groovy"), _ => None, } } @@ -118,4 +142,28 @@ mod tests { assert_eq!(module_path_for_tsjs("a/b/c.ts"), "a/b/c"); assert_eq!(module_path_for_tsjs("packages/x/src/Foo.ts"), "packages/x/src/Foo"); } + + #[test] + fn tier2_basename_takes_precedence_over_extension() { + assert_eq!(code_lang_for_path(Path::new("Dockerfile")), Some("dockerfile")); + assert_eq!(code_lang_for_path(Path::new("foo/Dockerfile.dev")), Some("dockerfile")); + assert_eq!(code_lang_for_path(Path::new("myapp.dockerfile")), Some("dockerfile")); + assert_eq!(code_lang_for_path(Path::new("repo/Cargo.toml")), Some("toml")); + assert_eq!(code_lang_for_path(Path::new("pyproject.toml")), Some("toml")); + assert_eq!(code_lang_for_path(Path::new("repo/package.json")), Some("json")); + assert_eq!(code_lang_for_path(Path::new("tsconfig.json")), Some("json")); + assert_eq!(code_lang_for_path(Path::new("go.mod")), Some("go-mod")); + assert_eq!(code_lang_for_path(Path::new("pom.xml")), Some("xml")); + assert_eq!(code_lang_for_path(Path::new("build.gradle")), Some("groovy")); + } + + #[test] + fn tier2_extension_fallback() { + assert_eq!(code_lang_for_path(Path::new("k8s/deploy.yaml")), Some("yaml")); + assert_eq!(code_lang_for_path(Path::new("k8s/deploy.yml")), Some("yaml")); + assert_eq!(code_lang_for_path(Path::new("foo/bar.toml")), Some("toml")); + assert_eq!(code_lang_for_path(Path::new("foo/bar.json")), Some("json")); + assert_eq!(code_lang_for_path(Path::new("foo/bar.xml")), Some("xml")); + assert_eq!(code_lang_for_path(Path::new("foo/bar.gradle")), Some("groovy")); + } } diff --git a/crates/kebab-source-fs/src/media.rs b/crates/kebab-source-fs/src/media.rs index 3f9e0c6..8dfc2df 100644 --- a/crates/kebab-source-fs/src/media.rs +++ b/crates/kebab-source-fs/src/media.rs @@ -12,6 +12,12 @@ use kebab_core::{AudioType, ImageType, MediaType}; /// `MediaType::Image(_)` / `MediaType::Audio(_)`. Anything else (including /// missing extension) → `MediaType::Other(ext)`. pub(crate) fn media_type_for(path: &Path) -> MediaType { + // p10-2: code_lang_for_path is the single source of truth for code lang + // (design §3.5). Delegate before falling back to extension branches. + if let Some(lang) = kebab_parse_code::code_lang_for_path(path) { + return MediaType::Code(lang.to_string()); + } + let ext = path .extension() .and_then(|s| s.to_str()) @@ -36,23 +42,6 @@ pub(crate) fn media_type_for(path: &Path) -> MediaType { "flac" => MediaType::Audio(AudioType::Flac), "ogg" => MediaType::Audio(AudioType::Ogg), - // p10-1A-2: Rust is the only code lang activated in 1A. Other - // recognized code langs stay Other until their phase (1B+). - "rs" => MediaType::Code("rust".to_string()), - - // p10-1B: Python / TS / JS AST chunkers active. - "py" | "pyi" => MediaType::Code("python".into()), - // .mts / .cts are TypeScript ESM / CommonJS variants — same grammar. - "ts" | "tsx" | "mts" | "cts" => MediaType::Code("typescript".into()), - "js" | "mjs" | "cjs" | "jsx" => MediaType::Code("javascript".into()), - - // p10-1C-Go: Go ingest activated. - "go" => MediaType::Code("go".into()), - - // p10-1C-JK: JVM family (Java + Kotlin) ingest activated. - "java" => MediaType::Code("java".into()), - "kt" | "kts" => MediaType::Code("kotlin".into()), - // Empty string (no extension) and any other extension: bucket as // Other and let downstream extractors decide if they support it. _ => MediaType::Other(ext), @@ -96,7 +85,8 @@ mod tests { media_type_for(Path::new("crates/kebab-core/src/lib.rs")), MediaType::Code("rust".to_string()) ); - assert_eq!(media_type_for(Path::new("Cargo.toml")), MediaType::Other("toml".to_string())); + // Cargo.toml is a Tier 2 code manifest (p10-2), handled by code_lang_for_path + assert_eq!(media_type_for(Path::new("Cargo.toml")), MediaType::Code("toml".to_string())); } #[test] @@ -149,4 +139,14 @@ mod tests { MediaType::Other(String::new()) ); } + + #[test] + fn tier2_files_map_to_media_code() { + assert_eq!(media_type_for(Path::new("a/deploy.yaml")), MediaType::Code("yaml".into())); + assert_eq!(media_type_for(Path::new("a/Dockerfile")), MediaType::Code("dockerfile".into())); + assert_eq!(media_type_for(Path::new("a/Cargo.toml")), MediaType::Code("toml".into())); + assert_eq!(media_type_for(Path::new("a/pom.xml")), MediaType::Code("xml".into())); + assert_eq!(media_type_for(Path::new("a/build.gradle")), MediaType::Code("groovy".into())); + assert_eq!(media_type_for(Path::new("a/go.mod")), MediaType::Code("go-mod".into())); + } } diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index beafdec..cc1c391 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -22,7 +22,7 @@ Cargo workspace, 함수 호출 기반 모듈러 모놀리스. UI binary (`kebab- | OCR | Ollama vision LM (default `gemma4:e4b`) — `OcrEngine` trait 으로 Tesseract / Apple Vision 등 future swap (HOTFIXES P6-2) | | Image caption | Ollama vision LM, runtime gate `image.caption.enabled` (default OFF) | | PDF parser | `lopdf` per-page 텍스트, `chunker_version = "pdf-page-v1"` 가 PDF 자산에 하드코딩 (HOTFIXES P7-3) | -| code parser | `tree-sitter` + `tree-sitter-rust` / `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` / `tree-sitter-go` / `tree-sitter-java` / `tree-sitter-kotlin-ng` — **parser-side** (`kebab-parse-code`), chunker-side 아님 (design §6.3). chunker versions: Rust = `code-rust-ast-v1`, Python = `code-python-ast-v1`, TypeScript = `code-ts-ast-v1`, JavaScript = `code-js-ast-v1`, Go = `code-go-ast-v1`, Java = `code-java-ast-v1`, Kotlin = `code-kotlin-ast-v1`. `ast_chunk_max_lines = 200` 상수 고정 (HOTFIXES 2026-05-19 — Chunker trait 이 per-medium config 미노출). Kotlin grammar 은 `tree-sitter-kotlin-ng` 사용 — bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 고착되어 있어 사용 불가. | +| code parser | `tree-sitter` + `tree-sitter-rust` / `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` / `tree-sitter-go` / `tree-sitter-java` / `tree-sitter-kotlin-ng` — **parser-side** (`kebab-parse-code`), chunker-side 아님 (design §6.3). chunker versions: Rust = `code-rust-ast-v1`, Python = `code-python-ast-v1`, TypeScript = `code-ts-ast-v1`, JavaScript = `code-js-ast-v1`, Go = `code-go-ast-v1`, Java = `code-java-ast-v1`, Kotlin = `code-kotlin-ast-v1`. `ast_chunk_max_lines = 200` 상수 고정 (HOTFIXES 2026-05-19 — Chunker trait 이 per-medium config 미노출). Kotlin grammar 은 `tree-sitter-kotlin-ng` 사용 — bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 고착되어 있어 사용 불가. **Tier 2 (p10-2)**: YAML/k8s → `serde_yaml` + `k8s-manifest-resource-v1` (apiVersion+kind per resource), Dockerfile → `dockerfile-file-v1` (whole-file), Cargo.toml/go.mod/.json/.xml/.groovy → `manifest-file-v1` (whole-file). Tier 2 chunkers live in `kebab-chunk`; no tree-sitter grammar needed (structure from file type, not AST). | | 1B symbol path | workspace path → module path: Python = dotted prefix (`kebab_eval.metrics.compute_mrr`), TypeScript/JavaScript = slash-style prefix (`src/Foo.Foo.search`). Rust 1A-2 는 file-scope nesting 만 (workspace prefix 없음, 비일관 수용 — HOTFIXES 2026-05-20). | | TUI | Ratatui + crossterm — P9-1 Library 패널, P9-2/3/4 진행 예정 | | Desktop | Tauri 2 + `pdfjs-dist` (native PDF render backend 금지) — P9-5 | @@ -52,7 +52,7 @@ flowchart TB ppdf["kebab-parse-pdf"] pimg["kebab-parse-image"] paud["kebab-parse-audio
(P8 보류)"] - pcode["kebab-parse-code
(P10-1A-2 + P10-1B + P10-1C-Go + P10-1C-JK)"] + pcode["kebab-parse-code
(P10-1A-2 + P10-1B + P10-1C-Go + P10-1C-JK + P10-2)"] ptypes["kebab-parse-types"] norm["kebab-normalize"] chunk["kebab-chunk"] @@ -165,7 +165,13 @@ kebab/ │ ├── kebab-source-fs/ # 워크스페이스 walk + checksum (P1-1) │ ├── kebab-parse-md/ # Markdown frontmatter + blocks (P1-2/3) │ ├── kebab-normalize/ # ParsedBlock → CanonicalDocument (P1-4) -│ ├── kebab-chunk/ # heading-aware + pdf-page-v1 + code-rust-ast-v1 + code-python-ast-v1 + code-ts-ast-v1 + code-js-ast-v1 + code-go-ast-v1 + code-java-ast-v1 + code-kotlin-ast-v1 chunker (P1-5, P7-2, P10-1A-2, P10-1B, P10-1C-Go, P10-1C-JK) +│ ├── kebab-chunk/ # heading-aware + pdf-page-v1 + code-*-ast-v1 (Tier 1) + k8s-manifest-resource-v1 + dockerfile-file-v1 + manifest-file-v1 + tier2_shared (P10-2) chunker (P1-5, P7-2, P10-1A-2, P10-1B, P10-1C-Go, P10-1C-JK, P10-2) +│ │ └── src/ +│ │ ├── code_*_ast_v1.rs # Tier 1 AST chunkers (rust/python/ts/js/go/java/kotlin) +│ │ ├── k8s_manifest_resource_v1.rs # Tier 2 (p10-2): YAML multi-doc, apiVersion+kind per resource +│ │ ├── dockerfile_file_v1.rs # Tier 2 (p10-2): whole-file Dockerfile +│ │ ├── manifest_file_v1.rs # Tier 2 (p10-2): whole-file Cargo.toml / go.mod / .json / .xml / .groovy +│ │ └── tier2_shared.rs # Tier 2 (p10-2): shared oversize fallback + Chunk builder helpers │ ├── kebab-store-sqlite/ # SQLite + FTS5 (V001/V002/V003) (P1-6, P2-1, P3-3) │ ├── kebab-search/ # Lexical + Vector + Hybrid retriever (P2-2, P3-4) │ ├── kebab-embed/ kebab-embed-local/ # Embedder trait + fastembed adapter (P3-1, P3-2) diff --git a/docs/SMOKE.md b/docs/SMOKE.md index b609a2f..97c9d59 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -422,6 +422,86 @@ KB search --mode hybrid "Hello" --code-lang go --json | \ # 기대: symbol = "main.Hello", lang = "go" ``` +## P10-2 Tier 2 리소스 파일 색인 + +P10-1C-Go 와 동일한 격리 KB 설정. `.yaml` / `Dockerfile` / `.toml` 등 Tier 2 리소스 파일을 워크스페이스에 두고 ingest 하면 각 확장자에 맞는 chunker 로 처리된다. + +```bash +# 1) Kubernetes manifest (YAML multi-doc) +cat > /tmp/kebab-smoke/workspace/deploy.yaml <<'EOF' +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-app + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: my-app + template: + metadata: + labels: + app: my-app + spec: + containers: + - name: app + image: my-app:latest +--- +apiVersion: v1 +kind: Service +metadata: + name: my-app-svc + namespace: default +spec: + selector: + app: my-app + ports: + - port: 80 +EOF + +# 2) Dockerfile (전체 파일 단일 chunk) +cat > /tmp/kebab-smoke/workspace/Dockerfile <<'EOF' +FROM rust:1.85 AS builder +WORKDIR /app +COPY . . +RUN cargo build --release + +FROM debian:bookworm-slim +COPY --from=builder /app/target/release/kebab /usr/local/bin/kebab +ENTRYPOINT ["kebab"] +EOF + +# 3) Cargo.toml (manifest — 전체 파일 단일 chunk) +cp Cargo.toml /tmp/kebab-smoke/workspace/Cargo.toml + +# 4) ingest +KB ingest + +# 5) 언어별 검색 (citation.symbol 확인) +KB search --mode hybrid "Deployment" --code-lang yaml --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang}]}' +# 기대: symbol = "Deployment/default/my-app" (kind/namespace/name), lang = "yaml" + +KB search --mode hybrid "rust:1.85" --code-lang dockerfile --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang}]}' +# 기대: symbol = "", lang = "dockerfile" + +KB search --mode hybrid "kebab-cli" --code-lang toml --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang}]}' +# 기대: symbol = "", lang = "toml" + +# 6) schema stats 에 Tier 2 언어 카운트 확인 +KB --json schema | jq '.stats.code_lang_breakdown' +# 기대: {"yaml": N, "dockerfile": N, "toml": N, ...} +``` + +**Tier 2 citation.symbol 컨벤션**: + +- **YAML k8s 리소스**: `//` (예: `Deployment/default/my-app`). `namespace` 없으면 `/`. multi-doc YAML 은 `---` 구분자 기준으로 resource 별 chunk. +- **Dockerfile**: `` (고정 심볼, 전체 파일이 단일 chunk). +- **TOML / JSON / XML / Groovy / go.mod**: `` (고정 심볼, 전체 파일이 단일 chunk). 단, 파일이 `tier2_shared` 의 oversize threshold 초과 시 줄 단위 fallback chunk. + ## 검증 체크리스트 - `kebab doctor` 가 `--config` path 를 honor 하고 그 안의 `storage.data_dir` 를 출력 (XDG default 가 아님). @@ -456,6 +536,7 @@ rm -rf /tmp/kebab-smoke # 통째로 정리 - (P10-1B) `.py` / `.ts` / `.tsx` / `.js` / `.mjs` / `.cjs` / `.jsx` 파일을 워크스페이스에 두면 `kebab ingest` 결과에 `new` 카운터에 포함. `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` 검색이 `citation.symbol` 에 module path prefix 를 포함한 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 해당 언어 카운트 등장 확인. - (P10-1C-Go) `.go` 파일을 워크스페이스에 두면 `kebab ingest` 가 `code-go-ast-v1` 로 처리. `--code-lang go` 검색이 `citation.symbol` 에 `.` / `.(*Receiver).` 형식 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"go": N` 등장 확인. - (P10-1C-JK) `.java` 파일은 `code-java-ast-v1`, `.kt`/`.kts` 파일은 `code-kotlin-ast-v1` 로 처리. `--code-lang java` / `--code-lang kotlin` 검색이 `citation.symbol` 에 `com.foo.Foo.bar` 형식 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"java": N` / `"kotlin": N` 등장 확인. +- (P10-2) `.yaml`/`.yml` 파일은 apiVersion+kind 파싱으로 k8s resource 별 chunk 생성 (`k8s-manifest-resource-v1`). `Dockerfile`/`Dockerfile.*` 는 전체 파일 단일 chunk (`dockerfile-file-v1`). `.toml`/`.json`/`.xml`/`.groovy`/`go.mod` 는 전체 파일 단일 chunk (`manifest-file-v1`). `--code-lang yaml` / `--code-lang dockerfile` / `--code-lang toml` 검색이 `citation.symbol` 에 각각 `Deployment/default/my-app` / `` / `` 형식 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"yaml": N` / `"dockerfile": N` / `"toml": N` 등장 확인. - (P7-3 + follow-up) 동일 path 에 byte 가 다른 PDF 를 두 번째 ingest 하면 `purge_vector_orphans_for_workspace_path` 가 옛 chunk_id 를 LanceDB 에서 먼저 삭제, 이어서 `purge_orphan_at_workspace_path` 가 옛 doc / chunks / embedding_records 를 SQLite 에서 sweep. 새 byte 가 새 `doc_id` 로 색인됨. `IngestReport` 에 그 자산만 `new+=1` (다른 자산은 `updated`). 두 store 모두 정합 — 옛 본문 검색 시 옛 chunks 가 더 이상 surface 되지 않음. ### Embedding upgrade (fb-39b) diff --git a/docs/superpowers/plans/2026-05-20-p10-2-tier2-resource-aware.md b/docs/superpowers/plans/2026-05-20-p10-2-tier2-resource-aware.md new file mode 100644 index 0000000..05770ad --- /dev/null +++ b/docs/superpowers/plans/2026-05-20-p10-2-tier2-resource-aware.md @@ -0,0 +1,1343 @@ +# p10-2 Tier 2 Resource-Aware Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development to implement this plan task-by-task. + +**Goal:** Activate Tier 2 resource-aware chunkers (k8s manifest + Dockerfile + 7-file manifest set) in a single PR. AST 가 아닌 file/document-level chunking. 머지 시점부터 `.yaml` / `Dockerfile` / 매니페스트 7종 dogfooding 가능. + +**Architecture:** 3 self-contained chunker 모듈을 `kebab-chunk` 에 추가. `kebab-parse-code` 의 lang.rs 만 갱신 (Tier 2 = AST 없음). `kebab-source-fs/src/media.rs` 의 inline 확장자 match 를 `code_lang_for_path` 호출로 통일 (1A-1 부터 누적된 duplication 정리). `ingest_one_code_asset` 의 match 가 Tier 2 lang 7종 (`yaml` / `dockerfile` / `toml` / `json` / `xml` / `groovy` / `go-mod`) 을 새 chunker 로 라우팅. parser_version = `"none-v1"` 통일. + +**Tech Stack:** Rust 2024 workspace, `serde_yaml = "0.9"` (이미 workspace.dependencies). 1A-2 / 1B / 1C 인프라 변경 없음. + +**Memory note:** Host has been OOM'd previously (재부팅 사례 있음). Per-crate cargo only. ONE full-suite + clippy invocation in Task J. NO `cargo test --workspace` outside that gate. + +--- + +## Pre-flight + +Branch `feat/p10-2-tier2-resource` 이미 존재 (spec commit 47857b2 포함). + +- [ ] **Disk hygiene**: `df -h /` 점검. 90% 넘으면 `cargo clean` (last cleanup recovered 38.7 GB). + +Reference files: +- 1A-2 chunker: `crates/kebab-chunk/src/code_rust_ast_v1.rs` — `AST_CHUNK_MAX_LINES = 200` / `POLICY_HASH_HEX_LEN = 16` / `BYTES_PER_TOKEN = 3` 상수 + `Document → Vec` 패턴. +- 1C-JK dispatch generalization: `crates/kebab-app/src/lib.rs::ingest_one_code_asset` (~L1794). 현재 7-arm match (rust|python|typescript|javascript|go|java|kotlin). Tier 2 분기 추가 자리. +- 1A-1 code_lang_for_path: `crates/kebab-parse-code/src/lang.rs`. basename 우선 매칭 패턴 신설. +- 1A-1 media.rs: `crates/kebab-source-fs/src/media.rs`. inline `match extension` duplication. +- spec: `tasks/p10/p10-2-tier2-resource-aware.md`. + +--- + +## Task A: kebab-chunk 에 serde_yaml dep 추가 + +**Files:** +- Modify: `crates/kebab-chunk/Cargo.toml` (dependencies 절) + +- [ ] **Step 1**: `crates/kebab-chunk/Cargo.toml` 의 `[dependencies]` 절에 추가 (serde_json 다음 줄): + +```toml +serde_yaml = { workspace = true } +``` + +- [ ] **Step 2**: `cargo build -p kebab-chunk` → clean (unused dep warning 무시 — Task D 에서 사용). + +- [ ] **Step 3**: Commit: + +```bash +git add crates/kebab-chunk/Cargo.toml +git commit -m "$(cat <<'EOF' +build(p10-2): add serde_yaml dep to kebab-chunk for k8s-manifest-resource-v1 + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task B: lang.rs — basename + 확장자 추가 + Tier 2 매핑 + +**Files:** +- Modify: `crates/kebab-parse-code/src/lang.rs` +- Test: same file's test module + +- [ ] **Step 1 (failing test)**: `lang.rs` 의 `#[cfg(test)] mod tests` 에 추가 (기존 테스트 옆): + +```rust +#[test] +fn tier2_basename_takes_precedence_over_extension() { + assert_eq!(code_lang_for_path(Path::new("Dockerfile")), Some("dockerfile")); + assert_eq!(code_lang_for_path(Path::new("foo/Dockerfile.dev")), Some("dockerfile")); + assert_eq!(code_lang_for_path(Path::new("myapp.dockerfile")), Some("dockerfile")); + assert_eq!(code_lang_for_path(Path::new("repo/Cargo.toml")), Some("toml")); + assert_eq!(code_lang_for_path(Path::new("pyproject.toml")), Some("toml")); + assert_eq!(code_lang_for_path(Path::new("repo/package.json")), Some("json")); + assert_eq!(code_lang_for_path(Path::new("tsconfig.json")), Some("json")); + assert_eq!(code_lang_for_path(Path::new("go.mod")), Some("go-mod")); + assert_eq!(code_lang_for_path(Path::new("pom.xml")), Some("xml")); + assert_eq!(code_lang_for_path(Path::new("build.gradle")), Some("groovy")); +} + +#[test] +fn tier2_extension_fallback() { + assert_eq!(code_lang_for_path(Path::new("k8s/deploy.yaml")), Some("yaml")); + assert_eq!(code_lang_for_path(Path::new("k8s/deploy.yml")), Some("yaml")); + assert_eq!(code_lang_for_path(Path::new("foo/bar.toml")), Some("toml")); + assert_eq!(code_lang_for_path(Path::new("foo/bar.json")), Some("json")); + assert_eq!(code_lang_for_path(Path::new("foo/bar.xml")), Some("xml")); + assert_eq!(code_lang_for_path(Path::new("foo/bar.gradle")), Some("groovy")); +} +``` + +- [ ] **Step 2**: Run → FAIL. + +```bash +cargo test -p kebab-parse-code lang::tests::tier2 -- --nocapture +``` + +Expected: function returns `None` for all new inputs. + +- [ ] **Step 3 (impl)**: `code_lang_for_path` 본문을 다음 형태로 갱신 (기존 확장자 매칭은 유지하고 basename 분기를 *맨 앞* 으로): + +```rust +pub fn code_lang_for_path(path: &Path) -> Option<&'static str> { + // p10-2: basename takes precedence over extension (Dockerfile, Cargo.toml, …). + let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); + match file_name { + "Dockerfile" | "Cargo.toml" | "pyproject.toml" | "package.json" + | "tsconfig.json" | "go.mod" | "pom.xml" | "build.gradle" => { + return Some(match file_name { + "Dockerfile" => "dockerfile", + "Cargo.toml" | "pyproject.toml" => "toml", + "package.json" | "tsconfig.json" => "json", + "go.mod" => "go-mod", + "pom.xml" => "xml", + "build.gradle" => "groovy", + _ => unreachable!(), + }); + } + _ => {} + } + // Dockerfile.* prefix variant (Dockerfile.dev, Dockerfile.prod, …). + if let Some(rest) = file_name.strip_prefix("Dockerfile.") { + if !rest.is_empty() { + return Some("dockerfile"); + } + } + + // Extension fallback. + let ext = path.extension().and_then(|e| e.to_str())?; + let lang = match ext { + "rs" => "rust", + "py" | "pyi" => "python", + "ts" | "tsx" | "mts" | "cts" => "typescript", + "js" | "mjs" | "cjs" | "jsx" => "javascript", + "go" => "go", + "java" => "java", + "kt" | "kts" => "kotlin", + // p10-2: Tier 2 extensions. + "yaml" | "yml" => "yaml", + "dockerfile" => "dockerfile", + "toml" => "toml", + "json" => "json", + "xml" => "xml", + "gradle" => "groovy", + _ => return None, + }; + Some(lang) +} +``` + +(기존 함수의 확장자 절 그대로 보존하고 위 7줄만 추가. 기존 코드 형식이 다르면 그 형식 유지 + Tier 2 라인만 추가.) + +- [ ] **Step 4**: Run → PASS. + +```bash +cargo test -p kebab-parse-code lang::tests -- --nocapture +``` + +Expected: 모든 lang::tests 통과. + +- [ ] **Step 5**: Clippy + commit: + +```bash +cargo clippy -p kebab-parse-code --all-targets -- -D warnings +git add crates/kebab-parse-code/src/lang.rs +git commit -m "$(cat <<'EOF' +feat(p10-2): extend code_lang_for_path with Tier 2 basenames + extensions + +Adds basename-first matching for Dockerfile / Cargo.toml / pyproject.toml / +package.json / tsconfig.json / go.mod / pom.xml / build.gradle plus +Dockerfile.* prefix variant. Extension fallback adds .yaml/.yml/.dockerfile/ +.toml/.json/.xml/.gradle → yaml/dockerfile/toml/json/xml/groovy. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task C: media.rs — code_lang_for_path 호출로 inline match 통일 + +**Files:** +- Modify: `crates/kebab-source-fs/src/media.rs` + +design §3.5 의 "code_lang_for_path 가 *유일한 source of truth*" 룰 적용. 1A-1 부터 누적된 duplication 정리. + +- [ ] **Step 1**: 현재 `media_type_for` 함수의 `match extension` 의 code 매칭 절을 모두 한 줄로 교체: + +```rust +pub fn media_type_for(path: &Path) -> MediaType { + // p10-2: code_lang_for_path is the single source of truth for code lang. + if let Some(lang) = kebab_parse_code::code_lang_for_path(path) { + return MediaType::Code(lang.to_string()); + } + + // 기존 비-code 확장자 매칭 (markdown / pdf / images / etc.) 은 그대로 유지. + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); + match ext { + // ... 기존 비-code 절 그대로 ... + "md" | "markdown" => MediaType::Markdown, + // (기존 코드 그대로 — code lang 절 만 삭제) + _ => MediaType::Other(ext.to_string()), + } +} +``` + +(전체 함수 본문은 현재 파일을 Read 한 후 code 절만 삭제하고 위 if-let 을 함수 맨 앞에 추가.) + +- [ ] **Step 2**: 기존 media.rs 의 test 모듈 보존 — `code_files_map_to_media_code`, `go_files_map_to_media_code_go`, `java_kotlin_files_map_to_media_code` 등 모두 통과해야 함. 추가 Tier 2 테스트: + +```rust +#[test] +fn tier2_files_map_to_media_code() { + assert_eq!(media_type_for(Path::new("a/deploy.yaml")), MediaType::Code("yaml".into())); + assert_eq!(media_type_for(Path::new("a/Dockerfile")), MediaType::Code("dockerfile".into())); + assert_eq!(media_type_for(Path::new("a/Cargo.toml")), MediaType::Code("toml".into())); + assert_eq!(media_type_for(Path::new("a/pom.xml")), MediaType::Code("xml".into())); + assert_eq!(media_type_for(Path::new("a/build.gradle")), MediaType::Code("groovy".into())); + assert_eq!(media_type_for(Path::new("a/go.mod")), MediaType::Code("go-mod".into())); +} +``` + +- [ ] **Step 3**: `cargo test -p kebab-source-fs` → 기존 + 신규 테스트 모두 PASS. 만약 비-code 확장자 (md/pdf/etc.) 매칭이 깨졌으면 Step 1 의 비-code 절 보존 누락 — 다시 확인. + +- [ ] **Step 4**: Clippy + commit: + +```bash +cargo clippy -p kebab-source-fs --all-targets -- -D warnings +git add crates/kebab-source-fs/src/media.rs +git commit -m "$(cat <<'EOF' +refactor(p10-2): media.rs delegates code lang to code_lang_for_path + +Replaces 1A-1 era inline match block with a single call to +kebab_parse_code::code_lang_for_path, per design §3.5 single-source-of-truth +rule. Adds Tier 2 routing test (yaml / dockerfile / toml / json / xml / +groovy / go-mod) and preserves all non-code extension branches. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task D: k8s-manifest-resource-v1 chunker + +**Files:** +- Create: `crates/kebab-chunk/src/k8s_manifest_resource_v1.rs` +- Create: `crates/kebab-chunk/tests/fixtures/sample_k8s.yaml` +- Create: `crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs` +- Modify: `crates/kebab-chunk/src/lib.rs` (pub use) + +가장 복잡한 chunker. pre-split + serde_yaml deserialize + identify + emit + oversize fallback. + +- [ ] **Step 1 (fixture)**: `crates/kebab-chunk/tests/fixtures/sample_k8s.yaml` 생성. 3 document (2 k8s + 1 비-k8s): + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api-server + namespace: prod +spec: + replicas: 3 + selector: + matchLabels: + app: api-server + template: + metadata: + labels: + app: api-server + spec: + containers: + - name: api + image: example/api:1.2.3 +--- +apiVersion: v1 +kind: Service +metadata: + name: api-server + namespace: prod +spec: + selector: + app: api-server + ports: + - port: 80 + targetPort: 8080 +--- +# Non-k8s document — apiVersion missing +kind: ClusterIP +foo: bar +``` + +- [ ] **Step 2 (failing test)**: `crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs` 생성: + +```rust +use kebab_chunk::{ChunkPolicy, Chunker, K8sManifestResourceV1Chunker}; +use kebab_core::{Asset, AssetId, Document, MediaType, ParserVersion, SourceSpan}; +use std::path::PathBuf; + +fn read_fixture(name: &str) -> String { + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests/fixtures") + .join(name); + std::fs::read_to_string(path).expect("read fixture") +} + +fn make_doc(lang: &str, text: &str) -> Document { + // Tier 2 makes a Document directly (no extractor). Mirror what + // ingest_one_code_asset will do for Tier 2 in Task G. + use kebab_core::{Block, Inline}; + Document { + doc_id: "test-doc".to_string(), + asset: Asset { + asset_id: AssetId("test".to_string()), + workspace_path: format!("test.{lang}"), + byte_len: text.len() as u64, + content_hash: "deadbeef".to_string(), + media_type: MediaType::Code(lang.to_string()), + }, + parser_version: ParserVersion("none-v1".to_string()), + metadata: Default::default(), + blocks: vec![Block::Code { + text: text.to_string(), + lang: Some(lang.to_string()), + span: SourceSpan::Line { start: 1, end: text.lines().count() as u32 }, + }], + } +} + +#[test] +fn k8s_multi_doc_emits_one_chunk_per_resource() { + let text = read_fixture("sample_k8s.yaml"); + let doc = make_doc("yaml", &text); + let policy = ChunkPolicy::default(); + let chunks = K8sManifestResourceV1Chunker.chunk(&doc, &policy).unwrap(); + + // 2 k8s resources accepted, 1 non-k8s skipped. + assert_eq!(chunks.len(), 2, "expected 2 k8s chunks, got {}", chunks.len()); + + let symbols: Vec<_> = chunks.iter() + .map(|c| c.source_span_symbol().unwrap_or_default().to_string()) + .collect(); + assert_eq!(symbols, vec![ + "Deployment/prod/api-server".to_string(), + "Service/prod/api-server".to_string(), + ]); + + // Each chunk's lang field is "yaml". + for c in &chunks { + assert_eq!(c.source_span_lang().as_deref(), Some("yaml")); + } +} + +#[test] +fn k8s_invalid_yaml_emits_zero_chunks() { + let invalid = "apiVersion: v1\nkind: Service\n\tbadtab: x\n"; // invalid YAML + let doc = make_doc("yaml", invalid); + let policy = ChunkPolicy::default(); + let chunks = K8sManifestResourceV1Chunker.chunk(&doc, &policy).unwrap(); + assert!(chunks.is_empty(), "invalid yaml -> 0 chunks"); +} + +#[test] +fn k8s_cluster_scoped_resource_symbol() { + let cluster = r#"apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-admin +rules: [] +"#; + let doc = make_doc("yaml", cluster); + let policy = ChunkPolicy::default(); + let chunks = K8sManifestResourceV1Chunker.chunk(&doc, &policy).unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!( + chunks[0].source_span_symbol().unwrap_or_default(), + "ClusterRole/cluster-admin" + ); +} +``` + +(`Chunk` 의 `source_span_symbol()` / `source_span_lang()` 는 helper. 실제 API 가 다르면 — 예: `chunk.source_spans[0]` 의 `SourceSpan::Code { symbol, lang, .. }` 직접 접근 — 1A-2 의 snapshot test 형식 참고하여 동일 패턴으로 작성. Step 4 impl 작성 후 API 일치 확인.) + +- [ ] **Step 3**: `cargo test -p kebab-chunk k8s_manifest_resource_v1` → FAIL ("K8sManifestResourceV1Chunker not found"). + +- [ ] **Step 4a (shared helper)**: 먼저 `crates/kebab-chunk/src/tier2_shared.rs` 생성 — Task D/E/F 가 모두 사용할 oversize-aware chunk emit helper. impl 작성 전 `crates/kebab-chunk/src/code_rust_ast_v1.rs` 의 Chunk 생성 코드 (hash, token count, ChunkPolicy 적용 부분) Read 하고 동일 패턴 미러링: + +```rust +//! p10-2: Tier 2 chunker shared helpers (oversize fallback + Chunk build). + +use crate::ChunkPolicy; +use anyhow::Result; +use kebab_core::{Chunk, Document, SourceSpan}; + +pub(crate) const AST_CHUNK_MAX_LINES: u32 = 200; + +/// Push 1+ chunks for a region. ≤200 lines → 1 chunk. >200 → line-window +/// split with same symbol (only line range varies). Mirrors 1A-2's oversize +/// fallback. +#[allow(clippy::too_many_arguments)] +pub(crate) fn push_chunks_with_oversize( + out: &mut Vec, + doc: &Document, + policy: &ChunkPolicy, + text: &str, + line_start: u32, + line_end: u32, + symbol: &str, + lang: &str, + chunker_version: &str, +) -> Result<()> { + let n_lines = (line_end - line_start + 1).max(1); + if n_lines <= AST_CHUNK_MAX_LINES { + out.push(build_chunk(doc, policy, text, line_start, line_end, symbol, lang, chunker_version)?); + return Ok(()); + } + let lines: Vec<&str> = text.lines().collect(); + let mut window_start = line_start; + let mut i = 0usize; + while i < lines.len() { + let take = (AST_CHUNK_MAX_LINES as usize).min(lines.len() - i); + let window_text = lines[i..i + take].join("\n"); + let window_end = window_start + take as u32 - 1; + out.push(build_chunk(doc, policy, &window_text, window_start, window_end, symbol, lang, chunker_version)?); + i += take; + window_start = window_end + 1; + } + Ok(()) +} + +/// Build a single Chunk from a (text, line range, symbol, lang) tuple. +/// MUST mirror code_rust_ast_v1.rs's Chunk construction so hash / token / +/// ChunkPolicy semantics stay identical across Tier 1 and Tier 2. +fn build_chunk( + doc: &Document, + policy: &ChunkPolicy, + text: &str, + line_start: u32, + line_end: u32, + symbol: &str, + lang: &str, + chunker_version: &str, +) -> Result { + let span = SourceSpan::Code { + line_start, + line_end, + symbol: Some(symbol.to_string()), + lang: Some(lang.to_string()), + }; + // TODO at impl time: replicate 1A-2's exact Chunk { id, text, source_spans, + // chunker_version, parser_version, policy_hash, token_count, content_hash, ... } + // field-fill, computing id / content_hash / policy_hash via the same helpers + // (blake3 + serde_json_canonicalizer) used by code_rust_ast_v1. The exact + // function names are in code_rust_ast_v1.rs's `chunk` impl; mirror them + // here. + todo!("mirror code_rust_ast_v1's Chunk construction; see Task D Step 4a comment") +} +``` + +(`todo!()` 는 placeholder 표시 — impl 단계에서 `code_rust_ast_v1.rs` 의 실제 chunk 생성 부분을 그대로 옮김. 1A-2 의 Chunk 생성이 ~30 줄 정도면 그것을 build_chunk 안으로 옮기되 `span` / `chunker_version` 을 인자 형태로 받음.) + +- [ ] **Step 4b (k8s chunker impl)**: `crates/kebab-chunk/src/k8s_manifest_resource_v1.rs` 작성: + +```rust +//! p10-2: k8s manifest resource-aware chunker. +//! +//! YAML multi-document split with `apiVersion` + `kind` identification. +//! 1 chunk per recognized resource, symbol `//`. +//! Invalid YAML or non-k8s document → 0 chunks (handled by p10-3 fallback). + +use crate::tier2_shared::push_chunks_with_oversize; +use crate::{Chunker, ChunkPolicy}; +use anyhow::Result; +use kebab_core::{Block, Chunk, Document}; + +pub const VERSION_LABEL: &str = "k8s-manifest-resource-v1"; + +pub struct K8sManifestResourceV1Chunker; + +impl Chunker for K8sManifestResourceV1Chunker { + fn chunker_version(&self) -> &'static str { VERSION_LABEL } + + fn chunk(&self, doc: &Document, policy: &ChunkPolicy) -> Result> { + let Some(Block::Code { text, .. }) = doc.blocks.first() else { + return Ok(vec![]); + }; + + // Pre-split on ^---\s*$ to track line numbers (serde_yaml's + // multi-doc iterator doesn't expose line offsets). + let slices = split_yaml_documents(text); + + let mut chunks = Vec::new(); + for slice in slices { + // Any parse error → skip whole file (return empty; p10-3 fallback later). + let value: serde_yaml::Value = match serde_yaml::from_str(slice.text) { + Ok(v) => v, + Err(_) => return Ok(vec![]), + }; + let Some(mapping) = value.as_mapping() else { continue }; + + let api = mapping.get("apiVersion").and_then(|v| v.as_str()).unwrap_or(""); + let kind = mapping.get("kind").and_then(|v| v.as_str()).unwrap_or(""); + if api.is_empty() || kind.is_empty() { + continue; + } + + let metadata = mapping.get("metadata").and_then(|v| v.as_mapping()); + let name = metadata + .and_then(|m| m.get("name")) + .and_then(|v| v.as_str()) + .unwrap_or(""); + let namespace = metadata + .and_then(|m| m.get("namespace")) + .and_then(|v| v.as_str()); + + let symbol = match namespace { + Some(ns) if !ns.is_empty() => format!("{kind}/{ns}/{name}"), + _ => format!("{kind}/{name}"), + }; + + push_chunks_with_oversize( + &mut chunks, doc, policy, + slice.text, slice.line_start, slice.line_end, + &symbol, "yaml", VERSION_LABEL, + )?; + } + Ok(chunks) + } +} + +struct YamlSlice<'a> { + text: &'a str, + line_start: u32, + line_end: u32, +} + +fn split_yaml_documents(text: &str) -> Vec> { + let mut slices = Vec::new(); + let lines: Vec<&str> = text.lines().collect(); + + let mut separators: Vec = lines.iter().enumerate() + .filter_map(|(i, l)| { + let trimmed = l.trim_end(); + if trimmed == "---" || trimmed.starts_with("--- ") || trimmed.starts_with("---\t") { + Some(i) + } else { None } + }) + .collect(); + separators.push(lines.len()); // sentinel after last line + + let mut doc_start_line: usize = 0; // 0-indexed + for sep_line in separators { + if sep_line > doc_start_line { + let start_byte = byte_offset_of_line(text, doc_start_line); + let end_byte = byte_offset_of_line(text, sep_line); + let slice_text = &text[start_byte..end_byte]; + if !slice_text.trim().is_empty() { + slices.push(YamlSlice { + text: slice_text, + line_start: (doc_start_line + 1) as u32, + line_end: sep_line as u32, + }); + } + } + doc_start_line = sep_line + 1; + } + slices +} + +fn byte_offset_of_line(text: &str, line_idx: usize) -> usize { + if line_idx == 0 { return 0; } + let mut count = 0usize; + for (i, c) in text.char_indices() { + if c == '\n' { + count += 1; + if count == line_idx { return i + 1; } + } + } + text.len() +} +``` + +- [ ] **Step 5**: `crates/kebab-chunk/src/lib.rs` 에 추가 (tier2_shared 은 pub 아님 — crate-internal): + +```rust +mod tier2_shared; +pub mod k8s_manifest_resource_v1; +pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker; +``` + +- [ ] **Step 6**: `cargo test -p kebab-chunk k8s_manifest_resource_v1 -- --nocapture` → PASS. fixture 의 2 k8s chunk + 비-k8s skip + invalid yaml 0 chunk + cluster-scoped symbol 검증. + +- [ ] **Step 7**: Clippy + commit: + +```bash +cargo clippy -p kebab-chunk --all-targets -- -D warnings +git add crates/kebab-chunk/src/k8s_manifest_resource_v1.rs \ + crates/kebab-chunk/src/lib.rs \ + crates/kebab-chunk/tests/fixtures/sample_k8s.yaml \ + crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs +git commit -m "$(cat <<'EOF' +feat(p10-2): k8s-manifest-resource-v1 chunker (YAML multi-doc + apiVersion+kind identification) + +Splits multi-document YAML by ^---\s*$, requires apiVersion + kind string +fields per document, emits 1 chunk per recognized k8s resource. Symbol = +// or / (cluster-scoped). Invalid YAML +returns 0 chunks (handled by p10-3 paragraph fallback). Oversize >200 lines +splits into line-windows sharing the same symbol. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task E: dockerfile-file-v1 chunker + +**Files:** +- Create: `crates/kebab-chunk/src/dockerfile_file_v1.rs` +- Create: `crates/kebab-chunk/tests/fixtures/sample.dockerfile` +- Create: `crates/kebab-chunk/tests/dockerfile_file_v1.rs` +- Modify: `crates/kebab-chunk/src/lib.rs` + +- [ ] **Step 1 (fixture)**: `crates/kebab-chunk/tests/fixtures/sample.dockerfile` (5 줄): + +```dockerfile +FROM rust:1.94-slim AS builder +WORKDIR /app +COPY . . +RUN cargo build --release +CMD ["/app/target/release/kebab"] +``` + +- [ ] **Step 2 (failing test)**: `crates/kebab-chunk/tests/dockerfile_file_v1.rs`: + +```rust +use kebab_chunk::{ChunkPolicy, Chunker, DockerfileFileV1Chunker}; +use kebab_core::{Asset, AssetId, Block, Document, MediaType, ParserVersion, SourceSpan}; +use std::path::PathBuf; + +fn read_fixture(name: &str) -> String { + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests/fixtures").join(name); + std::fs::read_to_string(path).unwrap() +} +fn make_doc(lang: &str, text: &str) -> Document { + Document { + doc_id: "test".into(), + asset: Asset { + asset_id: AssetId("a".into()), + workspace_path: "Dockerfile".into(), + byte_len: text.len() as u64, + content_hash: "deadbeef".into(), + media_type: MediaType::Code(lang.into()), + }, + parser_version: ParserVersion("none-v1".into()), + metadata: Default::default(), + blocks: vec![Block::Code { + text: text.into(), + lang: Some(lang.into()), + span: SourceSpan::Line { start: 1, end: text.lines().count() as u32 }, + }], + } +} + +#[test] +fn dockerfile_emits_single_chunk() { + let text = read_fixture("sample.dockerfile"); + let doc = make_doc("dockerfile", &text); + let chunks = DockerfileFileV1Chunker.chunk(&doc, &ChunkPolicy::default()).unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].source_span_symbol().as_deref(), Some("")); + assert_eq!(chunks[0].source_span_lang().as_deref(), Some("dockerfile")); + // line_start = 1, line_end = 5 (5-line fixture). + let (ls, le) = chunks[0].source_span_lines(); + assert_eq!((ls, le), (1, 5)); +} +``` + +(`source_span_lines()` / `source_span_symbol()` API 가 1A-2 와 동일한지 확인 후 미러링.) + +- [ ] **Step 3**: `cargo test -p kebab-chunk dockerfile_file_v1` → FAIL. + +- [ ] **Step 4 (impl)**: `crates/kebab-chunk/src/dockerfile_file_v1.rs`. Task D Step 4a 의 `tier2_shared::push_chunks_with_oversize` 재사용: + +```rust +//! p10-2: dockerfile whole-file chunker (Tier 2). + +use crate::tier2_shared::push_chunks_with_oversize; +use crate::{Chunker, ChunkPolicy}; +use anyhow::Result; +use kebab_core::{Block, Chunk, Document}; + +pub const VERSION_LABEL: &str = "dockerfile-file-v1"; + +pub struct DockerfileFileV1Chunker; + +impl Chunker for DockerfileFileV1Chunker { + fn chunker_version(&self) -> &'static str { VERSION_LABEL } + + fn chunk(&self, doc: &Document, policy: &ChunkPolicy) -> Result> { + let Some(Block::Code { text, .. }) = doc.blocks.first() else { + return Ok(vec![]); + }; + let total_lines = text.lines().count().max(1) as u32; + let mut chunks = Vec::new(); + push_chunks_with_oversize( + &mut chunks, doc, policy, + text, 1, total_lines, + "", "dockerfile", VERSION_LABEL, + )?; + Ok(chunks) + } +} +``` + +- [ ] **Step 5**: `crates/kebab-chunk/src/lib.rs` 갱신: + +```rust +pub mod dockerfile_file_v1; +pub use dockerfile_file_v1::DockerfileFileV1Chunker; +``` + +- [ ] **Step 6**: `cargo test -p kebab-chunk dockerfile_file_v1` → PASS. + +- [ ] **Step 7**: Clippy + commit: + +```bash +cargo clippy -p kebab-chunk --all-targets -- -D warnings +git add crates/kebab-chunk/src/dockerfile_file_v1.rs \ + crates/kebab-chunk/src/lib.rs \ + crates/kebab-chunk/tests/fixtures/sample.dockerfile \ + crates/kebab-chunk/tests/dockerfile_file_v1.rs +git commit -m "$(cat <<'EOF' +feat(p10-2): dockerfile-file-v1 chunker (whole-file 1 chunk, symbol ) + +Reads entire Dockerfile / Dockerfile.* / *.dockerfile content and emits a +single Chunk with symbol "", code_lang "dockerfile", line range +1..EOF. Oversize >200 lines splits into line-windows sharing the symbol. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task F: manifest-file-v1 chunker + +**Files:** +- Create: `crates/kebab-chunk/src/manifest_file_v1.rs` +- Create: `crates/kebab-chunk/tests/fixtures/sample_cargo.toml` +- Create: `crates/kebab-chunk/tests/fixtures/sample_package.json` +- Create: `crates/kebab-chunk/tests/fixtures/sample_pom.xml` +- Create: `crates/kebab-chunk/tests/fixtures/sample_go.mod` +- Create: `crates/kebab-chunk/tests/manifest_file_v1.rs` +- Modify: `crates/kebab-chunk/src/lib.rs` + +- [ ] **Step 1 (fixtures)**: 4 작은 fixture (각 ~10 줄): + +`sample_cargo.toml`: + +```toml +[package] +name = "demo" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = "1" +``` + +`sample_package.json`: + +```json +{ + "name": "demo", + "version": "0.1.0", + "dependencies": { + "react": "^18.0.0" + } +} +``` + +`sample_pom.xml`: + +```xml + + + 4.0.0 + com.demo + demo + 0.1.0 + +``` + +`sample_go.mod`: + +``` +module example.com/demo + +go 1.22 + +require github.com/spf13/cobra v1.8.0 +``` + +- [ ] **Step 2 (failing test)**: `crates/kebab-chunk/tests/manifest_file_v1.rs` 에 4 테스트 (각 fixture 마다): + +```rust +use kebab_chunk::{ChunkPolicy, Chunker, ManifestFileV1Chunker}; +use kebab_core::{Asset, AssetId, Block, Document, MediaType, ParserVersion, SourceSpan}; +use std::path::PathBuf; + +fn read_fixture(name: &str) -> String { + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests/fixtures").join(name); + std::fs::read_to_string(path).unwrap() +} +fn make_doc(lang: &str, text: &str) -> Document { + // Same as Task E's helper. Copy verbatim. + Document { + doc_id: "test".into(), + asset: Asset { + asset_id: AssetId("a".into()), + workspace_path: format!("test-manifest"), + byte_len: text.len() as u64, + content_hash: "deadbeef".into(), + media_type: MediaType::Code(lang.into()), + }, + parser_version: ParserVersion("none-v1".into()), + metadata: Default::default(), + blocks: vec![Block::Code { + text: text.into(), + lang: Some(lang.into()), + span: SourceSpan::Line { start: 1, end: text.lines().count() as u32 }, + }], + } +} + +#[test] +fn cargo_toml_single_chunk_with_toml_lang() { + let text = read_fixture("sample_cargo.toml"); + let doc = make_doc("toml", &text); + let chunks = ManifestFileV1Chunker.chunk(&doc, &ChunkPolicy::default()).unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].source_span_symbol().as_deref(), Some("")); + assert_eq!(chunks[0].source_span_lang().as_deref(), Some("toml")); +} + +#[test] +fn package_json_single_chunk_with_json_lang() { + let text = read_fixture("sample_package.json"); + let doc = make_doc("json", &text); + let chunks = ManifestFileV1Chunker.chunk(&doc, &ChunkPolicy::default()).unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].source_span_symbol().as_deref(), Some("")); + assert_eq!(chunks[0].source_span_lang().as_deref(), Some("json")); +} + +#[test] +fn pom_xml_single_chunk_with_xml_lang() { + let text = read_fixture("sample_pom.xml"); + let doc = make_doc("xml", &text); + let chunks = ManifestFileV1Chunker.chunk(&doc, &ChunkPolicy::default()).unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].source_span_symbol().as_deref(), Some("")); + assert_eq!(chunks[0].source_span_lang().as_deref(), Some("xml")); +} + +#[test] +fn go_mod_single_chunk_with_go_mod_lang() { + let text = read_fixture("sample_go.mod"); + let doc = make_doc("go-mod", &text); + let chunks = ManifestFileV1Chunker.chunk(&doc, &ChunkPolicy::default()).unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].source_span_symbol().as_deref(), Some("")); + assert_eq!(chunks[0].source_span_lang().as_deref(), Some("go-mod")); +} +``` + +- [ ] **Step 3**: `cargo test -p kebab-chunk manifest_file_v1` → FAIL. + +- [ ] **Step 4 (impl)**: `crates/kebab-chunk/src/manifest_file_v1.rs`: + +```rust +//! p10-2: manifest whole-file chunker (Tier 2). Cargo.toml / package.json / etc. + +use crate::tier2_shared::push_chunks_with_oversize; +use crate::{Chunker, ChunkPolicy}; +use anyhow::Result; +use kebab_core::{Block, Chunk, Document}; + +pub const VERSION_LABEL: &str = "manifest-file-v1"; + +pub struct ManifestFileV1Chunker; + +impl Chunker for ManifestFileV1Chunker { + fn chunker_version(&self) -> &'static str { VERSION_LABEL } + + fn chunk(&self, doc: &Document, policy: &ChunkPolicy) -> Result> { + let Some(Block::Code { text, lang, .. }) = doc.blocks.first() else { + return Ok(vec![]); + }; + let lang_str = lang.as_deref().unwrap_or(""); + let total_lines = text.lines().count().max(1) as u32; + let mut chunks = Vec::new(); + push_chunks_with_oversize( + &mut chunks, doc, policy, + text, 1, total_lines, + "", lang_str, VERSION_LABEL, + )?; + Ok(chunks) + } +} +``` + +- [ ] **Step 5**: `crates/kebab-chunk/src/lib.rs` 갱신: + +```rust +pub mod manifest_file_v1; +pub use manifest_file_v1::ManifestFileV1Chunker; +``` + +- [ ] **Step 6**: `cargo test -p kebab-chunk manifest_file_v1` → 4 테스트 PASS. + +- [ ] **Step 7**: Clippy + commit: + +```bash +cargo clippy -p kebab-chunk --all-targets -- -D warnings +git add crates/kebab-chunk/src/manifest_file_v1.rs \ + crates/kebab-chunk/src/lib.rs \ + crates/kebab-chunk/tests/fixtures/sample_cargo.toml \ + crates/kebab-chunk/tests/fixtures/sample_package.json \ + crates/kebab-chunk/tests/fixtures/sample_pom.xml \ + crates/kebab-chunk/tests/fixtures/sample_go.mod \ + crates/kebab-chunk/tests/manifest_file_v1.rs +git commit -m "$(cat <<'EOF' +feat(p10-2): manifest-file-v1 chunker (whole-file 1 chunk, symbol ) + +Emits 1 Chunk per manifest file (Cargo.toml / pyproject.toml / package.json / +tsconfig.json / pom.xml / build.gradle / go.mod). Symbol unified to +""; manifest type distinguished by code_lang (toml / json / xml / +groovy / go-mod). Oversize >200 lines splits into line-windows. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task G: ingest_one_code_asset Tier 2 routing + +**Files:** +- Modify: `crates/kebab-app/src/lib.rs` (`ingest_one_code_asset` 함수 + 호출 부 allowlist) + +현재 7-arm match (rust|python|typescript|javascript|go|java|kotlin) 옆에 Tier 2 분기. Tier 2 는 `extract` 단계 없음 — `RawAsset` bytes 로 직접 `Document` 생성. + +- [ ] **Step 1**: 함수 본문의 parser_version match 갱신: + +```rust +let parser_version = match code_lang { + "rust" => ParserVersion(kebab_parse_code::RUST_PARSER_VERSION.to_string()), + // ... 기존 7 줄 그대로 ... + "kotlin" => ParserVersion(kebab_parse_code::KOTLIN_PARSER_VERSION.to_string()), + // p10-2: Tier 2 has no parse step. + "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + => ParserVersion("none-v1".to_string()), + other => anyhow::bail!("unsupported code_lang: {other}"), +}; +``` + +- [ ] **Step 2**: chunker_version match 갱신 (Tier 2 분기 추가): + +```rust +let chunker_version = match code_lang { + "rust" => CodeRustAstV1Chunker.chunker_version(), + // ... 기존 ... + "kotlin" => CodeKotlinAstV1Chunker.chunker_version(), + "yaml" => K8sManifestResourceV1Chunker.chunker_version(), + "dockerfile" => DockerfileFileV1Chunker.chunker_version(), + "toml" | "json" | "xml" | "groovy" | "go-mod" + => ManifestFileV1Chunker.chunker_version(), + other => anyhow::bail!("unreachable chunker_version: {other}"), +}; +``` + +- [ ] **Step 3**: extract / chunk 단계 분리. 현재는 `let mut canonical = match code_lang { ... extract ... };` 후 `let chunks = match code_lang { ... chunk(&canonical) ... };`. Tier 2 는 extract 없이 직접 Document 생성: + +```rust +// p10-1B/1C: Tier 1 extractors return a canonical Document. +// p10-2: Tier 2 has no parser — synthesize a Document with a single +// Block::Code carrying the whole file text. The chunker does the work. +let mut canonical = match code_lang { + "rust" => RustAstExtractor::new().extract(&ctx, &bytes).context("...")?, + // ... 기존 7 줄 ... + "kotlin" => KotlinAstExtractor::new().extract(&ctx, &bytes).context("...")?, + "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => { + // Tier 2: no extractor. Build a minimal Document. + synthesize_tier2_document(asset, &bytes, code_lang, &parser_version)? + } + other => anyhow::bail!("unreachable (extract): {other}"), +}; + +let chunks = match code_lang { + "rust" => CodeRustAstV1Chunker.chunk(&canonical, chunk_policy).context("...")?, + // ... 기존 ... + "kotlin" => CodeKotlinAstV1Chunker.chunk(&canonical, chunk_policy).context("...")?, + "yaml" => K8sManifestResourceV1Chunker.chunk(&canonical, chunk_policy).context("kb-chunk::K8sManifestResourceV1Chunker::chunk")?, + "dockerfile" => DockerfileFileV1Chunker.chunk(&canonical, chunk_policy).context("kb-chunk::DockerfileFileV1Chunker::chunk")?, + "toml" | "json" | "xml" | "groovy" | "go-mod" + => ManifestFileV1Chunker.chunk(&canonical, chunk_policy).context("kb-chunk::ManifestFileV1Chunker::chunk")?, + other => anyhow::bail!("unreachable (chunk): {other}"), +}; +``` + +`synthesize_tier2_document` helper 를 같은 파일 (kebab-app/src/lib.rs) 안에 추가: + +```rust +fn synthesize_tier2_document( + asset: &RawAsset, + bytes: &[u8], + code_lang: &str, + parser_version: &ParserVersion, +) -> anyhow::Result { + use kebab_core::{Asset, AssetId, Block, Document, MediaType, SourceSpan}; + let text = std::str::from_utf8(bytes) + .with_context(|| format!("tier2 doc not utf-8: {}", asset.workspace_path))? + .to_string(); + let n_lines = text.lines().count().max(1) as u32; + Ok(Document { + doc_id: asset.asset_id.0.clone(), // tentative — will be overwritten downstream + asset: Asset { + asset_id: AssetId(asset.asset_id.0.clone()), + workspace_path: asset.workspace_path.clone(), + byte_len: asset.byte_len, + content_hash: asset.content_hash.clone(), + media_type: MediaType::Code(code_lang.to_string()), + }, + parser_version: parser_version.clone(), + metadata: Default::default(), + blocks: vec![Block::Code { + text, + lang: Some(code_lang.to_string()), + span: SourceSpan::Line { start: 1, end: n_lines }, + }], + }) +} +``` + +(`Document` / `Asset` 의 정확한 필드 — Read `crates/kebab-core/src/document.rs` 후 미러링. 위 코드의 필드명이 다르면 정정.) + +- [ ] **Step 4**: 호출 부 allowlist 갱신 (현재 `matches!(lang.as_str(), "rust" | "python" | ...)`): + +```rust +if matches!(lang.as_str(), + "rust" | "python" | "typescript" | "javascript" + | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" +) { + return ingest_one_code_asset(...); +} +``` + +- [ ] **Step 5**: Build + per-crate test: + +```bash +cargo build -p kebab-app +cargo test -p kebab-app --lib -- --nocapture 2>&1 | tail -20 +``` + +Expected: build clean, 기존 unit test (있다면) 그대로 PASS. + +- [ ] **Step 6**: Clippy + commit: + +```bash +cargo clippy -p kebab-app --all-targets -- -D warnings +git add crates/kebab-app/src/lib.rs +git commit -m "$(cat <<'EOF' +feat(p10-2): activate Tier 2 chunkers in ingest_one_code_asset dispatch + +Adds yaml / dockerfile / toml / json / xml / groovy / go-mod arms to the +existing 7-arm AST match. parser_version unified to "none-v1" for Tier 2. +synthesize_tier2_document builds a minimal Document (single Block::Code +with raw file text) since Tier 2 has no parse step. allowlist in +ingest_one_asset extended to admit Tier 2 langs. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task H: code_ingest_smoke integration tests (Tier 2) + +**Files:** +- Modify: `crates/kebab-app/tests/code_ingest_smoke.rs` (3 새 test) + +기존 9 테스트 옆에 yaml / dockerfile / manifest 통합 ingest 검증 1개씩 추가. + +- [ ] **Step 1 (failing test)** — 파일 끝에 추가: + +```rust +#[test] +fn tier2_k8s_yaml_ingest_searchable() { + let kb = isolated_kb(); // TempDir KB helper, existing in this file + let path = kb.workspace_root().join("k8s/deploy.yaml"); + std::fs::create_dir_all(path.parent().unwrap()).unwrap(); + std::fs::write(&path, "apiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: api\n namespace: prod\nspec:\n replicas: 1\n").unwrap(); + kb.ingest_via_cli().expect("ingest"); + + let hits = kb.search_via_cli_json("--code-lang yaml api"); + assert!(!hits.is_empty(), "expected at least 1 yaml hit"); + assert_eq!(hits[0]["citation"]["lang"].as_str(), Some("yaml")); + assert_eq!(hits[0]["citation"]["symbol"].as_str(), Some("Deployment/prod/api")); +} + +#[test] +fn tier2_dockerfile_ingest_searchable() { + let kb = isolated_kb(); + let path = kb.workspace_root().join("Dockerfile"); + std::fs::write(&path, "FROM rust:1.94\nRUN cargo install foo\n").unwrap(); + kb.ingest_via_cli().expect("ingest"); + + let hits = kb.search_via_cli_json("--code-lang dockerfile cargo"); + assert!(!hits.is_empty()); + assert_eq!(hits[0]["citation"]["lang"].as_str(), Some("dockerfile")); + assert_eq!(hits[0]["citation"]["symbol"].as_str(), Some("")); +} + +#[test] +fn tier2_cargo_toml_ingest_searchable() { + let kb = isolated_kb(); + let path = kb.workspace_root().join("Cargo.toml"); + std::fs::write(&path, "[package]\nname = \"demo\"\nversion = \"0.1.0\"\n").unwrap(); + kb.ingest_via_cli().expect("ingest"); + + let hits = kb.search_via_cli_json("--code-lang toml demo"); + assert!(!hits.is_empty()); + assert_eq!(hits[0]["citation"]["lang"].as_str(), Some("toml")); + assert_eq!(hits[0]["citation"]["symbol"].as_str(), Some("")); +} +``` + +(helper API — `isolated_kb()`, `workspace_root()`, `ingest_via_cli()`, `search_via_cli_json()` — 는 기존 9 테스트가 쓰는 그대로. 명명이 다르면 그 파일의 패턴 미러링.) + +- [ ] **Step 2**: 실행 → FAIL ("yaml symbol not present"). + +```bash +cargo test -p kebab-app --test code_ingest_smoke tier2 -- --nocapture +``` + +- [ ] **Step 3**: Task D-G 가 다 완료된 상태이므로 코드 변경 없이 PASS 해야 함. FAIL 이면 디버그: +- citation_helper 의 `Citation::Code` mapping (1A-1) 이 `lang` / `symbol` 을 wire 에 채우는지 확인. +- `code_lang_for_path` 가 호출되는지 확인 (kebab-source-fs/media.rs). + +- [ ] **Step 4**: 9 + 3 = 12 테스트 통과 후 commit: + +```bash +git add crates/kebab-app/tests/code_ingest_smoke.rs +git commit -m "$(cat <<'EOF' +test(p10-2): integration smoke tests for Tier 2 (k8s yaml + Dockerfile + Cargo.toml) + +Three new tests in code_ingest_smoke.rs verifying isolated-TempDir ingest + +--code-lang filter + Citation::Code.lang / .symbol shape for each Tier 2 +chunker. Brings the suite to 12 tests (Rust 3 + Python 1 + TS 1 + JS 1 + +Go 1 + Java 1 + Kotlin 1 + yaml 1 + dockerfile 1 + manifest 1). + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task I: frozen design §3.5 + §10.1 갱신 + +**Files:** +- Modify: `docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md` + +- [ ] **Step 1**: §3.5 의 `code_lang` 매핑 표 끝부분에 3 줄 추가 (Shell / Make 줄 사이 적절한 위치 — `code_lang_for_path` 정의 직전): + +```diff + - YAML / k8s manifest (`.yaml`, `.yml`) → `yaml` + - Dockerfile (`Dockerfile`, `*.dockerfile`) → `dockerfile` + - TOML (`.toml`) → `toml` + - JSON (`.json`) → `json` ++- XML (`.xml`, `pom.xml`) → `xml` ++- Groovy (`build.gradle`, `.gradle`) → `groovy` ++- Go module (`go.mod`) → `go-mod` + - Shell (`.sh`, `.bash`, `.zsh`) → `shell` +``` + +- [ ] **Step 2**: §10.1 의 deactivation log 표 (또는 줄 목록) 끝에 추가 (1C-Go, 1C-JK 활성화 줄 다음): + +``` +| p10-2 | Tier 2 활성화 — k8s-manifest-resource-v1 + dockerfile-file-v1 + manifest-file-v1 chunker 3종. code_lang 추가 매핑 (xml / groovy / go-mod). | 2026-05-20 | +``` + +(§10.1 의 정확한 형식 — table vs bullet list — 은 현재 파일을 Read 한 후 그 형식에 맞게.) + +- [ ] **Step 3**: commit: + +```bash +git add docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md +git commit -m "$(cat <<'EOF' +docs(p10-2): activate Tier 2 in code-ingest design §10.1 + §3.5 mappings + +§3.5: add code_lang_for_path mappings xml / groovy / go-mod. +§10.1: add deactivation log entry for p10-2 (3 Tier 2 chunkers active). + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task J: README + HANDOFF + ARCHITECTURE + SMOKE + tasks/INDEX + tasks/p10/INDEX + full-suite gate + +**Files:** +- Modify: `README.md` +- Modify: `HANDOFF.md` +- Modify: `docs/ARCHITECTURE.md` +- Modify: `docs/SMOKE.md` +- Modify: `tasks/INDEX.md` +- Modify: `tasks/p10/INDEX.md` + +- [ ] **Step 1 — README.md**: **명령** 표의 ingest 행에 Tier 2 7종 언급 추가. 예 (기존 행이 "지원 lang: rust / python / typescript / javascript / go / java / kotlin" 형식이면): + +```diff +-지원 lang: rust / python / typescript / javascript / go / java / kotlin ++지원 lang: rust / python / typescript / javascript / go / java / kotlin / yaml (k8s) / dockerfile / toml / json / xml / groovy / go-mod +``` + +Configuration 섹션 변경 없음 (gating flag 신설 없음). Mermaid 다이어그램은 변경 없음 (code 카테고리 이미 존재). + +- [ ] **Step 2 — HANDOFF.md**: phase 표의 p10-2 행 ⏳ → ✅. "머지 후 발견된 버그 / 결정 (요약)" 섹션 변경 불필요 (post-merge 발견 시 별도 PR). + +- [ ] **Step 3 — docs/ARCHITECTURE.md**: 디렉토리 트리의 `crates/kebab-chunk/src/` 트리에 3 줄 추가: + +``` +crates/kebab-chunk/src/ +├── code_*_ast_v1.rs (Tier 1, 7개) +├── k8s_manifest_resource_v1.rs (Tier 2, p10-2) +├── dockerfile_file_v1.rs (Tier 2, p10-2) +├── manifest_file_v1.rs (Tier 2, p10-2) +├── tier2_shared.rs (Tier 2 helper, p10-2) +└── ... +``` + +- [ ] **Step 4 — docs/SMOKE.md**: 한 줄 추가 — Tier 2 smoke 검증 (yaml + Dockerfile + Cargo.toml ingest → search --code-lang yaml/dockerfile/toml). + +- [ ] **Step 5 — tasks/INDEX.md** + **tasks/p10/INDEX.md**: p10-2 status ⏳ → ✅. + +- [ ] **Step 6 — Full-suite gate** (memory-conscious): + +```bash +df -h / # 공간 확인 +cargo clean # heavy 면 +cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -60 +cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -30 +``` + +Expected: 모든 테스트 PASS, clippy clean. + +- [ ] **Step 7**: commit: + +```bash +git add README.md HANDOFF.md docs/ARCHITECTURE.md docs/SMOKE.md tasks/INDEX.md tasks/p10/INDEX.md +git commit -m "$(cat <<'EOF' +docs(p10-2): README/HANDOFF/ARCHITECTURE/SMOKE/INDEX + tasks/p10/INDEX + +User-visible surface sync per the docs-split rule: README adds Tier 2 langs +in the command table; HANDOFF flips p10-2 to ✅; ARCHITECTURE adds the new +chunker modules + tier2_shared.rs to the directory tree; SMOKE adds a +yaml/Dockerfile/Cargo.toml smoke step; both INDEX files flip p10-2 to ✅. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task K: version bump 0.13.0 → 0.14.0 + gitea PR + +**Files:** +- Modify: `Cargo.toml` (workspace `version`) +- Modify: `Cargo.lock` (자동 갱신) + +- [ ] **Step 1**: `Cargo.toml` 의 `[workspace.package] version = "0.13.0"` → `"0.14.0"`. + +- [ ] **Step 2**: `cargo build -p kebab` 한 번 — `Cargo.lock` 갱신. + +- [ ] **Step 3**: commit: + +```bash +git add Cargo.toml Cargo.lock +git commit -m "$(cat <<'EOF' +chore: bump version 0.13.0 → 0.14.0 (p10-2 Tier 2 resource-aware) + +Minor bump — additive code_lang values (xml / groovy / go-mod) + 3 new +chunker_version labels (k8s-manifest-resource-v1 / dockerfile-file-v1 / +manifest-file-v1) + frozen design §3.5 deltas. No DB migration, no wire +schema major bump. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +- [ ] **Step 4**: gitea PR open via gitea-ops skill. Branch `feat/p10-2-tier2-resource` → `main`. Title: `feat(p10-2): Tier 2 resource-aware chunkers (k8s + Dockerfile + manifest)`. + +- [ ] **Step 5**: 사용자가 APPROVE 하면 즉시 머지 (memory: feedback_pr_workflow). 머지 후 main pull + branch 정리 + `gitea-release v0.14.0` (gitea-ops skill). + +--- + +## Verification matrix (final, after Task K merge) + +| 검증 | 명령 | 기대 | +|------|------|------| +| Tier 2 lang routing | `kebab schema --json \| jq '.stats.code_lang_breakdown'` (Tier 2 파일 ingest 후) | yaml / dockerfile / toml / json / xml / groovy / go-mod 카운트 등장 | +| k8s symbol shape | `kebab search --code-lang yaml --json` | citation.symbol = `//` | +| Dockerfile chunk | `kebab search --code-lang dockerfile --json` | citation.symbol = ``, line 1..EOF | +| manifest chunk | `kebab search --code-lang toml --json` | citation.symbol = ``, lang 매핑 | +| 비-k8s YAML skip | docker-compose.yml ingest | 0 chunk, IngestReport.skipped 카운트 +1 | +| Invalid YAML skip | 의도적 invalid yaml ingest | 0 chunk, IngestReport.skipped + warning | + +`docs/SMOKE.md` 의 Tier 2 절을 따라 수동 검증 가능. + +--- + +## Risks 재요약 (구현 중 주의) + +- `^---\s*$` regex 가 너무 좁음 — YAML 표준 상 `---` 뒤 공백 + comment 가능. fixture 로 검증 + 필요시 regex 완화. +- `serde_yaml::Value::as_str()` 가 boolean / number 에 None 반환 — apiVersion/kind 가 string 임을 강제. 이미 spec 명시. +- pre-1.0 의 Cargo.toml workspace version 위치 — `[workspace.package]` 가 맞는지 현재 파일 Read 후 확인. +- `synthesize_tier2_document` 의 `doc_id` 가 임시값 (`asset_id.0`) — downstream 의 진짜 doc_id 생성 로직과 충돌 가능. 1A-2 의 extractor return 형식과 같은 doc_id 정책 적용. Step 3 impl 작성 전 RustAstExtractor 의 Document 생성 코드 확인. +- pom.xml 거대 fixture 가 200 줄 넘어가면 oversize split 검증 좋음 — 필요시 Task F 의 sample_pom.xml 을 일부러 길게. diff --git a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md index c216ed6..b946439 100644 --- a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md +++ b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md @@ -1549,6 +1549,8 @@ transitional 형태) 의 source of truth. **p10-1C-JavaKotlin 활성화 (Java + Kotlin) (2026-05-20)**: Java (`code-java-ast-v1`, `.java`) + Kotlin (`code-kotlin-ast-v1`, `.kt`/`.kts`) AST chunker 활성화. symbol = `com.foo.Foo.bar` 형식 (패키지 + 클래스 + 메서드/필드). Kotlin grammar 은 `tree-sitter-kotlin-ng` 사용 (bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 고착으로 사용 불가). +**p10-2 활성화 (Tier 2 chunker) (2026-05-20)**: Tier 2 resource-aware chunker 3종 활성화 — k8s-manifest-resource-v1 (`.yaml`/`.yml`), dockerfile-file-v1 (`Dockerfile`), manifest-file-v1 (`Cargo.toml` 등 설정 파일). 추가 code_lang 매핑: XML (`.xml`, `pom.xml`), Groovy (`build.gradle`, `.gradle`), Go module (`go.mod`). + ### 10.2 MCP server transport (fb-30) `kebab mcp` 가 stdio JSON-RPC server. Rust SDK = `rmcp 1.6`. Tool surface diff --git a/docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md b/docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md index 3a780eb..98f0b0a 100644 --- a/docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md +++ b/docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md @@ -237,6 +237,9 @@ pub struct Metadata { - Dockerfile (`Dockerfile`, `*.dockerfile`) → `dockerfile` - TOML (`.toml`) → `toml` - JSON (`.json`) → `json` +- XML (`.xml`, `pom.xml`) → `xml` +- Groovy (`build.gradle`, `.gradle`) → `groovy` +- Go module (`go.mod`) → `go-mod` - Shell (`.sh`, `.bash`, `.zsh`) → `shell` - Make (`Makefile`, `*.mk`) → `make` - 미지원 / Tier 3 fallback → null diff --git a/tasks/INDEX.md b/tasks/INDEX.md index c9496fe..47922d5 100644 --- a/tasks/INDEX.md +++ b/tasks/INDEX.md @@ -145,7 +145,7 @@ P0~P5 는 직렬. P6~P9 는 P5 이후 병렬 가능. - p10-1C-Go Go AST chunker — 🟡 PR 오픈 (v0.12.0, `code-go-ast-v1`) - p10-1C-JavaKotlin Java + Kotlin AST chunkers — 🟢 PR 오픈 (v0.13.0, `code-java-ast-v1` / `code-kotlin-ast-v1`) - p10-1D C + C++ AST chunkers — ⏳ - - p10-2 Tier 2 resource-aware — ⏳ + - p10-2 Tier 2 resource-aware — ✅ 머지 (v0.14.0, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1`) - p10-3 Tier 3 paragraph + line-window fallback — ⏳ ## Post-merge 핫픽스 diff --git a/tasks/p10/INDEX.md b/tasks/p10/INDEX.md index 2ec3bd9..c14a287 100644 --- a/tasks/p10/INDEX.md +++ b/tasks/p10/INDEX.md @@ -8,7 +8,7 @@ | 1C-Go | Go AST chunker (`code-go-ast-v1`) | 🟡 PR 오픈 (v0.12.0) | | 1C-JavaKotlin | Java + Kotlin AST chunkers (`code-java-ast-v1` / `code-kotlin-ast-v1`) | 🟢 PR 오픈 (v0.13.0) | | 1D | C + C++ AST chunkers | ⏳ | -| 2 | Tier 2 resource-aware (k8s / Dockerfile / manifest) | ⏳ | +| 2 | Tier 2 resource-aware (k8s / Dockerfile / manifest) | ✅ 머지 (v0.14.0) | | 3 | Tier 3 paragraph + line-window fallback | ⏳ | Design: [2026-05-15-kebab-code-ingest-design.md](../../docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md) diff --git a/tasks/p10/p10-2-tier2-resource-aware.md b/tasks/p10/p10-2-tier2-resource-aware.md new file mode 100644 index 0000000..8ae66ad --- /dev/null +++ b/tasks/p10/p10-2-tier2-resource-aware.md @@ -0,0 +1,120 @@ +# p10-2 — Tier 2 resource-aware chunkers (k8s + Dockerfile + manifest) + +**Status:** 🟡 진행 중 +**Contract sections:** §3.3 (chunker_version `k8s-manifest-resource-v1` + `dockerfile-file-v1` + `manifest-file-v1`), §3.4 (citation symbol — `//` / `` / ``), §3.5 (code_lang 추가 매핑 `xml` / `groovy` / `go-mod`), §6.1 (`kebab-parse-code/src/lang.rs` 갱신 + `kebab-source-fs/src/media.rs` 의 inline duplication 정리), §6.2 (`kebab-chunk/src/{k8s_manifest_resource_v1,dockerfile_file_v1,manifest_file_v1}.rs`), §9.2 (Tier 2 정의), §10.1 (deactivation log 한 줄). +**Design:** [2026-05-15-kebab-code-ingest-design.md](../../docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md) §1.2 (Phase 2) + §9.2. +**Plan:** [2026-05-20-p10-2-tier2-resource-aware.md](../../docs/superpowers/plans/2026-05-20-p10-2-tier2-resource-aware.md). + +## Goal + +p10-1A-2 / 1B / 1C 인프라 위에 Tier 2 resource-aware chunker 3종을 단일 PR 로 활성화. AST 가 아닌 file/document-level chunking — 1B (Python+TS+JS) 의 묶음 패턴 따름. 머지 시점부터 `.yaml` / `.yml` / `Dockerfile` / 매니페스트 7종 dogfooding 가능. + +비-k8s YAML (Helm values, CI yml, docker-compose 등) 및 invalid YAML 은 본 phase 에선 skip — p10-3 의 paragraph fallback 이 머지되면 자동으로 wire 됨. + +## 동결된 설계 결정 (이 task 로 확정) + +### 공통 + +- **3 chunker = self-contained**. `kebab-parse-code` 에 Tier 2 용 extractor 모듈 추가 없음. lang.rs 의 `code_lang_for_path` 갱신만. AST 가 아니라 추상화 비용이 코드 보상보다 큼. +- **`code_lang_for_path` = single source of truth** (design §3.5). `kebab-source-fs/src/media.rs` 의 inline 확장자 match 는 이 함수 호출로 통일 (1A-1 부터 누적된 duplication 정리, 작은 리팩토링). +- **parser_version** = `"none-v1"` 통일. Tier 2 는 parse 단계가 없음을 명시하는 sentinel. chunker_version cascade 만 의미 있음. +- **oversize fallback** = AST chunker 와 동일 정책 (`AST_CHUNK_MAX_LINES = 200` 초과 시 line-window split). 거대 ConfigMap / multi-stage Dockerfile / aggregate POM 대비. split chunk 는 같은 symbol 공유 (line range 만 다름). +- **frozen design 갱신** (본 PR 안에서): + - §3.5 `code_lang` 매핑 표에 3 줄 추가: + - XML (`.xml`, `pom.xml`) → `xml` + - Groovy (`build.gradle`, `.gradle`) → `groovy` + - Go module (`go.mod`) → `go-mod` + - §10.1 deactivation log 한 줄 추가: "p10-2 활성화 — Tier 2 chunker 3종 active." + +### k8s-manifest-resource-v1 + +- **Trigger**: `MediaType::Code("yaml")` (= `.yaml` / `.yml`). +- **k8s 식별**: YAML document 의 top-level mapping 에 `apiVersion: ` + `kind: ` 둘 다 있어야 인정. 하나라도 없거나 string 타입이 아니면 그 document skip (전체 파일 skip 아님 — 다른 document 는 정상 처리). +- **Multi-document split 구현**: `serde_yaml::Deserializer::from_str` 의 multi-document iterator 가 line offset 을 안 줘서, 원본 텍스트의 `^---\s*$` 줄 정규식 기준으로 pre-split 후 각 슬라이스를 deserialize. line_start/line_end 는 pre-split 단계에서 추적. trailing `---` 의 빈 슬라이스는 skip. +- **Symbol**: `//` (namespace 있으면) 또는 `/` (cluster-scoped) 또는 `/` (name 누락). 예: `Deployment/prod/api-server`, `ClusterRole/cluster-admin`, `ConfigMap/`. +- **Chunk text**: pre-split 슬라이스의 원본 텍스트 그대로 (deserialized form 아님 — 원본 보존). +- **Citation**: `Citation::Code { path, line_start, line_end, symbol: Some(<위>), lang: Some("yaml") }`. +- **Failure modes**: + - Invalid YAML (어떤 document 라도 deserialize 실패) → 파일 전체 emit 0 chunk + warning log `invalid yaml: {path}`. p10-3 의 paragraph fallback 이 picked up. + - 인정된 document 0개 (모두 비-k8s) → 파일 전체 emit 0 chunk. 동일 fallback. + +### dockerfile-file-v1 + +- **Trigger**: `MediaType::Code("dockerfile")` — 파일명이 정확히 `Dockerfile`, 또는 prefix `Dockerfile.` (e.g. `Dockerfile.dev`), 또는 확장자 `.dockerfile` (e.g. `myapp.dockerfile`). +- **Algorithm**: 파일 전체 텍스트 → 1 chunk emit. +- **Symbol**: 통일 ``. +- **Citation**: `Citation::Code { path, line_start: 1, line_end: , symbol: Some(""), lang: Some("dockerfile") }`. + +### manifest-file-v1 + +- **Trigger**: 파일명이 design §9.2 의 7종 중 하나: + | basename | code_lang | + |----------------|-----------| + | `Cargo.toml` | `toml` | + | `pyproject.toml` | `toml` | + | `package.json` | `json` | + | `tsconfig.json`| `json` | + | `go.mod` | `go-mod` | + | `pom.xml` | `xml` | + | `build.gradle` | `groovy` | +- **제외**: `build.gradle.kts` 는 1C-JK 의 Kotlin AST chunker (code-kotlin-ast-v1) 가 잡으므로 본 chunker 의 대상 아님. +- **Algorithm**: 파일 전체 텍스트 → 1 chunk emit. +- **Symbol**: 통일 `` (7종 모두). manifest 종류 구분은 `code_lang` 으로 — 예: `--code-lang go-mod` 는 go.mod 만, `--code-lang toml` 은 Cargo.toml + pyproject.toml. +- **Citation**: `Citation::Code { path, line_start: 1, line_end: , symbol: Some(""), lang: Some(<위 매핑>) }`. + +### Routing (kebab-app::ingest_one_code_asset) + +기존 7-arm AST match 옆에 Tier 2 분기 추가: + +```text +"rust" | "python" | "typescript" | "javascript" + | "go" | "java" | "kotlin" → 기존 AST chunker (1A-2 / 1B / 1C) +"yaml" → k8s_manifest_resource_v1 +"dockerfile" → dockerfile_file_v1 +"toml" | "json" | "xml" + | "groovy" | "go-mod" → manifest_file_v1 +_ → skip (p10-3 fallback 의 자리) +``` + +`code_lang_for_path` 의 lookup 순서: basename 우선 매칭 (`Cargo.toml` / `Dockerfile.*` / etc.) → 확장자 fallback (`.yaml` / `.toml` / etc.). + +## Acceptance criteria + +- `cargo test --workspace --no-fail-fast -j 1` passes (memory-conscious: per-crate 위주, full-suite gate 는 docs task 직전 1회). +- `cargo clippy --workspace --all-targets -- -D warnings` passes. +- 각 chunker 의 snapshot test 안정: + - `crates/kebab-chunk/tests/fixtures/sample.yaml` — 2 k8s doc (Deployment + Service) + 1 비-k8s doc (apiVersion 빠짐) → 2 chunk emit, 비-k8s doc skip. + - `crates/kebab-chunk/tests/fixtures/sample.dockerfile` → 1 chunk, symbol ``. + - `crates/kebab-chunk/tests/fixtures/sample.Cargo.toml` + `sample.package.json` + `sample.pom.xml` + `sample.go.mod` (4종) → 각 1 chunk, symbol ``, 매핑된 code_lang. +- `code_lang_for_path` 의 basename 우선 매칭 + 확장자 fallback unit test. +- 격리 TempDir KB 에 yaml + Dockerfile + Cargo.toml 두고 `kebab search --code-lang yaml --json` / `--code-lang dockerfile --json` / `--code-lang toml --json` 각각 `Citation::Code` 반환 (기존 `code_ingest_smoke.rs` 에 3 테스트 추가, 총 12 테스트). +- `kebab schema --json | jq .stats.code_lang_breakdown` 에 `yaml` / `dockerfile` / `toml` / `json` / `xml` / `groovy` / `go-mod` 카운트 (사용된 것만 등장). +- README + HANDOFF + docs/ARCHITECTURE + docs/SMOKE + tasks/INDEX + tasks/p10/INDEX 갱신. +- frozen design §3.5 매핑 3 줄 + §10.1 활성화 한 줄. +- workspace `Cargo.toml` minor bump (0.13.0 → 0.14.0), gitea-release v0.14.0. + +## Allowed dependencies + +- `kebab-chunk` 에 새 모듈 3개 (`k8s_manifest_resource_v1.rs` / `dockerfile_file_v1.rs` / `manifest_file_v1.rs`) 및 dep entry `serde_yaml = { workspace = true }` (workspace 에 이미 존재). 기존 deps (kebab-core / serde_json_canonicalizer / blake3 / anyhow / tracing) 유지. +- `kebab-parse-code` 의 `lang.rs` 갱신만. extractor 모듈 추가 없음, 새 crate dep 없음. +- `kebab-source-fs/src/media.rs` — `code_lang_for_path` 호출로 inline match 정리. 기존 dep 유지 (kebab-parse-code 는 이미 의존). +- `kebab-app::ingest_one_code_asset` — match 분기 확장. 새 crate dep 없음. + +## Forbidden dependencies + +- `kebab-chunk` 가 store / embed / llm / rag / tree-sitter 직접 import 금지 (boundary §6.3 유지). +- `kebab-parse-code` 가 store / embed / llm / rag 직접 import 금지. +- UI crate (`kebab-cli` / `kebab-mcp` / `kebab-tui` / `kebab-desktop`) 가 `kebab-parse-code` / `kebab-chunk` 직접 import 금지 — `kebab-app` facade 만. + +## Risks / notes + +- **serde_yaml line offset 없음** → 원본 텍스트의 `^---\s*$` 정규식 split 으로 line 추적. trailing `---` 의 빈 슬라이스 / 첫 슬라이스에 `---` prefix 없음 / 비-표준 separator (예: `--- # comment`) 모두 fixture 로 검증. +- **apiVersion / kind 가 string 이 아닌 경우** (예: `kind: 42`) — `serde_yaml::Value::as_str()` 으로 string 체크 후 인정. 비-string 이면 비-k8s 취급. +- **cluster-scoped resource** (Namespace, ClusterRole, ClusterRoleBinding, …) — metadata.namespace 없음이 정상. symbol = `/` 형태. +- **metadata.name 누락** — 비정상이지만 panic 금지. `/` fallback + warning log. +- **거대 ConfigMap / Helm-rendered manifest** — `AST_CHUNK_MAX_LINES = 200` oversize fallback. split chunk 가 같은 symbol 공유 → search 시 dedupe 또는 user-visible 두 hit 으로 보임 (1A-2 의 oversize 와 동일 동작). +- **YAML anchor / merge keys (`&`, `<<`, `*`)** — serde_yaml 가 자동 resolve. 원본 텍스트 보존 정책상 chunk text 는 원본 (resolve 전) 유지, 파싱은 resolve 후 값으로. +- **`Dockerfile.example` 같은 doc-purpose 파일** — 확장자/접두사 매칭에 잡힘. user intent 와 어긋날 수 있으나 본 phase 의 scope 밖 (skip 정책은 1A-1 의 size/built-in/generated 정책으로 통제). dogfood 후 false positive 빈도 보고 HOTFIXES 결정. +- **`pom.xml` aggregate parent POM** — 매우 큼 (수백~수천 줄). oversize fallback 으로 split. 거대 fixture 로 한 번 검증. +- **`media.rs` 정리** — 1A-1 부터 누적된 inline `match extension` duplication 을 `code_lang_for_path` 호출로 교체. 기존 단위 테스트 동작 보존 (테스트는 결과 값만 보므로 통과해야 함). +- **머지 후 deviation** 은 `tasks/HOTFIXES.md` dated 로그 + 본 spec `Risks / notes` 에 one-line cross-link.