diff --git a/Cargo.lock b/Cargo.lock index 0839e76..34e4a91 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4127,7 +4127,7 @@ dependencies = [ [[package]] name = "kebab-app" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "base64 0.22.1", @@ -4172,7 +4172,7 @@ dependencies = [ [[package]] name = "kebab-chunk" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "blake3", @@ -4188,7 +4188,7 @@ dependencies = [ [[package]] name = "kebab-cli" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "clap", @@ -4209,7 +4209,7 @@ dependencies = [ [[package]] name = "kebab-config" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "dirs 5.0.1", @@ -4224,7 +4224,7 @@ dependencies = [ [[package]] name = "kebab-core" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "blake3", @@ -4238,7 +4238,7 @@ dependencies = [ [[package]] name = "kebab-embed" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "blake3", @@ -4252,7 +4252,7 @@ dependencies = [ [[package]] name = "kebab-embed-local" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "fastembed", @@ -4265,7 +4265,7 @@ dependencies = [ [[package]] name = "kebab-eval" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "kebab-app", @@ -4284,7 +4284,7 @@ dependencies = [ [[package]] name = "kebab-llm" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "kebab-core", @@ -4293,7 +4293,7 @@ dependencies = [ [[package]] name = "kebab-llm-local" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "kebab-config", @@ -4310,7 +4310,7 @@ dependencies = [ [[package]] name = "kebab-mcp" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "kebab-app", @@ -4328,7 +4328,7 @@ dependencies = [ [[package]] name = "kebab-normalize" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "kebab-core", @@ -4343,7 +4343,7 @@ dependencies = [ [[package]] name = "kebab-parse-code" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "gix", @@ -4364,7 +4364,7 @@ dependencies = [ [[package]] name = "kebab-parse-image" -version = "0.14.0" +version = "0.15.0" dependencies = [ "ab_glyph", "anyhow", @@ -4388,7 +4388,7 @@ dependencies = [ [[package]] name = "kebab-parse-md" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "kebab-core", @@ -4405,7 +4405,7 @@ dependencies = [ [[package]] name = "kebab-parse-pdf" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "blake3", @@ -4418,7 +4418,7 @@ dependencies = [ [[package]] name = "kebab-parse-types" -version = "0.14.0" +version = "0.15.0" dependencies = [ "kebab-core", "serde", @@ -4426,7 +4426,7 @@ dependencies = [ [[package]] name = "kebab-rag" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "blake3", @@ -4447,7 +4447,7 @@ dependencies = [ [[package]] name = "kebab-search" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "globset", @@ -4466,7 +4466,7 @@ dependencies = [ [[package]] name = "kebab-source-fs" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "blake3", @@ -4485,7 +4485,7 @@ dependencies = [ [[package]] name = "kebab-store-sqlite" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "blake3", @@ -4506,7 +4506,7 @@ dependencies = [ [[package]] name = "kebab-store-vector" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "arrow", @@ -4530,7 +4530,7 @@ dependencies = [ [[package]] name = "kebab-tui" -version = "0.14.0" +version = "0.15.0" dependencies = [ "anyhow", "crossterm", diff --git a/Cargo.toml b/Cargo.toml index cf98073..571d7ff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ edition = "2024" rust-version = "1.85" license = "MIT OR Apache-2.0" repository = "https://github.com/altair823/kebab" -version = "0.14.0" +version = "0.15.0" [workspace.dependencies] anyhow = "1" diff --git a/HANDOFF.md b/HANDOFF.md index 184b1e8..bb8338b 100644 --- a/HANDOFF.md +++ b/HANDOFF.md @@ -4,7 +4,7 @@ ## 한 줄 요약 -P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. `kebab ingest` 가 markdown / image / PDF / 소스코드 (Rust / Python / TS / JS / Go / Java / Kotlin) / Tier 2 리소스 파일 (yaml/k8s / dockerfile / toml / json / xml / groovy / go-mod) 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page / code citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공. P10-2 (Tier 2 resource-aware) 완료 — 다음 후보 = P10-1D (C/C++) 또는 P10-3 (Tier 3 fallback) 또는 P9-5 (desktop tauri) 또는 보류 중인 P8 (audio). +P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. `kebab ingest` 가 markdown / image / PDF / 소스코드 (Rust / Python / TS / JS / Go / Java / Kotlin) / Tier 2 리소스 파일 (yaml/k8s / dockerfile / toml / json / xml / groovy / go-mod) + Tier 3 paragraph fallback (shell / 비-k8s YAML / AST 실패 케이스) 처리. `kebab search` / `kebab ask` 가 매체 가로질러 결과 + page / code citation 반환. `kebab tui` 가 4 패널 (Library + Search + Ask + Inspect) 제공. P10-3 (Tier 3 paragraph fallback) 완료 — 다음 후보 = P10-1D (C/C++) 또는 P9-5 (desktop tauri) 또는 보류 중인 P8 (audio). ## Phase 로드맵 @@ -20,7 +20,7 @@ P0–P5 + P6 + P7 + P9-1/2/3/4 (Library / Search / Ask / Inspect) 머지 완료. | **P7** | PDF text + page citation | `kebab-parse-pdf` | P5 | ✅ 완료 (3/3 component, page-level chunker + ingest wiring) | | **P8** | 음성 transcription + timestamp citation | `kebab-parse-audio` | P5 | ⏸ 보류 (whisper-rs 시스템 dep brainstorm 필요) | | **P9** | TUI + desktop app | `kebab-tui`, `kebab-desktop` | P5 | 🟡 진행 (4/5 component — P9-1/2/3/4 완료 [Library / Search / Ask / Inspect], P9-5 desktop 예정 · 도그푸딩 피드백 **20/20 ✅**) | -| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)**, **2 ✅ (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` — v0.14.0)** | +| **P10** | code ingest framework | `kebab-parse-code` | P5 | 🟡 진행 중 — 1A-1 ✅ (wire schema + parse-code skeleton + filter flags), 1A-2 ✅ (Rust AST chunker, `code-rust-ast-v1` — v0.7.0), 1B ✅ (Python/TS/JS AST chunkers — v0.8.0 이후), **1C-Go ✅ (Go AST chunker, `code-go-ast-v1` — v0.12.0)**, **1C-JavaKotlin ✅ (Java + Kotlin AST chunkers, `code-java-ast-v1` / `code-kotlin-ast-v1` — v0.13.0)**, **2 ✅ (Tier 2 resource-aware: yaml/k8s + dockerfile + manifest, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1` — v0.14.0)**, **3 ✅ (Tier 3 paragraph fallback: code-text-paragraph-v1 — v0.15.0)** | P0~P5 직렬. P6~P9 P5 이후 병렬 가능. diff --git a/README.md b/README.md index a948a10..828ac89 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ kebab doctor | 명령 | 동작 | |------|------| | `kebab init` | XDG 경로에 데이터 디렉토리 + config.toml 생성 | -| `kebab ingest []` | Markdown / 이미지 / PDF / Rust 소스코드 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`), **소스코드** (`.rs` → `code-rust-ast-v1`, `.py` → `code-python-ast-v1`, `.ts`/`.tsx` → `code-ts-ast-v1`, `.js`/`.mjs`/`.cjs`/`.jsx` → `code-js-ast-v1`, `.go` → `code-go-ast-v1`, `.java` → `code-java-ast-v1`, `.kt`/`.kts` → `code-kotlin-ast-v1` — 모두 tree-sitter AST chunker; **Tier 2 리소스 파일**: `.yaml`/`.yml` → `k8s-manifest-resource-v1` (apiVersion+kind 파싱), `Dockerfile`/`Dockerfile.*`/`*.dockerfile` → `dockerfile-file-v1` (전체 파일), `Cargo.toml`/`pyproject.toml`/`.toml`/`package.json`/`tsconfig.json`/`.json`/`pom.xml`/`.xml`/`build.gradle`/`.gradle`/`go.mod` → `manifest-file-v1` (전체 파일) — yaml (k8s) / dockerfile / toml / json / xml / groovy / go-mod 지원). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. 코드 chunk 는 `citation.kind = "code"` 에 `citation.lang = ""` + `symbol` + line range 를 담고, SearchHit top-level 에 `code_lang` + `repo` (`.git/` walk-up 의 디렉토리 이름) 가 backfill 됨. `--code-lang rust` / `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` / `--code-lang go` / `--code-lang java` / `--code-lang kotlin` / `--code-lang yaml` / `--code-lang dockerfile` / `--code-lang toml` / `--code-lang json` / `--code-lang xml` / `--code-lang groovy` / `--code-lang go-mod` / `--media code` filter 로 언어별·코드 전용 검색 가능 (p10-1A-1 filter flags). Python symbol 은 workspace 경로 → dotted module path prefix (예: `kebab_eval.metrics.compute_mrr`), TS/JS symbol 은 slash-style module path prefix (예: `src/Foo.Foo.search`), Go symbol 은 `package.Func` / `package.(*Receiver).Method` 형식, Java / Kotlin symbol 은 `com.foo.Foo.bar` 형식 (패키지 + 클래스 + 메서드/필드). | +| `kebab ingest []` | Markdown / 이미지 / PDF / Rust 소스코드 색인 (idempotent). TTY 에서는 stderr 진행 바, non-TTY (CI / pipe) 는 stderr 한 줄씩, `--json` 은 stdout 에 `ingest_progress.v1` 라인 streaming 후 마지막에 `ingest_report.v1`. Ctrl-C 한 번이면 현재 asset 마무리 후 abort (부분 commit 보존, idempotent re-run), 두 번째 Ctrl-C 는 hard exit. Markdown title 이 frontmatter 에 없어도 첫 H1 → H2 → 첫 paragraph 80 자 → 파일명 순으로 자동 채움 (parser_version `md-frontmatter-v2`) — 기존 색인된 doc 도 다음 ingest 에서 새 title 로 갱신. **Incremental** (p9-fb-23): 두 번째 이후의 ingest 는 변하지 않은 doc (blake3 + parser/chunker/embedder version 모두 동일) 의 parse/chunk/embed/vector upsert 를 자동 스킵. final summary 에 `N unchanged` 카운트 표시. `--force-reingest` 로 skip 무시 강제 재처리. **지원 형식** (extractor 자동 결정 — config 에 명시 불가): Markdown (`.md`), 이미지 (`.png` / `.jpg` / `.jpeg`, OCR + caption), PDF (`.pdf`), **소스코드** (`.rs` → `code-rust-ast-v1`, `.py` → `code-python-ast-v1`, `.ts`/`.tsx` → `code-ts-ast-v1`, `.js`/`.mjs`/`.cjs`/`.jsx` → `code-js-ast-v1`, `.go` → `code-go-ast-v1`, `.java` → `code-java-ast-v1`, `.kt`/`.kts` → `code-kotlin-ast-v1` — 모두 tree-sitter AST chunker; **Tier 2 리소스 파일**: `.yaml`/`.yml` → `k8s-manifest-resource-v1` (apiVersion+kind 파싱), `Dockerfile`/`Dockerfile.*`/`*.dockerfile` → `dockerfile-file-v1` (전체 파일), `Cargo.toml`/`pyproject.toml`/`.toml`/`package.json`/`tsconfig.json`/`.json`/`pom.xml`/`.xml`/`build.gradle`/`.gradle`/`go.mod` → `manifest-file-v1` (전체 파일) — yaml (k8s) / dockerfile / toml / json / xml / groovy / go-mod 지원); **Tier 3 paragraph fallback** (`.sh`/`.bash`/`.zsh` → `code-text-paragraph-v1`, blank-line paragraph split + 80-line/20-overlap line-window. Tier 1/2 가 0 chunk 또는 Err 시 자동 fallback — 비-k8s YAML 같은 케이스 picked up. symbol = None, lang 은 원본 보존.). 다른 확장자는 자동 skip — `IngestItem.warnings` 에 사유 (`"unsupported media type: .docx"` 등), `IngestReport.skipped_by_extension` 에 카운트 분류, CLI / TUI summary 에 breakdown 표시. 코드 chunk 는 `citation.kind = "code"` 에 `citation.lang = ""` + `symbol` + line range 를 담고, SearchHit top-level 에 `code_lang` + `repo` (`.git/` walk-up 의 디렉토리 이름) 가 backfill 됨. `--code-lang rust` / `--code-lang python` / `--code-lang typescript` / `--code-lang javascript` / `--code-lang go` / `--code-lang java` / `--code-lang kotlin` / `--code-lang yaml` / `--code-lang dockerfile` / `--code-lang toml` / `--code-lang json` / `--code-lang xml` / `--code-lang groovy` / `--code-lang go-mod` / `--code-lang shell` / `--media code` filter 로 언어별·코드 전용 검색 가능 (p10-1A-1 filter flags). Python symbol 은 workspace 경로 → dotted module path prefix (예: `kebab_eval.metrics.compute_mrr`), TS/JS symbol 은 slash-style module path prefix (예: `src/Foo.Foo.search`), Go symbol 은 `package.Func` / `package.(*Receiver).Method` 형식, Java / Kotlin symbol 은 `com.foo.Foo.bar` 형식 (패키지 + 클래스 + 메서드/필드). | | `kebab search --mode {lexical,vector,hybrid} "" [--no-cache] [--max-tokens N] [--snippet-chars N] [--cursor ] [--tag T] [--lang L] [--path-glob G] [--trust-min LEVEL] [--media TYPE] [--ingested-after RFC3339] [--doc-id ID] [--trace] [--bulk] [--repo NAME ...] [--code-lang LIST]` | 검색. hybrid는 RRF fusion, citation 포함. 같은 process 안에서 동일 query (NFKC + trim + lowercase 정규화) 반복 시 in-process LRU 캐시 hit (capacity = `[search] cache_capacity`, default 256). `--no-cache` 로 강제 bypass — 디버깅용. ingest commit 발생 시 `kv['corpus_revision']` bump 으로 모든 entry 자동 stale. **`--max-tokens` / `--snippet-chars` / `--cursor` (p9-fb-34)** — agent budget controls. `--json` 출력은 `search_response.v1` wrapper (`{hits, next_cursor, truncated}`) — pre-fb-34 의 bare array 와 호환 안 됨. mismatched cursor → `error.v1.code = stale_cursor`. **filter flags (p9-fb-36):** `--tag` 는 반복 가능 flag (`--tag rust --tag async`) 로 OR 매칭, `--media` 는 `,` 구분 다중 값 OR 매칭, 나머지 flags 간은 AND 조합. `--trust-min` 은 `primary\|secondary\|generated` 중 하나 (해당 level 이상 포함). `--ingested-after` 는 RFC3339 UTC — 파싱 실패 시 `error.v1.code = config_invalid` (exit 2). `--media md` 는 `markdown` alias 로 정규화. 알 수 없는 `--media` 값은 무조건 empty hits (오류 아님). **`--trace` (p9-fb-37)** — `search_response.v1.trace` 에 lexical / vector pre-fusion 후보 + RRF union + per-stage timing (`lexical_ms` / `vector_ms` / `fusion_ms` / `total_ms`) 노출. trace 요청은 캐시 우회 (`--no-cache` 없이도 항상 cold). **`--bulk` (p9-fb-42)** — stdin ndjson 으로 N query 한 번에 실행. `--json` 면 stdout per-query ndjson (`bulk_search_item.v1`) + stderr summary (`bulk_summary: total=N succeeded=S failed=F`). Cap 100. agent 가 query decomposition 후 sub-query 일괄 실행 시 single round-trip — App instance 재사용으로 캐시 / embedder cold-start 비용 한 번만. Per-query failure 는 item 의 `error` (error.v1) 에 격리, 다른 query 계속 진행. **code corpus filters (p10-1A-1):** `--repo` 는 반복 가능 (`--repo kebab --repo other`) OR 매칭. `--code-lang` 는 반복 또는 comma 다중 값 (`--code-lang rust,python`), 알 수 없는 값은 빈 hits. `--media code` 는 Tier 1/2/3 모든 code chunk 포함. 1A-1 시점에서는 indexed 된 code chunk 가 없어 filter 가 항상 빈 결과 — 1A-2 (Rust AST chunker) 머지 이후 실효. | | `kebab list docs` | 색인된 문서 목록 | | `kebab inspect doc ` / `kebab inspect chunk ` | raw record 보기 | @@ -132,7 +132,7 @@ flowchart TB subgraph Pipeline["도메인 + 파이프라인"] parse["parse-md / parse-pdf / parse-image / parse-code"] - chunker["chunker (md-heading-v1, pdf-page-v1, code-{rust,python,ts,js,go,java,kotlin}-ast-v1, k8s-manifest-resource-v1, dockerfile-file-v1, manifest-file-v1)"] + chunker["chunker (md-heading-v1, pdf-page-v1, code-{rust,python,ts,js,go,java,kotlin}-ast-v1, k8s-manifest-resource-v1, dockerfile-file-v1, manifest-file-v1, code-text-paragraph-v1)"] embedder["embedder (fastembed multilingual-e5-large)"] retriever["retriever (lexical / vector / hybrid RRF)"] rag["RAG pipeline"] diff --git a/crates/kebab-app/src/lib.rs b/crates/kebab-app/src/lib.rs index 585b47f..907d93b 100644 --- a/crates/kebab-app/src/lib.rs +++ b/crates/kebab-app/src/lib.rs @@ -39,7 +39,7 @@ use std::sync::Arc; use anyhow::{Context, anyhow}; use serde::{Deserialize, Serialize}; -use kebab_chunk::{CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; +use kebab_chunk::{CodeGoAstV1Chunker, CodeJavaAstV1Chunker, CodeJsAstV1Chunker, CodeKotlinAstV1Chunker, CodePythonAstV1Chunker, CodeRustAstV1Chunker, CodeTextParagraphV1Chunker, CodeTsAstV1Chunker, DockerfileFileV1Chunker, K8sManifestResourceV1Chunker, ManifestFileV1Chunker, MdHeadingV1Chunker, PdfPageV1Chunker}; use kebab_core::{ Answer, Block, CanonicalDocument, Chunk, ChunkId, ChunkPolicy, ChunkerVersion, Chunker, DocFilter, DocSummary, DocumentId, DocumentStore, Embedder, EmbeddingInput, @@ -948,11 +948,12 @@ fn ingest_one_asset( force_reingest, ); } - // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. + // p10-1A-2 / 1B: code ingest dispatch. p10-2: Tier 2 langs added. p10-3: shell added. MediaType::Code(lang) if matches!(lang.as_str(), "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" - | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod") => + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + | "shell") => { return ingest_one_code_asset( app, @@ -1835,11 +1836,13 @@ fn ingest_one_code_asset( // p10-2: Tier 2 has no parse step — sentinel "none-v1". "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => ParserVersion("none-v1".to_string()), + // p10-3: shell direct routes to Tier 3 (no parse step). + "shell" => ParserVersion("none-v1".to_string()), other => anyhow::bail!("unsupported code_lang: {other}"), }; // p10-1b Task D/G/J/L: chunker_version per-lang. - let chunker_version = match code_lang { + let mut chunker_version = match code_lang { "rust" => CodeRustAstV1Chunker.chunker_version(), "python" => CodePythonAstV1Chunker.chunker_version(), "typescript" => CodeTsAstV1Chunker.chunker_version(), @@ -1852,6 +1855,8 @@ fn ingest_one_code_asset( "dockerfile" => DockerfileFileV1Chunker.chunker_version(), "toml" | "json" | "xml" | "groovy" | "go-mod" => ManifestFileV1Chunker.chunker_version(), + // p10-3: + "shell" => CodeTextParagraphV1Chunker.chunker_version(), other => anyhow::bail!("unreachable chunker_version: {other}"), }; @@ -1877,70 +1882,145 @@ fn ingest_one_code_asset( }; // p10-1b Task D/G/J/L: extractor per-lang. - let mut canonical = match code_lang { + // p10-3: capture Result so Tier 1 extractor errors can fall back to Tier 3. + let canonical_result: anyhow::Result = match code_lang { "rust" => RustAstExtractor::new() .extract(&ctx, &bytes) - .context("kb-parse-code::RustAstExtractor::extract (code:rust)")?, + .context("kb-parse-code::RustAstExtractor::extract (code:rust)"), "python" => PythonAstExtractor::new() .extract(&ctx, &bytes) - .context("kb-parse-code::PythonAstExtractor::extract (code:python)")?, + .context("kb-parse-code::PythonAstExtractor::extract (code:python)"), "typescript" => TypescriptAstExtractor::new() .extract(&ctx, &bytes) - .context("kb-parse-code::TypescriptAstExtractor::extract (code:typescript)")?, + .context("kb-parse-code::TypescriptAstExtractor::extract (code:typescript)"), "javascript" => JavascriptAstExtractor::new() .extract(&ctx, &bytes) - .context("kb-parse-code::JavascriptAstExtractor::extract (code:javascript)")?, + .context("kb-parse-code::JavascriptAstExtractor::extract (code:javascript)"), "go" => GoAstExtractor::new() .extract(&ctx, &bytes) - .context("kb-parse-code::GoAstExtractor::extract (code:go)")?, + .context("kb-parse-code::GoAstExtractor::extract (code:go)"), "java" => JavaAstExtractor::new() .extract(&ctx, &bytes) - .context("kb-parse-code::JavaAstExtractor::extract (code:java)")?, + .context("kb-parse-code::JavaAstExtractor::extract (code:java)"), "kotlin" => KotlinAstExtractor::new() .extract(&ctx, &bytes) - .context("kb-parse-code::KotlinAstExtractor::extract (code:kotlin)")?, + .context("kb-parse-code::KotlinAstExtractor::extract (code:kotlin)"), // p10-2 Tier 2: no extractor — synthesize Document directly from raw bytes. "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => { - synthesize_tier2_document(asset, &bytes, code_lang, &parser_version)? + synthesize_tier2_document(asset, &bytes, code_lang, &parser_version) } + // p10-3: shell reuses the same synthesizer. + "shell" => synthesize_tier2_document(asset, &bytes, "shell", &parser_version), other => anyhow::bail!("unreachable (extract): {other}"), }; + // p10-3: Tier 1 extractor failure → fall back to Tier 3 synthesized doc. + // Tier 2 (yaml/dockerfile/…) and shell errors are real (e.g. non-UTF-8) — propagate. + let mut canonical = match canonical_result { + Ok(d) => d, + Err(e) if code_lang == "shell" + || matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod") => + { + return Err(e).context("synthesize_tier2_document failed for tier 2/3 lang"); + } + Err(e) => { + // Tier 1 extractor errored — fall back to Tier 3 synthesized doc. + tracing::warn!( + workspace_path = %asset.workspace_path.0, + code_lang = code_lang, + error = %e, + "tier1 extract errored; falling back to tier 3 synthesized doc" + ); + chunker_version = CodeTextParagraphV1Chunker.chunker_version(); + let tier3_parser_version = ParserVersion("none-v1".to_string()); + synthesize_tier2_document(asset, &bytes, code_lang, &tier3_parser_version) + .context("synthesize_tier2_document for tier 3 fallback after extract error")? + } + }; + // p10-1b Task D/G/J/L: chunker per-lang. - let chunks = match code_lang { - "rust" => CodeRustAstV1Chunker + // p10-3: track whether the extract stage already fell back to Tier 3. + // Tier 2 langs already have "none-v1" parser_version normally, so exclude them + // from the extract_fell_back guard with the !matches! exclusion. + let extract_fell_back = canonical.parser_version.0 == "none-v1" + && !matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" | "shell"); + + let chunks_result: anyhow::Result> = if extract_fell_back { + // Tier 1 lang whose extractor errored — go straight to Tier 3 chunker. + CodeTextParagraphV1Chunker .chunk(&canonical, chunk_policy) - .context("kb-chunk::CodeRustAstV1Chunker::chunk (code:rust)")?, - "python" => CodePythonAstV1Chunker - .chunk(&canonical, chunk_policy) - .context("kb-chunk::CodePythonAstV1Chunker::chunk (code:python)")?, - "typescript" => CodeTsAstV1Chunker - .chunk(&canonical, chunk_policy) - .context("kb-chunk::CodeTsAstV1Chunker::chunk (code:typescript)")?, - "javascript" => CodeJsAstV1Chunker - .chunk(&canonical, chunk_policy) - .context("kb-chunk::CodeJsAstV1Chunker::chunk (code:javascript)")?, - "go" => CodeGoAstV1Chunker - .chunk(&canonical, chunk_policy) - .context("kb-chunk::CodeGoAstV1Chunker::chunk (code:go)")?, - "java" => CodeJavaAstV1Chunker - .chunk(&canonical, chunk_policy) - .context("kb-chunk::CodeJavaAstV1Chunker::chunk (code:java)")?, - "kotlin" => CodeKotlinAstV1Chunker - .chunk(&canonical, chunk_policy) - .context("kb-chunk::CodeKotlinAstV1Chunker::chunk (code:kotlin)")?, - // p10-2 Tier 2: - "yaml" => K8sManifestResourceV1Chunker - .chunk(&canonical, chunk_policy) - .context("kb-chunk::K8sManifestResourceV1Chunker::chunk")?, - "dockerfile" => DockerfileFileV1Chunker - .chunk(&canonical, chunk_policy) - .context("kb-chunk::DockerfileFileV1Chunker::chunk")?, - "toml" | "json" | "xml" | "groovy" | "go-mod" - => ManifestFileV1Chunker - .chunk(&canonical, chunk_policy) - .context("kb-chunk::ManifestFileV1Chunker::chunk")?, - other => anyhow::bail!("unreachable (chunk): {other}"), + .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 after extract fallback)") + } else { + match code_lang { + "rust" => CodeRustAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeRustAstV1Chunker::chunk (code:rust)"), + "python" => CodePythonAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodePythonAstV1Chunker::chunk (code:python)"), + "typescript" => CodeTsAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeTsAstV1Chunker::chunk (code:typescript)"), + "javascript" => CodeJsAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeJsAstV1Chunker::chunk (code:javascript)"), + "go" => CodeGoAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeGoAstV1Chunker::chunk (code:go)"), + "java" => CodeJavaAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeJavaAstV1Chunker::chunk (code:java)"), + "kotlin" => CodeKotlinAstV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeKotlinAstV1Chunker::chunk (code:kotlin)"), + // p10-2 Tier 2: + "yaml" => K8sManifestResourceV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::K8sManifestResourceV1Chunker::chunk"), + "dockerfile" => DockerfileFileV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::DockerfileFileV1Chunker::chunk"), + "toml" | "json" | "xml" | "groovy" | "go-mod" => ManifestFileV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::ManifestFileV1Chunker::chunk"), + // p10-3: + "shell" => CodeTextParagraphV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (code:shell)"), + other => anyhow::bail!("unreachable (chunk): {other}"), + } + }; + + // p10-3: Tier 1/2 0-chunk OR error → Tier 3 fallback retry. + // "shell" direct path is already Tier 3 — don't retry-double-up. + let chunks: Vec = match chunks_result { + Ok(v) if !v.is_empty() => v, + other if code_lang == "shell" => other?, // shell propagates directly + Ok(_empty) => { + tracing::warn!( + workspace_path = %asset.workspace_path.0, + code_lang = code_lang, + "tier1/2 emitted 0 chunks; falling back to tier 3 (code-text-paragraph-v1)" + ); + chunker_version = CodeTextParagraphV1Chunker.chunker_version(); + canonical.parser_version = ParserVersion("none-v1".to_string()); + CodeTextParagraphV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback)")? + } + Err(e) => { + tracing::warn!( + workspace_path = %asset.workspace_path.0, + code_lang = code_lang, + error = %e, + "tier1/2 chunker errored; falling back to tier 3 (code-text-paragraph-v1)" + ); + chunker_version = CodeTextParagraphV1Chunker.chunker_version(); + canonical.parser_version = ParserVersion("none-v1".to_string()); + CodeTextParagraphV1Chunker + .chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback after error)")? + } }; // Stamp chunker + embedding versions so incremental skip detection has diff --git a/crates/kebab-app/tests/code_ingest_smoke.rs b/crates/kebab-app/tests/code_ingest_smoke.rs index 69ac528..a462666 100644 --- a/crates/kebab-app/tests/code_ingest_smoke.rs +++ b/crates/kebab-app/tests/code_ingest_smoke.rs @@ -850,6 +850,181 @@ fn tier2_cargo_toml_ingest_searchable() { ); } +/// p10-3 Task E: a `.sh` file is ingested via the shell direct-Tier-3 path +/// and the resulting `Citation::Code` hit must carry `lang="shell"`, +/// `symbol=None`, `line_start >= 1`, and +/// `chunker_version = "code-text-paragraph-v1"`. +#[test] +fn tier3_shell_ingest_searchable() { + let env = TestEnv::lexical_only(); + + std::fs::write( + env.workspace_root.join("deploy.sh"), + "#!/usr/bin/env bash\nset -e\necho hello\n\nkebab ingest --json\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + assert!(report.new >= 1, "shell file ingested: {report:?}"); + + let sh_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("deploy.sh")) + .expect("deploy.sh item present"); + assert_eq!( + sh_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("none-v1"), + "parser_version must be none-v1 for shell (Tier 3 direct)" + ); + assert_eq!( + sh_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-text-paragraph-v1"), + "chunker_version must be code-text-paragraph-v1 for shell" + ); + + let query = kebab_core::SearchQuery { + text: "kebab".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["shell".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'kebab'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!( + lang.as_deref(), + Some("shell"), + "citation.lang must be 'shell'" + ); + assert_eq!(*symbol, None, "Tier 3 symbol must be None"); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("shell"), + "SearchHit.code_lang must be 'shell'" + ); + assert_eq!( + h.chunker_version.0.as_str(), + "code-text-paragraph-v1", + "shell chunks must be stamped with the Tier 3 chunker_version" + ); +} + +/// p10-3 Task E: a docker-compose-shaped YAML file (no `apiVersion`/`kind`) +/// is ingested; the k8s chunker returns `Ok(vec![])` and the Tier 3 fallback +/// wrapper retries with `CodeTextParagraphV1Chunker`. The resulting +/// `Citation::Code` hit must carry `lang="yaml"`, `symbol=None`, +/// `line_start >= 1`, and `chunker_version = "code-text-paragraph-v1"`. +#[test] +fn tier3_yaml_fallback_picks_up_non_k8s_yaml() { + let env = TestEnv::lexical_only(); + + // docker-compose-shaped YAML — version + services but no apiVersion/kind. + // The k8s chunker returns Ok(vec![]); Tier 3 fallback should pick this up. + std::fs::write( + env.workspace_root.join("docker-compose.yml"), + "version: '3'\nservices:\n api:\n image: nginx:latest\n ports:\n - 8080:80\n", + ) + .unwrap(); + + let report = kebab_app::ingest_with_config(env.config.clone(), env.scope(), false) + .expect("ingest must succeed"); + assert_eq!(report.errors, 0, "no ingest errors: {report:?}"); + assert!( + report.new >= 1, + "expected non-k8s yaml ingested via Tier 3, got {} new docs", + report.new + ); + + let yaml_item = report + .items + .as_ref() + .expect("items present") + .iter() + .find(|i| i.doc_path.0.ends_with("docker-compose.yml")) + .expect("docker-compose.yml item present"); + assert_eq!( + yaml_item.parser_version.as_ref().map(|p| p.0.as_str()), + Some("none-v1"), + "parser_version must be none-v1 after Tier 3 fallback" + ); + assert_eq!( + yaml_item.chunker_version.as_ref().map(|c| c.0.as_str()), + Some("code-text-paragraph-v1"), + "chunker_version must be code-text-paragraph-v1 after Tier 3 fallback" + ); + + let query = kebab_core::SearchQuery { + text: "nginx".to_string(), + mode: kebab_core::SearchMode::Lexical, + k: 10, + filters: kebab_core::SearchFilters { + code_lang: vec!["yaml".to_string()], + ..Default::default() + }, + }; + let hits = kebab_app::search_with_config(env.config.clone(), query) + .expect("search must succeed"); + + let h = hits + .iter() + .find(|h| matches!(&h.citation, Citation::Code { .. })) + .expect("at least one Citation::Code hit for 'nginx'"); + + match &h.citation { + Citation::Code { + lang, + symbol, + line_start, + .. + } => { + assert_eq!( + lang.as_deref(), + Some("yaml"), + "citation.lang must be 'yaml'" + ); + assert_eq!(*symbol, None, "Tier 3 fallback symbol must be None"); + assert!(*line_start >= 1, "line_start must be >=1"); + } + _ => unreachable!(), + } + + assert_eq!( + h.code_lang.as_deref(), + Some("yaml"), + "SearchHit.code_lang must be 'yaml'" + ); + assert_eq!( + h.chunker_version.0.as_str(), + "code-text-paragraph-v1", + "non-k8s yaml fallback must be stamped code-text-paragraph-v1" + ); +} + /// Re-ingesting the same `.rs` file without changes must report /// `Unchanged` (incremental-skip path exercised). #[test] diff --git a/crates/kebab-chunk/src/code_text_paragraph_v1.rs b/crates/kebab-chunk/src/code_text_paragraph_v1.rs new file mode 100644 index 0000000..cda5b99 --- /dev/null +++ b/crates/kebab-chunk/src/code_text_paragraph_v1.rs @@ -0,0 +1,170 @@ +//! p10-3: Tier 3 paragraph + line-window fallback chunker. +//! +//! Splits code/text files on blank-line paragraph boundaries. Paragraphs +//! with more than 80 lines are further split into 80-line windows with a +//! 20-line overlap (stride 60) — the same oversize pattern used by Tier 1/2 +//! chunkers but without AST structure, hence no symbol. +//! +//! Per spec §9.3: all emitted chunks carry `symbol: None`. + +use crate::tier2_shared::{build_chunk_no_symbol, policy_hash}; +use anyhow::Result; +use kebab_core::{Block, CanonicalDocument, Chunk, ChunkPolicy, ChunkerVersion, Chunker}; + +pub const VERSION_LABEL: &str = "code-text-paragraph-v1"; + +/// Lines-per-window for the oversize fallback (Tier 3). +const FALLBACK_LINES_PER_CHUNK: usize = 80; +/// Overlap between consecutive windows. +const FALLBACK_LINES_OVERLAP: usize = 20; +// stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP = 60. + +#[derive(Clone, Copy, Debug, Default)] +pub struct CodeTextParagraphV1Chunker; + +impl Chunker for CodeTextParagraphV1Chunker { + fn chunker_version(&self) -> ChunkerVersion { + ChunkerVersion(VERSION_LABEL.to_string()) + } + + fn policy_hash(&self, policy: &ChunkPolicy) -> String { + policy_hash(policy) + } + + fn chunk(&self, doc: &CanonicalDocument, policy: &ChunkPolicy) -> Result> { + // Expect a single Block::Code carrying the full source text. + let (text, lang_str) = match doc.blocks.first() { + Some(Block::Code(cb)) => (cb.code.as_str(), cb.lang.as_deref().unwrap_or("")), + _ => return Ok(vec![]), + }; + + let mut chunks = Vec::new(); + for para in split_paragraphs(text) { + push_paragraph(&mut chunks, doc, policy, ¶, lang_str)?; + } + + tracing::debug!( + target: "kebab-chunk", + doc_id = %doc.doc_id, + chunks = chunks.len(), + "code-text-paragraph-v1 chunked", + ); + + Ok(chunks) + } +} + +/// A contiguous run of non-blank lines from the source text. +struct Paragraph { + /// Lines joined with `\n` (no trailing newline). + text: String, + /// 1-indexed line number of the first line in the source file. + line_start: u32, + /// 1-indexed line number of the last line in the source file. + line_end: u32, +} + +/// Split `text` into `Paragraph`s separated by blank (all-whitespace) lines. +/// +/// Blank lines are treated as boundaries and are NOT included in any +/// paragraph's line range. Paragraphs that would consist entirely of blank +/// lines are skipped. +fn split_paragraphs(text: &str) -> Vec { + let mut paragraphs = Vec::new(); + let mut current: Vec<&str> = Vec::new(); + let mut current_start: Option = None; + + for (idx, line) in text.lines().enumerate() { + let line_no = (idx + 1) as u32; + let is_blank = line.trim().is_empty(); + if is_blank { + if let Some(start) = current_start.take() { + let end = start + current.len() as u32 - 1; + paragraphs.push(Paragraph { + text: current.join("\n"), + line_start: start, + line_end: end, + }); + current.clear(); + } + } else { + if current_start.is_none() { + current_start = Some(line_no); + } + current.push(line); + } + } + // Flush any trailing paragraph not terminated by a blank line. + if let Some(start) = current_start { + let end = start + current.len() as u32 - 1; + paragraphs.push(Paragraph { + text: current.join("\n"), + line_start: start, + line_end: end, + }); + } + paragraphs +} + +/// Emit one or more chunks for a single paragraph. +/// +/// Paragraphs with ≤ `FALLBACK_LINES_PER_CHUNK` lines become a single chunk. +/// Larger paragraphs are split into overlapping windows of +/// `FALLBACK_LINES_PER_CHUNK` lines with stride `FALLBACK_LINES_PER_CHUNK - +/// FALLBACK_LINES_OVERLAP`. The last window may be shorter. Window starts +/// are passed as `split_key` so `id_for_chunk` can produce distinct ids +/// across windows. +fn push_paragraph( + out: &mut Vec, + doc: &CanonicalDocument, + policy: &ChunkPolicy, + para: &Paragraph, + lang: &str, +) -> Result<()> { + let n_lines = (para.line_end - para.line_start + 1) as usize; + + if n_lines <= FALLBACK_LINES_PER_CHUNK { + // Use line_start as split_key so each paragraph gets a distinct + // chunk_id even when block_ids is empty (no symbol, no AST structure). + // Without this, all short paragraphs from the same doc share the same + // base_policy_hash and therefore the same id_for_chunk result. + out.push(build_chunk_no_symbol( + doc, + policy, + ¶.text, + para.line_start, + para.line_end, + lang, + VERSION_LABEL, + Some(para.line_start), + )); + return Ok(()); + } + + // Oversize: line-window split with overlap. + let stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP; + let lines: Vec<&str> = para.text.lines().collect(); + let mut i = 0usize; + loop { + let end = (i + FALLBACK_LINES_PER_CHUNK).min(lines.len()); + let window_text = lines[i..end].join("\n"); + let window_start = para.line_start + i as u32; + let window_end = para.line_start + (end as u32) - 1; + // Use window_start as split_key so chunk_ids are unique across windows. + out.push(build_chunk_no_symbol( + doc, + policy, + &window_text, + window_start, + window_end, + lang, + VERSION_LABEL, + Some(window_start), + )); + if end == lines.len() { + break; + } + i += stride; + } + Ok(()) +} diff --git a/crates/kebab-chunk/src/lib.rs b/crates/kebab-chunk/src/lib.rs index 9b65e05..eee3f69 100644 --- a/crates/kebab-chunk/src/lib.rs +++ b/crates/kebab-chunk/src/lib.rs @@ -28,6 +28,7 @@ mod tier2_shared; pub mod k8s_manifest_resource_v1; pub mod dockerfile_file_v1; pub mod manifest_file_v1; +pub mod code_text_paragraph_v1; pub use code_go_ast_v1::CodeGoAstV1Chunker; pub use code_java_ast_v1::CodeJavaAstV1Chunker; @@ -41,3 +42,4 @@ pub use pdf_page_v1::PdfPageV1Chunker; pub use k8s_manifest_resource_v1::K8sManifestResourceV1Chunker; pub use dockerfile_file_v1::DockerfileFileV1Chunker; pub use manifest_file_v1::ManifestFileV1Chunker; +pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker; diff --git a/crates/kebab-chunk/src/tier2_shared.rs b/crates/kebab-chunk/src/tier2_shared.rs index f52173c..c80b863 100644 --- a/crates/kebab-chunk/src/tier2_shared.rs +++ b/crates/kebab-chunk/src/tier2_shared.rs @@ -88,7 +88,7 @@ pub(crate) fn push_chunks_with_oversize( /// for normal single-chunk emission. Mirrors the `Some(part_ls)` / `None` /// split_key pattern in 1A-2. #[allow(clippy::too_many_arguments)] -fn build_chunk( +pub(crate) fn build_chunk( doc: &CanonicalDocument, chunker_version: &ChunkerVersion, base_policy_hash: &str, @@ -105,7 +105,49 @@ fn build_chunk( symbol: Some(symbol.to_string()), lang: Some(lang.to_string()), }; + build_chunk_from_span(doc, chunker_version, base_policy_hash, text, span, split_key) +} +/// Like `build_chunk` but emits `symbol: None`. Used by Tier 3 (per spec §9.3). +/// +/// Accepts `policy: &ChunkPolicy` and `chunker_version: &str` (string slice) +/// so callers don't need to pre-compute the hash and version wrapper. +/// `split_key` is `Some(window_start)` for oversize line-window splits. +#[allow(clippy::too_many_arguments)] +pub(crate) fn build_chunk_no_symbol( + doc: &CanonicalDocument, + policy: &ChunkPolicy, + text: &str, + line_start: u32, + line_end: u32, + lang: &str, + chunker_version: &str, + split_key: Option, +) -> Chunk { + let cv = ChunkerVersion(chunker_version.to_string()); + let base_policy_hash = policy_hash(policy); + let span = SourceSpan::Code { + line_start, + line_end, + symbol: None, + lang: Some(lang.to_string()), + }; + build_chunk_from_span(doc, &cv, &base_policy_hash, text, span, split_key) +} + +/// Core chunk-building logic shared by `build_chunk` and `build_chunk_no_symbol`. +/// +/// Takes a pre-built `SourceSpan` so the only difference between the two +/// public helpers is whether `symbol` is `Some` or `None`. All id/hash/ +/// token mechanics are identical. +fn build_chunk_from_span( + doc: &CanonicalDocument, + chunker_version: &ChunkerVersion, + base_policy_hash: &str, + text: &str, + span: SourceSpan, + split_key: Option, +) -> Chunk { // id_hash mirrors code_rust_ast_v1's make_chunk logic: // split_key Some(k) => "{base_policy_hash}#L{k}" // split_key None => base_policy_hash @@ -114,7 +156,7 @@ fn build_chunk( None => base_policy_hash.to_string(), }; - // block_ids: Tier 2 chunkers have no per-block structure (the whole file + // block_ids: Tier 2/3 chunkers have no per-block structure (the whole file // is one Block::Code), so we pass an empty slice — same as using the doc- // level slice without explicit block granularity. let block_ids: Vec = vec![]; diff --git a/crates/kebab-chunk/tests/code_text_paragraph_v1.rs b/crates/kebab-chunk/tests/code_text_paragraph_v1.rs new file mode 100644 index 0000000..a3ef17a --- /dev/null +++ b/crates/kebab-chunk/tests/code_text_paragraph_v1.rs @@ -0,0 +1,270 @@ +//! Behavioural tests for `CodeTextParagraphV1Chunker`. +//! +//! Documents are constructed manually (no kebab-parse-code dependency) by +//! placing raw text into a single `Block::Code`, mirroring the pattern used +//! in `k8s_manifest_resource_v1.rs`. + +use std::path::PathBuf; + +use kebab_chunk::CodeTextParagraphV1Chunker; +use kebab_core::{ + AssetId, Block, CanonicalDocument, ChunkPolicy, Chunker, ChunkerVersion, CodeBlock, + CommonBlock, Lang, Metadata, ParserVersion, Provenance, SourceSpan, SourceType, TrustLevel, + WorkspacePath, id_for_block, id_for_doc, +}; +use time::OffsetDateTime; + +// ── helpers ────────────────────────────────────────────────────────────────── + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +/// Build a `CanonicalDocument` with a single `Block::Code` containing `text` +/// and the supplied `lang` label. +fn text_doc(lang: &str, text: &str) -> CanonicalDocument { + let wp = WorkspacePath("scripts/sample.sh".into()); + let aid = AssetId("d".repeat(64)); + let pv = ParserVersion("code-text-paragraph-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + let line_count = text.lines().count() as u32; + let span = SourceSpan::Code { + line_start: 1, + line_end: line_count.max(1), + symbol: None, + lang: Some(lang.into()), + }; + let bid = id_for_block(&doc_id, "code", &[], 0, &span); + let block = Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some(lang.into()), + code: text.to_string(), + }); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: "sample.sh".into(), + lang: Lang("und".into()), + blocks: vec![block], + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some(lang.into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn policy() -> ChunkPolicy { + ChunkPolicy { + target_tokens: 500, + overlap_tokens: 80, + respect_markdown_headings: false, + chunker_version: ChunkerVersion("code-text-paragraph-v1".into()), + } +} + +// ── tests ───────────────────────────────────────────────────────────────────── + +/// `sample_shell.sh` has 4 paragraphs separated by 3 blank lines: +/// - paragraph 1: lines 1-2 (shebang + set -euo pipefail) +/// - paragraph 2: lines 4-7 (env setup block) +/// - paragraph 3: lines 9-11 (ingest block) +/// - paragraph 4: lines 13-15 (report block) +/// +/// We assert: +/// - exactly 4 chunks (one per paragraph) +/// - all symbols are None (Tier 3 spec §9.3) +/// - all langs are "shell" +/// - line ranges are strictly ascending and do NOT include the blank lines +/// (lines 3, 8, 12 must not appear in any range) +#[test] +fn shell_multi_paragraph_splits_on_blank_lines() { + let fixture_path = fixtures_dir().join("sample_shell.sh"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + let doc = text_doc("shell", &text); + let chunks = CodeTextParagraphV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 4, + "expected 4 chunks (one per paragraph), got {}: {chunks:#?}", + chunks.len() + ); + + // All symbols must be None (Tier 3 requirement). + for (i, chunk) in chunks.iter().enumerate() { + match &chunk.source_spans[0] { + SourceSpan::Code { symbol, .. } => { + assert!( + symbol.is_none(), + "chunk[{i}] symbol must be None for Tier 3 chunker, got {symbol:?}" + ); + } + other => panic!("chunk[{i}]: expected Code span, got {other:?}"), + } + } + + // All langs must be "shell". + for (i, chunk) in chunks.iter().enumerate() { + match &chunk.source_spans[0] { + SourceSpan::Code { lang, .. } => { + assert_eq!( + lang.as_deref(), + Some("shell"), + "chunk[{i}] lang must be 'shell', got {lang:?}" + ); + } + other => panic!("chunk[{i}]: expected Code span, got {other:?}"), + } + } + + // Line ranges must be strictly ascending with no overlap, + // and blank lines (3, 8, 12) must not be included in any range. + let expected_ranges: &[(u32, u32)] = &[(1, 2), (4, 7), (9, 11), (13, 15)]; + let actual_ranges: Vec<(u32, u32)> = chunks + .iter() + .map(|c| match &c.source_spans[0] { + SourceSpan::Code { + line_start, + line_end, + .. + } => (*line_start, *line_end), + other => panic!("expected Code span, got {other:?}"), + }) + .collect(); + + assert_eq!( + actual_ranges, expected_ranges, + "line ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}" + ); +} + +/// `sample_long_paragraph.txt` has exactly 200 non-blank lines and no blank +/// lines, so the entire file is one paragraph. 200 > 80 (FALLBACK_LINES_PER_CHUNK), +/// so the oversize window split fires with stride 60: +/// - window 1: lines 1-80 +/// - window 2: lines 61-140 +/// - window 3: lines 121-200 +/// +/// All chunk_ids must be distinct (the #L{window_start} split_key suffix). +#[test] +fn single_long_paragraph_line_window_split() { + let fixture_path = fixtures_dir().join("sample_long_paragraph.txt"); + let text = std::fs::read_to_string(&fixture_path) + .unwrap_or_else(|e| panic!("cannot read fixture {}: {e}", fixture_path.display())); + + assert_eq!( + text.lines().count(), + 200, + "fixture must have exactly 200 lines" + ); + + let doc = text_doc("shell", &text); + let chunks = CodeTextParagraphV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 3, + "expected 3 window chunks for 200-line paragraph, got {}: {chunks:#?}", + chunks.len() + ); + + let expected_ranges: &[(u32, u32)] = &[(1, 80), (61, 140), (121, 200)]; + let actual_ranges: Vec<(u32, u32)> = chunks + .iter() + .map(|c| match &c.source_spans[0] { + SourceSpan::Code { + line_start, + line_end, + .. + } => (*line_start, *line_end), + other => panic!("expected Code span, got {other:?}"), + }) + .collect(); + + assert_eq!( + actual_ranges, expected_ranges, + "window ranges mismatch: got {actual_ranges:?}, expected {expected_ranges:?}" + ); + + // All chunk_ids must be distinct (#L{window_start} suffix differentiates them). + let ids: std::collections::HashSet<_> = chunks.iter().map(|c| c.chunk_id.clone()).collect(); + assert_eq!( + ids.len(), + chunks.len(), + "oversize window chunks must have distinct chunk_ids" + ); +} + +/// An empty source file (no non-blank lines) must yield zero chunks. +#[test] +fn empty_file_emits_zero_chunks() { + let doc = text_doc("shell", ""); + let chunks = CodeTextParagraphV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert_eq!( + chunks.len(), + 0, + "empty file must yield 0 chunks, got {}: {chunks:#?}", + chunks.len() + ); +} + +/// The `lang` field on each emitted chunk must match the `lang` passed to +/// `text_doc`, regardless of content. `symbol` must be `None` (Tier 3 spec). +#[test] +fn lang_field_preserved_from_input_doc() { + let doc = text_doc("yaml", "key1: value1\nkey2: value2\n"); + let chunks = CodeTextParagraphV1Chunker + .chunk(&doc, &policy()) + .expect("chunk"); + + assert!(!chunks.is_empty(), "expected at least one chunk"); + + match &chunks[0].source_spans[0] { + SourceSpan::Code { lang, symbol, .. } => { + assert_eq!( + lang.as_deref(), + Some("yaml"), + "lang must be 'yaml', got {lang:?}" + ); + assert!( + symbol.is_none(), + "symbol must be None for Tier 3 chunker, got {symbol:?}" + ); + } + other => panic!("expected Code span, got {other:?}"), + } +} diff --git a/crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt b/crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt new file mode 100644 index 0000000..192ff33 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt @@ -0,0 +1,200 @@ +line 001 +line 002 +line 003 +line 004 +line 005 +line 006 +line 007 +line 008 +line 009 +line 010 +line 011 +line 012 +line 013 +line 014 +line 015 +line 016 +line 017 +line 018 +line 019 +line 020 +line 021 +line 022 +line 023 +line 024 +line 025 +line 026 +line 027 +line 028 +line 029 +line 030 +line 031 +line 032 +line 033 +line 034 +line 035 +line 036 +line 037 +line 038 +line 039 +line 040 +line 041 +line 042 +line 043 +line 044 +line 045 +line 046 +line 047 +line 048 +line 049 +line 050 +line 051 +line 052 +line 053 +line 054 +line 055 +line 056 +line 057 +line 058 +line 059 +line 060 +line 061 +line 062 +line 063 +line 064 +line 065 +line 066 +line 067 +line 068 +line 069 +line 070 +line 071 +line 072 +line 073 +line 074 +line 075 +line 076 +line 077 +line 078 +line 079 +line 080 +line 081 +line 082 +line 083 +line 084 +line 085 +line 086 +line 087 +line 088 +line 089 +line 090 +line 091 +line 092 +line 093 +line 094 +line 095 +line 096 +line 097 +line 098 +line 099 +line 100 +line 101 +line 102 +line 103 +line 104 +line 105 +line 106 +line 107 +line 108 +line 109 +line 110 +line 111 +line 112 +line 113 +line 114 +line 115 +line 116 +line 117 +line 118 +line 119 +line 120 +line 121 +line 122 +line 123 +line 124 +line 125 +line 126 +line 127 +line 128 +line 129 +line 130 +line 131 +line 132 +line 133 +line 134 +line 135 +line 136 +line 137 +line 138 +line 139 +line 140 +line 141 +line 142 +line 143 +line 144 +line 145 +line 146 +line 147 +line 148 +line 149 +line 150 +line 151 +line 152 +line 153 +line 154 +line 155 +line 156 +line 157 +line 158 +line 159 +line 160 +line 161 +line 162 +line 163 +line 164 +line 165 +line 166 +line 167 +line 168 +line 169 +line 170 +line 171 +line 172 +line 173 +line 174 +line 175 +line 176 +line 177 +line 178 +line 179 +line 180 +line 181 +line 182 +line 183 +line 184 +line 185 +line 186 +line 187 +line 188 +line 189 +line 190 +line 191 +line 192 +line 193 +line 194 +line 195 +line 196 +line 197 +line 198 +line 199 +line 200 diff --git a/crates/kebab-chunk/tests/fixtures/sample_shell.sh b/crates/kebab-chunk/tests/fixtures/sample_shell.sh new file mode 100644 index 0000000..2fc2911 --- /dev/null +++ b/crates/kebab-chunk/tests/fixtures/sample_shell.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +# First paragraph: env setup +export KEBAB_HOME="${KEBAB_HOME:-$HOME/.local/share/kebab}" +mkdir -p "$KEBAB_HOME" +cd "$KEBAB_HOME" + +# Second paragraph: ingest +echo "ingesting workspace..." +kebab ingest --config /etc/kebab/config.toml + +# Third paragraph: report +echo "done" +kebab schema --json | jq '.stats' diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index cc1c391..39a0941 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -22,7 +22,7 @@ Cargo workspace, 함수 호출 기반 모듈러 모놀리스. UI binary (`kebab- | OCR | Ollama vision LM (default `gemma4:e4b`) — `OcrEngine` trait 으로 Tesseract / Apple Vision 등 future swap (HOTFIXES P6-2) | | Image caption | Ollama vision LM, runtime gate `image.caption.enabled` (default OFF) | | PDF parser | `lopdf` per-page 텍스트, `chunker_version = "pdf-page-v1"` 가 PDF 자산에 하드코딩 (HOTFIXES P7-3) | -| code parser | `tree-sitter` + `tree-sitter-rust` / `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` / `tree-sitter-go` / `tree-sitter-java` / `tree-sitter-kotlin-ng` — **parser-side** (`kebab-parse-code`), chunker-side 아님 (design §6.3). chunker versions: Rust = `code-rust-ast-v1`, Python = `code-python-ast-v1`, TypeScript = `code-ts-ast-v1`, JavaScript = `code-js-ast-v1`, Go = `code-go-ast-v1`, Java = `code-java-ast-v1`, Kotlin = `code-kotlin-ast-v1`. `ast_chunk_max_lines = 200` 상수 고정 (HOTFIXES 2026-05-19 — Chunker trait 이 per-medium config 미노출). Kotlin grammar 은 `tree-sitter-kotlin-ng` 사용 — bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 고착되어 있어 사용 불가. **Tier 2 (p10-2)**: YAML/k8s → `serde_yaml` + `k8s-manifest-resource-v1` (apiVersion+kind per resource), Dockerfile → `dockerfile-file-v1` (whole-file), Cargo.toml/go.mod/.json/.xml/.groovy → `manifest-file-v1` (whole-file). Tier 2 chunkers live in `kebab-chunk`; no tree-sitter grammar needed (structure from file type, not AST). | +| code parser | `tree-sitter` + `tree-sitter-rust` / `tree-sitter-python` / `tree-sitter-typescript` / `tree-sitter-javascript` / `tree-sitter-go` / `tree-sitter-java` / `tree-sitter-kotlin-ng` — **parser-side** (`kebab-parse-code`), chunker-side 아님 (design §6.3). chunker versions: Rust = `code-rust-ast-v1`, Python = `code-python-ast-v1`, TypeScript = `code-ts-ast-v1`, JavaScript = `code-js-ast-v1`, Go = `code-go-ast-v1`, Java = `code-java-ast-v1`, Kotlin = `code-kotlin-ast-v1`. `ast_chunk_max_lines = 200` 상수 고정 (HOTFIXES 2026-05-19 — Chunker trait 이 per-medium config 미노출). Kotlin grammar 은 `tree-sitter-kotlin-ng` 사용 — bare `tree-sitter-kotlin` 은 tree-sitter 0.21–0.23 에 고착되어 있어 사용 불가. **Tier 2 (p10-2)**: YAML/k8s → `serde_yaml` + `k8s-manifest-resource-v1` (apiVersion+kind per resource), Dockerfile → `dockerfile-file-v1` (whole-file), Cargo.toml/go.mod/.json/.xml/.groovy → `manifest-file-v1` (whole-file). Tier 2 chunkers live in `kebab-chunk`; no tree-sitter grammar needed (structure from file type, not AST). **Tier 3 (p10-3)**: shell scripts (`.sh`/`.bash`/`.zsh`) direct → `code-text-paragraph-v1` (blank-line paragraph segmentation + 80-line / 20-overlap line-window for oversize). Same chunker also serves as fallback when Tier 1/2 emit 0 chunks or Err — non-k8s YAML / invalid YAML / AST extractor failures all picked up. symbol = None; lang preserved from input doc. | | 1B symbol path | workspace path → module path: Python = dotted prefix (`kebab_eval.metrics.compute_mrr`), TypeScript/JavaScript = slash-style prefix (`src/Foo.Foo.search`). Rust 1A-2 는 file-scope nesting 만 (workspace prefix 없음, 비일관 수용 — HOTFIXES 2026-05-20). | | TUI | Ratatui + crossterm — P9-1 Library 패널, P9-2/3/4 진행 예정 | | Desktop | Tauri 2 + `pdfjs-dist` (native PDF render backend 금지) — P9-5 | @@ -52,7 +52,7 @@ flowchart TB ppdf["kebab-parse-pdf"] pimg["kebab-parse-image"] paud["kebab-parse-audio
(P8 보류)"] - pcode["kebab-parse-code
(P10-1A-2 + P10-1B + P10-1C-Go + P10-1C-JK + P10-2)"] + pcode["kebab-parse-code
(P10-1A-2 + P10-1B + P10-1C-Go + P10-1C-JK + P10-2 + P10-3)"] ptypes["kebab-parse-types"] norm["kebab-normalize"] chunk["kebab-chunk"] @@ -165,12 +165,13 @@ kebab/ │ ├── kebab-source-fs/ # 워크스페이스 walk + checksum (P1-1) │ ├── kebab-parse-md/ # Markdown frontmatter + blocks (P1-2/3) │ ├── kebab-normalize/ # ParsedBlock → CanonicalDocument (P1-4) -│ ├── kebab-chunk/ # heading-aware + pdf-page-v1 + code-*-ast-v1 (Tier 1) + k8s-manifest-resource-v1 + dockerfile-file-v1 + manifest-file-v1 + tier2_shared (P10-2) chunker (P1-5, P7-2, P10-1A-2, P10-1B, P10-1C-Go, P10-1C-JK, P10-2) +│ ├── kebab-chunk/ # heading-aware + pdf-page-v1 + code-*-ast-v1 (Tier 1) + k8s-manifest-resource-v1 + dockerfile-file-v1 + manifest-file-v1 + tier2_shared (P10-2) + code-text-paragraph-v1 (P10-3) chunker (P1-5, P7-2, P10-1A-2, P10-1B, P10-1C-Go, P10-1C-JK, P10-2, P10-3) │ │ └── src/ │ │ ├── code_*_ast_v1.rs # Tier 1 AST chunkers (rust/python/ts/js/go/java/kotlin) │ │ ├── k8s_manifest_resource_v1.rs # Tier 2 (p10-2): YAML multi-doc, apiVersion+kind per resource │ │ ├── dockerfile_file_v1.rs # Tier 2 (p10-2): whole-file Dockerfile │ │ ├── manifest_file_v1.rs # Tier 2 (p10-2): whole-file Cargo.toml / go.mod / .json / .xml / .groovy +│ │ ├── code_text_paragraph_v1.rs # Tier 3 (p10-3): blank-line paragraph + 80/20 line-window fallback │ │ └── tier2_shared.rs # Tier 2 (p10-2): shared oversize fallback + Chunk builder helpers │ ├── kebab-store-sqlite/ # SQLite + FTS5 (V001/V002/V003) (P1-6, P2-1, P3-3) │ ├── kebab-search/ # Lexical + Vector + Hybrid retriever (P2-2, P3-4) diff --git a/docs/SMOKE.md b/docs/SMOKE.md index 97c9d59..52380b7 100644 --- a/docs/SMOKE.md +++ b/docs/SMOKE.md @@ -502,6 +502,52 @@ KB --json schema | jq '.stats.code_lang_breakdown' - **Dockerfile**: `` (고정 심볼, 전체 파일이 단일 chunk). - **TOML / JSON / XML / Groovy / go.mod**: `` (고정 심볼, 전체 파일이 단일 chunk). 단, 파일이 `tier2_shared` 의 oversize threshold 초과 시 줄 단위 fallback chunk. +## P10-3 Tier 3 paragraph fallback + +P10-2 와 동일한 격리 KB 설정. `.sh` 파일은 direct, 비-k8s YAML 은 fallback 으로 들어간다. + +```bash +# 1) shell script (direct Tier 3) +cat > /tmp/kebab-smoke/workspace/deploy.sh <<'EOF' +#!/usr/bin/env bash +set -e + +echo "ingesting..." +kebab ingest + +echo "done" +kebab schema --json | jq '.stats' +EOF + +# 2) 비-k8s YAML (Tier 2 가 0 chunk → Tier 3 fallback) +cat > /tmp/kebab-smoke/workspace/docker-compose.yml <<'EOF' +version: '3' +services: + api: + image: nginx:latest + ports: + - 8080:80 +EOF + +# 3) ingest +KB ingest + +# 4) 언어별 검색 (citation.symbol = None 확인) +KB search --mode hybrid "ingest" --code-lang shell --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang, chunker: .chunker_version}]}' +# 기대: symbol = null, lang = "shell", chunker_version = "code-text-paragraph-v1" + +KB search --mode hybrid "nginx" --code-lang yaml --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang, chunker: .chunker_version}]}' +# 기대: symbol = null, lang = "yaml", chunker_version = "code-text-paragraph-v1" + +# 5) schema stats 에 shell 카운트 확인 +KB --json schema | jq '.stats.code_lang_breakdown' +# 기대: {"shell": N, "yaml": M, ...} +``` + +**Tier 3 citation.symbol 컨벤션**: 항상 `null`. 의미 단위 식별 안 함. `lang` 은 원본 lang 보존 (shell → `"shell"`, yaml → `"yaml"` 등). + ## 검증 체크리스트 - `kebab doctor` 가 `--config` path 를 honor 하고 그 안의 `storage.data_dir` 를 출력 (XDG default 가 아님). @@ -537,6 +583,7 @@ rm -rf /tmp/kebab-smoke # 통째로 정리 - (P10-1C-Go) `.go` 파일을 워크스페이스에 두면 `kebab ingest` 가 `code-go-ast-v1` 로 처리. `--code-lang go` 검색이 `citation.symbol` 에 `.` / `.(*Receiver).` 형식 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"go": N` 등장 확인. - (P10-1C-JK) `.java` 파일은 `code-java-ast-v1`, `.kt`/`.kts` 파일은 `code-kotlin-ast-v1` 로 처리. `--code-lang java` / `--code-lang kotlin` 검색이 `citation.symbol` 에 `com.foo.Foo.bar` 형식 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"java": N` / `"kotlin": N` 등장 확인. - (P10-2) `.yaml`/`.yml` 파일은 apiVersion+kind 파싱으로 k8s resource 별 chunk 생성 (`k8s-manifest-resource-v1`). `Dockerfile`/`Dockerfile.*` 는 전체 파일 단일 chunk (`dockerfile-file-v1`). `.toml`/`.json`/`.xml`/`.groovy`/`go.mod` 는 전체 파일 단일 chunk (`manifest-file-v1`). `--code-lang yaml` / `--code-lang dockerfile` / `--code-lang toml` 검색이 `citation.symbol` 에 각각 `Deployment/default/my-app` / `` / `` 형식 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"yaml": N` / `"dockerfile": N` / `"toml": N` 등장 확인. +- (P10-3) `.sh`/`.bash`/`.zsh` 파일은 direct Tier 3 (`code-text-paragraph-v1`). 비-k8s YAML (apiVersion+kind 없는 yaml) 은 k8s chunker 가 0 chunk → Tier 3 fallback 으로 picked up. `--code-lang shell` / `--code-lang yaml` 검색이 `citation.symbol = null`, `chunker_version = "code-text-paragraph-v1"` 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"shell": N` 등장 확인. - (P7-3 + follow-up) 동일 path 에 byte 가 다른 PDF 를 두 번째 ingest 하면 `purge_vector_orphans_for_workspace_path` 가 옛 chunk_id 를 LanceDB 에서 먼저 삭제, 이어서 `purge_orphan_at_workspace_path` 가 옛 doc / chunks / embedding_records 를 SQLite 에서 sweep. 새 byte 가 새 `doc_id` 로 색인됨. `IngestReport` 에 그 자산만 `new+=1` (다른 자산은 `updated`). 두 store 모두 정합 — 옛 본문 검색 시 옛 chunks 가 더 이상 surface 되지 않음. ### Embedding upgrade (fb-39b) diff --git a/docs/superpowers/plans/2026-05-21-p10-3-tier3-paragraph-fallback.md b/docs/superpowers/plans/2026-05-21-p10-3-tier3-paragraph-fallback.md new file mode 100644 index 0000000..e3a15d7 --- /dev/null +++ b/docs/superpowers/plans/2026-05-21-p10-3-tier3-paragraph-fallback.md @@ -0,0 +1,1296 @@ +# p10-3 Tier 3 Paragraph Fallback Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Activate the `code-text-paragraph-v1` chunker — paragraph + line-window fallback for shell scripts and for Tier 1/2 0-chunk / Err results (non-k8s YAML, invalid YAML, AST extractor failures). + +**Architecture:** Single new chunker module `crates/kebab-chunk/src/code_text_paragraph_v1.rs` using blank-line paragraph segmentation and 80-line / 20-overlap line-window split for oversize paragraphs. `tier2_shared::build_chunk` is exposed as `pub(crate)` so Tier 3 shares the same Chunk-construction semantics as Tier 1/2. `ingest_one_code_asset` gains a `"shell"` arm in its 4-arm match plus a post-match fallback wrapper that retries Tier 1/2 results in `Ok(empty)` / `Err(_)` shape against Tier 3, swapping `chunker_version` + `parser_version` for downstream stamping. + +**Tech Stack:** Rust 2024 workspace. No new external deps (string operations only). Reuses `tier2_shared::build_chunk` (Task D of p10-2 / commit 8996e73). + +**Memory note:** Host has been OOM'd previously. Per-crate cargo only. ONE full-suite + clippy gate at Task I. NO `cargo test --workspace` outside that gate. + +--- + +## Pre-flight + +Branch `feat/p10-3-tier3-paragraph` already exists (spec commit `9d4a60a`). + +- [ ] **Disk hygiene**: `df -h /` 점검. 80% 넘으면 `cargo clean`. + +Reference files (read on-demand per task): +- `tasks/p10/p10-3-tier3-paragraph-fallback.md` — frozen contract. +- `crates/kebab-chunk/src/tier2_shared.rs` — `build_chunk` source; the visibility upgrade lives here. +- `crates/kebab-chunk/src/k8s_manifest_resource_v1.rs` — closest Tier 2 chunker template (uses `tier2_shared::push_chunks_with_oversize`). +- `crates/kebab-chunk/tests/k8s_manifest_resource_v1.rs` — `yaml_doc` helper pattern + `policy()` helper; Tier 3 tests mirror this shape. +- `crates/kebab-app/src/lib.rs` lines 950-970 (allowlist) + 1794-2040 (ingest_one_code_asset). +- `crates/kebab-app/tests/code_ingest_smoke.rs` — 12 existing tests; Tier 3 tests mirror the `TestEnv::lexical_only()` pattern. + +--- + +## Task A: expose `tier2_shared::build_chunk` as `pub(crate)` + +**Files:** +- Modify: `crates/kebab-chunk/src/tier2_shared.rs` + +- [ ] **Step 1**: Read `crates/kebab-chunk/src/tier2_shared.rs` to confirm `build_chunk`'s current visibility (likely module-private `fn`). + +- [ ] **Step 2**: Change `fn build_chunk(...)` to `pub(crate) fn build_chunk(...)`. Signature unchanged: + +```rust +pub(crate) fn build_chunk( + doc: &Document, + policy: &ChunkPolicy, + text: &str, + line_start: u32, + line_end: u32, + symbol: &str, + lang: &str, + chunker_version: &str, +) -> Result { + // body unchanged +} +``` + +- [ ] **Step 3**: Per-crate build sanity: + +```bash +cargo build -p kebab-chunk 2>&1 | tail -3 +``` + +Expected: clean. + +- [ ] **Step 4**: Commit: + +```bash +git add crates/kebab-chunk/src/tier2_shared.rs +git commit -m "$(cat <<'EOF' +refactor(p10-3): expose tier2_shared::build_chunk as pub(crate) + +Tier 3 chunker (next task) needs to call the same Chunk-construction helper +to keep id / hash / token-count / policy_hash semantics identical with +Tier 2. Visibility-only change; signature and body unchanged. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task B: `code-text-paragraph-v1` chunker (TDD) + +**Files:** +- Create: `crates/kebab-chunk/src/code_text_paragraph_v1.rs` +- Create: `crates/kebab-chunk/tests/fixtures/sample_shell.sh` +- Create: `crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt` +- Create: `crates/kebab-chunk/tests/code_text_paragraph_v1.rs` +- Modify: `crates/kebab-chunk/src/lib.rs` (pub mod + pub use) + +### B.1 — fixtures + +- [ ] **Step 1**: Create `crates/kebab-chunk/tests/fixtures/sample_shell.sh` (3-paragraph shell, each < 80 lines): + +```sh +#!/usr/bin/env bash +set -euo pipefail + +# First paragraph: env setup +export KEBAB_HOME="${KEBAB_HOME:-$HOME/.local/share/kebab}" +mkdir -p "$KEBAB_HOME" +cd "$KEBAB_HOME" + +# Second paragraph: ingest + +echo "ingesting workspace..." +kebab ingest --config /etc/kebab/config.toml + +# Third paragraph: report + +echo "done" +kebab schema --json | jq '.stats' +``` + +Note: blank lines BETWEEN the three logical sections are the paragraph boundaries. Each section starts with a `#` comment and runs ~3-4 lines. Total file ~13 lines. + +- [ ] **Step 2**: Create `crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt` (single 200-line paragraph, no blank lines — exercises line-window split): + +```bash +# generate with a small loop — content is irrelevant, the line count is the test +python3 -c 'print("\n".join(f"line {i:03d}" for i in range(1, 201)))' \ + > crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt +``` + +Verify line count = 200: + +```bash +wc -l crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt +``` + +Expected: `200 crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt`. + +### B.2 — failing tests + +- [ ] **Step 3**: Create `crates/kebab-chunk/tests/code_text_paragraph_v1.rs` with the same helper structure as `tests/k8s_manifest_resource_v1.rs` (which has a `yaml_doc(text) -> CanonicalDocument` helper). Mirror it as `text_doc(lang, text) -> CanonicalDocument`. Then four tests: + +```rust +//! Behavioural tests for `CodeTextParagraphV1Chunker`. + +use std::path::PathBuf; + +use kebab_chunk::{ChunkPolicy, Chunker, CodeTextParagraphV1Chunker}; +use kebab_core::{ + AssetId, Block, CanonicalDocument, CodeBlock, CommonBlock, Lang, Metadata, ParserVersion, + Provenance, SourceSpan, SourceType, TrustLevel, WorkspacePath, id_for_block, id_for_doc, +}; +use time::OffsetDateTime; + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") +} + +fn text_doc(lang: &str, text: &str) -> CanonicalDocument { + let wp = WorkspacePath(format!("script.{lang}")); + let aid = AssetId("a".repeat(64)); + let pv = ParserVersion("none-v1".into()); + let doc_id = id_for_doc(&wp, &aid, &pv); + + let line_count = text.lines().count() as u32; + let span = SourceSpan::Code { + line_start: 1, + line_end: line_count.max(1), + symbol: None, + lang: Some(lang.into()), + }; + let bid = id_for_block(&doc_id, "code", &[], 0, &span); + let block = Block::Code(CodeBlock { + common: CommonBlock { + block_id: bid, + heading_path: vec![], + source_span: span, + }, + lang: Some(lang.into()), + code: text.to_string(), + }); + + CanonicalDocument { + doc_id, + source_asset_id: aid, + workspace_path: wp, + title: format!("script.{lang}"), + lang: Lang("und".into()), + blocks: vec![block], + metadata: Metadata { + aliases: vec![], + tags: vec![], + created_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + updated_at: OffsetDateTime::from_unix_timestamp(1_700_000_000).unwrap(), + source_type: SourceType::Note, + trust_level: TrustLevel::Primary, + user_id_alias: None, + user: Default::default(), + repo: Some("kebab".into()), + git_branch: Some("main".into()), + git_commit: Some("0".repeat(40)), + code_lang: Some(lang.into()), + }, + provenance: Provenance { events: vec![] }, + parser_version: pv, + schema_version: 1, + doc_version: 1, + last_chunker_version: None, + last_embedding_version: None, + } +} + +fn policy() -> ChunkPolicy { + ChunkPolicy::default() +} + +#[test] +fn shell_multi_paragraph_splits_on_blank_lines() { + let text = std::fs::read_to_string(fixtures_dir().join("sample_shell.sh")) + .expect("read sample_shell.sh"); + let doc = text_doc("shell", &text); + let chunks = CodeTextParagraphV1Chunker.chunk(&doc, &policy()).expect("chunk"); + + // 3 paragraphs separated by blank lines. + assert_eq!(chunks.len(), 3, "expected 3 paragraph chunks, got {}", chunks.len()); + + for c in &chunks { + match &c.source_spans[0] { + SourceSpan::Code { symbol, lang, .. } => { + assert_eq!(symbol.as_deref(), None, "Tier 3 symbol must be None"); + assert_eq!(lang.as_deref(), Some("shell")); + } + other => panic!("expected Code span, got {other:?}"), + } + } + + // Line ranges must be ascending and not overlap (blank lines are NOT in any chunk). + let ranges: Vec<(u32, u32)> = chunks.iter().map(|c| match &c.source_spans[0] { + SourceSpan::Code { line_start, line_end, .. } => (*line_start, *line_end), + _ => unreachable!(), + }).collect(); + for w in ranges.windows(2) { + assert!(w[0].1 < w[1].0, "paragraph ranges must be strictly ascending; got {:?}", ranges); + } +} + +#[test] +fn single_long_paragraph_line_window_split() { + let text = std::fs::read_to_string(fixtures_dir().join("sample_long_paragraph.txt")) + .expect("read sample_long_paragraph.txt"); + let doc = text_doc("shell", &text); + let chunks = CodeTextParagraphV1Chunker.chunk(&doc, &policy()).expect("chunk"); + + // 200 lines / window 80 / overlap 20 / stride 60 + // chunk[0] = 1..80 (80 lines) + // chunk[1] = 61..140 (80 lines) + // chunk[2] = 121..200 (80 lines) + // → exactly 3 chunks. + assert_eq!(chunks.len(), 3, "expected 3 windows for 200-line paragraph, got {}", chunks.len()); + + let ranges: Vec<(u32, u32)> = chunks.iter().map(|c| match &c.source_spans[0] { + SourceSpan::Code { line_start, line_end, .. } => (*line_start, *line_end), + _ => unreachable!(), + }).collect(); + assert_eq!(ranges, vec![(1, 80), (61, 140), (121, 200)]); + + // chunk_ids must all differ (id_for_chunk's split_key suffix). + let ids: std::collections::HashSet<_> = chunks.iter().map(|c| c.chunk_id.clone()).collect(); + assert_eq!(ids.len(), 3, "line-window chunks must have distinct chunk_ids"); +} + +#[test] +fn empty_file_emits_zero_chunks() { + let doc = text_doc("shell", ""); + let chunks = CodeTextParagraphV1Chunker.chunk(&doc, &policy()).expect("chunk"); + assert!(chunks.is_empty(), "empty text → 0 chunks"); +} + +#[test] +fn lang_field_preserved_from_input_doc() { + let yaml = "key1: value1\nkey2: value2\n"; + let doc = text_doc("yaml", yaml); + let chunks = CodeTextParagraphV1Chunker.chunk(&doc, &policy()).expect("chunk"); + assert_eq!(chunks.len(), 1); + match &chunks[0].source_spans[0] { + SourceSpan::Code { lang, symbol, .. } => { + assert_eq!(symbol.as_deref(), None); + assert_eq!(lang.as_deref(), Some("yaml"), "Tier 3 must preserve input lang"); + } + other => panic!("expected Code span, got {other:?}"), + } +} +``` + +- [ ] **Step 4**: Run tests → FAIL (module/struct not yet defined): + +```bash +cargo test -p kebab-chunk --test code_text_paragraph_v1 -- --nocapture 2>&1 | tail -10 +``` + +Expected: compile error `CodeTextParagraphV1Chunker not found`. + +### B.3 — chunker implementation + +- [ ] **Step 5**: Create `crates/kebab-chunk/src/code_text_paragraph_v1.rs`: + +```rust +//! p10-3: Tier 3 paragraph + line-window fallback chunker. +//! +//! Triggered for shell scripts (`.sh`/`.bash`/`.zsh`) directly, and as a +//! fallback when Tier 1/2 chunkers return `Ok(empty)` or `Err`. Splits by +//! blank lines into paragraphs; paragraphs > 80 lines are further split +//! into 80-line windows with 20-line overlap. + +use crate::tier2_shared::build_chunk; +use crate::{Chunker, ChunkPolicy}; +use anyhow::Result; +use kebab_core::{Block, Chunk, Document}; + +pub const VERSION_LABEL: &str = "code-text-paragraph-v1"; + +const FALLBACK_LINES_PER_CHUNK: usize = 80; +const FALLBACK_LINES_OVERLAP: usize = 20; +// stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP = 60. + +pub struct CodeTextParagraphV1Chunker; + +impl Chunker for CodeTextParagraphV1Chunker { + fn chunker_version(&self) -> &'static str { VERSION_LABEL } + + fn chunk(&self, doc: &Document, policy: &ChunkPolicy) -> Result> { + let Some(Block::Code { text, lang, .. }) = doc.blocks.first() else { + return Ok(vec![]); + }; + let lang_str = lang.as_deref().unwrap_or(""); + + let mut chunks = Vec::new(); + for para in split_paragraphs(text) { + push_paragraph(&mut chunks, doc, policy, ¶, lang_str)?; + } + Ok(chunks) + } +} + +/// Single paragraph + its 1-indexed line range. +struct Paragraph<'a> { + text: String, // joined lines (no trailing newline) + line_start: u32, + line_end: u32, + // unused but kept for future ergonomics: + _src: std::marker::PhantomData<&'a ()>, +} + +fn split_paragraphs(text: &str) -> Vec> { + let mut paragraphs = Vec::new(); + let mut current: Vec<&str> = Vec::new(); + let mut current_start: Option = None; // 1-indexed line number where current paragraph began + + for (idx, line) in text.lines().enumerate() { + let line_no = (idx + 1) as u32; // 1-indexed + let is_blank = line.trim().is_empty(); + if is_blank { + // Boundary: flush current paragraph. + if let Some(start) = current_start.take() { + let end = start + current.len() as u32 - 1; + paragraphs.push(Paragraph { + text: current.join("\n"), + line_start: start, + line_end: end, + _src: std::marker::PhantomData, + }); + current.clear(); + } + } else { + if current_start.is_none() { + current_start = Some(line_no); + } + current.push(line); + } + } + // Trailing paragraph at EOF (no boundary blank line). + if let Some(start) = current_start.take() { + let end = start + current.len() as u32 - 1; + paragraphs.push(Paragraph { + text: current.join("\n"), + line_start: start, + line_end: end, + _src: std::marker::PhantomData, + }); + } + paragraphs +} + +fn push_paragraph( + out: &mut Vec, + doc: &Document, + policy: &ChunkPolicy, + para: &Paragraph<'_>, + lang: &str, +) -> Result<()> { + let n_lines = (para.line_end - para.line_start + 1) as usize; + if n_lines <= FALLBACK_LINES_PER_CHUNK { + out.push(build_chunk( + doc, policy, + ¶.text, + para.line_start, para.line_end, + "", // empty symbol — build_chunk wraps as Some(""); see Step 7 note + lang, + VERSION_LABEL, + )?); + return Ok(()); + } + + // Line-window split. Stride = window - overlap = 60. + let stride = FALLBACK_LINES_PER_CHUNK - FALLBACK_LINES_OVERLAP; + let lines: Vec<&str> = para.text.lines().collect(); + let mut i = 0usize; + while i < lines.len() { + let end = (i + FALLBACK_LINES_PER_CHUNK).min(lines.len()); + let window_text = lines[i..end].join("\n"); + let window_start = para.line_start + i as u32; + let window_end = para.line_start + (end as u32) - 1; + out.push(build_chunk( + doc, policy, + &window_text, + window_start, window_end, + "", + lang, + VERSION_LABEL, + )?); + if end == lines.len() { + break; + } + i += stride; + } + Ok(()) +} +``` + +- [ ] **Step 6**: Register the module in `crates/kebab-chunk/src/lib.rs` (next to existing Tier 2 chunker exports): + +```rust +pub mod code_text_paragraph_v1; +pub use code_text_paragraph_v1::CodeTextParagraphV1Chunker; +``` + +- [ ] **Step 7**: **`symbol = ""` vs `symbol = None` correction**. The spec says Tier 3 chunks must have `Citation::Code.symbol = None`. But `tier2_shared::build_chunk` takes `symbol: &str` and likely wraps it as `Some(s.to_string())`. Two options: + - **(preferred)** Add a sibling helper `build_chunk_no_symbol(doc, policy, text, line_start, line_end, lang, chunker_version) -> Result` in `tier2_shared.rs` that constructs `SourceSpan::Code { ..., symbol: None, lang: Some(lang.to_string()) }`. The current `build_chunk` keeps wrapping `Some(symbol)`. + - (alternative) Change `build_chunk`'s symbol parameter to `Option<&str>`. More disruption (Tier 2 callers need an update). + +Take the preferred path. Edit `crates/kebab-chunk/src/tier2_shared.rs`: add (next to `build_chunk`): + +```rust +/// Like `build_chunk` but emits `symbol: None`. Used by Tier 3. +#[allow(clippy::too_many_arguments)] +pub(crate) fn build_chunk_no_symbol( + doc: &Document, + policy: &ChunkPolicy, + text: &str, + line_start: u32, + line_end: u32, + lang: &str, + chunker_version: &str, +) -> Result { + // Mirror build_chunk's body but with symbol: None in the span. + // The simplest implementation calls build_chunk's underlying machinery — + // pull the body into a helper if needed, or inline minimally: + let span = SourceSpan::Code { + line_start, + line_end, + symbol: None, + lang: Some(lang.to_string()), + }; + build_chunk_from_span(doc, policy, text, span, chunker_version) +} +``` + +If `build_chunk` has a `_from_span` substructure already (see `tier2_shared.rs`), call into it. If not, extract a `build_chunk_from_span(doc, policy, text, span, chunker_version) -> Result` private helper, then have both `build_chunk` (with `symbol: Some(...)`) and `build_chunk_no_symbol` call into it. The diff stays small. + +Update `code_text_paragraph_v1.rs::push_paragraph` to call `build_chunk_no_symbol` instead of `build_chunk(..., "", ...)`: + +```rust +use crate::tier2_shared::build_chunk_no_symbol; +// ... +out.push(build_chunk_no_symbol( + doc, policy, + ¶.text, + para.line_start, para.line_end, + lang, + VERSION_LABEL, +)?); +``` + +And the same swap in the line-window branch. + +- [ ] **Step 8**: Run tests: + +```bash +cargo test -p kebab-chunk --test code_text_paragraph_v1 -- --nocapture 2>&1 | tail -25 +``` + +Expected: 4 PASS. + +- [ ] **Step 9**: Run all kebab-chunk tests (no regression on Tier 1/2 from `tier2_shared` edit): + +```bash +cargo test -p kebab-chunk -- --nocapture 2>&1 | tail -20 +``` + +Expected: all PASS. + +- [ ] **Step 10**: Clippy + commit: + +```bash +cargo clippy -p kebab-chunk --all-targets -- -D warnings +git add crates/kebab-chunk/src/code_text_paragraph_v1.rs \ + crates/kebab-chunk/src/tier2_shared.rs \ + crates/kebab-chunk/src/lib.rs \ + crates/kebab-chunk/tests/fixtures/sample_shell.sh \ + crates/kebab-chunk/tests/fixtures/sample_long_paragraph.txt \ + crates/kebab-chunk/tests/code_text_paragraph_v1.rs +git commit -m "$(cat <<'EOF' +feat(p10-3): code-text-paragraph-v1 chunker — paragraph + line-window fallback + +Blank-line paragraph segmentation (whitespace-only lines as boundaries, +blank lines themselves never in any chunk's range). Paragraphs > 80 lines +split into 80-line windows with 20-line overlap (stride 60), sharing the +input lang and symbol=None per spec §9.3. tier2_shared exposes a new +build_chunk_no_symbol helper so Chunk id/hash/token semantics stay +identical with Tier 1/2. + +4 unit tests cover multi-paragraph shell, 200-line oversize line-window +split (chunks 1-80 / 61-140 / 121-200), empty file, and lang preservation +when input is yaml. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task C: shell direct routing in `ingest_one_code_asset` + +**Files:** +- Modify: `crates/kebab-app/src/lib.rs` + +### C.1 — allowlist + 4-arm match + +- [ ] **Step 1**: Open `crates/kebab-app/src/lib.rs` line 953. Current allowlist: + +```rust +if matches!(lang.as_str(), + "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod") +``` + +Add `"shell"`: + +```rust +if matches!(lang.as_str(), + "rust" | "python" | "typescript" | "javascript" | "go" | "java" | "kotlin" + | "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + | "shell") +``` + +- [ ] **Step 2**: At the top-of-file `use kebab_chunk::{...}` line, add `CodeTextParagraphV1Chunker`: + +```rust +use kebab_chunk::{ + /* existing items */, + CodeTextParagraphV1Chunker, +}; +``` + +- [ ] **Step 3**: parser_version match (line ~1825): + +```rust +let parser_version = match code_lang { + // ... existing 7 Tier 1 arms ... + "kotlin" => ParserVersion(kebab_parse_code::KOTLIN_PARSER_VERSION.to_string()), + "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" + => ParserVersion("none-v1".to_string()), + // p10-3: shell also uses Tier 3 (no parse step). + "shell" => ParserVersion("none-v1".to_string()), + other => anyhow::bail!("unsupported code_lang: {other}"), +}; +``` + +- [ ] **Step 4**: chunker_version match: + +```rust +let chunker_version = match code_lang { + // ... existing arms ... + "toml" | "json" | "xml" | "groovy" | "go-mod" + => ManifestFileV1Chunker.chunker_version(), + // p10-3: + "shell" => CodeTextParagraphV1Chunker.chunker_version(), + other => anyhow::bail!("unreachable chunker_version: {other}"), +}; +``` + +- [ ] **Step 5**: extract match (canonical Document construction): + +```rust +let mut canonical = match code_lang { + // ... existing Tier 1 + Tier 2 arms ... + "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" => { + synthesize_tier2_document(asset, &bytes, code_lang, &parser_version)? + } + // p10-3: shell reuses the same synthesizer — single Block::Code with raw text. + "shell" => synthesize_tier2_document(asset, &bytes, "shell", &parser_version)?, + other => anyhow::bail!("unreachable (extract): {other}"), +}; +``` + +- [ ] **Step 6**: chunks match: + +```rust +let chunks = match code_lang { + // ... existing Tier 1 + Tier 2 arms ... + "toml" | "json" | "xml" | "groovy" | "go-mod" + => ManifestFileV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::ManifestFileV1Chunker::chunk")?, + // p10-3: + "shell" => CodeTextParagraphV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (code:shell)")?, + other => anyhow::bail!("unreachable (chunk): {other}"), +}; +``` + +- [ ] **Step 7**: Build: + +```bash +cargo build -p kebab-app 2>&1 | tail -5 +``` + +Expected: clean. + +- [ ] **Step 8**: Clippy + interim commit (allowlist + 4 arms only; fallback wrapper is the next task): + +```bash +cargo clippy -p kebab-app --all-targets -- -D warnings +git add crates/kebab-app/src/lib.rs +git commit -m "$(cat <<'EOF' +feat(p10-3): activate shell direct routing through Tier 3 chunker + +Extends ingest_one_code_asset's allowlist + 4-arm match (parser_version / +chunker_version / extract / chunks) to admit code_lang "shell" and route it +to CodeTextParagraphV1Chunker. parser_version "none-v1" + synthesize_tier2_document +reused. + +Tier 1/2 fallback wrapper lands in the next commit. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task D: Tier 1/2 → Tier 3 fallback wrapper + +**Files:** +- Modify: `crates/kebab-app/src/lib.rs` + +The post-chunk fallback wrapper. After the `chunks` match resolves, if the result is `Ok(empty)` (Tier 2 invalid YAML / non-k8s YAML) or `Err(_)` (Tier 1 extractor / chunker failure), retry with Tier 3. + +- [ ] **Step 1**: Reshape the `chunks` match to NOT use `?`, instead bind a `Result>`: + +```rust +let chunks_result: anyhow::Result> = match code_lang { + "rust" => CodeRustAstV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeRustAstV1Chunker::chunk (code:rust)"), + "python" => CodePythonAstV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::CodePythonAstV1Chunker::chunk (code:python)"), + // ... existing arms similarly bind Result, no `?` ... + "shell" => CodeTextParagraphV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (code:shell)"), + other => anyhow::bail!("unreachable (chunk): {other}"), +}; +``` + +(Every existing arm: replace its `.context(...)?` with `.context(...)` — drop the trailing `?`. The result of the whole match is now `anyhow::Result>`.) + +- [ ] **Step 2**: Add the fallback wrapper directly after the match: + +```rust +// p10-3: Tier 1/2 0-chunk OR error → Tier 3 fallback retry. +// "shell" is direct Tier 3 already; don't retry-double-up. +let chunks = match chunks_result { + Ok(v) if !v.is_empty() => v, + other if code_lang == "shell" => { + // shell direct call already IS Tier 3 — don't retry. Propagate. + other? + } + Ok(_empty) => { + tracing::warn!( + workspace_path = %asset.workspace_path, + code_lang = code_lang, + "tier1/2 emitted 0 chunks; falling back to tier 3 (code-text-paragraph-v1)" + ); + chunker_version = CodeTextParagraphV1Chunker.chunker_version(); + canonical.parser_version = ParserVersion("none-v1".to_string()); + CodeTextParagraphV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback)")? + } + Err(e) => { + tracing::warn!( + workspace_path = %asset.workspace_path, + code_lang = code_lang, + error = %e, + "tier1/2 errored; falling back to tier 3 (code-text-paragraph-v1)" + ); + chunker_version = CodeTextParagraphV1Chunker.chunker_version(); + canonical.parser_version = ParserVersion("none-v1".to_string()); + CodeTextParagraphV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 fallback after error)")? + } +}; +``` + +Notes: +- `chunker_version` must already be a `let mut` binding (Task C step 4). If it's currently `let`, change to `let mut`. +- `canonical.parser_version` mutation requires `canonical` to be `let mut` (it already is per Task G of p10-2 — `let mut canonical = match ...`). +- The `workspace_path` field on `asset` is a `WorkspacePath(String)` newtype; the `%` formatter uses its `Display` impl. Verify by `git grep "fn fmt" crates/kebab-core/src/ids.rs` if needed — `WorkspacePath` derives `Display`. + +- [ ] **Step 3**: When fallback fires, the extract step for a Tier 1 lang (`"rust"` / `"python"` / ...) **didn't run** (because the Tier 1 `extract` call errored before reaching `chunks_result`). So `canonical` may already be set up correctly for the Tier 3 chunker — IF the extract step succeeded but chunking returned empty. But if extract itself errored (e.g. tree-sitter parse failure), `canonical` was never built and the chunks_result match arm never executed. + + Reshape the **extract** match similarly: + +```rust +let canonical_result: anyhow::Result = match code_lang { + "rust" => RustAstExtractor::new().extract(&ctx, &bytes) + .context("kb-parse-code::RustAstExtractor::extract (code:rust)"), + // ... existing arms ... → drop trailing `?` + "shell" => Ok(synthesize_tier2_document(asset, &bytes, "shell", &parser_version)?), + // (synthesize_tier2_document returns anyhow::Result; the `?` here + // is fine because the Tier 2 synthesizer call is itself the inner Result. If it + // failed, we want to propagate the synthesizer error — synthesize_tier2_document + // can fail on non-utf8 bytes; falling back from a non-utf8 file makes no sense.) + other => anyhow::bail!("unreachable (extract): {other}"), +}; + +// p10-3: extract failure (e.g. tree-sitter parse error) → Tier 3 fallback with +// a synthesized Document. +let mut canonical = match canonical_result { + Ok(d) => d, + Err(_) if code_lang == "shell" => { + // shell's extract goes through synthesize_tier2_document — if THAT fails (non-utf8), + // there's nothing to fall back to. Propagate. + canonical_result? + } + Err(e) => { + tracing::warn!( + workspace_path = %asset.workspace_path, + code_lang = code_lang, + error = %e, + "tier1/2 extract errored; falling back to tier 3 synthesized doc" + ); + chunker_version = CodeTextParagraphV1Chunker.chunker_version(); + // Build the Tier 3 doc from raw bytes. parser_version was originally Tier 1's + // (e.g. RUST_PARSER_VERSION); swap to "none-v1" so try_skip_unchanged keys correctly. + let tier3_parser_version = ParserVersion("none-v1".to_string()); + let mut tier3_doc = synthesize_tier2_document(asset, &bytes, code_lang, &tier3_parser_version)?; + tier3_doc.parser_version = tier3_parser_version; + tier3_doc + } +}; +``` + +If after extract fallback the original `chunks_result` match was going to run a Tier 1 chunker against a Tier 3 doc — that would crash because Tier 1 chunkers expect AST output. So when extract fell back, the *chunks* match must also use Tier 3 directly. Solution: drop the chunks match into an `if-else` flow: + +```rust +let extract_fell_back = matches!(canonical.parser_version.0.as_str(), "none-v1") + && !matches!(code_lang, "yaml" | "dockerfile" | "toml" | "json" | "xml" | "groovy" | "go-mod" | "shell"); + +let chunks = if extract_fell_back { + // Extract already fell back to Tier 3 doc shape; run Tier 3 chunker directly. + CodeTextParagraphV1Chunker.chunk(&canonical, chunk_policy) + .context("kb-chunk::CodeTextParagraphV1Chunker::chunk (tier 3 after extract fallback)")? +} else { + // Normal path — Tier 1/2/3 chunker per code_lang. + let chunks_result: anyhow::Result> = match code_lang { + // ... arms above ... + }; + // ... fallback wrapper from Step 2 ... +}; +``` + +This is getting complex; consider a helper. **Refactor signal**: extract this logic into a single function `tier3_fallback_chunks(asset, bytes, code_lang, chunk_policy, original_parser_version, &mut canonical, &mut chunker_version) -> Result>` if the inline becomes hard to read. + +For the plan, keep it inline but readable. The reviewer will catch readability issues. If a subagent reports DONE_WITH_CONCERNS citing complexity, refactor in a follow-up step. + +- [ ] **Step 4**: Build: + +```bash +cargo build -p kebab-app 2>&1 | tail -5 +``` + +Expected: clean. + +- [ ] **Step 5**: Run existing kebab-app unit tests (no regression): + +```bash +cargo test -p kebab-app --lib -- --nocapture 2>&1 | tail -10 +``` + +Expected: 52 PASS (matching the count after Task G of p10-2). + +- [ ] **Step 6**: Clippy + commit: + +```bash +cargo clippy -p kebab-app --all-targets -- -D warnings +git add crates/kebab-app/src/lib.rs +git commit -m "$(cat <<'EOF' +feat(p10-3): Tier 1/2 → Tier 3 fallback wrapper in ingest_one_code_asset + +After the chunks match resolves, an Ok(empty) result (Tier 2 invalid YAML +/ non-k8s YAML / similar) or Err (Tier 1 extractor / chunker failure) is +retried against CodeTextParagraphV1Chunker. On retry, chunker_version is +swapped to "code-text-paragraph-v1" and canonical.parser_version to +"none-v1" so downstream stamping + try_skip_unchanged remain consistent. + +Extract failure is handled similarly — when a Tier 1 extractor errors +(e.g. tree-sitter parse failure), a synthesize_tier2_document-shaped +fallback doc is built from raw bytes and routed through Tier 3 chunker +directly. + +shell direct path is exempted from the fallback chain (it IS Tier 3 +already). + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task E: integration smoke tests (Tier 3) + +**Files:** +- Modify: `crates/kebab-app/tests/code_ingest_smoke.rs` + +Two new tests. Mirror the pattern from p10-2's three Tier 2 tests (commit `166e1dd`). + +- [ ] **Step 1**: Read the existing `tests/code_ingest_smoke.rs` first — especially the three p10-2 Tier 2 tests near the end (`tier2_k8s_yaml_ingest_searchable`, `tier2_dockerfile_ingest_searchable`, `tier2_cargo_toml_ingest_searchable`). Replicate the `TestEnv::lexical_only()` + ingest_with_config + search_with_config pattern. + +- [ ] **Step 2**: Append two tests at the end of the file: + +```rust +#[test] +fn tier3_shell_ingest_searchable() { + let env = TestEnv::lexical_only(); + let workspace = env.workspace_root(); + std::fs::write( + workspace.join("deploy.sh"), + "#!/usr/bin/env bash\nset -e\necho hello\n\nkebab ingest --json\n", + ) + .unwrap(); + + let report = env.ingest().expect("ingest"); + assert!(report.new_docs >= 1, "expected at least 1 new doc, got {}", report.new_docs); + + let hits = env.search_code_lang("shell", "kebab").expect("search"); + assert!(!hits.is_empty(), "expected at least 1 shell hit"); + + let citation = match &hits[0].citation { + Citation::Code { symbol, lang, .. } => (symbol.clone(), lang.clone()), + other => panic!("expected Citation::Code, got {other:?}"), + }; + assert_eq!(citation.0, None, "Tier 3 symbol must be None"); + assert_eq!(citation.1.as_deref(), Some("shell")); + + // chunker_version should be code-text-paragraph-v1. + assert_eq!( + hits[0].chunker_version.as_deref(), + Some("code-text-paragraph-v1"), + "shell chunks must be stamped with the Tier 3 chunker_version" + ); +} + +#[test] +fn tier3_yaml_fallback_picks_up_non_k8s_yaml() { + let env = TestEnv::lexical_only(); + let workspace = env.workspace_root(); + + // docker-compose-shaped YAML — has `version:` and `services:` but no apiVersion/kind. + // k8s chunker will return Ok(vec![]); the Tier 3 fallback should pick this up. + std::fs::write( + workspace.join("docker-compose.yml"), + "version: '3'\nservices:\n api:\n image: nginx:latest\n ports:\n - 8080:80\n", + ) + .unwrap(); + + let report = env.ingest().expect("ingest"); + assert!(report.new_docs >= 1, "expected the non-k8s yaml to be ingested via Tier 3, got {} new docs", report.new_docs); + + let hits = env.search_code_lang("yaml", "nginx").expect("search"); + assert!(!hits.is_empty(), "expected at least 1 yaml fallback hit"); + + let (symbol, lang) = match &hits[0].citation { + Citation::Code { symbol, lang, .. } => (symbol.clone(), lang.clone()), + other => panic!("expected Citation::Code, got {other:?}"), + }; + assert_eq!(symbol, None, "Tier 3 fallback symbol must be None"); + assert_eq!(lang.as_deref(), Some("yaml"), "lang preserved through fallback"); + + assert_eq!( + hits[0].chunker_version.as_deref(), + Some("code-text-paragraph-v1"), + "non-k8s yaml fallback must be stamped code-text-paragraph-v1" + ); +} +``` + +(The helpers `TestEnv::lexical_only()`, `workspace_root()`, `ingest()`, `search_code_lang(lang, query)` — verify their actual names by reading the file. The first test in `code_ingest_smoke.rs` uses whatever the established API is; mirror it precisely.) + +- [ ] **Step 3**: Run targeted tests: + +```bash +cargo test -p kebab-app --test code_ingest_smoke tier3 -- --nocapture 2>&1 | tail -30 +``` + +Expected: 2 PASS. + +- [ ] **Step 4**: Run the entire smoke file: + +```bash +cargo test -p kebab-app --test code_ingest_smoke -- --nocapture 2>&1 | tail -30 +``` + +Expected: 14 PASS (12 existing + 2 new). + +- [ ] **Step 5**: Clippy + commit: + +```bash +cargo clippy -p kebab-app --tests -- -D warnings +git add crates/kebab-app/tests/code_ingest_smoke.rs +git commit -m "$(cat <<'EOF' +test(p10-3): integration smoke tests for Tier 3 (shell + yaml fallback) + +Two new tests verify end-to-end Tier 3 wiring: +- tier3_shell_ingest_searchable: .sh file → --code-lang shell search → + Citation::Code { symbol: None, lang: "shell" }, chunker_version + "code-text-paragraph-v1". +- tier3_yaml_fallback_picks_up_non_k8s_yaml: docker-compose-shaped yaml + (no apiVersion/kind) triggers k8s chunker's Ok(vec![]) result, fallback + retries with Tier 3 → Citation::Code { symbol: None, lang: "yaml" } and + chunker_version "code-text-paragraph-v1". + +Brings code_ingest_smoke to 14 tests (Tier 1: 9, Tier 2: 3, Tier 3: 2). + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task F: frozen design §10.1 + §10 activation log + +**Files:** +- Modify: `docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md` +- Modify: `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` + +- [ ] **Step 1**: Read §10.1 of `docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md` to find the existing activation log format (Task I of p10-2 added the p10-2 entry there). Add a sibling entry right after: + +``` +| p10-3 | Tier 3 활성화 — code-text-paragraph-v1 active. shell direct routing + Tier 1/2 fallback wrapper (0-chunk or Err → Tier 3 retry). 비-k8s YAML / invalid YAML 자동 picked up. | 2026-05-21 | +``` + +(Match the table style exactly. If the existing entries are bullets, use a bullet.) + +- [ ] **Step 2**: Read §10 of `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` (around line 1552 — the p10-2 entry is there). Add right after: + +``` +**p10-3 활성화 (Tier 3 paragraph fallback) (2026-05-21)**: Tier 3 chunker `code-text-paragraph-v1` 활성화. shell script (`.sh`/`.bash`/`.zsh`) direct routing + Tier 1/2 가 0 chunk 또는 Err 시 자동 fallback 으로 retry. 비-k8s YAML / invalid YAML / AST 실패 케이스 모두 picked up. lang 은 입력 보존 (shell → "shell", yaml → "yaml" 등), symbol 은 항상 None. +``` + +- [ ] **Step 3**: Commit: + +```bash +git add docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md \ + docs/superpowers/specs/2026-04-27-kebab-final-form-design.md +git commit -m "$(cat <<'EOF' +docs(p10-3): activate Tier 3 in frozen design §10.1 + §10 + +§10.1 (code-ingest design): add deactivation log entry for p10-3. +§10 (final-form design): mirror entry in the activation log. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task G: README + HANDOFF + ARCHITECTURE + SMOKE + tasks/INDEX + tasks/p10/INDEX + +**Files:** +- Modify: `README.md` +- Modify: `HANDOFF.md` +- Modify: `docs/ARCHITECTURE.md` +- Modify: `docs/SMOKE.md` +- Modify: `tasks/INDEX.md` +- Modify: `tasks/p10/INDEX.md` + +### G.1 — README + +- [ ] **Step 1**: Open `README.md`. Find the `kebab ingest` row in the 명령 table (line ~73). Extend the supported-langs list with shell + the fallback note. Sample patch (adjust to actual current wording): + +Find: + +``` +**소스코드** (`.rs` → `code-rust-ast-v1`, `.py` → ... , `.kt`/`.kts` → `code-kotlin-ast-v1` — 모두 tree-sitter AST chunker; **Tier 2 리소스 파일**: ...) +``` + +After the `Tier 2 리소스 파일: ...` clause, insert before the closing `)`: + +``` +; **Tier 3 paragraph fallback** (`.sh`/`.bash`/`.zsh` → `code-text-paragraph-v1`, blank-line paragraph split + 80-line/20-overlap line-window. Tier 1/2 가 0 chunk 또는 Err 시 자동 fallback — 비-k8s YAML 같은 케이스 picked up. symbol = None, lang 은 원본 보존.) +``` + +Also extend the `--code-lang` enumeration: + +``` +--code-lang ... / --code-lang shell / ... +``` + +- [ ] **Step 2**: Find the Mermaid diagram's chunker node (line ~135). Replace: + +``` +chunker["chunker (md-heading-v1, pdf-page-v1, code-{rust,python,ts,js,go,java,kotlin}-ast-v1, k8s-manifest-resource-v1, dockerfile-file-v1, manifest-file-v1)"] +``` + +with (add `code-text-paragraph-v1`): + +``` +chunker["chunker (md-heading-v1, pdf-page-v1, code-{rust,python,ts,js,go,java,kotlin}-ast-v1, k8s-manifest-resource-v1, dockerfile-file-v1, manifest-file-v1, code-text-paragraph-v1)"] +``` + +### G.2 — HANDOFF + +- [ ] **Step 3**: Open `HANDOFF.md`. Find the phase table row for P10. The Tier 2 row was `**2 ✅ (Tier 2 resource-aware: ... — v0.14.0)**`. Add a sibling for Tier 3: + +``` +, **3 ✅ (Tier 3 paragraph fallback: code-text-paragraph-v1 — v0.15.0)** +``` + +(Insert into the same P10 list cell — match the comma + bold styling used by neighbors.) + +Update the 한 줄 요약 at the top: replace `Tier 2 리소스 파일 (yaml/k8s / dockerfile / toml / json / xml / groovy / go-mod) 처리` with `... + Tier 3 paragraph fallback (shell / 비-k8s YAML / AST 실패) 처리`. And in 다음 후보: drop p10-3, leave `P10-1D (C/C++) 또는 P9-5 (desktop tauri) 또는 보류 중인 P8 (audio)`. + +### G.3 — ARCHITECTURE + +- [ ] **Step 4**: Open `docs/ARCHITECTURE.md`. Find the code parser table row (line ~25 per Task J of p10-2). After the Tier 2 sentence, add: + +``` +**Tier 3 (p10-3)**: shell scripts (`.sh`/`.bash`/`.zsh`) direct → `code-text-paragraph-v1` (blank-line paragraph segmentation + 80-line / 20-overlap line-window for oversize). Same chunker also serves as fallback when Tier 1/2 emit 0 chunks or Err — non-k8s YAML / invalid YAML / AST extractor failures all picked up. symbol = None; lang preserved from input doc. +``` + +- [ ] **Step 5**: Find the `flowchart TB` block (line ~52). The `pcode` node currently says `(P10-1A-2 + P10-1B + P10-1C-Go + P10-1C-JK + P10-2)`. Update to `(P10-1A-2 + P10-1B + P10-1C-Go + P10-1C-JK + P10-2 + P10-3)`. + +- [ ] **Step 6**: Find the `crates/kebab-chunk/src/` tree (line ~165). Add an entry for `code_text_paragraph_v1.rs`: + +``` +│ │ ├── code_*_ast_v1.rs # Tier 1 AST chunkers (rust/python/ts/js/go/java/kotlin) +│ │ ├── k8s_manifest_resource_v1.rs # Tier 2 (p10-2): YAML multi-doc, apiVersion+kind per resource +│ │ ├── dockerfile_file_v1.rs # Tier 2 (p10-2): whole-file Dockerfile +│ │ ├── manifest_file_v1.rs # Tier 2 (p10-2): whole-file Cargo.toml / go.mod / .json / .xml / .groovy +│ │ ├── code_text_paragraph_v1.rs # Tier 3 (p10-3): blank-line paragraph + 80/20 line-window fallback +│ │ └── tier2_shared.rs # Tier 2 (p10-2): shared oversize fallback + Chunk builder helpers +``` + +### G.4 — SMOKE + +- [ ] **Step 7**: Open `docs/SMOKE.md`. After the "P10-2 Tier 2 리소스 파일 색인" section, add a "P10-3 Tier 3 paragraph fallback" section: + +```markdown +## P10-3 Tier 3 paragraph fallback + +P10-2 와 동일한 격리 KB 설정. `.sh` 파일은 direct, 비-k8s YAML 은 fallback 으로 들어간다. + +```bash +# 1) shell script (direct Tier 3) +cat > /tmp/kebab-smoke/workspace/deploy.sh <<'EOF' +#!/usr/bin/env bash +set -e + +echo "ingesting..." +kebab ingest + +echo "done" +kebab schema --json | jq '.stats' +EOF + +# 2) 비-k8s YAML (Tier 2 가 0 chunk → Tier 3 fallback) +cat > /tmp/kebab-smoke/workspace/docker-compose.yml <<'EOF' +version: '3' +services: + api: + image: nginx:latest + ports: + - 8080:80 +EOF + +# 3) ingest +KB ingest + +# 4) 언어별 검색 (citation.symbol = None 확인) +KB search --mode hybrid "ingest" --code-lang shell --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang, chunker: .chunker_version}]}' +# 기대: symbol = null, lang = "shell", chunker_version = "code-text-paragraph-v1" + +KB search --mode hybrid "nginx" --code-lang yaml --json | \ + jq '{hits: [.hits[] | {symbol: .citation.symbol, lang: .citation.lang, chunker: .chunker_version}]}' +# 기대: symbol = null, lang = "yaml", chunker_version = "code-text-paragraph-v1" + +# 5) schema stats 에 shell 카운트 확인 +KB --json schema | jq '.stats.code_lang_breakdown' +# 기대: {"shell": N, "yaml": M, ...} (M 은 k8s yaml + Tier 3 fallback yaml 합계) +``` + +**Tier 3 citation.symbol 컨벤션**: 항상 `null`. 의미 단위 식별 안 함. `lang` 은 원본 lang 보존 (shell → `"shell"`, yaml → `"yaml"` 등). +``` + +Append a P10-3 entry to the 검증 체크리스트 at the bottom: + +``` +- (P10-3) `.sh`/`.bash`/`.zsh` 파일은 direct Tier 3 (`code-text-paragraph-v1`). 비-k8s YAML (apiVersion+kind 없는 yaml) 은 k8s chunker 가 0 chunk → Tier 3 fallback 으로 picked up. `--code-lang shell` / `--code-lang yaml` 검색이 `citation.symbol = null`, `chunker_version = "code-text-paragraph-v1"` 결과를 반환하면 wiring 정상. `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"shell": N` 등장 확인. +``` + +### G.5 — INDEX files + +- [ ] **Step 8**: `tasks/INDEX.md` — flip the p10-3 row to ✅: + +Find: +``` + - p10-3 Tier 3 paragraph + line-window fallback — ⏳ +``` +Replace with: +``` + - p10-3 Tier 3 paragraph + line-window fallback — ✅ 머지 (v0.15.0, `code-text-paragraph-v1`) +``` + +- [ ] **Step 9**: `tasks/p10/INDEX.md` — same row, change to ✅: + +``` +| 3 | Tier 3 paragraph + line-window fallback | ✅ 머지 (v0.15.0) | +``` + +### G.6 — commit + +- [ ] **Step 10**: Single commit for all 6 docs: + +```bash +git add README.md HANDOFF.md docs/ARCHITECTURE.md docs/SMOKE.md tasks/INDEX.md tasks/p10/INDEX.md +git commit -m "$(cat <<'EOF' +docs(p10-3): README/HANDOFF/ARCHITECTURE/SMOKE/INDEX sync + +- README adds Tier 3 to the ingest row (shell + fallback) and the Mermaid + chunker enumeration; --code-lang shell admitted. +- HANDOFF flips p10-3 to ✅ (v0.15.0) and updates the 한 줄 요약 + next + candidates. +- ARCHITECTURE adds Tier 3 to the code-parser row, extends the flowchart + pcode node, and lists code_text_paragraph_v1.rs in the chunker tree. +- SMOKE adds a P10-3 walkthrough (shell + non-k8s YAML fallback) and a + verification checklist entry. +- tasks/INDEX + tasks/p10/INDEX flip p10-3 to ✅. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task H: workspace test gate + clippy + +**Files:** (none — gates only) + +- [ ] **Step 1**: Disk check: + +```bash +df -h / +``` + +If usage > 80%, run `cargo clean` first. + +- [ ] **Step 2**: Workspace test gate (memory-conscious `-j 1`): + +```bash +cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -80 +``` + +Expected: ALL PASS. Especially: +- `kebab-chunk`: 4 new Tier 3 tests + existing. +- `kebab-app`: 14 tests in `code_ingest_smoke` (12 + 2 new). + +If FAIL: common modes: +- A Tier 1/2 test inadvertently relied on the chunks match's prior `?`-propagation behavior — Task D's restructuring shouldn't change observable behavior but check. +- A test that expected `Err` from Tier 1 (e.g. invalid input fixture) now gets `Ok(vec![chunk])` (Tier 3 fallback). Such tests would be tests-of-failure-mode rather than tests-of-success — likely intentional regression coverage. Review case-by-case. + +- [ ] **Step 3**: Workspace clippy: + +```bash +cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -30 +``` + +Expected: clean. + +--- + +## Task I: workspace version bump + gitea PR + +**Files:** +- Modify: `Cargo.toml` +- Modify: `Cargo.lock` (auto) + +- [ ] **Step 1**: Edit `Cargo.toml` workspace `version = "0.14.0"` → `"0.15.0"`. + +- [ ] **Step 2**: Refresh Cargo.lock: + +```bash +cargo build -p kebab-cli 2>&1 | tail -5 +``` + +Expected: clean. `Cargo.lock` cascades all 22 `kebab-*` crates to 0.15.0. + +- [ ] **Step 3**: Commit: + +```bash +git add Cargo.toml Cargo.lock +git commit -m "$(cat <<'EOF' +chore: bump version 0.14.0 → 0.15.0 (p10-3 Tier 3 paragraph fallback) + +Minor bump — additive new chunker_version "code-text-paragraph-v1" + new +routing lang "shell" + new fallback wrapper behavior. No DB migration, no +wire schema major bump (Citation::Code.lang values were already a free +string field). + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +- [ ] **Step 4**: Push branch + open gitea PR via REST API (per CLAUDE.md). Title: + +``` +feat(p10-3): Tier 3 paragraph + line-window fallback chunker — shell + 비-k8s YAML / AST 실패 자동 picked up +``` + +Body summary: + +- `code-text-paragraph-v1` chunker activated (design §9.3). +- shell scripts (`.sh`/`.bash`/`.zsh`) ingest directly via Tier 3. +- Tier 1/2 0-chunk or Err results retry with Tier 3 — non-k8s YAML, invalid YAML, AST extractor failures all picked up. `chunker_version` + `parser_version` swap on fallback. +- 4 unit tests + 2 smoke tests = 6 new testing surfaces. +- frozen design §10.1 + §10 deltas. +- 0.14.0 → 0.15.0. + +Test plan checkboxes: +- [x] `cargo test --workspace --no-fail-fast -j 1` PASS +- [x] `cargo clippy --workspace --all-targets -- -D warnings` clean +- [x] kebab-chunk 4 new Tier 3 unit tests PASS +- [x] kebab-app code_ingest_smoke 14 tests PASS (12 + 2 new) +- [ ] post-merge dogfood: multi-root KB ingest with mixed .sh + non-k8s yaml — verify --code-lang shell results and schema breakdown +- [ ] post-merge gitea-release v0.15.0 + +- [ ] **Step 5**: Wait for code-reviewer APPROVE, then merge via the gitea REST API (`POST /repos/altair823-org/kebab/pulls//merge`) and cut `gitea-release v0.15.0`. + +--- + +## Verification matrix (final, after Task I merge) + +| 검증 | 명령 | 기대 | +|------|------|------| +| shell direct | `kebab ingest /tmp/kebab-smoke/workspace/deploy.sh` + `kebab search --code-lang shell --json` | `Citation::Code { symbol: null, lang: "shell" }`, `chunker_version: "code-text-paragraph-v1"` | +| 비-k8s YAML fallback | `kebab ingest /tmp/kebab-smoke/workspace/docker-compose.yml` + `kebab search --code-lang yaml --json` | `Citation::Code { symbol: null, lang: "yaml" }`, `chunker_version: "code-text-paragraph-v1"` | +| invalid YAML fallback | malformed yaml ingest → search | Tier 3 chunks emitted (non-empty) | +| AST extractor 실패 fallback | (hard to trigger artificially — relies on tree-sitter parse failure on otherwise-valid Rust; this is dogfood territory) | `Citation::Code { symbol: null, lang: "rust" }`, `chunker_version: "code-text-paragraph-v1"` | +| `code_lang_breakdown` | `kebab schema --json | jq .stats.code_lang_breakdown` | `"shell": N`, `"yaml": M+K` (k8s + fallback) | + +--- + +## Risks reminder (구현 중 주의) + +- **Fallback wrapper 의 복잡도**: Task D 가 가장 위험 — extract failure + chunks failure 두 path 가 얽힘. 한 helper 로 추출하는 게 깔끔할 수 있음. 가독성이 나빠지면 subagent 가 DONE_WITH_CONCERNS 로 보고 → 후속 cleanup commit. +- **`tier2_shared::build_chunk_no_symbol` 추가**: 기존 `build_chunk` 의 body 를 재사용하려면 `build_chunk_from_span` 내부 helper 분리 가능. 분리하면 build_chunk + build_chunk_no_symbol 둘 다 한 곳에서 Chunk 구성. +- **shell fixture line count 정확성**: Task B Step 1 의 `sample_shell.sh` 가 정확히 3 paragraph 가 되도록 — `#` 줄과 명령 줄 사이 빈 줄이 없어야 같은 paragraph, paragraph 사이엔 정확히 1 빈 줄. fixture 생성 후 `cat -A` 로 확인 권장. +- **200-line fixture stride 계산**: Step 2 의 `sample_long_paragraph.txt` 가 정확히 200 lines. window 80 / stride 60 → 1-80, 61-140, 121-200. 마지막 window 의 시작이 121 인 이유 = 121 + 80 - 1 = 200 ≤ 200. 다음 stride (181) 의 시작은 181 인데 181 + 80 - 1 = 260 > 200 이라 그냥 EOF 까지 = 181-200 (20 lines). 즉 알고리즘에 따라 chunk 수가 3 또는 4 가 됨. Task B Step 5 의 impl 확인 — `while i < lines.len()` 에서 마지막 stride 가 EOF 를 넘어서면 짧은 마지막 window emit. 확인 결과 expectation 이 정확히 3 chunk 인지 4 chunk 인지 (3 이 맞으면 `break` 조건이 `if end == lines.len() { break }` 로 처리) 검증. + - **재계산**: 200 lines, window 80, stride 60. + - i=0, end=min(80, 200)=80 → chunk 1-80, break? end != len(200), 계속, i += 60 → i=60. + - i=60, end=min(140, 200)=140 → chunk 61-140, end != 200, i=120. + - i=120, end=min(200, 200)=200 → chunk 121-200, end == 200 → break. + - 정확히 3 chunk. Test expectation 맞음. + +- **`canonical.parser_version` mutation 의 영향 범위**: try_skip_unchanged 는 ingest_one_code_asset 시작부에서 `parser_version` (캡처된 값) 으로 체크. fallback 후 stored 가 `none-v1` 로 변경 — 다음 ingest 시 동일 lang 이면 동일 `parser_version` 으로 키 → skip 동작. Tier 1 chunker 가 미래에 정상 동작하기 시작하면 Tier 1 path 가 `RUST_PARSER_VERSION` 으로 새 키 → cache miss → reprocess. cascade rule 정상. + +- **머지 후 deviation** 은 `tasks/HOTFIXES.md` dated 로그 + 본 spec `Risks / notes` cross-link. diff --git a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md index b946439..72992fb 100644 --- a/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md +++ b/docs/superpowers/specs/2026-04-27-kebab-final-form-design.md @@ -1551,6 +1551,8 @@ transitional 형태) 의 source of truth. **p10-2 활성화 (Tier 2 chunker) (2026-05-20)**: Tier 2 resource-aware chunker 3종 활성화 — k8s-manifest-resource-v1 (`.yaml`/`.yml`), dockerfile-file-v1 (`Dockerfile`), manifest-file-v1 (`Cargo.toml` 등 설정 파일). 추가 code_lang 매핑: XML (`.xml`, `pom.xml`), Groovy (`build.gradle`, `.gradle`), Go module (`go.mod`). +**p10-3 활성화 (Tier 3 paragraph fallback) (2026-05-21)**: Tier 3 chunker `code-text-paragraph-v1` 활성화. shell script (`.sh`/`.bash`/`.zsh`) direct routing + Tier 1/2 가 0 chunk 또는 Err 시 자동 fallback 으로 retry. 비-k8s YAML / invalid YAML / AST 실패 케이스 모두 picked up. lang 은 입력 보존 (shell → "shell", yaml → "yaml" 등), symbol 은 항상 None. + ### 10.2 MCP server transport (fb-30) `kebab mcp` 가 stdio JSON-RPC server. Rust SDK = `rmcp 1.6`. Tool surface diff --git a/tasks/INDEX.md b/tasks/INDEX.md index 47922d5..7bf62ac 100644 --- a/tasks/INDEX.md +++ b/tasks/INDEX.md @@ -146,7 +146,7 @@ P0~P5 는 직렬. P6~P9 는 P5 이후 병렬 가능. - p10-1C-JavaKotlin Java + Kotlin AST chunkers — 🟢 PR 오픈 (v0.13.0, `code-java-ast-v1` / `code-kotlin-ast-v1`) - p10-1D C + C++ AST chunkers — ⏳ - p10-2 Tier 2 resource-aware — ✅ 머지 (v0.14.0, `k8s-manifest-resource-v1` / `dockerfile-file-v1` / `manifest-file-v1`) - - p10-3 Tier 3 paragraph + line-window fallback — ⏳ + - p10-3 Tier 3 paragraph + line-window fallback — ✅ 머지 (v0.15.0, `code-text-paragraph-v1`) ## Post-merge 핫픽스 diff --git a/tasks/p10/INDEX.md b/tasks/p10/INDEX.md index c14a287..f2bb2c9 100644 --- a/tasks/p10/INDEX.md +++ b/tasks/p10/INDEX.md @@ -9,6 +9,6 @@ | 1C-JavaKotlin | Java + Kotlin AST chunkers (`code-java-ast-v1` / `code-kotlin-ast-v1`) | 🟢 PR 오픈 (v0.13.0) | | 1D | C + C++ AST chunkers | ⏳ | | 2 | Tier 2 resource-aware (k8s / Dockerfile / manifest) | ✅ 머지 (v0.14.0) | -| 3 | Tier 3 paragraph + line-window fallback | ⏳ | +| 3 | Tier 3 paragraph + line-window fallback | ✅ 머지 (v0.15.0) | Design: [2026-05-15-kebab-code-ingest-design.md](../../docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md) diff --git a/tasks/p10/p10-3-tier3-paragraph-fallback.md b/tasks/p10/p10-3-tier3-paragraph-fallback.md new file mode 100644 index 0000000..675d7fa --- /dev/null +++ b/tasks/p10/p10-3-tier3-paragraph-fallback.md @@ -0,0 +1,116 @@ +# p10-3 — Tier 3 paragraph + line-window fallback chunker + +**Status:** 🟡 진행 중 +**Contract sections:** §3.3 (chunker_version `code-text-paragraph-v1`), §3.5 (code_lang routing — `shell` 활성화 + "미지원 / Tier 3 fallback" 명확화), §6.2 (`kebab-chunk/src/code_text_paragraph_v1.rs`), §6.3 (`tier2_shared::build_chunk` 의 `pub(crate)` 노출), §9.3 (Tier 3 정의), §10.1 (deactivation log 한 줄). +**Design:** [2026-05-15-kebab-code-ingest-design.md](../../docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md) §1.3 (Phase 3) + §9.3. +**Plan:** [2026-05-20-p10-3-tier3-paragraph-fallback.md](../../docs/superpowers/plans/2026-05-20-p10-3-tier3-paragraph-fallback.md). + +## Goal + +p10-1A-2 / 1B / 1C / 1A-1 의 framework + p10-2 Tier 2 인프라 위에 Tier 3 paragraph fallback chunker 활성화. 단일 PR. 머지 시점부터: + +- `.sh` / `.bash` / `.zsh` 파일이 paragraph 단위로 색인. +- p10-2 의 비-k8s YAML / invalid YAML / Tier 1 AST extractor 실패 등 0-chunk 결과가 자동으로 Tier 3 로 fallback 되어 색인 — 이전에 skip 되던 파일이 search 가능. + +## 동결된 설계 결정 (이 task 로 확정) + +### chunker (`code-text-paragraph-v1`) + +- **Input**: `Document` with single `Block::Code { text, lang, ... }`. Tier 2 의 `synthesize_tier2_document` 와 동일한 모양 — fallback wrapper 가 같은 doc 재사용. +- **VERSION_LABEL**: `"code-text-paragraph-v1"`. +- **Paragraph 분할**: `text.lines()` 순회. 빈 줄 (정확히 빈 줄 또는 only-whitespace) 을 paragraph boundary 로. 빈 줄 자체는 어느 paragraph 에도 포함되지 않음 (chunk 의 line range 에 미포함). 빈 paragraph (전부 whitespace) skip. +- **Paragraph 크기 룰** (design §9.3 default 그대로, hardcoded): + - paragraph line count ≤ 80 → 1 chunk emit. + - paragraph line count > 80 → line-window split with window size 80 / overlap 20 (stride 60). 즉 line 1-80, 61-140, 121-200, … 마지막 window 는 EOF 까지 (≤ 80 lines). + - `FALLBACK_LINES_PER_CHUNK = 80`, `FALLBACK_LINES_OVERLAP = 20` 둘 다 hardcoded constants (1A-2 의 `AST_CHUNK_MAX_LINES = 200` 패턴 그대로 — 사용자 config 노출 안 함, 미래 HOTFIXES 시 노출 검토). +- **Citation**: `SourceSpan::Code { line_start, line_end, symbol: None, lang: }`. `symbol = None` 통일 (Tier 3 는 의미 단위 식별 안 함). `lang` 은 입력 Document 의 `Block::Code.lang` 그대로 보존 — shell → `"shell"`, k8s skip → `"yaml"`, Rust extractor 실패 → `"rust"` 등. +- **chunk_id 충돌 방지**: 동일 paragraph 의 line-window split 시 `id_for_chunk` 의 `split_key` 에 `window_start` 전달 (Tier 2 `#L{k}` 패턴 동일). +- **Edge cases**: + - 전체 파일이 빈 줄만 → 0 chunk emit (fallback 의 fallback 없음). `tracing::warn!`. + - 단일 paragraph + ≤ 80 lines → 1 chunk, line range 1..N. + - 빈 줄 없는 거대 파일 (한 paragraph 전체) → line-window split. + +### Routing / fallback wrapper + +- **`code_lang_for_path`** 변경 없음 (shell 매핑은 1A-1 시점부터 이미 존재). +- **`ingest_one_code_asset` allowlist** (`crates/kebab-app/src/lib.rs:953`) 에 `"shell"` 추가. +- **4-arm match (parser_version / chunker_version / extract / chunks)** 에 `"shell"` arm 추가: + - parser_version = `"none-v1"` (Tier 2 sentinel 재사용). + - chunker_version = `CodeTextParagraphV1Chunker.chunker_version()`. + - extract = `synthesize_tier2_document(asset, &bytes, "shell", &parser_version)?` (재사용). + - chunks = `CodeTextParagraphV1Chunker.chunk(&canonical, chunk_policy)?`. +- **Fallback wrapper** (핵심 신규 로직) — chunks match 직후 후처리: + - Tier 1/2 lang 의 결과가 `Err(_)` 또는 `Ok(empty_vec)` 이면 Tier 3 retry. + - retry 시: + - `chunker_version` 를 `code-text-paragraph-v1` 로 swap (downstream stamping 정확성). + - `canonical.parser_version` 도 `"none-v1"` 로 swap (Tier 1 의 `RUST_PARSER_VERSION` 등이 misleading 하므로). + - `CodeTextParagraphV1Chunker.chunk(&canonical, chunk_policy)` 실행. + - 실패 사유는 `tracing::warn!("tier1/2 emitted 0 chunks or errored for {workspace_path} ({code_lang}); falling back to tier 3")`. +- **Tier 3 자체가 0 chunk 또는 Err** 인 경우는 그대로 fail/skip (fallback 의 fallback 없음). + +### `tier2_shared::build_chunk` 노출 + +- 현재 module-private `fn build_chunk`. Tier 3 가 동일 Chunk 생성 (hash / token / policy_hash 일관) 을 위해 호출 — `pub(crate) fn build_chunk(...)` 으로 visibility 만 변경. signature 동일. + +### Lang 보존 정책 + +- Tier 3 chunk 의 `Citation::Code.lang` = 입력 Document 의 `Block::Code.lang` 그대로. 명시적으로 표: + | Source | input lang | Tier 3 output lang | + |--------|-----------|----------| + | shell direct | `"shell"` | `"shell"` | + | k8s 0-chunk fallback | `"yaml"` | `"yaml"` | + | Rust AST 실패 fallback | `"rust"` | `"rust"` | + | manifest 0-chunk (이론상, 거의 발생 안 함) | `"toml"` 등 | 유지 | +- 검색 시 `--code-lang shell` / `--code-lang yaml` 등이 fallback chunk 도 매칭 — search filter 동작 자연. + +### Non-scope + +- **미지원 확장자 wiring**: `.txt` / `.log` / `.scala` / `.rb` 등은 본 PR scope 밖. `code_lang_for_path` 의 매핑은 unchanged. Tier 3 chunker 자체는 만들어두고, 미래에 `code_lang_for_path` 에 새 lang 추가 시 자동 picked up (1A-2 패턴). +- **config 노출**: `FALLBACK_LINES_PER_CHUNK` / `FALLBACK_LINES_OVERLAP` hardcoded. config.toml 노출 없음. + +### Frozen design 갱신 + +- `docs/superpowers/specs/2026-05-15-kebab-code-ingest-design.md` §10.1 활성화 로그 한 줄. +- `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` §10 activation log 한 줄. +- §3.5 의 "미지원 / Tier 3 fallback → null" 표현은 그대로 유지 (해당 표현이 본 phase 의 정확한 의미 — Tier 3 chunk 의 lang 은 입력 lang 보존이므로 "null" 은 미지원 확장자 wire 시 적용). + +## Acceptance criteria + +- `cargo test --workspace --no-fail-fast -j 1` PASS (memory-conscious `-j 1`). +- `cargo clippy --workspace --all-targets -- -D warnings` clean. +- 4 신규 unit test in `crates/kebab-chunk/tests/code_text_paragraph_v1.rs`: + - `shell_multi_paragraph_splits_on_blank_lines` — 3-paragraph fixture → 3 chunk, symbol=None, lang=shell, contiguous (exclusive of blank lines). + - `single_long_paragraph_line_window_split` — 200+ line single paragraph → window split, distinct chunk_ids, expected line ranges (1-80, 61-140, 121-200, …). + - `empty_file_emits_zero_chunks` — 빈 텍스트 → `Ok(vec![])`. + - `lang_field_preserved_from_input_doc` — lang=yaml 입력 → emit chunk lang=yaml. +- 2 신규 integration test in `crates/kebab-app/tests/code_ingest_smoke.rs`: + - `tier3_shell_ingest_searchable` — `.sh` 파일 ingest → `--code-lang shell` 검색 → `Citation::Code { symbol: None, lang: "shell" }`, `chunker_version: "code-text-paragraph-v1"`. + - `tier3_yaml_fallback_picks_up_non_k8s_yaml` — apiVersion+kind 없는 yaml ingest → fallback 발동 → `Citation::Code { symbol: None, lang: "yaml" }`, chunker_version `code-text-paragraph-v1`. +- 기존 12 smoke test + 2 신규 = 14 testing surface. (Tier 1 9 + Tier 2 3 + Tier 3 2.) +- `kebab schema --json | jq .stats.code_lang_breakdown` 에 `"shell"` 카운트 등장 (.sh 파일 ingest 후). 비-k8s YAML 도 `"yaml"` 카운트에 누적 (Tier 2 와 Tier 3 가 같은 lang). +- README + HANDOFF + docs/ARCHITECTURE + docs/SMOKE + tasks/INDEX + tasks/p10/INDEX 갱신. +- frozen design §10.1 + §10 activation log 한 줄씩. +- workspace `Cargo.toml` minor bump (0.14.0 → 0.15.0), gitea-release v0.15.0. + +## Allowed dependencies + +- `kebab-chunk` 의 새 모듈 `code_text_paragraph_v1.rs` — kebab-core + anyhow + tracing. tier2_shared 의 `build_chunk` 호출 (visibility `pub(crate)` 로 노출). tree-sitter / serde_yaml 비사용. +- `kebab-app::ingest_one_code_asset` — 4-arm match + allowlist + fallback wrapper 확장. 새 crate dep 없음. +- `kebab-parse-code` — 변경 없음 (lang.rs 의 shell 매핑은 1A-1 부터 존재). +- `kebab-source-fs` — 변경 없음 (media.rs 이미 `code_lang_for_path` 위임). + +## Forbidden dependencies + +- `kebab-chunk` 가 store / embed / llm / rag / tree-sitter 직접 import 금지 (boundary §6.3 유지). +- UI crate (`kebab-cli` / `kebab-mcp` / `kebab-tui` / `kebab-desktop`) 가 `kebab-parse-code` / `kebab-chunk` 직접 import 금지 — `kebab-app` facade 만. + +## Risks / notes + +- **Fallback infinite loop 방지**: Tier 3 자체가 0 chunk 또는 Err 인 경우는 그대로 fail/skip — fallback 의 fallback 없음. 명시 spec. +- **chunker_version swap 시 `try_skip_unchanged` 일관성**: fallback 발동 후 stored chunker_version = `code-text-paragraph-v1`. 다음 ingest 에 동일 파일 → 동일 chunker_version 으로 lookup 매칭 (skip 동작 OK). Tier 1 chunker 가 미래에 작동하기 시작하면 (예: tree-sitter grammar fix) cascade rule 로 incremental cache miss → 자동 reprocess 가 정상 동작. +- **lang 보존 vs fallback 의미**: fallback chunk 의 lang 이 원본 lang 유지라 search filter `--code-lang yaml` 가 Tier 2 와 Tier 3 chunk 둘 다 매칭. 의도된 동작 — 사용자가 "yaml 파일 검색" 했을 때 모든 yaml 결과 표시. +- **line-window overlap 의미**: 80/20 (stride 60) 은 design §9.3 default. 거대 paragraph (예: minified JSON 한 줄) 의 경우에도 동일 알고리즘 — 단 한 줄 = 한 line 이라 split 발생 안 함 (length 80 lines 기준). minified 의 경우 chunk 한 개에 매우 긴 텍스트가 들어가는데 이는 paragraph 분할 정책의 inherent limitation. 미래 HOTFIXES 검토. +- **빈 줄 처리**: `^\s*$` 매칭 (whitespace-only) 줄을 paragraph boundary 로. 탭만 있는 줄 / CR-only 줄 등 edge case fixture 로 검증. +- **shell line-comment 처리**: shell script 의 `# comment` 줄은 일반 line. paragraph 분할에 영향 없음 (빈 줄 아님). chunk 안에 그대로 보존. +- **fallback wrapper 의 `canonical.parser_version` mutation**: Document 의 parser_version 을 Tier 3 fallback 시 `"none-v1"` 로 swap. CanonicalDocument 가 `mut` 로 받아져야 함. 이미 `let mut canonical = match ...` 이라 mut 가능. plan 단계 검증. +- **머지 후 deviation** 은 `tasks/HOTFIXES.md` dated 로그 + 본 spec `Risks / notes` cross-link.