[workspace] resolver = "3" members = [ "crates/kebab-core", "crates/kebab-config", "crates/kebab-source-fs", "crates/kebab-parse-md", "crates/kebab-chunk", "crates/kebab-store-sqlite", "crates/kebab-store-vector", "crates/kebab-search", "crates/kebab-embed", "crates/kebab-embed-local", "crates/kebab-embed-candle", "crates/kebab-embed-ollama", "crates/kebab-llm", "crates/kebab-llm-local", "crates/kebab-rag", "crates/kebab-app", "crates/kebab-cli", "crates/kebab-eval", "crates/kebab-parse-image", "crates/kebab-parse-pdf", "crates/kebab-tui", "crates/kebab-mcp", "crates/kebab-parse-code", "crates/kebab-nli", ] [workspace.package] edition = "2024" rust-version = "1.85" license = "MIT OR Apache-2.0" repository = "https://github.com/altair823/kebab" version = "0.27.0" # v0.27.0 — PP-OCRv5 ONNX Rust 네이티브 OCR 엔진: `[image.ocr] engine = "paddle-onnx"` (default 여전히 "ollama-vision") 로 in-process 검출+인식(`ort` =2.0.0-rc.9, Python 런타임 0). DBNet det + CTC rec, 후처리(min-area rect/unclip)는 pure-Rust. e2e CER 0.005(synthetic 한/영, PoC 0.024 대비 우수), 큰 페이지 CPU <4초(Ollama vision ~50초 대비). 신규 config `det_model`/`rec_model`/`dict`/`score_thresh`/`unclip_ratio`/`max_boxes` + `KEBAB_IMAGE_OCR_*` env. ingest 서명 `|ocr:1:{engine}:{engine_version}` 로 engine/모델 변경 시 자동 재색인. 신규 인터페이스(engine 값/config 키) → minor. — CLAUDE.md §Release # pre-v0.18 workspace-wide cleanup: enable clippy::pedantic group with # intentional allow-list. The allowed lints are either cosmetic (doc style), # informational (function size), or carry intentional truncation we accept # (numeric casts in tokenizer/ONNX inputs, hash modular reduction, etc). [workspace.lints.clippy] pedantic = { level = "warn", priority = -1 } # Intentional u32 ↔ i64 casts in kebab-nli (ONNX i64 inputs from tokenizer u32 ids). # u64 ↔ usize across kebab-store-sqlite row counts. Wide truncation is auditable # at use site, not lint-wide. cast_possible_truncation = "allow" cast_possible_wrap = "allow" cast_sign_loss = "allow" cast_precision_loss = "allow" # Doc markdown style is cosmetic; we run rustdoc on demand. doc_markdown = "allow" missing_errors_doc = "allow" missing_panics_doc = "allow" # Informational only — splitting a long pipeline function isn't always cleaner. too_many_lines = "allow" # `Foo::default()` is concise and idiomatic here; `::default()` # adds noise without surfacing intent. default_trait_access = "allow" # Module name prefix on public items keeps the wire/log surface readable # (`refusal_reason::no_chunks` etc). module_name_repetitions = "allow" # We use `#[must_use]` deliberately on public results, not blanket. must_use_candidate = "allow" # `String` arg sometimes signals "I'll consume this" — let signature decide. needless_pass_by_value = "allow" # Idiomatic single-line bindings stay; let-else expansion isn't always clearer. manual_let_else = "allow" # `use` after `let` is a common kebab pattern (scoped imports next to use site). items_after_statements = "allow" # Naming pairs like `chunk_id` / `chunks_id` are intentional domain terms. similar_names = "allow" # `iter.map(format!).collect::()` is idiomatic when the per-element # string is genuinely independent — `fold` only wins on accumulation patterns. format_collect = "allow" # Exhaustive `match` with explicit variant arms (vs `_`) catches future # variant additions at compile time (kebab core's `RefusalReason` pattern). match_wildcard_for_single_variants = "allow" # Copy types under `&self` keep call-site discipline; auto-deref noise > tiny perf gain. trivially_copy_pass_by_ref = "allow" # `unnecessary_wraps` flags helpers that could drop `Result`, but keeping the # Result allows future error variants without churning callers. unnecessary_wraps = "allow" # NLI score / RRF fusion / similarity threshold comparisons are intentional — # floats live in the `[0, 1]` band and are compared with explicit thresholds. float_cmp = "allow" # File-extension dispatch is keyed on ASCII conventions; case sensitivity # is part of the spec for `.md`, `.pdf`, etc. case_sensitive_file_extension_comparisons = "allow" # Config / opts structs intentionally bundle boolean flags (ingest options, # search modes, etc) — splitting them into enums would obscure the wire shape. struct_excessive_bools = "allow" # `bytecount` crate would be a new dep just for one-off ASCII counts. naive_bytecount = "allow" # `#[ignore]` annotations on tests document via the test name + nearby comment. ignore_without_reason = "allow" # `format!` push patterns are a hot path for kebab-tui's progressive rendering; # `write!` rewrite needs a verified-equal benchmark before swapping. format_push_string = "allow" # Builder-style `with_*` methods return `Self`; the existing `#[must_use]` # discipline lives on aggregate constructors, not every chainable setter. return_self_not_must_use = "allow" # Match arms grouped by side-effect over body equality (e.g. snake_case wire # label tables) — fanning them out keeps adding a new variant trivial. match_same_arms = "allow" # Remaining style-only warnings: trailing `continue` is sometimes clearer than # rewriting, `_x` underscored bindings document intent at the use site, and # `!(a == b)` reads better than `a != b` when paired with a complementary check. needless_continue = "allow" used_underscore_binding = "allow" nonminimal_bool = "allow" # Other one-off cosmetic items: large literal formatting, doc link quoting, # `Clone::clone_from` swap, `str::replace` chaining, `Iterator::any` ergonomics. unreadable_literal = "allow" many_single_char_names = "allow" doc_link_with_quotes = "allow" assigning_clones = "allow" collapsible_str_replace = "allow" trivial_regex = "allow" elidable_lifetime_names = "allow" range_plus_one = "allow" explicit_iter_loop = "allow" implicit_hasher = "allow" ref_option = "allow" [workspace.dependencies] anyhow = "1" thiserror = "2" serde = { version = "1", features = ["derive"] } serde_json = "1" # Golden-fixture loader (P5-1, kebab-eval) parses YAML; pinned in the # workspace so future eval-adjacent crates share the same major. serde_yaml = "0.9" time = { version = "0.3", features = ["serde", "macros", "formatting", "parsing"] } uuid = { version = "1", features = ["v7", "serde"] } blake3 = "1" tracing = "0.1" # `bundled` ships SQLite source so the workspace doesn't depend on a # system libsqlite3 (matches the kebab-store-sqlite feature set). rusqlite = { version = "0.32", features = ["bundled"] } globset = "0.4" tempfile = "3" proptest = "1" # p9-fb-19: LRU cache for `App::search` results. Bounded capacity # from `config.search.cache_capacity` (default 256, ~1.3 MB cap). lru = "0.12" lopdf = "0.32" # fastembed-rs ships ONNX runtime via the `ort-download-binaries` feature # in its default set (which also pulls `hf-hub` for first-run model # downloads). Pinned to the 4.x line per task p3-2 (current 5.x release # remains untested for this workspace). fastembed = "4.9" # LanceDB embedded vector store (P3-3). 0.23.x pulls arrow / arrow-array / # arrow-schema 56.x transitively (via lance 1.0); the kebab-store-vector # crate matches that major to share the same Arrow types without a # re-export adapter. lancedb = { version = "0.23", default-features = false } arrow = "56" arrow-array = "56" arrow-schema = "56" tokio = { version = "1", features = ["rt", "macros"] } futures = "0.3" # Strict citation-marker extraction in kebab-rag (P4-3) needs a single regex # pass; pulled into the workspace deps so future crates can share the # same major. regex = "1" # MCP (Model Context Protocol) SDK. server + macros + transport-io provide # stdio JSON-RPC transport for `kebab-mcp` (p9-fb-30). schemars feature # exposes the derive macro used by tool input schemas. rmcp = { version = "1.6", default-features = false, features = ["server", "macros", "transport-io", "schemars"] } # Dev-only HTTP mock server for kebab-llm-local Ollama adapter tests. Requires # a tokio runtime to host its mock server (the runtime adapter crate stays # sync via reqwest::blocking — wiremock is dev-only there). wiremock = "0.6" base64 = "0.22" # Pure-Rust git library for repo metadata detection (kebab-parse-code). # No `git` binary required. Default features include thread-safety + most # object-reading capabilities needed for HEAD name + commit SHA queries. gix = { version = "0.70", default-features = false, features = ["revision"] } # Rust source parsing for code ingest (kebab-parse-code, p10-1A-2). The # chunker stays tree-sitter-free — AST work is parser-side per design §6.3. tree-sitter = "0.26" tree-sitter-rust = "0.24" # Python / TS / JS grammars for code ingest (kebab-parse-code, p10-1B). tree-sitter-python = "0.25.0" tree-sitter-typescript = "0.23.2" tree-sitter-javascript = "0.25.0" # Go grammar for code ingest (kebab-parse-code, p10-1C-Go). tree-sitter-go = "0.25.0" # JVM family grammars for code ingest (kebab-parse-code, p10-1C-JK). tree-sitter-java = "0.23.5" tree-sitter-kotlin-ng = "1.1.0" # bare tree-sitter-kotlin requires ts <0.23; -ng uses tree-sitter-language 0.1 (ts 0.26 compat) # C/C++ family grammars for code ingest (kebab-parse-code, p10-1D). tree-sitter-c = "0.24.2" tree-sitter-cpp = "0.23.4" # fb-41 PR-9 (kebab-nli): mDeBERTa-v3 XNLI verifier deps. Versions match # the fastembed 4.9 transitive set so the ONNX Runtime + tokenizer stack # stays single-versioned across the workspace. ort `default-features=false` # drops the bundled binary downloader (fastembed already provides one); # tokenizers `default-features=false, onig` swaps the default `esaxx` regex # backend for `onig` so the build doesn't need libstdc++ headers (verified # via PR-9a pre-flight: SentencePiece tokenizer.json loads + KR/EN encode). # hf-hub uses `ureq + rustls-tls` to stay aligned with kebab-embed-local's # pure-Rust TLS stack. ort = { version = "=2.0.0-rc.9", default-features = false, features = ["ndarray"] } tokenizers = { version = "0.21", default-features = false, features = ["onig"] } hf-hub = { version = "0.4", default-features = false, features = ["ureq", "rustls-tls"] } ndarray = "0.16" # Korean morphological tokenizer (FTS v0.20.x, §6.1). lindera-ko-dic bundles # the KO-DIC dictionary as an embedded blob via the embed-ko-dic feature. lindera = "3" lindera-ko-dic = "3" # Disk-footprint trim for dev / test builds. Codegen, opt-level, and # behavior are unchanged — only DWARF debug info is reduced (line # numbers kept, column numbers dropped) and split into separate # `.dwo` files. backtrace stays useful (function + line). release # profile is untouched, so CI / `--release` runs are byte-identical # to upstream defaults. [profile.dev] debug = "line-tables-only" split-debuginfo = "unpacked" [profile.test] debug = "line-tables-only" split-debuginfo = "unpacked"