Compare commits

...

6 Commits

Author SHA1 Message Date
th-kim0823
d5321701ea plan(fb-39b): embedding upgrade implementation plan
5 tasks: kebab-embed-local resolve_model arm + check_dim test,
kebab-config defaults + TOML template flip, cross-crate fixture
sweep (likely no-op since most tests use provider=none), docs
(design + HOTFIXES + new task spec + INDEX), README + SMOKE
walkthrough.

Post-merge: 0.6 → 0.7 binary bump per CLAUDE.md cascade rule.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 23:02:37 +09:00
th-kim0823
2c3461c465 spec(fb-39b): embedding model upgrade design
- multilingual-e5-small (384 dim) → multilingual-e5-large (1024 dim)
- Cascade: embedding_version bump → fb-23 incremental ingest
  re-embeds all chunks
- Migration policy: dim mismatch detection at LanceVectorStore::open
  → error.v1 (code = embedding_dim_mismatch) + hint
  "kebab reset --vector-only && kebab ingest"
- Config defaults flip (model + dimensions). User TOML pinning small
  preserves backwards-compat
- bge-m3 deferred (fastembed enum 미포함, UserDefinedEmbeddingModel
  ONNX path 별도)
- Release trigger: 0.6 → 0.7 minor bump per CLAUDE.md cascade rule

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 22:59:03 +09:00
240120ee80 Merge pull request 'feat(fb-39): eval foundation — precision_at_k_chunk metric' (#136) from feat/fb-39-eval-foundation into main
Reviewed-on: #136
2026-05-10 13:41:04 +00:00
th-kim0823
5870a1de15 fix(fb-39): address PR #136 round 1 review
kebab eval compare now surfaces precision_at_k_chunk delta in both
human-readable table + deltas JSON. Snapshot fixture regenerated
additively.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 22:39:11 +09:00
th-kim0823
f00fb376fe docs(fb-39): golden header + design §10.3 eval + spec status + INDEX
Strengthen fixtures/golden_queries.yaml header with precision_at_k_chunk
explanation + measurement guidance. Add §10.3 Eval metrics section to
frozen design documenting retrieval metrics (hit@k, MRR, recall@k_doc,
P@k_chunk) + groundedness metrics. Flip p9-fb-39 spec status from open
→ completed (eval foundation only, lever deferral noted). Update
tasks/INDEX.md fb-39 row mirror to fb-42 (merged, deferred note).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 22:35:15 +09:00
th-kim0823
bb0ec0469f feat(eval): precision_at_k_chunk metric (P@5, P@10) (fb-39) 2026-05-10 22:26:21 +09:00
10 changed files with 817 additions and 9 deletions

View File

@@ -184,6 +184,18 @@ pub fn render_report_md(report: &CompareReport) -> String {
),
);
}
for k in crate::metrics::TOP_K_VARIANTS {
let _ = writeln!(
out,
"| precision@{k}_chunk | {} | {} | {} |",
fmt(a.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN)),
fmt(b.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN)),
fmt_delta(
a.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
b.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
),
);
}
let _ = writeln!(
out,
"| citation_coverage | {} | {} | {} |",
@@ -419,6 +431,7 @@ fn build_deltas(
}
let mut hit = serde_json::Map::new();
let mut recall = serde_json::Map::new();
let mut precision = serde_json::Map::new();
for k in crate::metrics::TOP_K_VARIANTS {
hit.insert(
k.to_string(),
@@ -434,11 +447,19 @@ fn build_deltas(
b.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN),
),
);
precision.insert(
k.to_string(),
d(
a.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
b.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
),
);
}
serde_json::json!({
"hit_at_k": hit,
"mrr": d(a.mrr, b.mrr),
"recall_at_k_doc": recall,
"precision_at_k_chunk": precision,
"citation_coverage": d(a.citation_coverage, b.citation_coverage),
"groundedness": d(a.groundedness, b.groundedness),
"empty_result_rate": d(a.empty_result_rate, b.empty_result_rate),
@@ -484,6 +505,7 @@ mod tests {
hit_at_k: Default::default(),
mrr: 0.5,
recall_at_k_doc: Default::default(),
precision_at_k_chunk: Default::default(),
citation_coverage: f32::NAN,
groundedness: 0.0,
empty_result_rate: 0.0,

View File

@@ -58,6 +58,14 @@ pub struct AggregateMetrics {
pub hit_at_k: BTreeMap<u32, f32>,
pub mrr: f32,
pub recall_at_k_doc: BTreeMap<u32, f32>,
/// p9-fb-39: chunk-level precision at k. Binary relevance via
/// `expected_chunk_ids` (a hit is "relevant" if its chunk_id is
/// in the golden's `expected_chunk_ids`). Denominator is k (fixed)
/// — `hits.len() < k` still divides by k, treating shortfall as
/// precision loss (mirrors `hit_at_k`). Queries with empty
/// `expected_chunk_ids` are skipped (mirrors `hit_at_k_chunk`).
#[serde(default)]
pub precision_at_k_chunk: BTreeMap<u32, f32>,
#[serde(
serialize_with = "serialize_f32_nan_as_null",
deserialize_with = "deserialize_f32_or_nan"
@@ -187,6 +195,8 @@ pub(crate) fn aggregate_from_rows(
TOP_K_VARIANTS.iter().map(|k| (*k, (0_u32, 0_u32))).collect();
let mut recall_at_k_doc: BTreeMap<u32, (f64, u32)> =
TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();
let mut precision_at_k_chunk: BTreeMap<u32, (f64, u32)> =
TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();
let mut mrr_sum: f64 = 0.0;
let mut mrr_denom: u32 = 0;
@@ -243,6 +253,18 @@ pub(crate) fn aggregate_from_rows(
{
mrr_sum += 1.0 / f64::from(rank);
}
// p9-fb-39: precision@k_chunk — count of top-k hits whose
// chunk_id is in `expected`, divided by k (fixed denominator).
for k in TOP_K_VARIANTS {
let hits_in_topk_relevant = qr
.hits_top_k
.iter()
.filter(|h| h.rank <= *k && expected.contains(&h.chunk_id))
.count();
let entry = precision_at_k_chunk.get_mut(k).expect("init");
entry.0 += hits_in_topk_relevant as f64 / f64::from(*k);
entry.1 += 1;
}
}
// recall@k_doc (doc-level, requires non-empty expected_doc_ids
@@ -333,6 +355,7 @@ pub(crate) fn aggregate_from_rows(
mrr_sum / f64::from(mrr_denom)
}),
recall_at_k_doc: round_recall_map(&recall_at_k_doc),
precision_at_k_chunk: round_recall_map(&precision_at_k_chunk),
citation_coverage: ratio_or_nan(citation_num, citation_denom),
groundedness: ratio_or_zero(groundedness_num, groundedness_denom),
empty_result_rate: ratio_or_zero(empty_result_count, total_queries),
@@ -674,4 +697,114 @@ mod tests {
assert_eq!(agg.failed_queries, 1);
assert_eq!(agg.total_queries, 1);
}
#[test]
fn precision_at_k_chunk_field_default_empty_on_old_json() {
// Old eval_runs.metrics_json predates fb-39 — no precision_at_k_chunk field.
// serde(default) yields empty BTreeMap.
let old = serde_json::json!({
"hit_at_k": {"1": 0.5, "3": 0.5, "5": 0.5, "10": 0.5},
"mrr": 0.5,
"recall_at_k_doc": {"1": 0.0, "3": 0.0, "5": 0.0, "10": 0.0},
"citation_coverage": null,
"groundedness": 0.0,
"empty_result_rate": 0.0,
"refusal_correctness": null,
"total_queries": 1,
"failed_queries": 0
});
let parsed: AggregateMetrics =
serde_json::from_value(old).expect("backwards-compat deserialize");
assert!(parsed.precision_at_k_chunk.is_empty());
}
#[test]
fn precision_at_k_chunk_exact_match() {
// expected = [c1, c2, c3]. Top-5 hits: [c1@1, c2@2, c3@3, x@4, y@5].
// P@5 = 3/5 = 0.6. P@10 = 3/10 = 0.3.
let queries = vec![gq("q1", &["c1", "c2", "c3"], &["d1"])];
let rows = vec![record(
"q1",
vec![
hit(1, "c1", "d1"),
hit(2, "c2", "d1"),
hit(3, "c3", "d1"),
hit(4, "x", "d1"),
hit(5, "y", "d1"),
],
None,
None,
)];
let agg = aggregate_from_rows(&queries, &rows).unwrap();
assert_eq!(agg.precision_at_k_chunk[&5], 0.6);
assert_eq!(agg.precision_at_k_chunk[&10], 0.3);
}
#[test]
fn precision_at_k_chunk_partial_topk_divides_by_k() {
// expected = [c1, c2]. Hits: only [c1@1, c2@2, x@3] (3 results).
// P@5 = 2/5 = 0.4 (denominator is k, not hits.len()).
let queries = vec![gq("q1", &["c1", "c2"], &["d1"])];
let rows = vec![record(
"q1",
vec![hit(1, "c1", "d1"), hit(2, "c2", "d1"), hit(3, "x", "d1")],
None,
None,
)];
let agg = aggregate_from_rows(&queries, &rows).unwrap();
assert_eq!(agg.precision_at_k_chunk[&5], 0.4);
assert_eq!(agg.precision_at_k_chunk[&10], 0.2);
}
#[test]
fn precision_at_k_chunk_zero_relevant_in_topk() {
// expected = [c1]. Hits: [x@1, y@2, z@3] (none relevant).
// P@5 = 0/5 = 0.0.
let queries = vec![gq("q1", &["c1"], &["d1"])];
let rows = vec![record(
"q1",
vec![hit(1, "x", "d1"), hit(2, "y", "d1"), hit(3, "z", "d1")],
None,
None,
)];
let agg = aggregate_from_rows(&queries, &rows).unwrap();
assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
}
#[test]
fn precision_at_k_chunk_empty_expected_skipped() {
// expected_chunk_ids = []. Skipped → final BTreeMap entry value = 0.0
// (zero-denom path in round_recall_map). Mirrors recall_at_k_doc behavior.
let queries = vec![gq("q1", &[], &["d1"])];
let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, None)];
let agg = aggregate_from_rows(&queries, &rows).unwrap();
assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
}
#[test]
fn precision_at_k_chunk_two_queries_averaged() {
// q1: expected=[c1], hits=[c1@1, x@2, y@3] → P@5 = 1/5 = 0.2
// q2: expected=[c1, c2], hits=[c1@1, c2@2] → P@5 = 2/5 = 0.4
// Avg P@5 = 0.3.
let queries = vec![
gq("q1", &["c1"], &["d1"]),
gq("q2", &["c1", "c2"], &["d2"]),
];
let rows = vec![
record(
"q1",
vec![hit(1, "c1", "d1"), hit(2, "x", "d1"), hit(3, "y", "d1")],
None,
None,
),
record(
"q2",
vec![hit(1, "c1", "d2"), hit(2, "c2", "d2")],
None,
None,
),
];
let agg = aggregate_from_rows(&queries, &rows).unwrap();
assert_eq!(agg.precision_at_k_chunk[&5], 0.3);
}
}

View File

@@ -11,6 +11,12 @@
"5": 0.666700005531311
},
"mrr": 0.41670000553131104,
"precision_at_k_chunk": {
"1": 0.33329999446868896,
"10": 0.06669999659061432,
"3": 0.11110000312328339,
"5": 0.13330000638961792
},
"recall_at_k_doc": {
"1": 0.33329999446868896,
"10": 0.666700005531311,
@@ -32,6 +38,12 @@
"5": 1.0
},
"mrr": 0.833299994468689,
"precision_at_k_chunk": {
"1": 0.666700005531311,
"10": 0.10000000149011612,
"3": 0.33329999446868896,
"5": 0.20000000298023224
},
"recall_at_k_doc": {
"1": 0.666700005531311,
"10": 1.0,
@@ -53,6 +65,12 @@
"5": 0.33329999446868896
},
"mrr": 0.41659998893737793,
"precision_at_k_chunk": {
"1": 0.33340001106262207,
"10": 0.0333000048995018,
"3": 0.22219999134540558,
"5": 0.06669999659061432
},
"recall_at_k_doc": {
"1": 0.33340001106262207,
"10": 0.33329999446868896,

View File

@@ -203,6 +203,7 @@ fn store_aggregate_rejects_missing_run() {
hit_at_k: Default::default(),
mrr: 0.0,
recall_at_k_doc: Default::default(),
precision_at_k_chunk: Default::default(),
citation_coverage: f32::NAN,
groundedness: 0.0,
empty_result_rate: 0.0,

View File

@@ -0,0 +1,405 @@
# fb-39b Embedding Model Upgrade Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Upgrade default embedding model from `multilingual-e5-small` (384 dim) to `multilingual-e5-large` (1024 dim) so retrieval precision can improve on Korean dogfooding corpus. Existing user TOMLs pinning `multilingual-e5-small` keep working unchanged.
**Architecture:** Three-line code surface: a new arm in `kebab-embed-local::resolve_model`, defaults flipped in `kebab-config::Config::defaults` (and the TOML template), and the existing test asserting the 384 default updated. LanceDB tables are already namespaced by `(model, dim)` so an upgraded model writes to a fresh table; fb-23 incremental ingest detects the `embedding_version` mismatch and auto-re-embeds on next ingest. No migration tooling — orphan old-model tables cleaned via `kebab reset --vector-only`.
**Tech Stack:** Rust 2024, fastembed 4.9.1 (`MultilingualE5Large` enum already shipped), LanceDB.
**Spec:** `docs/superpowers/specs/2026-05-10-p9-fb-39b-embedding-upgrade-design.md`
---
## File map
**Modify:**
- `crates/kebab-embed-local/src/lib.rs` — add `multilingual-e5-large` arm in `resolve_model`. Update or add `check_dim` test for 1024.
- `crates/kebab-config/src/lib.rs` — flip `Config::defaults().models.embedding.{model, dimensions}` and the TOML template at line ~952. Update default test at line 767.
- `README.md``[models.embedding]` section: mention new default + small opt-out + dim mismatch hint.
- `docs/SMOKE.md` — append "Embedding upgrade (fb-39b)" walkthrough showing the `kebab reset --vector-only && kebab ingest` sequence + first-run ONNX download warning.
- `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` §5 storage / §9 versioning — update default model + dim references.
- `tasks/HOTFIXES.md` — entry for embedding upgrade UX (orphan tables on model swap, reset --vector-only flow).
- `tasks/p9/p9-fb-39-retrieval-precision-tuning.md` banner — append note "fb-39b lever 적용 (embedding upgrade) ✅".
- `tasks/INDEX.md` — fb-39b row ✅ (new row alongside fb-39).
**Create:**
- `tasks/p9/p9-fb-39b-embedding-upgrade.md` — new task spec mirroring fb-39 frontmatter (status: completed, design + plan links).
---
## Task 1: Add multilingual-e5-large to kebab-embed-local
**Files:**
- Modify: `crates/kebab-embed-local/src/lib.rs`
- [ ] **Step 1: Append failing tests**
Find the existing `mod tests` (~line 230). Append:
```rust
#[test]
fn resolve_model_supports_e5_large() {
let m = resolve_model("multilingual-e5-large").expect("e5-large should resolve");
// The fastembed enum is non-comparable in some versions; we only need
// to confirm Ok and that the underlying TextEmbedding could be built.
// Avoid actually constructing the model in tests (1.3 GB ONNX download).
let _ = m;
}
#[test]
fn check_dim_passes_for_1024() {
check_dim(1024, 1024).expect("matching dims must pass");
}
#[test]
fn check_dim_rejects_384_vs_1024() {
let err = check_dim(384, 1024).expect_err("dim mismatch must error");
let msg = format!("{err}");
assert!(msg.contains("384") && msg.contains("1024"),
"error must mention both dims, got: {msg}");
}
```
- [ ] **Step 2: Run tests to confirm failures**
```bash
cargo test -p kebab-embed-local resolve_model_supports_e5_large
cargo test -p kebab-embed-local check_dim_passes_for_1024
```
Expected: `resolve_model_supports_e5_large` fails (no arm); `check_dim_*` passes already (helper is generic).
- [ ] **Step 3: Add arm to resolve_model**
In `crates/kebab-embed-local/src/lib.rs`, find `fn resolve_model` (~line 199). Replace the match body:
```rust
fn resolve_model(name: &str) -> Result<EmbeddingModel> {
match name {
"multilingual-e5-small" => Ok(EmbeddingModel::MultilingualE5Small),
"multilingual-e5-large" => Ok(EmbeddingModel::MultilingualE5Large),
other => anyhow::bail!(
"kb-embed-local: unsupported embedding model {other:?}; \
this adapter currently ships `multilingual-e5-small` and \
`multilingual-e5-large`. Add a new arm to `resolve_model` \
(and a fastembed feature flag if needed) to support more."
),
}
}
```
- [ ] **Step 4: Run tests — all pass**
```bash
cargo test -p kebab-embed-local
cargo clippy -p kebab-embed-local --all-targets -- -D warnings
```
- [ ] **Step 5: Commit**
```bash
git add crates/kebab-embed-local/src/lib.rs
git commit -m "feat(embed): add multilingual-e5-large arm to resolve_model (fb-39b)"
```
---
## Task 2: Flip kebab-config default to e5-large + 1024 dim
**Files:**
- Modify: `crates/kebab-config/src/lib.rs`
- [ ] **Step 1: Read existing default test + value sites**
```bash
grep -n "multilingual-e5-small\|dimensions: 384\|dimensions = 384\|default.*embedding" crates/kebab-config/src/lib.rs
```
Three sites to update:
- `Config::defaults()` body (~line 307): `dimensions: 384` and `model: "multilingual-e5-small"`.
- Default-assert test (~line 767): `assert_eq!(c.models.embedding.dimensions, 384)` and likely a sibling assertion on model.
- TOML template at ~line 952: `dimensions = 384` (and likely `model = "multilingual-e5-small"`).
- [ ] **Step 2: Add failing assertion to existing default test**
Find the test at ~line 763-768 (likely `defaults_match_design_64_score_gate` or similar). Read it:
```bash
sed -n '760,780p' crates/kebab-config/src/lib.rs
```
If the test asserts `dimensions == 384`, change to `1024`. If it doesn't assert model name, add:
```rust
assert_eq!(c.models.embedding.model, "multilingual-e5-large");
assert_eq!(c.models.embedding.dimensions, 1024);
```
- [ ] **Step 3: Run tests — expect failure**
```bash
cargo test -p kebab-config defaults_match
```
Expected: assertion failure on dimensions == 1024 (still 384) and/or model name.
- [ ] **Step 4: Flip the defaults**
In `crates/kebab-config/src/lib.rs:307` (the `EmbeddingCfg` defaults block):
```rust
EmbeddingCfg {
provider: "fastembed".to_string(),
model: "multilingual-e5-large".to_string(),
version: "v1".to_string(),
dimensions: 1024,
// ... preserve other fields (batch_size etc.) ...
}
```
(Read the surrounding lines first to confirm field names — if `version` field doesn't exist or has a different shape, only update `model` + `dimensions`.)
- [ ] **Step 5: Flip the TOML template**
In `crates/kebab-config/src/lib.rs` near line 952, the multi-line raw string contains the example TOML config. Find:
```toml
[models.embedding]
provider = "fastembed"
model = "multilingual-e5-small"
...
dimensions = 384
```
Replace with `model = "multilingual-e5-large"` and `dimensions = 1024`.
- [ ] **Step 6: Run tests — pass**
```bash
cargo test -p kebab-config
cargo clippy -p kebab-config --all-targets -- -D warnings
```
- [ ] **Step 7: Commit**
```bash
git add crates/kebab-config/src/lib.rs
git commit -m "feat(config): default embedding model multilingual-e5-large + 1024 dim (fb-39b)"
```
---
## Task 3: Cross-crate test fixture sweep
**Files:**
- Modify: any test fixture broken by Task 2's default flip.
- [ ] **Step 1: Find broken sites**
```bash
cargo build --workspace 2>&1 | tail -10
cargo test --workspace --no-run 2>&1 | grep -E "error\[|FAILED" | head -20
```
Likely candidates:
- `crates/kebab-app/tests/` — anywhere a test asserted `embedding.dimensions == 384`.
- `crates/kebab-cli/tests/cli_schema.rs` — a capability/model assertion may include the embedding model name.
For each failure, decide:
- **Pin to small intentionally** (test exercises small-specific behavior): set `cfg.models.embedding.model = "multilingual-e5-small"; cfg.models.embedding.dimensions = 384;` explicitly.
- **Inherit new default** (test just snapshots defaults): update assertion to `multilingual-e5-large` / `1024`.
The vast majority of integration tests use `provider = "none"` (no embeddings) — those are unaffected.
- [ ] **Step 2: Verify workspace builds**
```bash
cargo build --workspace 2>&1 | tail -5
```
- [ ] **Step 3: Run workspace tests**
```bash
cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -10
cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -5
```
`-j 1` REQUIRED.
Expected: all green.
- [ ] **Step 4: Commit**
```bash
git add crates/
git commit -m "fix(fb-39b): update test fixtures for embedding default flip"
```
(Skip this commit if `cargo build --workspace` is already clean after Task 2 — meaning no fixture broke.)
---
## Task 4: Wire schema docs (design + HOTFIXES + new task spec)
**Files:**
- Modify: `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md`
- Modify: `tasks/HOTFIXES.md`
- Create: `tasks/p9/p9-fb-39b-embedding-upgrade.md`
- Modify: `tasks/p9/p9-fb-39-retrieval-precision-tuning.md`
- Modify: `tasks/INDEX.md`
- [ ] **Step 1: Update design §5 storage and §9 versioning**
```bash
grep -n "multilingual-e5-small\|^## §5\|^### §5\|^## §9\|384" docs/superpowers/specs/2026-04-27-kebab-final-form-design.md | head -10
```
Update any reference to `multilingual-e5-small` or `dim 384` in the design doc to read `multilingual-e5-large` and `dim 1024`. Keep historical version mentions intact (e.g. "0.6.0 shipped with multilingual-e5-small") if any — but the "current default" line must reflect the new model.
- [ ] **Step 2: Add HOTFIXES entry**
Append to `tasks/HOTFIXES.md` (under the dated log; place at top of the dated entries with today's date `2026-05-10`):
```markdown
- **2026-05-10 fb-39b — embedding upgrade UX**: default embedding flipped from `multilingual-e5-small` (384 dim) to `multilingual-e5-large` (1024 dim). LanceDB tables are namespaced by `(model, dim)` so the new model writes to a fresh table and the old `chunk_embeddings_multilingual-e5-small_384` table becomes orphan. fb-23 incremental ingest auto-re-embeds chunks (embedding_version mismatch) into the new table on next `kebab ingest`. To free disk before re-ingest, run `kebab reset --vector-only` first — this wipes both LanceDB and the SQLite `embedding_records` table. Search/ask against the new model returns empty hits until `kebab ingest` populates the new table.
```
- [ ] **Step 3: Create `tasks/p9/p9-fb-39b-embedding-upgrade.md`**
Mirror the fb-39 frontmatter shape:
```markdown
---
phase: P9
component: kebab-embed-local + kebab-config + kebab-store-vector + docs
task_id: p9-fb-39b
title: "Embedding model upgrade (multilingual-e5-large)"
status: completed
target_version: 0.7.0
depends_on: [p9-fb-39]
unblocks: []
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
contract_sections: [§4 search, §5 storage, §9 versioning cascade]
source_feedback: 사용자 도그푸딩 2026-05-06 — Claude Code 가 kebab CLI 사용 후 "rank 5+ 노이즈 섞임" 지적 (fb-39 의 lever 적용 측면).
---
# p9-fb-39b — Embedding model upgrade
> ✅ **구현 완료.** fb-39 의 lever 후보 4개 중 embedding model 업그레이드 lever 적용. P@k metric (fb-39) 으로 small vs large 비교 가능.
>
> - Design: [`docs/superpowers/specs/2026-05-10-p9-fb-39b-embedding-upgrade-design.md`](../../docs/superpowers/specs/2026-05-10-p9-fb-39b-embedding-upgrade-design.md)
> - Plan: [`docs/superpowers/plans/2026-05-10-p9-fb-39b-embedding-upgrade.md`](../../docs/superpowers/plans/2026-05-10-p9-fb-39b-embedding-upgrade.md)
## 요약
- `multilingual-e5-small` (384 dim) → `multilingual-e5-large` (1024 dim) default flip.
- 기존 user TOML 이 small 명시 시 그대로 (backwards-compat).
- fb-23 incremental ingest 가 embedding_version mismatch 감지 → 자동 re-embed.
- 0.6 → 0.7 minor bump 트리거 (design §9 cascade rule).
```
- [ ] **Step 4: Append fb-39b note to fb-39 task spec banner**
In `tasks/p9/p9-fb-39-retrieval-precision-tuning.md`, find the existing `> ✅ **Eval foundation 부분 구현 완료.**` banner. Append a line:
```markdown
> - fb-39b (lever 적용 — embedding upgrade): [`tasks/p9/p9-fb-39b-embedding-upgrade.md`](./p9-fb-39b-embedding-upgrade.md) ✅
```
- [ ] **Step 5: Add fb-39b row to INDEX**
In `tasks/INDEX.md`, find the fb-39 row. Add a sibling row immediately below:
```markdown
- [p9-fb-39b embedding upgrade](p9/p9-fb-39b-embedding-upgrade.md) — ✅ 머지 (2026-05-10) — multilingual-e5-large default
```
(Adapt format to match neighbor rows.)
- [ ] **Step 6: Workspace test + clippy gate**
```bash
cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -10
cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -5
```
`-j 1` REQUIRED.
- [ ] **Step 7: Commit**
```bash
git add docs/ tasks/
git commit -m "docs(fb-39b): design + HOTFIXES + new task spec + INDEX"
```
---
## Task 5: README + SMOKE walkthrough
**Files:**
- Modify: `README.md`
- Modify: `docs/SMOKE.md`
- [ ] **Step 1: Update README `[models.embedding]` section**
```bash
grep -n "models.embedding\|multilingual-e5-small\|fastembed" README.md | head -5
```
Locate the `[models.embedding]` config block in README. Update default values mentioned + add new bullet:
```markdown
- `model` (default `"multilingual-e5-large"`, fb-39b) — 다국어 sentence embedding 모델. 1024-dim. ONNX (~1.3 GB) 첫 실행 시 fastembed cache (`config.storage.model_dir/fastembed/`) 에 자동 다운로드. `"multilingual-e5-small"` (384 dim) 는 backwards-compat 으로 사용 가능 — TOML 에 명시.
- `dimensions` (default `1024`) — 모델의 embedding 차원. config 와 LanceDB stored dim 불일치 시 검색 결과 0 건 (orphan table). 모델 변경 시 `kebab reset --vector-only && kebab ingest` 로 vector index 재구축 권장.
```
- [ ] **Step 2: Append SMOKE walkthrough**
Append to `docs/SMOKE.md` after fb-39 section (or at end if absent):
````markdown
### Embedding upgrade (fb-39b)
`multilingual-e5-small` 에서 `multilingual-e5-large` 로 업그레이드 시퀀스:
```bash
# 기존 vector index 정리 (orphan table 회피)
kebab --config /tmp/kebab-smoke/config.toml reset --vector-only
# config.toml 의 [models.embedding] 갱신:
# model = "multilingual-e5-large"
# dimensions = 1024
# 재-ingest — fastembed 가 첫 실행 시 e5-large ONNX (~1.3 GB) 자동 다운로드.
# 다운로드 시간 + 모든 chunk re-embed 시간 (e5-small 대비 ~3-4×).
kebab --config /tmp/kebab-smoke/config.toml ingest
# fb-39 의 P@k metric 으로 small vs large 비교:
kebab --config /tmp/kebab-smoke/config.toml eval run
```
````
- [ ] **Step 3: Workspace test + clippy gate (sanity)**
```bash
cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -5
cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -3
```
- [ ] **Step 4: Commit**
```bash
git add README.md docs/SMOKE.md
git commit -m "docs(fb-39b): README + SMOKE — embedding upgrade walkthrough"
```
---
## Final verification checklist
- [ ] `cargo test --workspace --no-fail-fast -j 1` green
- [ ] `cargo clippy --workspace --all-targets -- -D warnings` clean
- [ ] `kebab schema --json | jq .models.embedding_version` reflects new model name (after a fresh ingest with new defaults)
- [ ] Manual smoke: `kebab reset --vector-only && kebab ingest` against `/tmp/kebab-smoke` triggers ONNX download (first run) then completes ingest into the new `chunk_embeddings_multilingual-e5-large_1024` table
- [ ] README + SMOKE + design + HOTFIXES + fb-39b spec + INDEX all updated
- [ ] **Post-merge**: cut version bump 0.6 → 0.7 + tag (CLAUDE.md `Versioning cascade` release rule — embedding_version cascade triggers minor bump)

View File

@@ -1510,6 +1510,26 @@ agent 가 분기). HTTP-SSE transport 는 fb-29 deferral 따라 P+. classify
모듈은 `kebab-app::error_wire` 에 single source — kebab-cli + kebab-mcp
공유.
### 10.3 Eval metrics (fb-39)
#### Retrieval metrics (ground-truth curated)
`kebab eval run` 이 golden query suite (`fixtures/golden_queries.yaml`) 대해 메트릭 계산. Curator 가 `expected_chunk_ids``expected_doc_ids` 설정 시에만 측정 가능 (shipped template 은 empty — workspace 별 자체 채움).
| 메트릭 | 정의 | 조건 |
|--------|------|------|
| `hit_at_k` | top-k 안 expected chunk 존재 여부 (binary). P(hit@k=true) 평균 | `expected_chunk_ids` 채움 |
| `MRR` | Mean Reciprocal Rank (첫 관련 chunk rank 역수 평균) | `expected_chunk_ids` 채움 |
| `recall_at_k_doc` | top-k 안 expected doc 비율 (`|top-k_docs ∩ expected_doc_ids| / |expected_doc_ids|`) | `expected_doc_ids` 채움 |
| `precision_at_k_chunk` (fb-39) | top-k 안 chunk_id 가 `expected_chunk_ids` 에 포함된 비율. 분모 = k (fixed) — `top-k` 부족도 precision 손실로 간주. 빈 `expected_chunk_ids` query 는 skip. | `expected_chunk_ids` 채움 |
#### Groundedness metrics (rule-based)
| 메트릭 | 정의 |
|--------|------|
| `must_contain` pass | answer 문자열 이 `golden.must_contain` 의 모든 substring 포함 |
| `forbidden` pass | answer 문자열 이 `golden.forbidden` 의 substring 미포함 |
---
## 11. 동결 범위 / 변경 정책

View File

@@ -0,0 +1,198 @@
---
title: "p9-fb-39b — Embedding model upgrade design (multilingual-e5-large)"
phase: P9
component: kebab-embed-local + kebab-store-vector + kebab-config + kebab-app
task_id: p9-fb-39b
status: design
target_version: 0.7.0
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
contract_sections: [§4 search, §5 storage, §9 versioning cascade]
date: 2026-05-10
---
# p9-fb-39b — Embedding model upgrade
## Goal
fb-39 의 lever 적용 — embedding model 을 `multilingual-e5-small` (384 dim) 에서 `multilingual-e5-large` (1024 dim) 로 업그레이드. 도그푸딩 한국어 corpus 의 retrieval precision 개선.
fb-39 가 측정 도구 (P@5 / P@10) 를 추가했으므로, 본 PR 머지 후 small vs large 비교 가능.
`bge-m3` 검토했으나 fastembed 4.9.1 의 `EmbeddingModel` enum 에 미포함 — `UserDefinedEmbeddingModel` ONNX 직접 로드 path 는 별도 작업 (fb-39c 후보). 본 PR scope = e5-large 만.
## Behavior contract
### Embedding model
- 신규 default: `multilingual-e5-large` (1024 dim).
- `kebab-embed-local::resolve_model` 에 신규 arm:
```rust
"multilingual-e5-large" => Ok(EmbeddingModel::MultilingualE5Large),
```
기존 `multilingual-e5-small` arm 그대로 (backwards-compat opt-out).
### Config defaults
- `Config::defaults().models.embedding.model`: `"multilingual-e5-small"``"multilingual-e5-large"`.
- `Config::defaults().models.embedding.dimensions`: `384``1024`.
- `kebab init` 가 생성하는 config.toml 템플릿 동일 갱신.
기존 user TOML 이 `model = "multilingual-e5-small"` 또는 `dimensions = 384` 명시한 경우 그대로 유지 — `serde` 가 user value 우선. opt-out 가능.
### Cascade
- `embedding_version`: 자동 변경 (config.models.embedding.model 값 그대로 wire 에 emit). `multilingual-e5-small``multilingual-e5-large`.
- fb-23 incremental ingest: 4-input match (blake3 + parser_version + chunker_version + embedding_version) 에서 embedding_version 깨짐 → 모든 chunk 재-embed. text/parse/chunk 비용 회피, embed 비용만 발생.
- `eval_runs.config_snapshot_json`: 새 version 자동 기록. 비교 시 동일 version 끼리.
- design §9 cascade rule 의 5 키 중 `embedding_version` 변경 — binary release 트리거 (CLAUDE.md `Versioning cascade` 룰).
### Migration policy
LanceDB stored vectors 의 dim 과 `config.models.embedding.dimensions` 가 mismatch 면:
- `LanceVectorStore::open` (또는 첫 호출) 가 비교 → mismatch 시 신규 `ErrorV1`:
- `code = "embedding_dim_mismatch"`
- `message`: `"vector index dim 384 vs config dim 1024"`
- `hint`: `"기존 vector index 가 4-dim, config 는 N-dim. 'kebab reset --vector-only && kebab ingest' 로 재구축."`
- CLI: exit 1 + error.v1 stderr (또는 비-`--json` 모드 plain stderr).
- silent migration / auto-wipe 안 함 — 사용자 명시 동의 필요.
remediation flow:
```
$ kebab search "..."
error: vector index dim 384 vs config dim 1024
Hint: 기존 vector index 가 384-dim, config 는 1024-dim.
'kebab reset --vector-only && kebab ingest' 로 재구축.
$ kebab reset --vector-only
[wipe LanceDB + SQLite embedding_records]
$ kebab ingest
[full re-embed with new model — fastembed downloads e5-large ONNX (~1.3 GB) on first run]
```
### Wire shape
신규 wire field 없음. `error.v1.code` 의 valid value namespace 에 `"embedding_dim_mismatch"` 추가 (string, enum 아님 — additive).
## Allowed / forbidden dependencies
- `kebab-embed-local`: 신규 dep 없음. fastembed enum variant 추가만.
- `kebab-store-vector`: 신규 dep 없음. LanceDB schema reader 사용.
- `kebab-config`: 신규 dep 없음. defaults 값 변경.
- `kebab-app`: 신규 dep 없음. error propagation.
`kebab-core` 의 다른 `kebab-*` 의존 금지 룰 그대로.
## Public surface delta
### kebab-embed-local (`lib.rs`)
```rust
fn resolve_model(name: &str) -> Result<EmbeddingModel> {
match name {
"multilingual-e5-small" => Ok(EmbeddingModel::MultilingualE5Small),
"multilingual-e5-large" => Ok(EmbeddingModel::MultilingualE5Large), // 신규
other => anyhow::bail!(/* ... */),
}
}
```
### kebab-config (defaults + TOML 템플릿)
```rust
EmbeddingCfg {
provider: "fastembed".to_string(),
model: "multilingual-e5-large".to_string(),
dimensions: 1024,
// ... 기타 ...
}
```
generated config.toml 템플릿 도 같이 갱신.
### kebab-store-vector (`lib.rs` 또는 신규 helper)
```rust
impl LanceVectorStore {
pub fn open(...) -> Result<Self> {
// 기존 open 로직 ...
let stored_dim = read_schema_vector_dim(&table)?;
if stored_dim != config_dim {
anyhow::bail!(StructuredError(ErrorV1 {
code: "embedding_dim_mismatch".to_string(),
message: format!("vector index dim {stored_dim} vs config dim {config_dim}"),
hint: Some(format!(
"기존 vector index 가 {stored_dim}-dim, config 는 {config_dim}-dim. \
'kebab reset --vector-only && kebab ingest' 로 재구축."
)),
// ...
}));
}
Ok(...)
}
}
```
(정확한 LanceDB schema reading API 는 구현 시 확인 — `Table::schema()` 또는 `arrow_schema::Schema` 직접 inspect.)
## Test plan
| kind | description |
|------|-------------|
| unit (kebab-embed-local) | `resolve_model("multilingual-e5-large")` returns Ok |
| unit (kebab-embed-local) | `check_dim(1024, 1024)` ok |
| unit (kebab-embed-local) | `check_dim(384, 1024)` Err — message mentions both dims |
| unit (kebab-config) | `Config::defaults().models.embedding.model == "multilingual-e5-large"` |
| unit (kebab-config) | `Config::defaults().models.embedding.dimensions == 1024` |
| unit (kebab-config) | TOML `model = "multilingual-e5-small"` deserialize 정상 (backwards-compat) |
| unit (kebab-config) | 생성된 config.toml 템플릿 안 `model = "multilingual-e5-large"`, `dimensions = 1024` |
| unit (kebab-store-vector) | mismatch fixture (384-dim stored + 1024 cfg) → `embedding_dim_mismatch` ErrorV1 |
| 통합 (kebab-cli) | mismatch scenario — pre-existing 384-dim DB + new config → exit 1 + error.v1 stderr (`code = embedding_dim_mismatch`) + hint mentions reset --vector-only |
| 통합 (kebab-cli) | small config 로 fresh ingest + search → 정상 (backwards-compat path 검증) |
`multilingual-e5-large` 모델 다운로드 회피 위해 unit/integration 테스트는 fixture 또는 mock — 실 모델 호출 안 함. 첫 도그푸딩 시 사용자가 fastembed cache 다운로드.
## Implementation steps (high-level)
1. `kebab-embed-local::resolve_model` arm + check_dim 단위 테스트.
2. `kebab-store-vector` dim mismatch detection + ErrorV1 + 단위 테스트.
3. `kebab-config` defaults flip + TOML 템플릿 + 단위 테스트.
4. `kebab-cli` integration: mismatch error.v1 wire + backwards-compat path 통합 테스트.
5. README + SMOKE + design + HOTFIXES + status flip.
5 task. 단일 PR, single 세션 가능.
## Risks / notes
- **첫 실행 모델 다운로드**: e5-large ONNX ~1.3 GB. fastembed cache (`config.storage.model_dir/fastembed/`) 에 자동 다운로드 (첫 호출 시). progress 표시 없음 — 사용자 침묵 latency. `kebab doctor` 또는 README 에 경고 안내.
- **Search/ingest latency**: e5-large 가 e5-small 대비 ~3-4× embedding 시간. ingest 비용 증가 (one-time + 신규 docs). search 시 query embed per-call 증가.
- **Disk usage**: vector dim 2.6× → LanceDB 약 2.7× 증가.
- **HOTFIXES entry**: dim mismatch UX (error.v1 + reset --vector-only flow) 가 frozen design 안 명시 안 된 신규 동작 — HOTFIXES 한 항목 추가.
- **eval comparison**: fb-39 P@k 가 측정 도구. 도그푸딩 corpus + golden 의 expected_chunk_ids 채워서 small vs large 정량 비교 별도 (PR 안 의무 아님).
- **fb-23 incremental ingest 와의 상호작용**: embedding_version 변경 → 모든 doc 재-embed. fb-23 의 unchanged path 는 한 번도 hit 안 함 (예상 동작).
- **release trigger**: design §9 cascade rule 의 `embedding_version` 변경 → CLAUDE.md `Versioning cascade` 룰에 따라 binary 0.6 → 0.7 minor bump 필요.
## Out of scope
- bge-m3 또는 user-defined ONNX path (fb-39c 후보).
- Other lever (RRF / cross-encoder / chunk policy).
- Auto-migration / background re-vector.
- LanceDB schema migration tooling (별도 wipe + re-ingest).
- multi-model coexistence (한 KB 안 small + large 동시).
- precision 정량 비교 의무 (별도 도그푸딩).
## Documentation updates (implementation PR 동시)
- `README.md` `[models.embedding]` config 섹션 — default 변경 + small opt-out 안내 + dim mismatch 시 reset 명령 안내.
- `docs/SMOKE.md` — upgrade walkthrough (`kebab reset --vector-only && kebab ingest` 시퀀스 + 첫 ONNX 다운로드 latency 경고).
- `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` §5 storage / §9 versioning 적절 절 — 새 default + dim 1024 명시.
- `tasks/HOTFIXES.md` — dim mismatch UX entry.
- `tasks/p9/p9-fb-39-retrieval-precision-tuning.md` banner — fb-39b lever 적용 (embedding upgrade) ✅ 추가 (단 spec status 는 fb-39 frozen).
- `tasks/p9/p9-fb-39b-embedding-upgrade.md` 신규 task spec (만들거나, fb-39 sub-task 로 frontmatter 처리).
- `tasks/INDEX.md` — fb-39b 행 추가 ✅.
- 본 PR 머지 후 `chore: bump version 0.6 → 0.7` + tag (CLAUDE.md release 절차).

View File

@@ -1,4 +1,4 @@
# Golden query suite for `kb eval run` (P5-1 / P5-2).
# Golden query suite for `kebab eval run` (P5-1 / P5-2 / fb-39).
#
# Top-level: list of queries. Required fields: `id`, `query`. All
# others are optional and default to empty / null.
@@ -7,8 +7,16 @@
# real rows in the active workspace's SQLite store at run time. Stale
# references make the runner bail at start. The shipped template
# leaves them empty so the file is loadable on any fresh workspace —
# fill them in after a `kb ingest` to enable hit@k / MRR metrics
# (P5-2).
# fill them in after a `kebab ingest` to enable the metrics that
# require ground truth (P5-2 + fb-39):
#
# - `expected_chunk_ids` → hit_at_k, MRR, precision_at_k_chunk (fb-39)
# - `expected_doc_ids` → recall_at_k_doc
#
# `precision_at_k_chunk` (fb-39): of the top-k retrieved hits, what
# fraction's `chunk_id` is in `expected_chunk_ids`. Denominator is k
# (fixed) — `top-k` shortfall is treated as precision loss. Queries
# with empty `expected_chunk_ids` are skipped from this metric.
#
# `must_contain` / `forbidden` drive the rule-based groundedness
# metric (P5-2).

View File

@@ -129,7 +129,7 @@ P0~P5 는 직렬. P6~P9 는 P5 이후 병렬 가능.
### 🎯 0.5.0 — RAG quality (cascade 동반: V00X + reindex)
- [p9-fb-38 score semantics](p9/p9-fb-38-score-semantics.md) — ✅ 머지 (2026-05-10)
- [p9-fb-39 retrieval precision 튜닝](p9/p9-fb-39-retrieval-precision-tuning.md) — ⏳ 미구현, brainstorm 필요 (embedding_version cascade)
- [p9-fb-39 retrieval precision 튜닝](p9/p9-fb-39-retrieval-precision-tuning.md) — ✅ 머지 (2026-05-10) — eval foundation only, lever 적용 deferred
- [p9-fb-40 fact-grounded answer](p9/p9-fb-40-fact-grounded-answer.md) — ✅ 머지 (2026-05-10)
### 🎯 0.6.0 또는 P+ — reasoning

View File

@@ -1,20 +1,23 @@
---
phase: P9
component: kebab-search + kebab-rag + kebab-chunk
component: kebab-eval + docs
task_id: p9-fb-39
title: "Retrieval precision 튜닝 (rank 5+ 노이즈 완화)"
status: open
target_version: 0.5.0
status: completed
target_version: 0.7.0
depends_on: []
unblocks: []
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
contract_sections: [§3 chunking, §4 search, §7 RAG]
contract_sections: [§3 chunking, §4 search, §7 RAG, §10.3 eval metrics]
source_feedback: 사용자 도그푸딩 2026-05-06 — Claude Code 가 kebab CLI 사용 후 "rank 5+ 부터 노이즈 섞임" 지적. precision-at-k 가 k=5 이후 떨어짐.
---
# p9-fb-39 — Retrieval precision 튜닝
> **백로그 only — 미구현.** 본 spec 은 도그푸딩 피드백 skeleton. 구현 착수 전 [superpowers:brainstorming](../../docs/superpowers/) 으로 설계 단계 선행 필요. 어느 lever (chunk policy / RRF k / score gate / cross-encoder / embedding 업그레이드) 부터 손볼지, eval golden set 선행 여부 brainstorm 후 결정.
> **Eval foundation 부분 구현 완료.** P@k metric (P@5, P@10) 추가. 본 spec 의 lever 적용 (chunk policy / RRF / cross-encoder / embedding 업그레이드) 은 별도 task 로 분리 (fb-39b 이후).
>
> - Design: [`docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md`](../../docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md)
> - Plan: [`docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md`](../../docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md)
## 증상 / 동기