Compare commits
6 Commits
spec/fb-39
...
spec/fb-39
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d5321701ea | ||
|
|
2c3461c465 | ||
| 240120ee80 | |||
|
|
5870a1de15 | ||
|
|
f00fb376fe | ||
|
|
bb0ec0469f |
@@ -184,6 +184,18 @@ pub fn render_report_md(report: &CompareReport) -> String {
|
||||
),
|
||||
);
|
||||
}
|
||||
for k in crate::metrics::TOP_K_VARIANTS {
|
||||
let _ = writeln!(
|
||||
out,
|
||||
"| precision@{k}_chunk | {} | {} | {} |",
|
||||
fmt(a.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN)),
|
||||
fmt(b.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN)),
|
||||
fmt_delta(
|
||||
a.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
|
||||
b.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
|
||||
),
|
||||
);
|
||||
}
|
||||
let _ = writeln!(
|
||||
out,
|
||||
"| citation_coverage | {} | {} | {} |",
|
||||
@@ -419,6 +431,7 @@ fn build_deltas(
|
||||
}
|
||||
let mut hit = serde_json::Map::new();
|
||||
let mut recall = serde_json::Map::new();
|
||||
let mut precision = serde_json::Map::new();
|
||||
for k in crate::metrics::TOP_K_VARIANTS {
|
||||
hit.insert(
|
||||
k.to_string(),
|
||||
@@ -434,11 +447,19 @@ fn build_deltas(
|
||||
b.recall_at_k_doc.get(k).copied().unwrap_or(f32::NAN),
|
||||
),
|
||||
);
|
||||
precision.insert(
|
||||
k.to_string(),
|
||||
d(
|
||||
a.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
|
||||
b.precision_at_k_chunk.get(k).copied().unwrap_or(f32::NAN),
|
||||
),
|
||||
);
|
||||
}
|
||||
serde_json::json!({
|
||||
"hit_at_k": hit,
|
||||
"mrr": d(a.mrr, b.mrr),
|
||||
"recall_at_k_doc": recall,
|
||||
"precision_at_k_chunk": precision,
|
||||
"citation_coverage": d(a.citation_coverage, b.citation_coverage),
|
||||
"groundedness": d(a.groundedness, b.groundedness),
|
||||
"empty_result_rate": d(a.empty_result_rate, b.empty_result_rate),
|
||||
@@ -484,6 +505,7 @@ mod tests {
|
||||
hit_at_k: Default::default(),
|
||||
mrr: 0.5,
|
||||
recall_at_k_doc: Default::default(),
|
||||
precision_at_k_chunk: Default::default(),
|
||||
citation_coverage: f32::NAN,
|
||||
groundedness: 0.0,
|
||||
empty_result_rate: 0.0,
|
||||
|
||||
@@ -58,6 +58,14 @@ pub struct AggregateMetrics {
|
||||
pub hit_at_k: BTreeMap<u32, f32>,
|
||||
pub mrr: f32,
|
||||
pub recall_at_k_doc: BTreeMap<u32, f32>,
|
||||
/// p9-fb-39: chunk-level precision at k. Binary relevance via
|
||||
/// `expected_chunk_ids` (a hit is "relevant" if its chunk_id is
|
||||
/// in the golden's `expected_chunk_ids`). Denominator is k (fixed)
|
||||
/// — `hits.len() < k` still divides by k, treating shortfall as
|
||||
/// precision loss (mirrors `hit_at_k`). Queries with empty
|
||||
/// `expected_chunk_ids` are skipped (mirrors `hit_at_k_chunk`).
|
||||
#[serde(default)]
|
||||
pub precision_at_k_chunk: BTreeMap<u32, f32>,
|
||||
#[serde(
|
||||
serialize_with = "serialize_f32_nan_as_null",
|
||||
deserialize_with = "deserialize_f32_or_nan"
|
||||
@@ -187,6 +195,8 @@ pub(crate) fn aggregate_from_rows(
|
||||
TOP_K_VARIANTS.iter().map(|k| (*k, (0_u32, 0_u32))).collect();
|
||||
let mut recall_at_k_doc: BTreeMap<u32, (f64, u32)> =
|
||||
TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();
|
||||
let mut precision_at_k_chunk: BTreeMap<u32, (f64, u32)> =
|
||||
TOP_K_VARIANTS.iter().map(|k| (*k, (0.0_f64, 0_u32))).collect();
|
||||
|
||||
let mut mrr_sum: f64 = 0.0;
|
||||
let mut mrr_denom: u32 = 0;
|
||||
@@ -243,6 +253,18 @@ pub(crate) fn aggregate_from_rows(
|
||||
{
|
||||
mrr_sum += 1.0 / f64::from(rank);
|
||||
}
|
||||
// p9-fb-39: precision@k_chunk — count of top-k hits whose
|
||||
// chunk_id is in `expected`, divided by k (fixed denominator).
|
||||
for k in TOP_K_VARIANTS {
|
||||
let hits_in_topk_relevant = qr
|
||||
.hits_top_k
|
||||
.iter()
|
||||
.filter(|h| h.rank <= *k && expected.contains(&h.chunk_id))
|
||||
.count();
|
||||
let entry = precision_at_k_chunk.get_mut(k).expect("init");
|
||||
entry.0 += hits_in_topk_relevant as f64 / f64::from(*k);
|
||||
entry.1 += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// recall@k_doc (doc-level, requires non-empty expected_doc_ids
|
||||
@@ -333,6 +355,7 @@ pub(crate) fn aggregate_from_rows(
|
||||
mrr_sum / f64::from(mrr_denom)
|
||||
}),
|
||||
recall_at_k_doc: round_recall_map(&recall_at_k_doc),
|
||||
precision_at_k_chunk: round_recall_map(&precision_at_k_chunk),
|
||||
citation_coverage: ratio_or_nan(citation_num, citation_denom),
|
||||
groundedness: ratio_or_zero(groundedness_num, groundedness_denom),
|
||||
empty_result_rate: ratio_or_zero(empty_result_count, total_queries),
|
||||
@@ -674,4 +697,114 @@ mod tests {
|
||||
assert_eq!(agg.failed_queries, 1);
|
||||
assert_eq!(agg.total_queries, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn precision_at_k_chunk_field_default_empty_on_old_json() {
|
||||
// Old eval_runs.metrics_json predates fb-39 — no precision_at_k_chunk field.
|
||||
// serde(default) yields empty BTreeMap.
|
||||
let old = serde_json::json!({
|
||||
"hit_at_k": {"1": 0.5, "3": 0.5, "5": 0.5, "10": 0.5},
|
||||
"mrr": 0.5,
|
||||
"recall_at_k_doc": {"1": 0.0, "3": 0.0, "5": 0.0, "10": 0.0},
|
||||
"citation_coverage": null,
|
||||
"groundedness": 0.0,
|
||||
"empty_result_rate": 0.0,
|
||||
"refusal_correctness": null,
|
||||
"total_queries": 1,
|
||||
"failed_queries": 0
|
||||
});
|
||||
let parsed: AggregateMetrics =
|
||||
serde_json::from_value(old).expect("backwards-compat deserialize");
|
||||
assert!(parsed.precision_at_k_chunk.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn precision_at_k_chunk_exact_match() {
|
||||
// expected = [c1, c2, c3]. Top-5 hits: [c1@1, c2@2, c3@3, x@4, y@5].
|
||||
// P@5 = 3/5 = 0.6. P@10 = 3/10 = 0.3.
|
||||
let queries = vec![gq("q1", &["c1", "c2", "c3"], &["d1"])];
|
||||
let rows = vec![record(
|
||||
"q1",
|
||||
vec![
|
||||
hit(1, "c1", "d1"),
|
||||
hit(2, "c2", "d1"),
|
||||
hit(3, "c3", "d1"),
|
||||
hit(4, "x", "d1"),
|
||||
hit(5, "y", "d1"),
|
||||
],
|
||||
None,
|
||||
None,
|
||||
)];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.precision_at_k_chunk[&5], 0.6);
|
||||
assert_eq!(agg.precision_at_k_chunk[&10], 0.3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn precision_at_k_chunk_partial_topk_divides_by_k() {
|
||||
// expected = [c1, c2]. Hits: only [c1@1, c2@2, x@3] (3 results).
|
||||
// P@5 = 2/5 = 0.4 (denominator is k, not hits.len()).
|
||||
let queries = vec![gq("q1", &["c1", "c2"], &["d1"])];
|
||||
let rows = vec![record(
|
||||
"q1",
|
||||
vec![hit(1, "c1", "d1"), hit(2, "c2", "d1"), hit(3, "x", "d1")],
|
||||
None,
|
||||
None,
|
||||
)];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.precision_at_k_chunk[&5], 0.4);
|
||||
assert_eq!(agg.precision_at_k_chunk[&10], 0.2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn precision_at_k_chunk_zero_relevant_in_topk() {
|
||||
// expected = [c1]. Hits: [x@1, y@2, z@3] (none relevant).
|
||||
// P@5 = 0/5 = 0.0.
|
||||
let queries = vec![gq("q1", &["c1"], &["d1"])];
|
||||
let rows = vec![record(
|
||||
"q1",
|
||||
vec![hit(1, "x", "d1"), hit(2, "y", "d1"), hit(3, "z", "d1")],
|
||||
None,
|
||||
None,
|
||||
)];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn precision_at_k_chunk_empty_expected_skipped() {
|
||||
// expected_chunk_ids = []. Skipped → final BTreeMap entry value = 0.0
|
||||
// (zero-denom path in round_recall_map). Mirrors recall_at_k_doc behavior.
|
||||
let queries = vec![gq("q1", &[], &["d1"])];
|
||||
let rows = vec![record("q1", vec![hit(1, "c1", "d1")], None, None)];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.precision_at_k_chunk[&5], 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn precision_at_k_chunk_two_queries_averaged() {
|
||||
// q1: expected=[c1], hits=[c1@1, x@2, y@3] → P@5 = 1/5 = 0.2
|
||||
// q2: expected=[c1, c2], hits=[c1@1, c2@2] → P@5 = 2/5 = 0.4
|
||||
// Avg P@5 = 0.3.
|
||||
let queries = vec![
|
||||
gq("q1", &["c1"], &["d1"]),
|
||||
gq("q2", &["c1", "c2"], &["d2"]),
|
||||
];
|
||||
let rows = vec![
|
||||
record(
|
||||
"q1",
|
||||
vec![hit(1, "c1", "d1"), hit(2, "x", "d1"), hit(3, "y", "d1")],
|
||||
None,
|
||||
None,
|
||||
),
|
||||
record(
|
||||
"q2",
|
||||
vec![hit(1, "c1", "d2"), hit(2, "c2", "d2")],
|
||||
None,
|
||||
None,
|
||||
),
|
||||
];
|
||||
let agg = aggregate_from_rows(&queries, &rows).unwrap();
|
||||
assert_eq!(agg.precision_at_k_chunk[&5], 0.3);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,12 @@
|
||||
"5": 0.666700005531311
|
||||
},
|
||||
"mrr": 0.41670000553131104,
|
||||
"precision_at_k_chunk": {
|
||||
"1": 0.33329999446868896,
|
||||
"10": 0.06669999659061432,
|
||||
"3": 0.11110000312328339,
|
||||
"5": 0.13330000638961792
|
||||
},
|
||||
"recall_at_k_doc": {
|
||||
"1": 0.33329999446868896,
|
||||
"10": 0.666700005531311,
|
||||
@@ -32,6 +38,12 @@
|
||||
"5": 1.0
|
||||
},
|
||||
"mrr": 0.833299994468689,
|
||||
"precision_at_k_chunk": {
|
||||
"1": 0.666700005531311,
|
||||
"10": 0.10000000149011612,
|
||||
"3": 0.33329999446868896,
|
||||
"5": 0.20000000298023224
|
||||
},
|
||||
"recall_at_k_doc": {
|
||||
"1": 0.666700005531311,
|
||||
"10": 1.0,
|
||||
@@ -53,6 +65,12 @@
|
||||
"5": 0.33329999446868896
|
||||
},
|
||||
"mrr": 0.41659998893737793,
|
||||
"precision_at_k_chunk": {
|
||||
"1": 0.33340001106262207,
|
||||
"10": 0.0333000048995018,
|
||||
"3": 0.22219999134540558,
|
||||
"5": 0.06669999659061432
|
||||
},
|
||||
"recall_at_k_doc": {
|
||||
"1": 0.33340001106262207,
|
||||
"10": 0.33329999446868896,
|
||||
|
||||
@@ -203,6 +203,7 @@ fn store_aggregate_rejects_missing_run() {
|
||||
hit_at_k: Default::default(),
|
||||
mrr: 0.0,
|
||||
recall_at_k_doc: Default::default(),
|
||||
precision_at_k_chunk: Default::default(),
|
||||
citation_coverage: f32::NAN,
|
||||
groundedness: 0.0,
|
||||
empty_result_rate: 0.0,
|
||||
|
||||
405
docs/superpowers/plans/2026-05-10-p9-fb-39b-embedding-upgrade.md
Normal file
405
docs/superpowers/plans/2026-05-10-p9-fb-39b-embedding-upgrade.md
Normal file
@@ -0,0 +1,405 @@
|
||||
# fb-39b Embedding Model Upgrade Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Upgrade default embedding model from `multilingual-e5-small` (384 dim) to `multilingual-e5-large` (1024 dim) so retrieval precision can improve on Korean dogfooding corpus. Existing user TOMLs pinning `multilingual-e5-small` keep working unchanged.
|
||||
|
||||
**Architecture:** Three-line code surface: a new arm in `kebab-embed-local::resolve_model`, defaults flipped in `kebab-config::Config::defaults` (and the TOML template), and the existing test asserting the 384 default updated. LanceDB tables are already namespaced by `(model, dim)` so an upgraded model writes to a fresh table; fb-23 incremental ingest detects the `embedding_version` mismatch and auto-re-embeds on next ingest. No migration tooling — orphan old-model tables cleaned via `kebab reset --vector-only`.
|
||||
|
||||
**Tech Stack:** Rust 2024, fastembed 4.9.1 (`MultilingualE5Large` enum already shipped), LanceDB.
|
||||
|
||||
**Spec:** `docs/superpowers/specs/2026-05-10-p9-fb-39b-embedding-upgrade-design.md`
|
||||
|
||||
---
|
||||
|
||||
## File map
|
||||
|
||||
**Modify:**
|
||||
- `crates/kebab-embed-local/src/lib.rs` — add `multilingual-e5-large` arm in `resolve_model`. Update or add `check_dim` test for 1024.
|
||||
- `crates/kebab-config/src/lib.rs` — flip `Config::defaults().models.embedding.{model, dimensions}` and the TOML template at line ~952. Update default test at line 767.
|
||||
- `README.md` — `[models.embedding]` section: mention new default + small opt-out + dim mismatch hint.
|
||||
- `docs/SMOKE.md` — append "Embedding upgrade (fb-39b)" walkthrough showing the `kebab reset --vector-only && kebab ingest` sequence + first-run ONNX download warning.
|
||||
- `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` §5 storage / §9 versioning — update default model + dim references.
|
||||
- `tasks/HOTFIXES.md` — entry for embedding upgrade UX (orphan tables on model swap, reset --vector-only flow).
|
||||
- `tasks/p9/p9-fb-39-retrieval-precision-tuning.md` banner — append note "fb-39b lever 적용 (embedding upgrade) ✅".
|
||||
- `tasks/INDEX.md` — fb-39b row ✅ (new row alongside fb-39).
|
||||
|
||||
**Create:**
|
||||
- `tasks/p9/p9-fb-39b-embedding-upgrade.md` — new task spec mirroring fb-39 frontmatter (status: completed, design + plan links).
|
||||
|
||||
---
|
||||
|
||||
## Task 1: Add multilingual-e5-large to kebab-embed-local
|
||||
|
||||
**Files:**
|
||||
- Modify: `crates/kebab-embed-local/src/lib.rs`
|
||||
|
||||
- [ ] **Step 1: Append failing tests**
|
||||
|
||||
Find the existing `mod tests` (~line 230). Append:
|
||||
|
||||
```rust
|
||||
#[test]
|
||||
fn resolve_model_supports_e5_large() {
|
||||
let m = resolve_model("multilingual-e5-large").expect("e5-large should resolve");
|
||||
// The fastembed enum is non-comparable in some versions; we only need
|
||||
// to confirm Ok and that the underlying TextEmbedding could be built.
|
||||
// Avoid actually constructing the model in tests (1.3 GB ONNX download).
|
||||
let _ = m;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn check_dim_passes_for_1024() {
|
||||
check_dim(1024, 1024).expect("matching dims must pass");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn check_dim_rejects_384_vs_1024() {
|
||||
let err = check_dim(384, 1024).expect_err("dim mismatch must error");
|
||||
let msg = format!("{err}");
|
||||
assert!(msg.contains("384") && msg.contains("1024"),
|
||||
"error must mention both dims, got: {msg}");
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run tests to confirm failures**
|
||||
|
||||
```bash
|
||||
cargo test -p kebab-embed-local resolve_model_supports_e5_large
|
||||
cargo test -p kebab-embed-local check_dim_passes_for_1024
|
||||
```
|
||||
Expected: `resolve_model_supports_e5_large` fails (no arm); `check_dim_*` passes already (helper is generic).
|
||||
|
||||
- [ ] **Step 3: Add arm to resolve_model**
|
||||
|
||||
In `crates/kebab-embed-local/src/lib.rs`, find `fn resolve_model` (~line 199). Replace the match body:
|
||||
|
||||
```rust
|
||||
fn resolve_model(name: &str) -> Result<EmbeddingModel> {
|
||||
match name {
|
||||
"multilingual-e5-small" => Ok(EmbeddingModel::MultilingualE5Small),
|
||||
"multilingual-e5-large" => Ok(EmbeddingModel::MultilingualE5Large),
|
||||
other => anyhow::bail!(
|
||||
"kb-embed-local: unsupported embedding model {other:?}; \
|
||||
this adapter currently ships `multilingual-e5-small` and \
|
||||
`multilingual-e5-large`. Add a new arm to `resolve_model` \
|
||||
(and a fastembed feature flag if needed) to support more."
|
||||
),
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Run tests — all pass**
|
||||
|
||||
```bash
|
||||
cargo test -p kebab-embed-local
|
||||
cargo clippy -p kebab-embed-local --all-targets -- -D warnings
|
||||
```
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add crates/kebab-embed-local/src/lib.rs
|
||||
git commit -m "feat(embed): add multilingual-e5-large arm to resolve_model (fb-39b)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 2: Flip kebab-config default to e5-large + 1024 dim
|
||||
|
||||
**Files:**
|
||||
- Modify: `crates/kebab-config/src/lib.rs`
|
||||
|
||||
- [ ] **Step 1: Read existing default test + value sites**
|
||||
|
||||
```bash
|
||||
grep -n "multilingual-e5-small\|dimensions: 384\|dimensions = 384\|default.*embedding" crates/kebab-config/src/lib.rs
|
||||
```
|
||||
|
||||
Three sites to update:
|
||||
- `Config::defaults()` body (~line 307): `dimensions: 384` and `model: "multilingual-e5-small"`.
|
||||
- Default-assert test (~line 767): `assert_eq!(c.models.embedding.dimensions, 384)` and likely a sibling assertion on model.
|
||||
- TOML template at ~line 952: `dimensions = 384` (and likely `model = "multilingual-e5-small"`).
|
||||
|
||||
- [ ] **Step 2: Add failing assertion to existing default test**
|
||||
|
||||
Find the test at ~line 763-768 (likely `defaults_match_design_64_score_gate` or similar). Read it:
|
||||
|
||||
```bash
|
||||
sed -n '760,780p' crates/kebab-config/src/lib.rs
|
||||
```
|
||||
|
||||
If the test asserts `dimensions == 384`, change to `1024`. If it doesn't assert model name, add:
|
||||
|
||||
```rust
|
||||
assert_eq!(c.models.embedding.model, "multilingual-e5-large");
|
||||
assert_eq!(c.models.embedding.dimensions, 1024);
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Run tests — expect failure**
|
||||
|
||||
```bash
|
||||
cargo test -p kebab-config defaults_match
|
||||
```
|
||||
Expected: assertion failure on dimensions == 1024 (still 384) and/or model name.
|
||||
|
||||
- [ ] **Step 4: Flip the defaults**
|
||||
|
||||
In `crates/kebab-config/src/lib.rs:307` (the `EmbeddingCfg` defaults block):
|
||||
|
||||
```rust
|
||||
EmbeddingCfg {
|
||||
provider: "fastembed".to_string(),
|
||||
model: "multilingual-e5-large".to_string(),
|
||||
version: "v1".to_string(),
|
||||
dimensions: 1024,
|
||||
// ... preserve other fields (batch_size etc.) ...
|
||||
}
|
||||
```
|
||||
|
||||
(Read the surrounding lines first to confirm field names — if `version` field doesn't exist or has a different shape, only update `model` + `dimensions`.)
|
||||
|
||||
- [ ] **Step 5: Flip the TOML template**
|
||||
|
||||
In `crates/kebab-config/src/lib.rs` near line 952, the multi-line raw string contains the example TOML config. Find:
|
||||
|
||||
```toml
|
||||
[models.embedding]
|
||||
provider = "fastembed"
|
||||
model = "multilingual-e5-small"
|
||||
...
|
||||
dimensions = 384
|
||||
```
|
||||
|
||||
Replace with `model = "multilingual-e5-large"` and `dimensions = 1024`.
|
||||
|
||||
- [ ] **Step 6: Run tests — pass**
|
||||
|
||||
```bash
|
||||
cargo test -p kebab-config
|
||||
cargo clippy -p kebab-config --all-targets -- -D warnings
|
||||
```
|
||||
|
||||
- [ ] **Step 7: Commit**
|
||||
|
||||
```bash
|
||||
git add crates/kebab-config/src/lib.rs
|
||||
git commit -m "feat(config): default embedding model multilingual-e5-large + 1024 dim (fb-39b)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 3: Cross-crate test fixture sweep
|
||||
|
||||
**Files:**
|
||||
- Modify: any test fixture broken by Task 2's default flip.
|
||||
|
||||
- [ ] **Step 1: Find broken sites**
|
||||
|
||||
```bash
|
||||
cargo build --workspace 2>&1 | tail -10
|
||||
cargo test --workspace --no-run 2>&1 | grep -E "error\[|FAILED" | head -20
|
||||
```
|
||||
|
||||
Likely candidates:
|
||||
- `crates/kebab-app/tests/` — anywhere a test asserted `embedding.dimensions == 384`.
|
||||
- `crates/kebab-cli/tests/cli_schema.rs` — a capability/model assertion may include the embedding model name.
|
||||
|
||||
For each failure, decide:
|
||||
- **Pin to small intentionally** (test exercises small-specific behavior): set `cfg.models.embedding.model = "multilingual-e5-small"; cfg.models.embedding.dimensions = 384;` explicitly.
|
||||
- **Inherit new default** (test just snapshots defaults): update assertion to `multilingual-e5-large` / `1024`.
|
||||
|
||||
The vast majority of integration tests use `provider = "none"` (no embeddings) — those are unaffected.
|
||||
|
||||
- [ ] **Step 2: Verify workspace builds**
|
||||
|
||||
```bash
|
||||
cargo build --workspace 2>&1 | tail -5
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Run workspace tests**
|
||||
|
||||
```bash
|
||||
cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -10
|
||||
cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -5
|
||||
```
|
||||
|
||||
`-j 1` REQUIRED.
|
||||
|
||||
Expected: all green.
|
||||
|
||||
- [ ] **Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add crates/
|
||||
git commit -m "fix(fb-39b): update test fixtures for embedding default flip"
|
||||
```
|
||||
|
||||
(Skip this commit if `cargo build --workspace` is already clean after Task 2 — meaning no fixture broke.)
|
||||
|
||||
---
|
||||
|
||||
## Task 4: Wire schema docs (design + HOTFIXES + new task spec)
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md`
|
||||
- Modify: `tasks/HOTFIXES.md`
|
||||
- Create: `tasks/p9/p9-fb-39b-embedding-upgrade.md`
|
||||
- Modify: `tasks/p9/p9-fb-39-retrieval-precision-tuning.md`
|
||||
- Modify: `tasks/INDEX.md`
|
||||
|
||||
- [ ] **Step 1: Update design §5 storage and §9 versioning**
|
||||
|
||||
```bash
|
||||
grep -n "multilingual-e5-small\|^## §5\|^### §5\|^## §9\|384" docs/superpowers/specs/2026-04-27-kebab-final-form-design.md | head -10
|
||||
```
|
||||
|
||||
Update any reference to `multilingual-e5-small` or `dim 384` in the design doc to read `multilingual-e5-large` and `dim 1024`. Keep historical version mentions intact (e.g. "0.6.0 shipped with multilingual-e5-small") if any — but the "current default" line must reflect the new model.
|
||||
|
||||
- [ ] **Step 2: Add HOTFIXES entry**
|
||||
|
||||
Append to `tasks/HOTFIXES.md` (under the dated log; place at top of the dated entries with today's date `2026-05-10`):
|
||||
|
||||
```markdown
|
||||
- **2026-05-10 fb-39b — embedding upgrade UX**: default embedding flipped from `multilingual-e5-small` (384 dim) to `multilingual-e5-large` (1024 dim). LanceDB tables are namespaced by `(model, dim)` so the new model writes to a fresh table and the old `chunk_embeddings_multilingual-e5-small_384` table becomes orphan. fb-23 incremental ingest auto-re-embeds chunks (embedding_version mismatch) into the new table on next `kebab ingest`. To free disk before re-ingest, run `kebab reset --vector-only` first — this wipes both LanceDB and the SQLite `embedding_records` table. Search/ask against the new model returns empty hits until `kebab ingest` populates the new table.
|
||||
```
|
||||
|
||||
- [ ] **Step 3: Create `tasks/p9/p9-fb-39b-embedding-upgrade.md`**
|
||||
|
||||
Mirror the fb-39 frontmatter shape:
|
||||
|
||||
```markdown
|
||||
---
|
||||
phase: P9
|
||||
component: kebab-embed-local + kebab-config + kebab-store-vector + docs
|
||||
task_id: p9-fb-39b
|
||||
title: "Embedding model upgrade (multilingual-e5-large)"
|
||||
status: completed
|
||||
target_version: 0.7.0
|
||||
depends_on: [p9-fb-39]
|
||||
unblocks: []
|
||||
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
|
||||
contract_sections: [§4 search, §5 storage, §9 versioning cascade]
|
||||
source_feedback: 사용자 도그푸딩 2026-05-06 — Claude Code 가 kebab CLI 사용 후 "rank 5+ 노이즈 섞임" 지적 (fb-39 의 lever 적용 측면).
|
||||
---
|
||||
|
||||
# p9-fb-39b — Embedding model upgrade
|
||||
|
||||
> ✅ **구현 완료.** fb-39 의 lever 후보 4개 중 embedding model 업그레이드 lever 적용. P@k metric (fb-39) 으로 small vs large 비교 가능.
|
||||
>
|
||||
> - Design: [`docs/superpowers/specs/2026-05-10-p9-fb-39b-embedding-upgrade-design.md`](../../docs/superpowers/specs/2026-05-10-p9-fb-39b-embedding-upgrade-design.md)
|
||||
> - Plan: [`docs/superpowers/plans/2026-05-10-p9-fb-39b-embedding-upgrade.md`](../../docs/superpowers/plans/2026-05-10-p9-fb-39b-embedding-upgrade.md)
|
||||
|
||||
## 요약
|
||||
|
||||
- `multilingual-e5-small` (384 dim) → `multilingual-e5-large` (1024 dim) default flip.
|
||||
- 기존 user TOML 이 small 명시 시 그대로 (backwards-compat).
|
||||
- fb-23 incremental ingest 가 embedding_version mismatch 감지 → 자동 re-embed.
|
||||
- 0.6 → 0.7 minor bump 트리거 (design §9 cascade rule).
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Append fb-39b note to fb-39 task spec banner**
|
||||
|
||||
In `tasks/p9/p9-fb-39-retrieval-precision-tuning.md`, find the existing `> ✅ **Eval foundation 부분 구현 완료.**` banner. Append a line:
|
||||
|
||||
```markdown
|
||||
> - fb-39b (lever 적용 — embedding upgrade): [`tasks/p9/p9-fb-39b-embedding-upgrade.md`](./p9-fb-39b-embedding-upgrade.md) ✅
|
||||
```
|
||||
|
||||
- [ ] **Step 5: Add fb-39b row to INDEX**
|
||||
|
||||
In `tasks/INDEX.md`, find the fb-39 row. Add a sibling row immediately below:
|
||||
|
||||
```markdown
|
||||
- [p9-fb-39b embedding upgrade](p9/p9-fb-39b-embedding-upgrade.md) — ✅ 머지 (2026-05-10) — multilingual-e5-large default
|
||||
```
|
||||
|
||||
(Adapt format to match neighbor rows.)
|
||||
|
||||
- [ ] **Step 6: Workspace test + clippy gate**
|
||||
|
||||
```bash
|
||||
cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -10
|
||||
cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -5
|
||||
```
|
||||
|
||||
`-j 1` REQUIRED.
|
||||
|
||||
- [ ] **Step 7: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/ tasks/
|
||||
git commit -m "docs(fb-39b): design + HOTFIXES + new task spec + INDEX"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 5: README + SMOKE walkthrough
|
||||
|
||||
**Files:**
|
||||
- Modify: `README.md`
|
||||
- Modify: `docs/SMOKE.md`
|
||||
|
||||
- [ ] **Step 1: Update README `[models.embedding]` section**
|
||||
|
||||
```bash
|
||||
grep -n "models.embedding\|multilingual-e5-small\|fastembed" README.md | head -5
|
||||
```
|
||||
|
||||
Locate the `[models.embedding]` config block in README. Update default values mentioned + add new bullet:
|
||||
|
||||
```markdown
|
||||
- `model` (default `"multilingual-e5-large"`, fb-39b) — 다국어 sentence embedding 모델. 1024-dim. ONNX (~1.3 GB) 첫 실행 시 fastembed cache (`config.storage.model_dir/fastembed/`) 에 자동 다운로드. `"multilingual-e5-small"` (384 dim) 는 backwards-compat 으로 사용 가능 — TOML 에 명시.
|
||||
- `dimensions` (default `1024`) — 모델의 embedding 차원. config 와 LanceDB stored dim 불일치 시 검색 결과 0 건 (orphan table). 모델 변경 시 `kebab reset --vector-only && kebab ingest` 로 vector index 재구축 권장.
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Append SMOKE walkthrough**
|
||||
|
||||
Append to `docs/SMOKE.md` after fb-39 section (or at end if absent):
|
||||
|
||||
````markdown
|
||||
### Embedding upgrade (fb-39b)
|
||||
|
||||
`multilingual-e5-small` 에서 `multilingual-e5-large` 로 업그레이드 시퀀스:
|
||||
|
||||
```bash
|
||||
# 기존 vector index 정리 (orphan table 회피)
|
||||
kebab --config /tmp/kebab-smoke/config.toml reset --vector-only
|
||||
|
||||
# config.toml 의 [models.embedding] 갱신:
|
||||
# model = "multilingual-e5-large"
|
||||
# dimensions = 1024
|
||||
|
||||
# 재-ingest — fastembed 가 첫 실행 시 e5-large ONNX (~1.3 GB) 자동 다운로드.
|
||||
# 다운로드 시간 + 모든 chunk re-embed 시간 (e5-small 대비 ~3-4×).
|
||||
kebab --config /tmp/kebab-smoke/config.toml ingest
|
||||
|
||||
# fb-39 의 P@k metric 으로 small vs large 비교:
|
||||
kebab --config /tmp/kebab-smoke/config.toml eval run
|
||||
```
|
||||
````
|
||||
|
||||
- [ ] **Step 3: Workspace test + clippy gate (sanity)**
|
||||
|
||||
```bash
|
||||
cargo test --workspace --no-fail-fast -j 1 2>&1 | tail -5
|
||||
cargo clippy --workspace --all-targets -- -D warnings 2>&1 | tail -3
|
||||
```
|
||||
|
||||
- [ ] **Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add README.md docs/SMOKE.md
|
||||
git commit -m "docs(fb-39b): README + SMOKE — embedding upgrade walkthrough"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Final verification checklist
|
||||
|
||||
- [ ] `cargo test --workspace --no-fail-fast -j 1` green
|
||||
- [ ] `cargo clippy --workspace --all-targets -- -D warnings` clean
|
||||
- [ ] `kebab schema --json | jq .models.embedding_version` reflects new model name (after a fresh ingest with new defaults)
|
||||
- [ ] Manual smoke: `kebab reset --vector-only && kebab ingest` against `/tmp/kebab-smoke` triggers ONNX download (first run) then completes ingest into the new `chunk_embeddings_multilingual-e5-large_1024` table
|
||||
- [ ] README + SMOKE + design + HOTFIXES + fb-39b spec + INDEX all updated
|
||||
- [ ] **Post-merge**: cut version bump 0.6 → 0.7 + tag (CLAUDE.md `Versioning cascade` release rule — embedding_version cascade triggers minor bump)
|
||||
@@ -1510,6 +1510,26 @@ agent 가 분기). HTTP-SSE transport 는 fb-29 deferral 따라 P+. classify
|
||||
모듈은 `kebab-app::error_wire` 에 single source — kebab-cli + kebab-mcp
|
||||
공유.
|
||||
|
||||
### 10.3 Eval metrics (fb-39)
|
||||
|
||||
#### Retrieval metrics (ground-truth curated)
|
||||
|
||||
`kebab eval run` 이 golden query suite (`fixtures/golden_queries.yaml`) 대해 메트릭 계산. Curator 가 `expected_chunk_ids` 및 `expected_doc_ids` 설정 시에만 측정 가능 (shipped template 은 empty — workspace 별 자체 채움).
|
||||
|
||||
| 메트릭 | 정의 | 조건 |
|
||||
|--------|------|------|
|
||||
| `hit_at_k` | top-k 안 expected chunk 존재 여부 (binary). P(hit@k=true) 평균 | `expected_chunk_ids` 채움 |
|
||||
| `MRR` | Mean Reciprocal Rank (첫 관련 chunk rank 역수 평균) | `expected_chunk_ids` 채움 |
|
||||
| `recall_at_k_doc` | top-k 안 expected doc 비율 (`|top-k_docs ∩ expected_doc_ids| / |expected_doc_ids|`) | `expected_doc_ids` 채움 |
|
||||
| `precision_at_k_chunk` (fb-39) | top-k 안 chunk_id 가 `expected_chunk_ids` 에 포함된 비율. 분모 = k (fixed) — `top-k` 부족도 precision 손실로 간주. 빈 `expected_chunk_ids` query 는 skip. | `expected_chunk_ids` 채움 |
|
||||
|
||||
#### Groundedness metrics (rule-based)
|
||||
|
||||
| 메트릭 | 정의 |
|
||||
|--------|------|
|
||||
| `must_contain` pass | answer 문자열 이 `golden.must_contain` 의 모든 substring 포함 |
|
||||
| `forbidden` pass | answer 문자열 이 `golden.forbidden` 의 substring 미포함 |
|
||||
|
||||
---
|
||||
|
||||
## 11. 동결 범위 / 변경 정책
|
||||
|
||||
@@ -0,0 +1,198 @@
|
||||
---
|
||||
title: "p9-fb-39b — Embedding model upgrade design (multilingual-e5-large)"
|
||||
phase: P9
|
||||
component: kebab-embed-local + kebab-store-vector + kebab-config + kebab-app
|
||||
task_id: p9-fb-39b
|
||||
status: design
|
||||
target_version: 0.7.0
|
||||
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
|
||||
contract_sections: [§4 search, §5 storage, §9 versioning cascade]
|
||||
date: 2026-05-10
|
||||
---
|
||||
|
||||
# p9-fb-39b — Embedding model upgrade
|
||||
|
||||
## Goal
|
||||
|
||||
fb-39 의 lever 적용 — embedding model 을 `multilingual-e5-small` (384 dim) 에서 `multilingual-e5-large` (1024 dim) 로 업그레이드. 도그푸딩 한국어 corpus 의 retrieval precision 개선.
|
||||
|
||||
fb-39 가 측정 도구 (P@5 / P@10) 를 추가했으므로, 본 PR 머지 후 small vs large 비교 가능.
|
||||
|
||||
`bge-m3` 검토했으나 fastembed 4.9.1 의 `EmbeddingModel` enum 에 미포함 — `UserDefinedEmbeddingModel` ONNX 직접 로드 path 는 별도 작업 (fb-39c 후보). 본 PR scope = e5-large 만.
|
||||
|
||||
## Behavior contract
|
||||
|
||||
### Embedding model
|
||||
|
||||
- 신규 default: `multilingual-e5-large` (1024 dim).
|
||||
- `kebab-embed-local::resolve_model` 에 신규 arm:
|
||||
|
||||
```rust
|
||||
"multilingual-e5-large" => Ok(EmbeddingModel::MultilingualE5Large),
|
||||
```
|
||||
|
||||
기존 `multilingual-e5-small` arm 그대로 (backwards-compat opt-out).
|
||||
|
||||
### Config defaults
|
||||
|
||||
- `Config::defaults().models.embedding.model`: `"multilingual-e5-small"` → `"multilingual-e5-large"`.
|
||||
- `Config::defaults().models.embedding.dimensions`: `384` → `1024`.
|
||||
- `kebab init` 가 생성하는 config.toml 템플릿 동일 갱신.
|
||||
|
||||
기존 user TOML 이 `model = "multilingual-e5-small"` 또는 `dimensions = 384` 명시한 경우 그대로 유지 — `serde` 가 user value 우선. opt-out 가능.
|
||||
|
||||
### Cascade
|
||||
|
||||
- `embedding_version`: 자동 변경 (config.models.embedding.model 값 그대로 wire 에 emit). `multilingual-e5-small` → `multilingual-e5-large`.
|
||||
- fb-23 incremental ingest: 4-input match (blake3 + parser_version + chunker_version + embedding_version) 에서 embedding_version 깨짐 → 모든 chunk 재-embed. text/parse/chunk 비용 회피, embed 비용만 발생.
|
||||
- `eval_runs.config_snapshot_json`: 새 version 자동 기록. 비교 시 동일 version 끼리.
|
||||
- design §9 cascade rule 의 5 키 중 `embedding_version` 변경 — binary release 트리거 (CLAUDE.md `Versioning cascade` 룰).
|
||||
|
||||
### Migration policy
|
||||
|
||||
LanceDB stored vectors 의 dim 과 `config.models.embedding.dimensions` 가 mismatch 면:
|
||||
|
||||
- `LanceVectorStore::open` (또는 첫 호출) 가 비교 → mismatch 시 신규 `ErrorV1`:
|
||||
- `code = "embedding_dim_mismatch"`
|
||||
- `message`: `"vector index dim 384 vs config dim 1024"`
|
||||
- `hint`: `"기존 vector index 가 4-dim, config 는 N-dim. 'kebab reset --vector-only && kebab ingest' 로 재구축."`
|
||||
- CLI: exit 1 + error.v1 stderr (또는 비-`--json` 모드 plain stderr).
|
||||
- silent migration / auto-wipe 안 함 — 사용자 명시 동의 필요.
|
||||
|
||||
remediation flow:
|
||||
|
||||
```
|
||||
$ kebab search "..."
|
||||
error: vector index dim 384 vs config dim 1024
|
||||
|
||||
Hint: 기존 vector index 가 384-dim, config 는 1024-dim.
|
||||
'kebab reset --vector-only && kebab ingest' 로 재구축.
|
||||
|
||||
$ kebab reset --vector-only
|
||||
[wipe LanceDB + SQLite embedding_records]
|
||||
|
||||
$ kebab ingest
|
||||
[full re-embed with new model — fastembed downloads e5-large ONNX (~1.3 GB) on first run]
|
||||
```
|
||||
|
||||
### Wire shape
|
||||
|
||||
신규 wire field 없음. `error.v1.code` 의 valid value namespace 에 `"embedding_dim_mismatch"` 추가 (string, enum 아님 — additive).
|
||||
|
||||
## Allowed / forbidden dependencies
|
||||
|
||||
- `kebab-embed-local`: 신규 dep 없음. fastembed enum variant 추가만.
|
||||
- `kebab-store-vector`: 신규 dep 없음. LanceDB schema reader 사용.
|
||||
- `kebab-config`: 신규 dep 없음. defaults 값 변경.
|
||||
- `kebab-app`: 신규 dep 없음. error propagation.
|
||||
|
||||
`kebab-core` 의 다른 `kebab-*` 의존 금지 룰 그대로.
|
||||
|
||||
## Public surface delta
|
||||
|
||||
### kebab-embed-local (`lib.rs`)
|
||||
|
||||
```rust
|
||||
fn resolve_model(name: &str) -> Result<EmbeddingModel> {
|
||||
match name {
|
||||
"multilingual-e5-small" => Ok(EmbeddingModel::MultilingualE5Small),
|
||||
"multilingual-e5-large" => Ok(EmbeddingModel::MultilingualE5Large), // 신규
|
||||
other => anyhow::bail!(/* ... */),
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### kebab-config (defaults + TOML 템플릿)
|
||||
|
||||
```rust
|
||||
EmbeddingCfg {
|
||||
provider: "fastembed".to_string(),
|
||||
model: "multilingual-e5-large".to_string(),
|
||||
dimensions: 1024,
|
||||
// ... 기타 ...
|
||||
}
|
||||
```
|
||||
|
||||
generated config.toml 템플릿 도 같이 갱신.
|
||||
|
||||
### kebab-store-vector (`lib.rs` 또는 신규 helper)
|
||||
|
||||
```rust
|
||||
impl LanceVectorStore {
|
||||
pub fn open(...) -> Result<Self> {
|
||||
// 기존 open 로직 ...
|
||||
let stored_dim = read_schema_vector_dim(&table)?;
|
||||
if stored_dim != config_dim {
|
||||
anyhow::bail!(StructuredError(ErrorV1 {
|
||||
code: "embedding_dim_mismatch".to_string(),
|
||||
message: format!("vector index dim {stored_dim} vs config dim {config_dim}"),
|
||||
hint: Some(format!(
|
||||
"기존 vector index 가 {stored_dim}-dim, config 는 {config_dim}-dim. \
|
||||
'kebab reset --vector-only && kebab ingest' 로 재구축."
|
||||
)),
|
||||
// ...
|
||||
}));
|
||||
}
|
||||
Ok(...)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
(정확한 LanceDB schema reading API 는 구현 시 확인 — `Table::schema()` 또는 `arrow_schema::Schema` 직접 inspect.)
|
||||
|
||||
## Test plan
|
||||
|
||||
| kind | description |
|
||||
|------|-------------|
|
||||
| unit (kebab-embed-local) | `resolve_model("multilingual-e5-large")` returns Ok |
|
||||
| unit (kebab-embed-local) | `check_dim(1024, 1024)` ok |
|
||||
| unit (kebab-embed-local) | `check_dim(384, 1024)` Err — message mentions both dims |
|
||||
| unit (kebab-config) | `Config::defaults().models.embedding.model == "multilingual-e5-large"` |
|
||||
| unit (kebab-config) | `Config::defaults().models.embedding.dimensions == 1024` |
|
||||
| unit (kebab-config) | TOML `model = "multilingual-e5-small"` deserialize 정상 (backwards-compat) |
|
||||
| unit (kebab-config) | 생성된 config.toml 템플릿 안 `model = "multilingual-e5-large"`, `dimensions = 1024` |
|
||||
| unit (kebab-store-vector) | mismatch fixture (384-dim stored + 1024 cfg) → `embedding_dim_mismatch` ErrorV1 |
|
||||
| 통합 (kebab-cli) | mismatch scenario — pre-existing 384-dim DB + new config → exit 1 + error.v1 stderr (`code = embedding_dim_mismatch`) + hint mentions reset --vector-only |
|
||||
| 통합 (kebab-cli) | small config 로 fresh ingest + search → 정상 (backwards-compat path 검증) |
|
||||
|
||||
`multilingual-e5-large` 모델 다운로드 회피 위해 unit/integration 테스트는 fixture 또는 mock — 실 모델 호출 안 함. 첫 도그푸딩 시 사용자가 fastembed cache 다운로드.
|
||||
|
||||
## Implementation steps (high-level)
|
||||
|
||||
1. `kebab-embed-local::resolve_model` arm + check_dim 단위 테스트.
|
||||
2. `kebab-store-vector` dim mismatch detection + ErrorV1 + 단위 테스트.
|
||||
3. `kebab-config` defaults flip + TOML 템플릿 + 단위 테스트.
|
||||
4. `kebab-cli` integration: mismatch error.v1 wire + backwards-compat path 통합 테스트.
|
||||
5. README + SMOKE + design + HOTFIXES + status flip.
|
||||
|
||||
5 task. 단일 PR, single 세션 가능.
|
||||
|
||||
## Risks / notes
|
||||
|
||||
- **첫 실행 모델 다운로드**: e5-large ONNX ~1.3 GB. fastembed cache (`config.storage.model_dir/fastembed/`) 에 자동 다운로드 (첫 호출 시). progress 표시 없음 — 사용자 침묵 latency. `kebab doctor` 또는 README 에 경고 안내.
|
||||
- **Search/ingest latency**: e5-large 가 e5-small 대비 ~3-4× embedding 시간. ingest 비용 증가 (one-time + 신규 docs). search 시 query embed per-call 증가.
|
||||
- **Disk usage**: vector dim 2.6× → LanceDB 약 2.7× 증가.
|
||||
- **HOTFIXES entry**: dim mismatch UX (error.v1 + reset --vector-only flow) 가 frozen design 안 명시 안 된 신규 동작 — HOTFIXES 한 항목 추가.
|
||||
- **eval comparison**: fb-39 P@k 가 측정 도구. 도그푸딩 corpus + golden 의 expected_chunk_ids 채워서 small vs large 정량 비교 별도 (PR 안 의무 아님).
|
||||
- **fb-23 incremental ingest 와의 상호작용**: embedding_version 변경 → 모든 doc 재-embed. fb-23 의 unchanged path 는 한 번도 hit 안 함 (예상 동작).
|
||||
- **release trigger**: design §9 cascade rule 의 `embedding_version` 변경 → CLAUDE.md `Versioning cascade` 룰에 따라 binary 0.6 → 0.7 minor bump 필요.
|
||||
|
||||
## Out of scope
|
||||
|
||||
- bge-m3 또는 user-defined ONNX path (fb-39c 후보).
|
||||
- Other lever (RRF / cross-encoder / chunk policy).
|
||||
- Auto-migration / background re-vector.
|
||||
- LanceDB schema migration tooling (별도 wipe + re-ingest).
|
||||
- multi-model coexistence (한 KB 안 small + large 동시).
|
||||
- precision 정량 비교 의무 (별도 도그푸딩).
|
||||
|
||||
## Documentation updates (implementation PR 동시)
|
||||
|
||||
- `README.md` `[models.embedding]` config 섹션 — default 변경 + small opt-out 안내 + dim mismatch 시 reset 명령 안내.
|
||||
- `docs/SMOKE.md` — upgrade walkthrough (`kebab reset --vector-only && kebab ingest` 시퀀스 + 첫 ONNX 다운로드 latency 경고).
|
||||
- `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md` §5 storage / §9 versioning 적절 절 — 새 default + dim 1024 명시.
|
||||
- `tasks/HOTFIXES.md` — dim mismatch UX entry.
|
||||
- `tasks/p9/p9-fb-39-retrieval-precision-tuning.md` banner — fb-39b lever 적용 (embedding upgrade) ✅ 추가 (단 spec status 는 fb-39 frozen).
|
||||
- `tasks/p9/p9-fb-39b-embedding-upgrade.md` 신규 task spec (만들거나, fb-39 sub-task 로 frontmatter 처리).
|
||||
- `tasks/INDEX.md` — fb-39b 행 추가 ✅.
|
||||
- 본 PR 머지 후 `chore: bump version 0.6 → 0.7` + tag (CLAUDE.md release 절차).
|
||||
@@ -1,4 +1,4 @@
|
||||
# Golden query suite for `kb eval run` (P5-1 / P5-2).
|
||||
# Golden query suite for `kebab eval run` (P5-1 / P5-2 / fb-39).
|
||||
#
|
||||
# Top-level: list of queries. Required fields: `id`, `query`. All
|
||||
# others are optional and default to empty / null.
|
||||
@@ -7,8 +7,16 @@
|
||||
# real rows in the active workspace's SQLite store at run time. Stale
|
||||
# references make the runner bail at start. The shipped template
|
||||
# leaves them empty so the file is loadable on any fresh workspace —
|
||||
# fill them in after a `kb ingest` to enable hit@k / MRR metrics
|
||||
# (P5-2).
|
||||
# fill them in after a `kebab ingest` to enable the metrics that
|
||||
# require ground truth (P5-2 + fb-39):
|
||||
#
|
||||
# - `expected_chunk_ids` → hit_at_k, MRR, precision_at_k_chunk (fb-39)
|
||||
# - `expected_doc_ids` → recall_at_k_doc
|
||||
#
|
||||
# `precision_at_k_chunk` (fb-39): of the top-k retrieved hits, what
|
||||
# fraction's `chunk_id` is in `expected_chunk_ids`. Denominator is k
|
||||
# (fixed) — `top-k` shortfall is treated as precision loss. Queries
|
||||
# with empty `expected_chunk_ids` are skipped from this metric.
|
||||
#
|
||||
# `must_contain` / `forbidden` drive the rule-based groundedness
|
||||
# metric (P5-2).
|
||||
|
||||
@@ -129,7 +129,7 @@ P0~P5 는 직렬. P6~P9 는 P5 이후 병렬 가능.
|
||||
|
||||
### 🎯 0.5.0 — RAG quality (cascade 동반: V00X + reindex)
|
||||
- [p9-fb-38 score semantics](p9/p9-fb-38-score-semantics.md) — ✅ 머지 (2026-05-10)
|
||||
- [p9-fb-39 retrieval precision 튜닝](p9/p9-fb-39-retrieval-precision-tuning.md) — ⏳ 미구현, brainstorm 필요 (embedding_version cascade)
|
||||
- [p9-fb-39 retrieval precision 튜닝](p9/p9-fb-39-retrieval-precision-tuning.md) — ✅ 머지 (2026-05-10) — eval foundation only, lever 적용 deferred
|
||||
- [p9-fb-40 fact-grounded answer](p9/p9-fb-40-fact-grounded-answer.md) — ✅ 머지 (2026-05-10)
|
||||
|
||||
### 🎯 0.6.0 또는 P+ — reasoning
|
||||
|
||||
@@ -1,20 +1,23 @@
|
||||
---
|
||||
phase: P9
|
||||
component: kebab-search + kebab-rag + kebab-chunk
|
||||
component: kebab-eval + docs
|
||||
task_id: p9-fb-39
|
||||
title: "Retrieval precision 튜닝 (rank 5+ 노이즈 완화)"
|
||||
status: open
|
||||
target_version: 0.5.0
|
||||
status: completed
|
||||
target_version: 0.7.0
|
||||
depends_on: []
|
||||
unblocks: []
|
||||
contract_source: ../../docs/superpowers/specs/2026-04-27-kebab-final-form-design.md
|
||||
contract_sections: [§3 chunking, §4 search, §7 RAG]
|
||||
contract_sections: [§3 chunking, §4 search, §7 RAG, §10.3 eval metrics]
|
||||
source_feedback: 사용자 도그푸딩 2026-05-06 — Claude Code 가 kebab CLI 사용 후 "rank 5+ 부터 노이즈 섞임" 지적. precision-at-k 가 k=5 이후 떨어짐.
|
||||
---
|
||||
|
||||
# p9-fb-39 — Retrieval precision 튜닝
|
||||
|
||||
> ⏳ **백로그 only — 미구현.** 본 spec 은 도그푸딩 피드백 skeleton. 구현 착수 전 [superpowers:brainstorming](../../docs/superpowers/) 으로 설계 단계 선행 필요. 어느 lever (chunk policy / RRF k / score gate / cross-encoder / embedding 업그레이드) 부터 손볼지, eval golden set 선행 여부 brainstorm 후 결정.
|
||||
> ✅ **Eval foundation 부분 구현 완료.** P@k metric (P@5, P@10) 추가. 본 spec 의 lever 적용 (chunk policy / RRF / cross-encoder / embedding 업그레이드) 은 별도 task 로 분리 (fb-39b 이후).
|
||||
>
|
||||
> - Design: [`docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md`](../../docs/superpowers/specs/2026-05-10-p9-fb-39-eval-foundation-design.md)
|
||||
> - Plan: [`docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md`](../../docs/superpowers/plans/2026-05-10-p9-fb-39-eval-foundation.md)
|
||||
|
||||
## 증상 / 동기
|
||||
|
||||
|
||||
Reference in New Issue
Block a user