feat(embed,config): add multilingual-e5-large + flip default config (fb-39b)

Task 1: Add multilingual-e5-large arm to kebab-embed-local::resolve_model with tests for 1024-dim variants and error cases.

Task 2: Flip kebab-config defaults from e5-small (384-dim) to e5-large (1024-dim) across defaults(), test assertions, and TOML template.

All tests pass; clippy clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
th-kim0823
2026-05-10 23:05:36 +09:00
parent d5321701ea
commit 69c94b6692
2 changed files with 34 additions and 11 deletions

View File

@@ -302,9 +302,9 @@ impl Config {
models: ModelsCfg {
embedding: EmbeddingModelCfg {
provider: "fastembed".to_string(),
model: "multilingual-e5-small".to_string(),
model: "multilingual-e5-large".to_string(),
version: "v1".to_string(),
dimensions: 384,
dimensions: 1024,
batch_size: 64,
},
llm: LlmCfg {
@@ -764,7 +764,8 @@ mod tests {
let c = Config::defaults();
assert_eq!(c.rag.score_gate, 0.30);
assert_eq!(c.chunking.target_tokens, 500);
assert_eq!(c.models.embedding.dimensions, 384);
assert_eq!(c.models.embedding.model, "multilingual-e5-large");
assert_eq!(c.models.embedding.dimensions, 1024);
assert_eq!(c.search.rrf_k, 60);
}
@@ -947,9 +948,9 @@ chunker_version = "md-heading-v1"
[models.embedding]
provider = "fastembed"
model = "multilingual-e5-small"
model = "multilingual-e5-large"
version = "v1"
dimensions = 384
dimensions = 1024
batch_size = 64
[models.llm]

View File

@@ -193,17 +193,18 @@ fn prefix_input(input: &EmbeddingInput<'_>) -> String {
}
/// Resolve a `config.models.embedding.model` string to a fastembed
/// `EmbeddingModel` enum variant. Only `multilingual-e5-small` is wired
/// for p3-2; additional model names should be added (and their dims
/// pinned in tests) as needed.
/// `EmbeddingModel` enum variant. Currently supports `multilingual-e5-small`
/// (384-dim) and `multilingual-e5-large` (1024-dim); additional model names
/// should be added (and their dims pinned in tests) as needed.
fn resolve_model(name: &str) -> Result<EmbeddingModel> {
match name {
"multilingual-e5-small" => Ok(EmbeddingModel::MultilingualE5Small),
"multilingual-e5-large" => Ok(EmbeddingModel::MultilingualE5Large),
other => anyhow::bail!(
"kb-embed-local: unsupported embedding model {other:?}; \
this adapter currently only ships `multilingual-e5-small`. \
Add a new arm to `resolve_model` (and a fastembed feature \
flag if needed) to support more."
this adapter currently ships `multilingual-e5-small` and \
`multilingual-e5-large`. Add a new arm to `resolve_model` \
(and a fastembed feature flag if needed) to support more."
),
}
}
@@ -294,6 +295,12 @@ mod tests {
resolve_model("multilingual-e5-small").expect("default model resolves");
}
#[test]
fn resolve_model_supports_e5_large() {
let m = resolve_model("multilingual-e5-large").expect("e5-large should resolve");
let _ = m;
}
#[test]
fn resolve_unknown_model_errors() {
let err = resolve_model("not-a-real-model").expect_err("unknown model errors");
@@ -301,6 +308,21 @@ mod tests {
assert!(msg.contains("unsupported embedding model"), "msg={msg}");
}
// ── check_dim ────────────────────────────────────────────────────
#[test]
fn check_dim_passes_for_1024() {
check_dim(1024, 1024).expect("matching dims must pass");
}
#[test]
fn check_dim_rejects_384_vs_1024() {
let err = check_dim(384, 1024).expect_err("dim mismatch must error");
let msg = format!("{err}");
assert!(msg.contains("384") && msg.contains("1024"),
"error must mention both dims, got: {msg}");
}
// expand_path tests live in `kb-config::paths`. The adapter imports
// it and trusts the upstream coverage rather than duplicating it.
}