Stand up the Cargo workspace (Rust 2024 / resolver=3) with the kb-core
crate per the frozen design (§3, §4, §7, §10). kb-core has zero
deps on other kb-* crates and exposes:
- Newtype IDs (AssetId / DocumentId / BlockId / ChunkId / EmbeddingId /
IndexId) with Display + FromStr that reject anything but 32 lower-hex.
- id_from + id_for_{asset,doc,block,chunk,embedding,index} per §4.2;
pinned hex test values computed via an independent JCS+blake3 tool.
- CanonicalDocument, Block (8 variants), SourceSpan, Inline (§3.4).
- Citation (5 variants) with W3C Media Fragments to_uri / parse;
round-trip property holds for every variant.
- Metadata + Provenance (§3.6); SearchQuery / SearchHit / RetrievalDetail
(§3.7); DocFilter / DocSummary mirrors of wire §2.5.
- Answer / AnswerCitation / RefusalReason / ModelRef (§3.8).
- IngestReport, JobRepo support types, VectorRecord / VectorHit.
- Component traits (SourceConnector / Extractor / Chunker / Embedder /
Retriever / LanguageModel / DocumentStore / VectorStore / JobRepo)
plus their input helpers (SourceScope / ExtractContext / ChunkPolicy
/ EmbeddingInput / GenerateRequest / TokenChunk / FinishReason).
- CoreError (§10).
- nfc + to_posix helpers (§4.1, §6.6).
20 unit tests cover ID determinism (1000-run regression), key-order
invariance, two pinned hex values, newtype rejection of bad input,
Citation round-trip for all 5 variants, and to_posix collapsing +
Korean NFC.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
176 lines
5.4 KiB
Rust
176 lines
5.4 KiB
Rust
//! Component traits (§7) and their input helper types (§7.1).
|
|
|
|
use std::path::{Path, PathBuf};
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
use serde_json::Value;
|
|
|
|
use crate::asset::RawAsset;
|
|
use crate::chunk::Chunk;
|
|
use crate::document::{Block, CanonicalDocument};
|
|
use crate::ids::{ChunkId, DocumentId};
|
|
use crate::jobs::{JobFilter, JobId, JobKind, JobRow, JobStatus};
|
|
use crate::media::MediaType;
|
|
use crate::search::{DocFilter, DocSummary, SearchFilters, SearchHit, SearchQuery};
|
|
use crate::vector::{VectorHit, VectorRecord};
|
|
use crate::versions::{
|
|
ChunkerVersion, EmbeddingModelId, EmbeddingVersion, IndexVersion, ParserVersion,
|
|
};
|
|
use crate::answer::{ModelRef, TokenUsage};
|
|
|
|
// ── Helper input types (§7.1) ─────────────────────────────────────────────
|
|
|
|
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
|
pub struct SourceScope {
|
|
pub root: PathBuf,
|
|
pub include: Vec<String>,
|
|
pub exclude: Vec<String>,
|
|
}
|
|
|
|
/// Forward-declared (§3.7a) — concrete shape decided by extractors. P0
|
|
/// keeps the option-of-config-file slot only.
|
|
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
|
pub struct ExtractConfig {
|
|
pub config_path: Option<PathBuf>,
|
|
}
|
|
|
|
/// Carries the raw asset bytes context to an `Extractor::extract` call.
|
|
pub struct ExtractContext<'a> {
|
|
pub asset: &'a RawAsset,
|
|
pub workspace_root: &'a Path,
|
|
pub config: &'a ExtractConfig,
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
pub struct ChunkPolicy {
|
|
pub target_tokens: usize,
|
|
pub overlap_tokens: usize,
|
|
pub respect_markdown_headings: bool,
|
|
pub chunker_version: ChunkerVersion,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
|
|
#[serde(rename_all = "lowercase")]
|
|
pub enum EmbeddingKind {
|
|
Document,
|
|
Query,
|
|
}
|
|
|
|
pub struct EmbeddingInput<'a> {
|
|
pub text: &'a str,
|
|
pub kind: EmbeddingKind,
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
pub struct GenerateRequest {
|
|
pub system: String,
|
|
pub user: String,
|
|
pub stop: Vec<String>,
|
|
pub max_tokens: usize,
|
|
pub temperature: f32,
|
|
pub seed: Option<u64>,
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
#[serde(rename_all = "snake_case", tag = "kind")]
|
|
pub enum TokenChunk {
|
|
Token(String),
|
|
Done {
|
|
finish_reason: FinishReason,
|
|
usage: TokenUsage,
|
|
},
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub enum FinishReason {
|
|
Stop,
|
|
Length,
|
|
Aborted,
|
|
Error(String),
|
|
}
|
|
|
|
// ── Traits (§7.2) ─────────────────────────────────────────────────────────
|
|
|
|
pub trait SourceConnector {
|
|
fn scan(&self, scope: &SourceScope) -> anyhow::Result<Vec<RawAsset>>;
|
|
}
|
|
|
|
pub trait Extractor: Send + Sync {
|
|
fn supports(&self, media_type: &MediaType) -> bool;
|
|
fn parser_version(&self) -> ParserVersion;
|
|
fn extract(
|
|
&self,
|
|
ctx: &ExtractContext<'_>,
|
|
bytes: &[u8],
|
|
) -> anyhow::Result<CanonicalDocument>;
|
|
}
|
|
|
|
pub trait Chunker: Send + Sync {
|
|
fn chunker_version(&self) -> ChunkerVersion;
|
|
fn policy_hash(&self, policy: &ChunkPolicy) -> String;
|
|
fn chunk(
|
|
&self,
|
|
doc: &CanonicalDocument,
|
|
policy: &ChunkPolicy,
|
|
) -> anyhow::Result<Vec<Chunk>>;
|
|
}
|
|
|
|
pub trait Embedder: Send + Sync {
|
|
fn model_id(&self) -> EmbeddingModelId;
|
|
fn model_version(&self) -> EmbeddingVersion;
|
|
fn dimensions(&self) -> usize;
|
|
fn embed(&self, inputs: &[EmbeddingInput<'_>]) -> anyhow::Result<Vec<Vec<f32>>>;
|
|
}
|
|
|
|
pub trait Retriever: Send + Sync {
|
|
fn search(&self, query: &SearchQuery) -> anyhow::Result<Vec<SearchHit>>;
|
|
fn index_version(&self) -> IndexVersion;
|
|
}
|
|
|
|
pub trait LanguageModel: Send + Sync {
|
|
fn model_ref(&self) -> ModelRef;
|
|
fn context_tokens(&self) -> usize;
|
|
fn generate_stream(
|
|
&self,
|
|
req: GenerateRequest,
|
|
) -> anyhow::Result<Box<dyn Iterator<Item = anyhow::Result<TokenChunk>> + Send>>;
|
|
}
|
|
|
|
pub trait DocumentStore {
|
|
fn put_asset(&self, a: &RawAsset) -> anyhow::Result<()>;
|
|
fn put_document(&self, d: &CanonicalDocument) -> anyhow::Result<()>;
|
|
fn put_blocks(&self, doc: &DocumentId, blocks: &[Block]) -> anyhow::Result<()>;
|
|
fn put_chunks(&self, doc: &DocumentId, chunks: &[Chunk]) -> anyhow::Result<()>;
|
|
fn get_document(&self, id: &DocumentId) -> anyhow::Result<Option<CanonicalDocument>>;
|
|
fn get_chunk(&self, id: &ChunkId) -> anyhow::Result<Option<Chunk>>;
|
|
fn list_documents(&self, filter: &DocFilter) -> anyhow::Result<Vec<DocSummary>>;
|
|
}
|
|
|
|
pub trait VectorStore {
|
|
fn ensure_table(
|
|
&self,
|
|
model: &EmbeddingModelId,
|
|
dim: usize,
|
|
) -> anyhow::Result<crate::ids::IndexId>;
|
|
fn upsert(&self, recs: &[VectorRecord]) -> anyhow::Result<()>;
|
|
fn search(
|
|
&self,
|
|
query_vec: &[f32],
|
|
k: usize,
|
|
filters: &SearchFilters,
|
|
) -> anyhow::Result<Vec<VectorHit>>;
|
|
}
|
|
|
|
pub trait JobRepo {
|
|
fn create(&self, kind: JobKind, payload: Value) -> anyhow::Result<JobId>;
|
|
fn update_progress(&self, id: &JobId, progress: Value) -> anyhow::Result<()>;
|
|
fn finish(
|
|
&self,
|
|
id: &JobId,
|
|
status: JobStatus,
|
|
error: Option<&str>,
|
|
) -> anyhow::Result<()>;
|
|
fn list(&self, filter: &JobFilter) -> anyhow::Result<Vec<JobRow>>;
|
|
}
|