p1-6: scaffold kb-store-sqlite crate + V001 full §5 DDL

New workspace member crate `kb-store-sqlite` (allowed deps only:
kb-core, kb-config, rusqlite[bundled], refinery, serde, serde_json,
time, blake3, tracing, anyhow, thiserror; dev-deps add kb-parse-md /
kb-normalize / kb-chunk for the contract round-trip test).

Migration V001 replaces the P0-1 stub with the full §5 DDL (assets,
documents, document_tags, blocks, chunks with policy_hash,
embedding_records, jobs, ingest_runs, answers, eval_runs,
eval_query_results) plus the §5 indexes. FTS5 virtual table + triggers
remain deferred to V002 (P2-1).

Public surface per task spec:
  SqliteStore::open / run_migrations / put_asset_with_bytes
  impl DocumentStore for SqliteStore (7 trait methods)
  impl JobRepo for SqliteStore (4 trait methods)
  StoreError { Sqlx, Migration, Conflict }

Behavior:
- Pragmas at open: foreign_keys=ON, journal_mode=WAL,
  synchronous=NORMAL, temp_store=MEMORY.
- Asset writer: byte_len ≤ copy_threshold_mb * 1MiB → copy to
  data_dir/assets/<aa>/<asset_id> (mode 0o644 on Unix), else
  reference. blake3(bytes) verified against asset.checksum; mismatch →
  Conflict.
- Idempotency: put_document UPSERTs and bumps doc_version + 1 on
  conflict; put_blocks / put_chunks DELETE-then-INSERT; document_tags
  re-derived per put_document.
- get_document rehydrates blocks via payload_json ordered by stream
  ordinal.
- list_documents builds dynamic WHERE from DocFilter (lang / trust_min
  / path_glob via GLOB / tags_any via document_tags subquery).
- JobRepo: jobs.kind/status are stored as lowercase enum tags; create
  mints a 32-hex JobId via blake3(kind || payload || nanos).

Tests follow in subsequent commits.
This commit is contained in:
2026-04-30 17:08:36 +00:00
parent 207a0ff61e
commit a3390d5171
10 changed files with 1906 additions and 4 deletions

View File

@@ -1,7 +1,10 @@
-- V001__init.sql — schema bootstrap.
-- Per design §5.1 + §5.9. Only the meta + migrations tables land here;
-- data tables (assets, documents, blocks, chunks, fts5, …) ship in later
-- phase-specific migrations (P1-6 / P2-1 / P3-3).
-- V001__init.sql — full P1 schema bootstrap.
-- Per design §5.1 (meta), §5.2 (assets), §5.3 (documents/document_tags),
-- §5.4 (blocks), §5.5 (chunks — FTS5 virtual table + triggers DEFERRED to
-- V002 in P2-1), §5.6 (embedding_records), §5.7 (jobs / ingest_runs /
-- answers / eval_runs / eval_query_results).
-- §5.1 Migrations meta -------------------------------------------------------
CREATE TABLE schema_meta (
key TEXT PRIMARY KEY,
@@ -13,3 +16,167 @@ CREATE TABLE migrations (
applied_at TEXT NOT NULL,
description TEXT NOT NULL
);
-- §5.2 Assets ----------------------------------------------------------------
CREATE TABLE assets (
asset_id TEXT PRIMARY KEY,
source_uri TEXT NOT NULL,
workspace_path TEXT NOT NULL,
media_type TEXT NOT NULL,
byte_len INTEGER NOT NULL,
checksum TEXT NOT NULL,
storage_kind TEXT NOT NULL CHECK (storage_kind IN ('copied','reference')),
storage_path TEXT NOT NULL,
discovered_at TEXT NOT NULL
);
CREATE UNIQUE INDEX idx_assets_workspace_path ON assets(workspace_path);
CREATE INDEX idx_assets_media_type ON assets(media_type);
-- §5.3 Documents -------------------------------------------------------------
CREATE TABLE documents (
doc_id TEXT PRIMARY KEY,
asset_id TEXT NOT NULL REFERENCES assets(asset_id) ON DELETE RESTRICT,
workspace_path TEXT NOT NULL,
title TEXT,
lang TEXT,
source_type TEXT NOT NULL,
trust_level TEXT NOT NULL,
parser_version TEXT NOT NULL,
doc_version INTEGER NOT NULL,
schema_version INTEGER NOT NULL,
metadata_json TEXT NOT NULL,
provenance_json TEXT NOT NULL,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
);
CREATE UNIQUE INDEX idx_docs_workspace_path ON documents(workspace_path);
CREATE INDEX idx_docs_lang ON documents(lang);
CREATE INDEX idx_docs_source_type ON documents(source_type);
CREATE TABLE document_tags (
doc_id TEXT NOT NULL REFERENCES documents(doc_id) ON DELETE CASCADE,
tag TEXT NOT NULL,
PRIMARY KEY (doc_id, tag)
);
CREATE INDEX idx_document_tags_tag ON document_tags(tag);
-- §5.4 Blocks ----------------------------------------------------------------
CREATE TABLE blocks (
block_id TEXT PRIMARY KEY,
doc_id TEXT NOT NULL REFERENCES documents(doc_id) ON DELETE CASCADE,
kind TEXT NOT NULL,
heading_path_json TEXT NOT NULL,
ordinal INTEGER NOT NULL,
source_span_json TEXT NOT NULL,
payload_json TEXT NOT NULL
);
CREATE INDEX idx_blocks_doc_id ON blocks(doc_id);
-- §5.5 Chunks (FTS5 virtual table + triggers deferred to V002 / P2-1) -------
CREATE TABLE chunks (
chunk_id TEXT PRIMARY KEY,
doc_id TEXT NOT NULL REFERENCES documents(doc_id) ON DELETE CASCADE,
text TEXT NOT NULL,
heading_path_json TEXT NOT NULL,
section_label TEXT,
source_spans_json TEXT NOT NULL,
token_estimate INTEGER NOT NULL,
chunker_version TEXT NOT NULL,
policy_hash TEXT NOT NULL,
block_ids_json TEXT NOT NULL,
created_at TEXT NOT NULL
);
CREATE INDEX idx_chunks_doc_id ON chunks(doc_id);
CREATE INDEX idx_chunks_chunker_version ON chunks(chunker_version);
-- §5.6 Embedding records (P3 — table empty in P1, present for forward compat) -
CREATE TABLE embedding_records (
embedding_id TEXT PRIMARY KEY,
chunk_id TEXT NOT NULL REFERENCES chunks(chunk_id) ON DELETE CASCADE,
model_id TEXT NOT NULL,
model_version TEXT NOT NULL,
dimensions INTEGER NOT NULL,
lance_table TEXT NOT NULL,
created_at TEXT NOT NULL,
UNIQUE(chunk_id, model_id, model_version, dimensions)
);
CREATE INDEX idx_embed_chunk ON embedding_records(chunk_id);
CREATE INDEX idx_embed_model ON embedding_records(model_id, model_version, dimensions);
-- §5.7 Jobs / IngestRuns / Answers / EvalRuns -------------------------------
CREATE TABLE jobs (
job_id TEXT PRIMARY KEY,
kind TEXT NOT NULL,
status TEXT NOT NULL CHECK (status IN ('pending','running','succeeded','failed','canceled')),
payload_json TEXT NOT NULL,
progress_json TEXT,
error_json TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL,
finished_at TEXT
);
CREATE INDEX idx_jobs_status ON jobs(status);
CREATE INDEX idx_jobs_kind ON jobs(kind);
CREATE TABLE ingest_runs (
run_id TEXT PRIMARY KEY,
scope_json TEXT NOT NULL,
scanned INTEGER NOT NULL,
new_count INTEGER NOT NULL,
updated_count INTEGER NOT NULL,
skipped_count INTEGER NOT NULL,
error_count INTEGER NOT NULL,
duration_ms INTEGER NOT NULL,
started_at TEXT NOT NULL,
finished_at TEXT NOT NULL,
items_json TEXT
);
CREATE TABLE answers (
trace_id TEXT PRIMARY KEY,
query TEXT NOT NULL,
answer TEXT NOT NULL,
grounded INTEGER NOT NULL,
refusal_reason TEXT,
model_id TEXT NOT NULL,
model_provider TEXT NOT NULL,
embedding_model_id TEXT,
embedding_dimensions INTEGER,
prompt_template_version TEXT NOT NULL,
retrieval_mode TEXT NOT NULL,
retrieval_k INTEGER NOT NULL,
score_gate REAL NOT NULL,
top_score REAL NOT NULL,
chunks_returned INTEGER NOT NULL,
chunks_used INTEGER NOT NULL,
citations_json TEXT NOT NULL,
packed_chunks_json TEXT,
prompt_tokens INTEGER,
completion_tokens INTEGER,
latency_ms INTEGER,
created_at TEXT NOT NULL
);
CREATE INDEX idx_answers_created_at ON answers(created_at);
CREATE INDEX idx_answers_grounded ON answers(grounded);
CREATE TABLE eval_runs (
run_id TEXT PRIMARY KEY,
suite TEXT NOT NULL,
config_snapshot_json TEXT NOT NULL,
aggregate_json TEXT NOT NULL,
commit_hash TEXT,
created_at TEXT NOT NULL
);
CREATE TABLE eval_query_results (
run_id TEXT NOT NULL REFERENCES eval_runs(run_id) ON DELETE CASCADE,
query_id TEXT NOT NULL,
result_json TEXT NOT NULL,
PRIMARY KEY (run_id, query_id)
);