kebab/migrations/V009__fts_korean_morphological.sql

-- V009__fts_korean_morphological.sql — Replace chunks_fts tokenizer: trigram → unicode61.
--
-- Per design §5.5 (chunks_fts virtual table + chunks_ai/ad/au triggers).
-- The CREATE VIRTUAL TABLE / CREATE TRIGGER block below is reproduced
-- VERBATIM from `docs/superpowers/specs/2026-04-27-kebab-final-form-design.md`
-- §5.5; CI diff-checks this against the design doc (test
-- `fts_v009_matches_design_section_5_5_verbatim` in
-- `crates/kebab-store-sqlite/tests/fts.rs`).
--
-- Tokenizer choice: unicode61 + pre-tokenized Korean column.
-- V007 trigram enabled substring matching for Korean ≥3 chars but
-- 2-char Korean queries (e.g. '한국', '서울') always returned 0 hits.
-- V009 adds `tokenized_korean_text TEXT` column to `chunks` — the ingest
-- path (S2+) runs lindera ko-dic morphological analysis and writes the
-- space-separated morpheme sequence to this column. The chunks_ai/chunks_au
-- triggers concatenate tokenized_korean_text with the raw text before
-- indexing into chunks_fts, so both Korean morphemes AND English tokens
-- are searchable via a single FTS query. English substring matching
-- (V007 ad-hoc feature) reverts to whole-token matching (V002 behavior).
-- corpus_revision is bumped so the in-process search cache is automatically
-- invalidated. See tasks/HOTFIXES.md (2026-05-28) for the deviation log.
--
-- chunks_fts is a shadow of chunks (NOT contentless — V002 DDL has no
-- `content=''`); this migration drops the old shadow, recreates it with
-- the new tokenizer, recreates the sync triggers (CASE expression for
-- tokenized_korean_text), and backfills from `chunks`. The `chunks` table
-- and embeddings are untouched, so users do NOT need to re-ingest after
-- upgrading — the migration is fully automatic. tokenized_korean_text
-- starts as NULL for all pre-V009 rows; a subsequent kebab ingest
-- (S2+ path) will fill it in via UPDATE, firing chunks_au to re-index.

-- ── Korean morphological tokenizer (V009) ─────────────────────────────

-- chunks 테이블에 한국어 형태소 분해된 text 를 저장할 열 추가.
ALTER TABLE chunks ADD COLUMN tokenized_korean_text TEXT;

-- 기존 chunks_fts 제거 (trigram tokenizer).
DROP TRIGGER IF EXISTS chunks_au;
DROP TRIGGER IF EXISTS chunks_ad;
DROP TRIGGER IF EXISTS chunks_ai;
DROP TABLE IF EXISTS chunks_fts;

-- ── §5.5 verbatim block ────────────────────────────────────────────────

CREATE VIRTUAL TABLE chunks_fts USING fts5(
  chunk_id     UNINDEXED,
  doc_id       UNINDEXED,
  heading_path,
  text,
  tokenize = 'unicode61'
);

CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
  INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
  VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
          CASE WHEN new.tokenized_korean_text IS NOT NULL
               THEN new.tokenized_korean_text || ' ' || new.text
               ELSE new.text
          END);
END;
CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
  DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
END;
CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
  DELETE FROM chunks_fts WHERE chunk_id = old.chunk_id;
  INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
  VALUES (new.chunk_id, new.doc_id, new.heading_path_json,
          CASE WHEN new.tokenized_korean_text IS NOT NULL
               THEN new.tokenized_korean_text || ' ' || new.text
               ELSE new.text
          END);
END;

-- ── End §5.5 verbatim block ───────────────────────────────────────────

-- One-shot backfill from existing chunks. tokenized_korean_text is NULL
-- for all pre-V009 rows so the CASE expression falls to the ELSE branch
-- (raw text only). Subsequent re-ingest via S2+ will UPDATE
-- tokenized_korean_text and fire chunks_au to re-index with morphemes.
INSERT INTO chunks_fts(chunk_id, doc_id, heading_path, text)
  SELECT chunk_id, doc_id, heading_path_json,
         CASE WHEN tokenized_korean_text IS NOT NULL
              THEN tokenized_korean_text || ' ' || text
              ELSE text
         END
  FROM chunks;

-- Bump corpus_revision so the in-process LRU search cache is invalidated.
-- kv table columns are `key` TEXT + `value` TEXT (V004__kv.sql).
-- value is TEXT so CAST is required for integer arithmetic.
UPDATE kv SET value = CAST(CAST(value AS INTEGER) + 1 AS TEXT) WHERE key = 'corpus_revision';