kebab/fixtures/golden_queries.yaml

# Golden query suite for `kebab eval run` (P5-1 / P5-2 / fb-39).
#
# Top-level: list of queries. Required fields: `id`, `query`. All
# others are optional and default to empty / null.
#
# Curators: `expected_doc_ids` and `expected_chunk_ids` MUST refer to
# real rows in the active workspace's SQLite store at run time. Stale
# references make the runner bail at start. The shipped template
# leaves them empty so the file is loadable on any fresh workspace —
# fill them in after a `kebab ingest` to enable the metrics that
# require ground truth (P5-2 + fb-39):
#
#   - `expected_chunk_ids` →  hit_at_k, MRR, precision_at_k_chunk (fb-39)
#   - `expected_doc_ids`   →  recall_at_k_doc
#
# `precision_at_k_chunk` (fb-39): of the top-k retrieved hits, what
# fraction's `chunk_id` is in `expected_chunk_ids`. Denominator is k
# (fixed) — `top-k` shortfall is treated as precision loss. Queries
# with empty `expected_chunk_ids` are skipped from this metric.
#
# `must_contain` / `forbidden` drive the rule-based groundedness
# metric (P5-2).

- id: g001
  query: "Cargo workspace 멤버 추가하는 법"
  lang: ko
  must_contain: ["[workspace]", "members"]
  difficulty: easy

- id: g002
  query: "What is Rust ownership?"
  lang: en
  must_contain: ["borrow", "lifetime"]
  difficulty: easy

- id: g003
  query: "Markdown chunking 규칙은?"
  lang: ko
  must_contain: ["heading"]
  forbidden: ["embedding"]
  difficulty: medium

- id: g004
  query: "How does FTS5 tokenization work for Korean text?"
  lang: en
  must_contain: ["unicode61", "tokenizer"]
  difficulty: medium

- id: g005
  query: "RAG citation 검증은 어떻게 동작?"
  lang: ko
  must_contain: ["citation", "marker"]
  difficulty: hard