refactor(wire): ExpansionProgress 이벤트 + 렌더 제거

IngestEvent::ExpansionProgress variant + 직렬화 테스트 제거(AssetChunked/
AssetTimings 유지). CLI/TUI 의 expansion 렌더 제거, AssetTimings 한 줄에서
expand 세그먼트 제거. ingest_progress.v1 schema 의 expansion_progress kind
제거, expansion_ms 설명을 "값 0 유지"로 갱신.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-02 21:37:44 +00:00
parent 21e02d8a93
commit a48c405826
4 changed files with 18 additions and 67 deletions

View File

@@ -50,18 +50,16 @@ pub struct AggregateCounts {
/// < ( AssetStarted /// < ( AssetStarted
/// [< (PdfOcrStarted < PdfOcrFinished)*] /// [< (PdfOcrStarted < PdfOcrFinished)*]
/// [< AssetChunked] /// [< AssetChunked]
/// [< ExpansionProgress*]
/// [< AssetTimings] /// [< AssetTimings]
/// < AssetFinished )* /// < AssetFinished )*
/// < (Completed | Aborted) /// < (Completed | Aborted)
/// ``` /// ```
/// ///
/// `[]` = optional. `PdfOcr*` is per-PDF asset only (v0.20.0 sub-item 1). /// `[]` = optional. `PdfOcr*` is per-PDF asset only (v0.20.0 sub-item 1).
/// `AssetChunked` / `ExpansionProgress` / `AssetTimings` are the v0.24.0 /// `AssetChunked` / `AssetTimings` are the v0.24.0 asset-internal phase
/// asset-internal phase events: `AssetChunked` fires once right after /// events: `AssetChunked` fires once right after chunking (markdown /
/// chunking (markdown / image / PDF); `ExpansionProgress` is a throttled /// image / PDF); `AssetTimings` reports per-phase wall-clock once
/// counter through the alias-expansion loop (markdown, expansion enabled /// (markdown only).
/// only); `AssetTimings` reports per-phase wall-clock once (markdown only).
/// ///
/// Embed-batch events (`embed_batch_started` / `embed_batch_finished` /// Embed-batch events (`embed_batch_started` / `embed_batch_finished`
/// in §2.4a) are reserved for a future iteration and are not emitted /// in §2.4a) are reserved for a future iteration and are not emitted
@@ -98,26 +96,14 @@ pub enum IngestEvent {
/// `idx/total` while its per-chunk phases churn. `chunks` is the chunk /// `idx/total` while its per-chunk phases churn. `chunks` is the chunk
/// count for asset `idx`. /// count for asset `idx`.
AssetChunked { idx: u32, total: u32, chunks: u32 }, AssetChunked { idx: u32, total: u32, chunks: u32 },
/// v0.24.0 (additive): throttled progress through the per-chunk
/// expansion (alias-LLM) loop — the slowest inner phase for large
/// documents (~14s per chunk against a remote GPU Ollama). `done` is
/// the number of chunks processed so far (cache hits included, so the
/// counter still advances on a warm re-run); `chunks` is the asset's
/// total chunk count. Emitted at most every 25 chunks or once per
/// second (see the loop in `ingest_one_asset`), plus a final
/// `done == chunks` frame.
ExpansionProgress {
idx: u32,
total: u32,
done: u32,
chunks: u32,
},
/// v0.24.0 (additive): per-phase wall-clock (milliseconds) for asset /// v0.24.0 (additive): per-phase wall-clock (milliseconds) for asset
/// `idx`, emitted once the asset's markdown pipeline finishes. Lets a /// `idx`, emitted once the asset's markdown pipeline finishes. Lets a
/// user see *where* the time went (parse / chunk / expansion / embed / /// user see *where* the time went (parse / chunk / embed / store)
/// store) without parsing logs. Only the markdown path emits this; the /// without parsing logs. Only the markdown path emits this; the
/// image / PDF paths surface `AssetChunked` but skip phase timing (their /// image / PDF paths surface `AssetChunked` but skip phase timing (their
/// phase shapes differ — OCR / caption rather than expansion). /// phase shapes differ — OCR / caption). `expansion_ms` is retained for
/// wire compatibility but is always 0 since doc-side expansion was
/// removed (HOTFIXES 2026-06-03).
AssetTimings { AssetTimings {
idx: u32, idx: u32,
total: u32, total: u32,
@@ -265,26 +251,6 @@ mod tests {
); );
} }
#[test]
fn expansion_progress_serializes_with_discriminator() {
let ev = IngestEvent::ExpansionProgress {
idx: 1,
total: 5,
done: 25,
chunks: 200,
};
let v = serde_json::to_value(&ev).unwrap();
assert_eq!(
v.get("kind").and_then(|s| s.as_str()),
Some("expansion_progress")
);
assert_eq!(v.get("done").and_then(serde_json::Value::as_u64), Some(25));
assert_eq!(
v.get("chunks").and_then(serde_json::Value::as_u64),
Some(200)
);
}
#[test] #[test]
fn asset_timings_serializes_all_phase_fields() { fn asset_timings_serializes_all_phase_fields() {
let ev = IngestEvent::AssetTimings { let ev = IngestEvent::AssetTimings {

View File

@@ -157,11 +157,11 @@ impl ProgressDisplay {
// in Completed handles the final state. No per-asset bar update // in Completed handles the final state. No per-asset bar update
// here avoids the duplicate-frame artifact in TTY scrollback. // here avoids the duplicate-frame artifact in TTY scrollback.
} }
// v0.24.0: asset-internal phase visibility. AssetChunked / // v0.24.0: asset-internal phase visibility. AssetChunked uses the
// ExpansionProgress use the bar *message* (live sub-progress for // bar *message* (live sub-progress for the current asset) —
// the current asset) — distinct from the per-file position draw, // distinct from the per-file position draw, so a single large
// so a single large document no longer looks frozen. AssetTimings // document no longer looks frozen. AssetTimings prints a one-line
// prints a one-line breakdown when the asset finishes. // breakdown when the asset finishes.
IngestEvent::AssetChunked { idx, total, chunks } => { IngestEvent::AssetChunked { idx, total, chunks } => {
if let Some(bar) = self.bar.as_ref() { if let Some(bar) = self.bar.as_ref() {
bar.set_message(format!("{chunks} chunks")); bar.set_message(format!("{chunks} chunks"));
@@ -171,20 +171,9 @@ impl ProgressDisplay {
let _ = writeln!(err, "ingest: {idx}/{total} → {chunks} chunks"); let _ = writeln!(err, "ingest: {idx}/{total} → {chunks} chunks");
} }
} }
IngestEvent::ExpansionProgress {
done, chunks, ..
} => {
if let Some(bar) = self.bar.as_ref() {
bar.set_message(format!("별칭 확장 {done}/{chunks}"));
}
// Non-TTY: suppressed by default — throttled though it is, one
// line per emit would still spam CI logs. The bar message
// covers the interactive case; --json carries every frame.
}
IngestEvent::AssetTimings { IngestEvent::AssetTimings {
parse_ms, parse_ms,
chunk_ms, chunk_ms,
expansion_ms,
embed_ms, embed_ms,
store_ms, store_ms,
.. ..
@@ -196,10 +185,9 @@ impl ProgressDisplay {
let mut err = std::io::stderr().lock(); let mut err = std::io::stderr().lock();
let _ = writeln!( let _ = writeln!(
err, err,
" ⏱ parse {} · chunk {} · expand {} · embed {} · store {}", " ⏱ parse {} · chunk {} · embed {} · store {}",
fmt_ms(*parse_ms), fmt_ms(*parse_ms),
fmt_ms(*chunk_ms), fmt_ms(*chunk_ms),
fmt_ms(*expansion_ms),
fmt_ms(*embed_ms), fmt_ms(*embed_ms),
fmt_ms(*store_ms), fmt_ms(*store_ms),
); );
@@ -289,7 +277,7 @@ fn emit_json(event: &IngestEvent) -> anyhow::Result<()> {
/// Render a phase duration (milliseconds) compactly for the human-mode /// Render a phase duration (milliseconds) compactly for the human-mode
/// `AssetTimings` line: `< 1000ms` stays in `ms`, larger spans collapse to /// `AssetTimings` line: `< 1000ms` stays in `ms`, larger spans collapse to
/// one-decimal seconds so a 45-second expansion reads `45.0s`, not `45000ms`. /// one-decimal seconds so a 45-second embed reads `45.0s`, not `45000ms`.
fn fmt_ms(ms: u64) -> String { fn fmt_ms(ms: u64) -> String {
if ms >= 1000 { if ms >= 1000 {
format!("{:.1}s", ms as f64 / 1000.0) format!("{:.1}s", ms as f64 / 1000.0)

View File

@@ -160,7 +160,6 @@ fn apply_event(state: &mut IngestState, event: IngestEvent) {
// per-asset counters, not sub-asset phase progress, so these are // per-asset counters, not sub-asset phase progress, so these are
// no-ops here (the CLI / --json surfaces render them). // no-ops here (the CLI / --json surfaces render them).
| IngestEvent::AssetChunked { .. } | IngestEvent::AssetChunked { .. }
| IngestEvent::ExpansionProgress { .. }
| IngestEvent::AssetTimings { .. } => {} | IngestEvent::AssetTimings { .. } => {}
} }
} }

View File

@@ -15,7 +15,6 @@
"asset_started", "asset_started",
"asset_finished", "asset_finished",
"asset_chunked", "asset_chunked",
"expansion_progress",
"asset_timings", "asset_timings",
"embed_batch_started", "embed_batch_started",
"embed_batch_finished", "embed_batch_finished",
@@ -36,11 +35,10 @@
"enum": ["new", "updated", "skipped", "error"], "enum": ["new", "updated", "skipped", "error"],
"description": "asset_finished: per-asset outcome (mirrors `ingest_report.v1.items[].kind`)." "description": "asset_finished: per-asset outcome (mirrors `ingest_report.v1.items[].kind`)."
}, },
"chunks": { "type": "integer", "minimum": 0, "description": "asset_finished / asset_chunked / expansion_progress (v0.24.0): chunk count produced for this asset." }, "chunks": { "type": "integer", "minimum": 0, "description": "asset_finished / asset_chunked (v0.24.0): chunk count produced for this asset." },
"done": { "type": "integer", "minimum": 0, "description": "expansion_progress (v0.24.0, additive): chunks processed so far in the per-chunk alias-expansion loop (cache hits included). Throttled: emitted at most every 25 chunks or once per second, plus a final frame where done == chunks." },
"parse_ms": { "type": "integer", "minimum": 0, "description": "asset_timings (v0.24.0, additive): parse phase wall-clock (ms). Markdown path only." }, "parse_ms": { "type": "integer", "minimum": 0, "description": "asset_timings (v0.24.0, additive): parse phase wall-clock (ms). Markdown path only." },
"chunk_ms": { "type": "integer", "minimum": 0, "description": "asset_timings (v0.24.0, additive): chunk phase wall-clock (ms). Markdown path only." }, "chunk_ms": { "type": "integer", "minimum": 0, "description": "asset_timings (v0.24.0, additive): chunk phase wall-clock (ms). Markdown path only." },
"expansion_ms": { "type": "integer", "minimum": 0, "description": "asset_timings (v0.24.0, additive): alias-expansion phase wall-clock (ms). Markdown path only; 0 when expansion is disabled." }, "expansion_ms": { "type": "integer", "minimum": 0, "description": "asset_timings (v0.24.0, additive): retained for wire compatibility but always 0 — doc-side expansion was removed (HOTFIXES 2026-06-03)." },
"embed_ms": { "type": "integer", "minimum": 0, "description": "asset_timings (v0.24.0, additive): embed + vector phase wall-clock (ms) — embedding, vector upsert, and stale-vector purge. Markdown path only." }, "embed_ms": { "type": "integer", "minimum": 0, "description": "asset_timings (v0.24.0, additive): embed + vector phase wall-clock (ms) — embedding, vector upsert, and stale-vector purge. Markdown path only." },
"store_ms": { "type": "integer", "minimum": 0, "description": "asset_timings (v0.24.0, additive): SQLite persist phase wall-clock (ms) — put_asset/document/blocks/chunks only. Markdown path only." }, "store_ms": { "type": "integer", "minimum": 0, "description": "asset_timings (v0.24.0, additive): SQLite persist phase wall-clock (ms) — put_asset/document/blocks/chunks only. Markdown path only." },
"n_chunks": { "type": "integer", "minimum": 0, "description": "embed_batch_started / embed_batch_finished: chunks in this embedding batch." }, "n_chunks": { "type": "integer", "minimum": 0, "description": "embed_batch_started / embed_batch_finished: chunks in this embedding batch." },