kebab/crates/kebab-parse-pdf/Cargo.toml

[package]
name = "kebab-parse-pdf"
version       = { workspace = true }
edition       = { workspace = true }
rust-version  = { workspace = true }
license       = { workspace = true }
repository    = { workspace = true }
description   = "Text PDF extractor + scanned-page image extract helpers for the kebab pipeline (P7-1 + v0.20.0 sub-item 1)"

[dependencies]
kebab-core   = { path = "../kebab-core" }
anyhow       = { workspace = true }
serde_json   = { workspace = true }
time         = { workspace = true }
tracing      = { workspace = true }
# Per-page text extraction. `lopdf::Document::extract_text(&[page])`
# is the only stable per-page API across the pdf-extract / lopdf
# pair (pdf-extract 0.7 still exposes only whole-document calls).
# pdf-extract is intentionally NOT pulled in here — its ~150 transitive
# crates (pom, postscript, type1-encoding-parser, …) buy us nothing
# at v1 (we don't call its whole-doc API), and the future scanned-PDF
# OCR fallback can re-add it when it actually needs it.
lopdf        = { workspace = true }

[dev-dependencies]
anyhow            = { workspace = true }
blake3            = { workspace = true }
kebab-parse-image = { path = "../kebab-parse-image" }
strsim            = "0.11"

[lints]
workspace = true