Spaces:

build-small-hackathon
/

draftme

Running

draftme / core /pdf_reader.py

Upload 105 files

7d2fea2 verified 7 days ago

836 Bytes

	import re
	import tempfile
	import unicodedata
	from pathlib import Path

	import fitz


	def read_pdf(source: bytes \| str \| Path) -> tuple[str, int]:
	if isinstance(source, bytes):
	with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp:
	tmp.write(source)
	tmp.flush()
	return _read_pdf_path(Path(tmp.name))
	return _read_pdf_path(Path(source))


	def _read_pdf_path(path: Path) -> tuple[str, int]:
	with fitz.open(path) as pdf:
	pages = [page.get_text("text") or "" for page in pdf]
	text = "\n\n".join(pages)
	text = unicodedata.normalize("NFKC", text)
	text = re.sub(r"[ \t]+", " ", text)
	text = re.sub(r"\n{3,}", "\n\n", text).strip()
	if not text:
	raise ValueError("No extractable text found in PDF. The file may be scanned.")
	return text, len(pages)