Spaces:

MCP-1st-Birthday
/

contract-guardian

Running

contract-guardian / agent /pdf_processor.py

sbcesar

Proyect Finished :D

80e1d6c 23 days ago

1.23 kB

	#!/usr/bin/env python3
	"""
	agent/pdf_processor.py
	Procesador robusto de PDF usando pypdf.
	"""
	import logging
	from pathlib import Path
	import pypdf

	logger = logging.getLogger(__name__)

	class PDFProcessor:
	def __init__(self):
	pass

	def extract_text(self, pdf_path: str) -> str:
	path = Path(pdf_path)
	if not path.exists():
	raise FileNotFoundError(f"No se encuentra el PDF: {pdf_path}")

	try:
	logger.info(f"📄 Extrayendo texto de: {path.name}")
	text = []
	with open(path, 'rb') as f:
	reader = pypdf.PdfReader(f)
	for i, page in enumerate(reader.pages):
	page_text = page.extract_text()
	if page_text:
	text.append(page_text)

	full_text = "\n".join(text)
	if len(full_text) < 50:
	raise ValueError("El PDF parece ser una imagen o estar vacío.")

	return full_text

	except Exception as e:
	logger.error(f"Error leyendo PDF: {e}")
	raise e

	if __name__ == "__main__":
	print("✅ PDF Processor cargado")