File size: 1,231 Bytes
80e1d6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env python3
"""
agent/pdf_processor.py
Procesador robusto de PDF usando pypdf.
"""
import logging
from pathlib import Path
import pypdf

logger = logging.getLogger(__name__)

class PDFProcessor:
    def __init__(self):
        pass
    
    def extract_text(self, pdf_path: str) -> str:
        path = Path(pdf_path)
        if not path.exists():
            raise FileNotFoundError(f"No se encuentra el PDF: {pdf_path}")
            
        try:
            logger.info(f"📄 Extrayendo texto de: {path.name}")
            text = []
            with open(path, 'rb') as f:
                reader = pypdf.PdfReader(f)
                for i, page in enumerate(reader.pages):
                    page_text = page.extract_text()
                    if page_text:
                        text.append(page_text)
                        
            full_text = "\n".join(text)
            if len(full_text) < 50:
                raise ValueError("El PDF parece ser una imagen o estar vacío.")
                
            return full_text
            
        except Exception as e:
            logger.error(f"Error leyendo PDF: {e}")
            raise e

if __name__ == "__main__":
    print("✅ PDF Processor cargado")