""" PDF document parser implementation. [AC-AISVC-33] PDF parsing using PyMuPDF (fitz). Extracts text content from PDF files. """ import logging from pathlib import Path from typing import Any from app.services.document.base import ( DocumentParseException, DocumentParser, ParseResult, ) logger = logging.getLogger(__name__) class PDFParser(DocumentParser): """ Parser for PDF documents. [AC-AISVC-33] Uses PyMuPDF for text extraction. """ def __init__(self, extract_images: bool = False, **kwargs: Any): self._extract_images = extract_images self._extra_config = kwargs self._fitz = None def _get_fitz(self): """Lazy import of PyMuPDF.""" if self._fitz is None: try: import fitz self._fitz = fitz except ImportError: raise DocumentParseException( "PyMuPDF (fitz) not installed. Install with: pip install pymupdf", parser="pdf" ) return self._fitz def parse(self, file_path: str | Path) -> ParseResult: """ Parse a PDF document and extract text content. [AC-AISVC-33] Extracts text from all pages. """ path = Path(file_path) if not path.exists(): raise DocumentParseException( f"File not found: {path}", file_path=str(path), parser="pdf" ) if not self.supports_extension(path.suffix): raise DocumentParseException( f"Unsupported file extension: {path.suffix}", file_path=str(path), parser="pdf" ) fitz = self._get_fitz() try: doc = fitz.open(path) text_parts = [] page_count = len(doc) for page_num in range(page_count): page = doc[page_num] text = page.get_text() if text.strip(): text_parts.append(f"[Page {page_num + 1}]\n{text}") doc.close() full_text = "\n\n".join(text_parts) file_size = path.stat().st_size logger.info( f"Parsed PDF: {path.name}, pages={page_count}, " f"chars={len(full_text)}, size={file_size}" ) return ParseResult( text=full_text, source_path=str(path), file_size=file_size, page_count=page_count, metadata={ "format": "pdf", "page_count": page_count, } ) except DocumentParseException: raise except Exception as e: raise DocumentParseException( f"Failed to parse PDF: {e}", file_path=str(path), parser="pdf", details={"error": str(e)} ) def get_supported_extensions(self) -> list[str]: """Get supported file extensions.""" return [".pdf"] class PDFPlumberParser(DocumentParser): """ Alternative PDF parser using pdfplumber. [AC-AISVC-33] Uses pdfplumber for text extraction. pdfplumber is better for table extraction but slower than PyMuPDF. """ def __init__(self, extract_tables: bool = True, **kwargs: Any): self._extract_tables = extract_tables self._extra_config = kwargs self._pdfplumber = None def _get_pdfplumber(self): """Lazy import of pdfplumber.""" if self._pdfplumber is None: try: import pdfplumber self._pdfplumber = pdfplumber except ImportError: raise DocumentParseException( "pdfplumber not installed. Install with: pip install pdfplumber", parser="pdfplumber" ) return self._pdfplumber def parse(self, file_path: str | Path) -> ParseResult: """ Parse a PDF document and extract text content. [AC-AISVC-33] Extracts text and optionally tables. """ path = Path(file_path) if not path.exists(): raise DocumentParseException( f"File not found: {path}", file_path=str(path), parser="pdfplumber" ) pdfplumber = self._get_pdfplumber() try: text_parts = [] page_count = 0 with pdfplumber.open(path) as pdf: page_count = len(pdf.pages) for page_num, page in enumerate(pdf.pages): text = page.extract_text() or "" if self._extract_tables: tables = page.extract_tables() for table in tables: table_text = self._format_table(table) text += f"\n\n{table_text}" if text.strip(): text_parts.append(f"[Page {page_num + 1}]\n{text}") full_text = "\n\n".join(text_parts) file_size = path.stat().st_size logger.info( f"Parsed PDF (pdfplumber): {path.name}, pages={page_count}, " f"chars={len(full_text)}, size={file_size}" ) return ParseResult( text=full_text, source_path=str(path), file_size=file_size, page_count=page_count, metadata={ "format": "pdf", "parser": "pdfplumber", "page_count": page_count, } ) except DocumentParseException: raise except Exception as e: raise DocumentParseException( f"Failed to parse PDF: {e}", file_path=str(path), parser="pdfplumber", details={"error": str(e)} ) def _format_table(self, table: list[list[str | None]]) -> str: """Format a table as text.""" if not table: return "" lines = [] for row in table: cells = [str(cell) if cell else "" for cell in row] lines.append(" | ".join(cells)) return "\n".join(lines) def get_supported_extensions(self) -> list[str]: """Get supported file extensions.""" return [".pdf"]