""" Base document parser interface. [AC-AISVC-33] Abstract interface for document parsers. Design reference: progress.md Section 7.2 - DocumentParser interface - parse(file_path) -> str - get_supported_extensions() -> list[str] """ from abc import ABC, abstractmethod from dataclasses import dataclass, field from pathlib import Path from typing import Any @dataclass class PageText: """ Text content from a single page. """ page: int text: str @dataclass class ParseResult: """ Result from document parsing. [AC-AISVC-33] Contains parsed text and metadata. """ text: str source_path: str file_size: int page_count: int | None = None metadata: dict[str, Any] = field(default_factory=dict) pages: list[PageText] = field(default_factory=list) class DocumentParser(ABC): """ Abstract base class for document parsers. [AC-AISVC-33] Provides unified interface for different document formats. """ @abstractmethod def parse(self, file_path: str | Path) -> ParseResult: """ Parse a document and extract text content. [AC-AISVC-33] Returns parsed text content. Args: file_path: Path to the document file. Returns: ParseResult with extracted text and metadata. Raises: DocumentParseException: If parsing fails. """ pass @abstractmethod def get_supported_extensions(self) -> list[str]: """ Get list of supported file extensions. [AC-AISVC-37] Returns supported format list. Returns: List of file extensions (e.g., [".pdf", ".txt"]) """ pass def supports_extension(self, extension: str) -> bool: """ Check if this parser supports a given file extension. [AC-AISVC-37] Validates file format support. Args: extension: File extension to check. Returns: True if extension is supported. """ normalized = extension.lower() if not normalized.startswith("."): normalized = f".{normalized}" return normalized in self.get_supported_extensions() class DocumentParseException(Exception): """Exception raised when document parsing fails.""" def __init__( self, message: str, file_path: str = "", parser: str = "", details: dict[str, Any] | None = None ): self.file_path = file_path self.parser = parser self.details = details or {} super().__init__(f"[{parser}] {message}" if parser else message) class UnsupportedFormatError(DocumentParseException): """Exception raised when file format is not supported.""" def __init__(self, extension: str, supported: list[str]): super().__init__( f"Unsupported file format: {extension}. " f"Supported formats: {', '.join(supported)}", parser="format_checker" ) self.extension = extension self.supported_formats = supported