""" Document parser factory. [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Factory for document parsers. Design reference: progress.md Section 7.2 - DocumentParserFactory """ import logging from pathlib import Path from typing import Any, Type from app.services.document.base import ( DocumentParser, DocumentParseException, ParseResult, UnsupportedFormatError, ) from app.services.document.excel_parser import CSVParser, ExcelParser from app.services.document.pdf_parser import PDFParser, PDFPlumberParser from app.services.document.text_parser import TextParser from app.services.document.word_parser import WordParser logger = logging.getLogger(__name__) class DocumentParserFactory: """ Factory for creating document parsers. [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Auto-selects parser based on file extension. """ _parsers: dict[str, Type[DocumentParser]] = {} _extension_map: dict[str, str] = {} @classmethod def _initialize(cls) -> None: """Initialize default parsers.""" if cls._parsers: return cls._parsers = { "pdf": PDFParser, "pdfplumber": PDFPlumberParser, "word": WordParser, "excel": ExcelParser, "csv": CSVParser, "text": TextParser, } cls._extension_map = { ".pdf": "pdf", ".docx": "word", ".xlsx": "excel", ".xls": "excel", ".csv": "csv", ".txt": "text", ".md": "text", ".markdown": "text", ".rst": "text", ".log": "text", ".json": "text", ".xml": "text", ".yaml": "text", ".yml": "text", } @classmethod def register_parser( cls, name: str, parser_class: Type[DocumentParser], extensions: list[str], ) -> None: """ Register a new document parser. [AC-AISVC-33] Allows runtime registration of parsers. """ cls._initialize() cls._parsers[name] = parser_class for ext in extensions: cls._extension_map[ext.lower()] = name logger.info(f"Registered document parser: {name} for extensions: {extensions}") @classmethod def get_supported_extensions(cls) -> list[str]: """ Get all supported file extensions. [AC-AISVC-37] Returns list of supported formats. """ cls._initialize() return list(cls._extension_map.keys()) @classmethod def get_parser_for_extension(cls, extension: str) -> DocumentParser: """ Get a parser instance for a file extension. [AC-AISVC-33] Creates appropriate parser based on extension. """ cls._initialize() normalized = extension.lower() if not normalized.startswith("."): normalized = f".{normalized}" if normalized not in cls._extension_map: raise UnsupportedFormatError(normalized, cls.get_supported_extensions()) parser_name = cls._extension_map[normalized] parser_class = cls._parsers[parser_name] return parser_class() @classmethod def parse_file( cls, file_path: str | Path, parser_name: str | None = None, parser_config: dict[str, Any] | None = None, ) -> ParseResult: """ Parse a document file. [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Main entry point for parsing. Args: file_path: Path to the document file parser_name: Optional specific parser to use parser_config: Optional configuration for the parser Returns: ParseResult with extracted text and metadata Raises: UnsupportedFormatError: If file format is not supported DocumentParseException: If parsing fails """ cls._initialize() path = Path(file_path) extension = path.suffix.lower() if parser_name: if parser_name not in cls._parsers: raise DocumentParseException( f"Unknown parser: {parser_name}", file_path=str(path), parser="factory" ) parser_class = cls._parsers[parser_name] parser = parser_class(**(parser_config or {})) else: parser = cls.get_parser_for_extension(extension) if parser_config: parser = type(parser)(**parser_config) return parser.parse(path) @classmethod def get_parser_info(cls) -> list[dict[str, Any]]: """ Get information about available parsers. [AC-AISVC-37] Returns parser metadata. """ cls._initialize() info = [] for name, parser_class in cls._parsers.items(): temp_instance = parser_class.__new__(parser_class) extensions = temp_instance.get_supported_extensions() display_names = { "pdf": "PDF 文档", "pdfplumber": "PDF 文档 (pdfplumber)", "word": "Word 文档", "excel": "Excel 电子表格", "csv": "CSV 文件", "text": "文本文件", } descriptions = { "pdf": "使用 PyMuPDF 解析 PDF 文档,速度快", "pdfplumber": "使用 pdfplumber 解析 PDF 文档,表格提取效果更好", "word": "解析 Word 文档 (.docx),保留段落结构", "excel": "解析 Excel 电子表格,支持多工作表", "csv": "解析 CSV 文件,自动检测编码", "text": "解析纯文本文件,支持多种编码", } info.append({ "name": name, "display_name": display_names.get(name, name), "description": descriptions.get(name, ""), "extensions": extensions, }) return info def parse_document( file_path: str | Path, parser_name: str | None = None, parser_config: dict[str, Any] | None = None, ) -> ParseResult: """ Convenience function for parsing documents. [AC-AISVC-33] Simple entry point for document parsing. """ return DocumentParserFactory.parse_file(file_path, parser_name, parser_config) def get_supported_document_formats() -> list[str]: """ Get list of supported document formats. [AC-AISVC-37] Returns supported format extensions. """ return DocumentParserFactory.get_supported_extensions()