107 lines
2.9 KiB
Python
107 lines
2.9 KiB
Python
"""
|
|
Base document parser interface.
|
|
[AC-AISVC-33] Abstract interface for document parsers.
|
|
|
|
Design reference: progress.md Section 7.2 - DocumentParser interface
|
|
- parse(file_path) -> str
|
|
- get_supported_extensions() -> list[str]
|
|
"""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
@dataclass
|
|
class ParseResult:
|
|
"""
|
|
Result from document parsing.
|
|
[AC-AISVC-33] Contains parsed text and metadata.
|
|
"""
|
|
text: str
|
|
source_path: str
|
|
file_size: int
|
|
page_count: int | None = None
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
class DocumentParser(ABC):
|
|
"""
|
|
Abstract base class for document parsers.
|
|
[AC-AISVC-33] Provides unified interface for different document formats.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def parse(self, file_path: str | Path) -> ParseResult:
|
|
"""
|
|
Parse a document and extract text content.
|
|
[AC-AISVC-33] Returns parsed text content.
|
|
|
|
Args:
|
|
file_path: Path to the document file.
|
|
|
|
Returns:
|
|
ParseResult with extracted text and metadata.
|
|
|
|
Raises:
|
|
DocumentParseException: If parsing fails.
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_supported_extensions(self) -> list[str]:
|
|
"""
|
|
Get list of supported file extensions.
|
|
[AC-AISVC-37] Returns supported format list.
|
|
|
|
Returns:
|
|
List of file extensions (e.g., [".pdf", ".txt"])
|
|
"""
|
|
pass
|
|
|
|
def supports_extension(self, extension: str) -> bool:
|
|
"""
|
|
Check if this parser supports a given file extension.
|
|
[AC-AISVC-37] Validates file format support.
|
|
|
|
Args:
|
|
extension: File extension to check.
|
|
|
|
Returns:
|
|
True if extension is supported.
|
|
"""
|
|
normalized = extension.lower()
|
|
if not normalized.startswith("."):
|
|
normalized = f".{normalized}"
|
|
return normalized in self.get_supported_extensions()
|
|
|
|
|
|
class DocumentParseException(Exception):
|
|
"""Exception raised when document parsing fails."""
|
|
|
|
def __init__(
|
|
self,
|
|
message: str,
|
|
file_path: str = "",
|
|
parser: str = "",
|
|
details: dict[str, Any] | None = None
|
|
):
|
|
self.file_path = file_path
|
|
self.parser = parser
|
|
self.details = details or {}
|
|
super().__init__(f"[{parser}] {message}" if parser else message)
|
|
|
|
|
|
class UnsupportedFormatError(DocumentParseException):
|
|
"""Exception raised when file format is not supported."""
|
|
|
|
def __init__(self, extension: str, supported: list[str]):
|
|
super().__init__(
|
|
f"Unsupported file format: {extension}. "
|
|
f"Supported formats: {', '.join(supported)}",
|
|
parser="format_checker"
|
|
)
|
|
self.extension = extension
|
|
self.supported_formats = supported
|