ai-robot-core/ai-service/app/services/document/base.py

107 lines
2.9 KiB
Python

"""
Base document parser interface.
[AC-AISVC-33] Abstract interface for document parsers.
Design reference: progress.md Section 7.2 - DocumentParser interface
- parse(file_path) -> str
- get_supported_extensions() -> list[str]
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
@dataclass
class ParseResult:
"""
Result from document parsing.
[AC-AISVC-33] Contains parsed text and metadata.
"""
text: str
source_path: str
file_size: int
page_count: int | None = None
metadata: dict[str, Any] = field(default_factory=dict)
class DocumentParser(ABC):
"""
Abstract base class for document parsers.
[AC-AISVC-33] Provides unified interface for different document formats.
"""
@abstractmethod
def parse(self, file_path: str | Path) -> ParseResult:
"""
Parse a document and extract text content.
[AC-AISVC-33] Returns parsed text content.
Args:
file_path: Path to the document file.
Returns:
ParseResult with extracted text and metadata.
Raises:
DocumentParseException: If parsing fails.
"""
pass
@abstractmethod
def get_supported_extensions(self) -> list[str]:
"""
Get list of supported file extensions.
[AC-AISVC-37] Returns supported format list.
Returns:
List of file extensions (e.g., [".pdf", ".txt"])
"""
pass
def supports_extension(self, extension: str) -> bool:
"""
Check if this parser supports a given file extension.
[AC-AISVC-37] Validates file format support.
Args:
extension: File extension to check.
Returns:
True if extension is supported.
"""
normalized = extension.lower()
if not normalized.startswith("."):
normalized = f".{normalized}"
return normalized in self.get_supported_extensions()
class DocumentParseException(Exception):
"""Exception raised when document parsing fails."""
def __init__(
self,
message: str,
file_path: str = "",
parser: str = "",
details: dict[str, Any] | None = None
):
self.file_path = file_path
self.parser = parser
self.details = details or {}
super().__init__(f"[{parser}] {message}" if parser else message)
class UnsupportedFormatError(DocumentParseException):
"""Exception raised when file format is not supported."""
def __init__(self, extension: str, supported: list[str]):
super().__init__(
f"Unsupported file format: {extension}. "
f"Supported formats: {', '.join(supported)}",
parser="format_checker"
)
self.extension = extension
self.supported_formats = supported