ai-robot-core/ai-service/app/services/document/text_parser.py

100 lines
2.9 KiB
Python

"""
Text file parser implementation.
[AC-AISVC-33] Text file parsing for plain text and markdown.
"""
import logging
from pathlib import Path
from typing import Any
from app.services.document.base import (
DocumentParseException,
DocumentParser,
ParseResult,
)
logger = logging.getLogger(__name__)
ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]
class TextParser(DocumentParser):
"""
Parser for plain text files.
[AC-AISVC-33] Direct text extraction with multiple encoding support.
"""
def __init__(self, encoding: str = "utf-8", **kwargs: Any):
self._encoding = encoding
self._extra_config = kwargs
def _try_encodings(self, path: Path) -> tuple[str, str]:
"""
Try multiple encodings to read the file.
Returns: (text, encoding_used)
"""
for enc in ENCODINGS_TO_TRY:
try:
with open(path, "r", encoding=enc) as f:
text = f.read()
logger.info(f"Successfully parsed with encoding: {enc}")
return text, enc
except (UnicodeDecodeError, LookupError):
continue
raise DocumentParseException(
f"Failed to decode file with any known encoding",
file_path=str(path),
parser="text"
)
def parse(self, file_path: str | Path) -> ParseResult:
"""
Parse a text file and extract content.
[AC-AISVC-33] Direct file reading.
"""
path = Path(file_path)
if not path.exists():
raise DocumentParseException(
f"File not found: {path}",
file_path=str(path),
parser="text"
)
try:
text, encoding_used = self._try_encodings(path)
file_size = path.stat().st_size
line_count = text.count("\n") + 1
logger.info(
f"Parsed text: {path.name}, lines={line_count}, "
f"chars={len(text)}, size={file_size}, encoding={encoding_used}"
)
return ParseResult(
text=text,
source_path=str(path),
file_size=file_size,
metadata={
"format": "text",
"line_count": line_count,
"encoding": encoding_used,
}
)
except DocumentParseException:
raise
except Exception as e:
raise DocumentParseException(
f"Failed to parse text file: {e}",
file_path=str(path),
parser="text",
details={"error": str(e)}
)
def get_supported_extensions(self) -> list[str]:
"""Get supported file extensions."""
return [".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"]