102 lines
3.0 KiB
Python
102 lines
3.0 KiB
Python
|
|
"""
|
||
|
|
Text file parser implementation.
|
||
|
|
[AC-AISVC-33] Text file parsing for plain text and markdown.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import logging
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
from app.services.document.base import (
|
||
|
|
DocumentParseException,
|
||
|
|
DocumentParser,
|
||
|
|
ParseResult,
|
||
|
|
)
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
|
||
|
|
class TextParser(DocumentParser):
|
||
|
|
"""
|
||
|
|
Parser for plain text files.
|
||
|
|
[AC-AISVC-33] Direct text extraction.
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(self, encoding: str = "utf-8", **kwargs: Any):
|
||
|
|
self._encoding = encoding
|
||
|
|
self._extra_config = kwargs
|
||
|
|
|
||
|
|
def parse(self, file_path: str | Path) -> ParseResult:
|
||
|
|
"""
|
||
|
|
Parse a text file and extract content.
|
||
|
|
[AC-AISVC-33] Direct file reading.
|
||
|
|
"""
|
||
|
|
path = Path(file_path)
|
||
|
|
|
||
|
|
if not path.exists():
|
||
|
|
raise DocumentParseException(
|
||
|
|
f"File not found: {path}",
|
||
|
|
file_path=str(path),
|
||
|
|
parser="text"
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
with open(path, "r", encoding=self._encoding) as f:
|
||
|
|
text = f.read()
|
||
|
|
|
||
|
|
file_size = path.stat().st_size
|
||
|
|
line_count = text.count("\n") + 1
|
||
|
|
|
||
|
|
logger.info(
|
||
|
|
f"Parsed text: {path.name}, lines={line_count}, "
|
||
|
|
f"chars={len(text)}, size={file_size}"
|
||
|
|
)
|
||
|
|
|
||
|
|
return ParseResult(
|
||
|
|
text=text,
|
||
|
|
source_path=str(path),
|
||
|
|
file_size=file_size,
|
||
|
|
metadata={
|
||
|
|
"format": "text",
|
||
|
|
"line_count": line_count,
|
||
|
|
"encoding": self._encoding,
|
||
|
|
}
|
||
|
|
)
|
||
|
|
|
||
|
|
except UnicodeDecodeError:
|
||
|
|
try:
|
||
|
|
with open(path, "r", encoding="gbk") as f:
|
||
|
|
text = f.read()
|
||
|
|
|
||
|
|
file_size = path.stat().st_size
|
||
|
|
line_count = text.count("\n") + 1
|
||
|
|
|
||
|
|
return ParseResult(
|
||
|
|
text=text,
|
||
|
|
source_path=str(path),
|
||
|
|
file_size=file_size,
|
||
|
|
metadata={
|
||
|
|
"format": "text",
|
||
|
|
"line_count": line_count,
|
||
|
|
"encoding": "gbk",
|
||
|
|
}
|
||
|
|
)
|
||
|
|
except Exception as e:
|
||
|
|
raise DocumentParseException(
|
||
|
|
f"Failed to parse text file with encoding fallback: {e}",
|
||
|
|
file_path=str(path),
|
||
|
|
parser="text",
|
||
|
|
details={"error": str(e)}
|
||
|
|
)
|
||
|
|
except Exception as e:
|
||
|
|
raise DocumentParseException(
|
||
|
|
f"Failed to parse text file: {e}",
|
||
|
|
file_path=str(path),
|
||
|
|
parser="text",
|
||
|
|
details={"error": str(e)}
|
||
|
|
)
|
||
|
|
|
||
|
|
def get_supported_extensions(self) -> list[str]:
|
||
|
|
"""Get supported file extensions."""
|
||
|
|
return [".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"]
|