ai-robot-core/ai-service/app/services/document/markdown_parser.py

179 lines
5.6 KiB
Python
Raw Normal View History

"""
Markdown parser with intelligent chunking.
[AC-AISVC-33] Markdown file parsing with structure-aware chunking.
"""
import logging
from pathlib import Path
from typing import Any
from app.services.document.base import (
DocumentParseException,
DocumentParser,
ParseResult,
)
from app.services.document.markdown_chunker import (
MarkdownChunker,
MarkdownElementType,
)
logger = logging.getLogger(__name__)
ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]
class MarkdownParser(DocumentParser):
"""
Parser for Markdown files with intelligent chunking.
[AC-AISVC-33] Structure-aware parsing for Markdown documents.
Features:
- Header hierarchy extraction
- Code block preservation
- Table structure preservation
- List grouping
- Context-aware chunking
"""
def __init__(
self,
encoding: str = "utf-8",
max_chunk_size: int = 1000,
min_chunk_size: int = 100,
preserve_code_blocks: bool = True,
preserve_tables: bool = True,
preserve_lists: bool = True,
include_header_context: bool = True,
**kwargs: Any,
):
self._encoding = encoding
self._max_chunk_size = max_chunk_size
self._min_chunk_size = min_chunk_size
self._preserve_code_blocks = preserve_code_blocks
self._preserve_tables = preserve_tables
self._preserve_lists = preserve_lists
self._include_header_context = include_header_context
self._extra_config = kwargs
self._chunker = MarkdownChunker(
max_chunk_size=max_chunk_size,
min_chunk_size=min_chunk_size,
preserve_code_blocks=preserve_code_blocks,
preserve_tables=preserve_tables,
preserve_lists=preserve_lists,
include_header_context=include_header_context,
)
def _try_encodings(self, path: Path) -> tuple[str, str]:
"""
Try multiple encodings to read the file.
Returns: (text, encoding_used)
"""
for enc in ENCODINGS_TO_TRY:
try:
with open(path, encoding=enc) as f:
text = f.read()
logger.info(f"Successfully parsed Markdown with encoding: {enc}")
return text, enc
except (UnicodeDecodeError, LookupError):
continue
raise DocumentParseException(
"Failed to decode Markdown file with any known encoding",
file_path=str(path),
parser="markdown"
)
def parse(self, file_path: str | Path) -> ParseResult:
"""
Parse a Markdown file and extract structured content.
[AC-AISVC-33] Structure-aware parsing.
"""
path = Path(file_path)
if not path.exists():
raise DocumentParseException(
f"File not found: {path}",
file_path=str(path),
parser="markdown"
)
try:
text, encoding_used = self._try_encodings(path)
file_size = path.stat().st_size
line_count = text.count("\n") + 1
chunks = self._chunker.chunk(text, doc_id=path.stem)
header_count = sum(
1 for c in chunks
if c.element_type == MarkdownElementType.HEADER
)
code_block_count = sum(
1 for c in chunks
if c.element_type == MarkdownElementType.CODE_BLOCK
)
table_count = sum(
1 for c in chunks
if c.element_type == MarkdownElementType.TABLE
)
list_count = sum(
1 for c in chunks
if c.element_type == MarkdownElementType.LIST
)
logger.info(
f"Parsed Markdown: {path.name}, lines={line_count}, "
f"chars={len(text)}, chunks={len(chunks)}, "
f"headers={header_count}, code_blocks={code_block_count}, "
f"tables={table_count}, lists={list_count}"
)
return ParseResult(
text=text,
source_path=str(path),
file_size=file_size,
metadata={
"format": "markdown",
"line_count": line_count,
"encoding": encoding_used,
"chunk_count": len(chunks),
"structure": {
"headers": header_count,
"code_blocks": code_block_count,
"tables": table_count,
"lists": list_count,
},
"chunks": [chunk.to_dict() for chunk in chunks],
}
)
except DocumentParseException:
raise
except Exception as e:
raise DocumentParseException(
f"Failed to parse Markdown file: {e}",
file_path=str(path),
parser="markdown",
details={"error": str(e)}
)
def get_supported_extensions(self) -> list[str]:
"""Get supported file extensions."""
return [".md", ".markdown"]
def get_chunks(self, text: str, doc_id: str = "") -> list[dict[str, Any]]:
"""
Get structured chunks from Markdown text.
Args:
text: Markdown text content
doc_id: Optional document ID
Returns:
List of chunk dictionaries
"""
chunks = self._chunker.chunk(text, doc_id)
return [chunk.to_dict() for chunk in chunks]