179 lines
5.6 KiB
Python
179 lines
5.6 KiB
Python
"""
|
|
Markdown parser with intelligent chunking.
|
|
[AC-AISVC-33] Markdown file parsing with structure-aware chunking.
|
|
"""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from app.services.document.base import (
|
|
DocumentParseException,
|
|
DocumentParser,
|
|
ParseResult,
|
|
)
|
|
from app.services.document.markdown_chunker import (
|
|
MarkdownChunker,
|
|
MarkdownElementType,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]
|
|
|
|
|
|
class MarkdownParser(DocumentParser):
|
|
"""
|
|
Parser for Markdown files with intelligent chunking.
|
|
[AC-AISVC-33] Structure-aware parsing for Markdown documents.
|
|
|
|
Features:
|
|
- Header hierarchy extraction
|
|
- Code block preservation
|
|
- Table structure preservation
|
|
- List grouping
|
|
- Context-aware chunking
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
encoding: str = "utf-8",
|
|
max_chunk_size: int = 1000,
|
|
min_chunk_size: int = 100,
|
|
preserve_code_blocks: bool = True,
|
|
preserve_tables: bool = True,
|
|
preserve_lists: bool = True,
|
|
include_header_context: bool = True,
|
|
**kwargs: Any,
|
|
):
|
|
self._encoding = encoding
|
|
self._max_chunk_size = max_chunk_size
|
|
self._min_chunk_size = min_chunk_size
|
|
self._preserve_code_blocks = preserve_code_blocks
|
|
self._preserve_tables = preserve_tables
|
|
self._preserve_lists = preserve_lists
|
|
self._include_header_context = include_header_context
|
|
self._extra_config = kwargs
|
|
|
|
self._chunker = MarkdownChunker(
|
|
max_chunk_size=max_chunk_size,
|
|
min_chunk_size=min_chunk_size,
|
|
preserve_code_blocks=preserve_code_blocks,
|
|
preserve_tables=preserve_tables,
|
|
preserve_lists=preserve_lists,
|
|
include_header_context=include_header_context,
|
|
)
|
|
|
|
def _try_encodings(self, path: Path) -> tuple[str, str]:
|
|
"""
|
|
Try multiple encodings to read the file.
|
|
Returns: (text, encoding_used)
|
|
"""
|
|
for enc in ENCODINGS_TO_TRY:
|
|
try:
|
|
with open(path, encoding=enc) as f:
|
|
text = f.read()
|
|
logger.info(f"Successfully parsed Markdown with encoding: {enc}")
|
|
return text, enc
|
|
except (UnicodeDecodeError, LookupError):
|
|
continue
|
|
|
|
raise DocumentParseException(
|
|
"Failed to decode Markdown file with any known encoding",
|
|
file_path=str(path),
|
|
parser="markdown"
|
|
)
|
|
|
|
def parse(self, file_path: str | Path) -> ParseResult:
|
|
"""
|
|
Parse a Markdown file and extract structured content.
|
|
[AC-AISVC-33] Structure-aware parsing.
|
|
"""
|
|
path = Path(file_path)
|
|
|
|
if not path.exists():
|
|
raise DocumentParseException(
|
|
f"File not found: {path}",
|
|
file_path=str(path),
|
|
parser="markdown"
|
|
)
|
|
|
|
try:
|
|
text, encoding_used = self._try_encodings(path)
|
|
|
|
file_size = path.stat().st_size
|
|
line_count = text.count("\n") + 1
|
|
|
|
chunks = self._chunker.chunk(text, doc_id=path.stem)
|
|
|
|
header_count = sum(
|
|
1 for c in chunks
|
|
if c.element_type == MarkdownElementType.HEADER
|
|
)
|
|
code_block_count = sum(
|
|
1 for c in chunks
|
|
if c.element_type == MarkdownElementType.CODE_BLOCK
|
|
)
|
|
table_count = sum(
|
|
1 for c in chunks
|
|
if c.element_type == MarkdownElementType.TABLE
|
|
)
|
|
list_count = sum(
|
|
1 for c in chunks
|
|
if c.element_type == MarkdownElementType.LIST
|
|
)
|
|
|
|
logger.info(
|
|
f"Parsed Markdown: {path.name}, lines={line_count}, "
|
|
f"chars={len(text)}, chunks={len(chunks)}, "
|
|
f"headers={header_count}, code_blocks={code_block_count}, "
|
|
f"tables={table_count}, lists={list_count}"
|
|
)
|
|
|
|
return ParseResult(
|
|
text=text,
|
|
source_path=str(path),
|
|
file_size=file_size,
|
|
metadata={
|
|
"format": "markdown",
|
|
"line_count": line_count,
|
|
"encoding": encoding_used,
|
|
"chunk_count": len(chunks),
|
|
"structure": {
|
|
"headers": header_count,
|
|
"code_blocks": code_block_count,
|
|
"tables": table_count,
|
|
"lists": list_count,
|
|
},
|
|
"chunks": [chunk.to_dict() for chunk in chunks],
|
|
}
|
|
)
|
|
|
|
except DocumentParseException:
|
|
raise
|
|
except Exception as e:
|
|
raise DocumentParseException(
|
|
f"Failed to parse Markdown file: {e}",
|
|
file_path=str(path),
|
|
parser="markdown",
|
|
details={"error": str(e)}
|
|
)
|
|
|
|
def get_supported_extensions(self) -> list[str]:
|
|
"""Get supported file extensions."""
|
|
return [".md", ".markdown"]
|
|
|
|
def get_chunks(self, text: str, doc_id: str = "") -> list[dict[str, Any]]:
|
|
"""
|
|
Get structured chunks from Markdown text.
|
|
|
|
Args:
|
|
text: Markdown text content
|
|
doc_id: Optional document ID
|
|
|
|
Returns:
|
|
List of chunk dictionaries
|
|
"""
|
|
chunks = self._chunker.chunk(text, doc_id)
|
|
return [chunk.to_dict() for chunk in chunks]
|