""" Markdown parser with intelligent chunking. [AC-AISVC-33] Markdown file parsing with structure-aware chunking. """ import logging from pathlib import Path from typing import Any from app.services.document.base import ( DocumentParseException, DocumentParser, ParseResult, ) from app.services.document.markdown_chunker import ( MarkdownChunker, MarkdownElementType, ) logger = logging.getLogger(__name__) ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"] class MarkdownParser(DocumentParser): """ Parser for Markdown files with intelligent chunking. [AC-AISVC-33] Structure-aware parsing for Markdown documents. Features: - Header hierarchy extraction - Code block preservation - Table structure preservation - List grouping - Context-aware chunking """ def __init__( self, encoding: str = "utf-8", max_chunk_size: int = 1000, min_chunk_size: int = 100, preserve_code_blocks: bool = True, preserve_tables: bool = True, preserve_lists: bool = True, include_header_context: bool = True, **kwargs: Any, ): self._encoding = encoding self._max_chunk_size = max_chunk_size self._min_chunk_size = min_chunk_size self._preserve_code_blocks = preserve_code_blocks self._preserve_tables = preserve_tables self._preserve_lists = preserve_lists self._include_header_context = include_header_context self._extra_config = kwargs self._chunker = MarkdownChunker( max_chunk_size=max_chunk_size, min_chunk_size=min_chunk_size, preserve_code_blocks=preserve_code_blocks, preserve_tables=preserve_tables, preserve_lists=preserve_lists, include_header_context=include_header_context, ) def _try_encodings(self, path: Path) -> tuple[str, str]: """ Try multiple encodings to read the file. Returns: (text, encoding_used) """ for enc in ENCODINGS_TO_TRY: try: with open(path, encoding=enc) as f: text = f.read() logger.info(f"Successfully parsed Markdown with encoding: {enc}") return text, enc except (UnicodeDecodeError, LookupError): continue raise DocumentParseException( "Failed to decode Markdown file with any known encoding", file_path=str(path), parser="markdown" ) def parse(self, file_path: str | Path) -> ParseResult: """ Parse a Markdown file and extract structured content. [AC-AISVC-33] Structure-aware parsing. """ path = Path(file_path) if not path.exists(): raise DocumentParseException( f"File not found: {path}", file_path=str(path), parser="markdown" ) try: text, encoding_used = self._try_encodings(path) file_size = path.stat().st_size line_count = text.count("\n") + 1 chunks = self._chunker.chunk(text, doc_id=path.stem) header_count = sum( 1 for c in chunks if c.element_type == MarkdownElementType.HEADER ) code_block_count = sum( 1 for c in chunks if c.element_type == MarkdownElementType.CODE_BLOCK ) table_count = sum( 1 for c in chunks if c.element_type == MarkdownElementType.TABLE ) list_count = sum( 1 for c in chunks if c.element_type == MarkdownElementType.LIST ) logger.info( f"Parsed Markdown: {path.name}, lines={line_count}, " f"chars={len(text)}, chunks={len(chunks)}, " f"headers={header_count}, code_blocks={code_block_count}, " f"tables={table_count}, lists={list_count}" ) return ParseResult( text=text, source_path=str(path), file_size=file_size, metadata={ "format": "markdown", "line_count": line_count, "encoding": encoding_used, "chunk_count": len(chunks), "structure": { "headers": header_count, "code_blocks": code_block_count, "tables": table_count, "lists": list_count, }, "chunks": [chunk.to_dict() for chunk in chunks], } ) except DocumentParseException: raise except Exception as e: raise DocumentParseException( f"Failed to parse Markdown file: {e}", file_path=str(path), parser="markdown", details={"error": str(e)} ) def get_supported_extensions(self) -> list[str]: """Get supported file extensions.""" return [".md", ".markdown"] def get_chunks(self, text: str, doc_id: str = "") -> list[dict[str, Any]]: """ Get structured chunks from Markdown text. Args: text: Markdown text content doc_id: Optional document ID Returns: List of chunk dictionaries """ chunks = self._chunker.chunk(text, doc_id) return [chunk.to_dict() for chunk in chunks]