From 4de2a2aece0b57c003ae8232ca163c5184b271e0 Mon Sep 17 00:00:00 2001 From: MerCry Date: Wed, 11 Mar 2026 18:56:43 +0800 Subject: [PATCH] =?UTF-8?q?[AC-DOC-PARSER]=20feat(document):=20=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E5=9B=BE=E7=89=87=E5=92=8C=20Markdown=20=E8=A7=A3?= =?UTF-8?q?=E6=9E=90=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 ImageParser 支持图片文件解析 - 新增 MarkdownParser 支持 Markdown 文件解析 - 新增 MarkdownChunker 实现 Markdown 智能分块 - 支持按标题、段落、代码块等元素类型分块 - 更新 document 模块导出和工厂方法 --- ai-service/app/services/document/__init__.py | 18 + ai-service/app/services/document/factory.py | 20 +- .../app/services/document/image_parser.py | 490 +++++++++++ .../app/services/document/markdown_chunker.py | 771 ++++++++++++++++++ .../app/services/document/markdown_parser.py | 178 ++++ 5 files changed, 1475 insertions(+), 2 deletions(-) create mode 100644 ai-service/app/services/document/image_parser.py create mode 100644 ai-service/app/services/document/markdown_chunker.py create mode 100644 ai-service/app/services/document/markdown_parser.py diff --git a/ai-service/app/services/document/__init__.py b/ai-service/app/services/document/__init__.py index 2a1aa36..f0dece3 100644 --- a/ai-service/app/services/document/__init__.py +++ b/ai-service/app/services/document/__init__.py @@ -16,6 +16,16 @@ from app.services.document.factory import ( get_supported_document_formats, parse_document, ) +from app.services.document.image_parser import ImageParser +from app.services.document.markdown_chunker import ( + MarkdownChunk, + MarkdownChunker, + MarkdownElement, + MarkdownElementType, + MarkdownParser as MarkdownStructureParser, + chunk_markdown, +) +from app.services.document.markdown_parser import MarkdownParser from app.services.document.pdf_parser import PDFParser, PDFPlumberParser from app.services.document.text_parser import TextParser from app.services.document.word_parser import WordParser @@ -35,4 +45,12 @@ __all__ = [ "ExcelParser", "CSVParser", "TextParser", + "MarkdownParser", + "MarkdownChunker", + "MarkdownChunk", + "MarkdownElement", + "MarkdownElementType", + "MarkdownStructureParser", + "chunk_markdown", + "ImageParser", ] diff --git a/ai-service/app/services/document/factory.py b/ai-service/app/services/document/factory.py index e19de83..08b42c3 100644 --- a/ai-service/app/services/document/factory.py +++ b/ai-service/app/services/document/factory.py @@ -16,6 +16,8 @@ from app.services.document.base import ( UnsupportedFormatError, ) from app.services.document.excel_parser import CSVParser, ExcelParser +from app.services.document.image_parser import ImageParser +from app.services.document.markdown_parser import MarkdownParser from app.services.document.pdf_parser import PDFParser, PDFPlumberParser from app.services.document.text_parser import TextParser from app.services.document.word_parser import WordParser @@ -45,6 +47,8 @@ class DocumentParserFactory: "excel": ExcelParser, "csv": CSVParser, "text": TextParser, + "markdown": MarkdownParser, + "image": ImageParser, } cls._extension_map = { @@ -54,14 +58,22 @@ class DocumentParserFactory: ".xls": "excel", ".csv": "csv", ".txt": "text", - ".md": "text", - ".markdown": "text", + ".md": "markdown", + ".markdown": "markdown", ".rst": "text", ".log": "text", ".json": "text", ".xml": "text", ".yaml": "text", ".yml": "text", + ".jpg": "image", + ".jpeg": "image", + ".png": "image", + ".gif": "image", + ".webp": "image", + ".bmp": "image", + ".tiff": "image", + ".tif": "image", } @classmethod @@ -174,6 +186,8 @@ class DocumentParserFactory: "excel": "Excel 电子表格", "csv": "CSV 文件", "text": "文本文件", + "markdown": "Markdown 文档", + "image": "图片文件", } descriptions = { @@ -183,6 +197,8 @@ class DocumentParserFactory: "excel": "解析 Excel 电子表格,支持多工作表", "csv": "解析 CSV 文件,自动检测编码", "text": "解析纯文本文件,支持多种编码", + "markdown": "智能解析 Markdown 文档,保留结构(标题、代码块、表格、列表)", + "image": "使用多模态 LLM 解析图片,提取文字和关键信息", } info.append({ diff --git a/ai-service/app/services/document/image_parser.py b/ai-service/app/services/document/image_parser.py new file mode 100644 index 0000000..af3839e --- /dev/null +++ b/ai-service/app/services/document/image_parser.py @@ -0,0 +1,490 @@ +""" +Image parser using multimodal LLM. +Supports parsing images into structured text content for knowledge base indexing. +""" + +import asyncio +import base64 +import json +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from app.services.document.base import ( + DocumentParseException, + DocumentParser, + PageText, + ParseResult, +) +from app.services.llm.factory import LLMUsageType, get_llm_config_manager + +logger = logging.getLogger(__name__) + +IMAGE_SYSTEM_PROMPT = """你是一个专业的图像内容分析助手。你的任务是分析图片内容,并将其智能拆分为适合知识库检索的独立数据块。 + +## 分析要求 +1. 仔细分析图片内容,识别其中的文字、图表、数据等信息 +2. 根据内容的逻辑结构,智能判断如何拆分为独立的知识条目 +3. 每个条目应该是独立、完整、可检索的知识单元 + +## 输出格式 +请严格按照以下 JSON 格式输出,不要添加任何其他内容: + +```json +{ + "image_summary": "图片整体概述(一句话描述图片主题)", + "total_chunks": <分块总数>, + "chunks": [ + { + "chunk_index": 0, + "content": "该分块的完整内容文字", + "chunk_type": "text|table|list|diagram|chart|mixed", + "keywords": ["关键词1", "关键词2"] + } + ] +} +``` + +## 分块策略 +- **单一内容**: 如果图片只有一段完整的文字/信息,可以只输出1个分块 +- **多段落内容**: 按段落或逻辑单元拆分,每个段落作为独立分块 +- **表格数据**: 将表格内容转换为结构化文字,作为一个分块 +- **图表数据**: 描述图表内容和数据,作为一个分块 +- **列表内容**: 每个列表项可作为独立分块,或合并为相关的一组 +- **混合内容**: 根据内容类型分别处理,确保每个分块主题明确 + +## 注意事项 +1. 每个分块的 content 必须是完整、可独立理解的文字 +2. chunk_type 用于标识内容类型,便于后续处理 +3. keywords 提取该分块的核心关键词,便于检索 +4. 确保输出的 JSON 格式正确,可以被解析""" + +IMAGE_USER_PROMPT = "请分析这张图片,按照要求的 JSON 格式输出分块结果。" + + +@dataclass +class ImageChunk: + """智能分块结果""" + chunk_index: int + content: str + chunk_type: str = "text" + keywords: list[str] = field(default_factory=list) + + +@dataclass +class ImageParseResult: + """图片解析结果(包含智能分块)""" + image_summary: str + chunks: list[ImageChunk] + raw_text: str + source_path: str + file_size: int + metadata: dict[str, Any] = field(default_factory=dict) + + +class ImageParser(DocumentParser): + """ + Image parser using multimodal LLM. + + Supports common image formats and extracts text content using + vision-capable LLM models (GPT-4V, GPT-4o, etc.). + + Features: + - Intelligent chunking based on content structure + - Structured output with keywords and chunk types + - Support for various content types (text, table, chart, etc.) + """ + + SUPPORTED_EXTENSIONS = [ + ".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif" + ] + + def __init__( + self, + model: str | None = None, + max_tokens: int = 4096, + timeout_seconds: int = 120, + ): + self._model = model + self._max_tokens = max_tokens + self._timeout_seconds = timeout_seconds + + def parse(self, file_path: str | Path) -> ParseResult: + """ + Parse an image file and extract text content using multimodal LLM. + + Note: This method is synchronous but internally uses async operations. + For async contexts, use parse_async() instead. + + Args: + file_path: Path to the image file. + + Returns: + ParseResult with extracted text content. + + Raises: + DocumentParseException: If parsing fails. + """ + path = Path(file_path) + if not path.exists(): + raise DocumentParseException( + f"Image file not found: {file_path}", + file_path=str(path), + parser="image", + ) + + file_size = path.stat().st_size + extension = path.suffix.lower() + + if extension not in self.SUPPORTED_EXTENSIONS: + raise DocumentParseException( + f"Unsupported image format: {extension}", + file_path=str(path), + parser="image", + details={"supported_formats": self.SUPPORTED_EXTENSIONS}, + ) + + try: + with open(path, "rb") as f: + image_data = f.read() + + image_base64 = base64.b64encode(image_data).decode("utf-8") + mime_type = self._get_mime_type(extension) + + try: + loop = asyncio.get_running_loop() + import concurrent.futures + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit( + asyncio.run, + self._analyze_image_async(image_base64, mime_type) + ) + result = future.result() + except RuntimeError: + result = asyncio.run(self._analyze_image_async(image_base64, mime_type)) + + logger.info( + f"[IMAGE-PARSER] Successfully parsed image: {path.name}, " + f"size={file_size}, chunks={len(result.chunks)}" + ) + + return ParseResult( + text=result.raw_text, + source_path=str(path), + file_size=file_size, + page_count=1, + metadata={ + "format": extension, + "parser": "image", + "mime_type": mime_type, + "image_summary": result.image_summary, + "chunk_count": len(result.chunks), + "chunks": [ + { + "chunk_index": c.chunk_index, + "content": c.content, + "chunk_type": c.chunk_type, + "keywords": c.keywords, + } + for c in result.chunks + ], + }, + pages=[PageText(page=1, text=result.raw_text)], + ) + + except Exception as e: + logger.error(f"[IMAGE-PARSER] Failed to parse image {path}: {e}") + raise DocumentParseException( + f"Failed to parse image: {str(e)}", + file_path=str(path), + parser="image", + details={"error": str(e)}, + ) + + async def parse_async(self, file_path: str | Path) -> ParseResult: + """ + Async version of parse method for use in async contexts. + + Args: + file_path: Path to the image file. + + Returns: + ParseResult with extracted text content. + """ + path = Path(file_path) + if not path.exists(): + raise DocumentParseException( + f"Image file not found: {file_path}", + file_path=str(path), + parser="image", + ) + + file_size = path.stat().st_size + extension = path.suffix.lower() + + if extension not in self.SUPPORTED_EXTENSIONS: + raise DocumentParseException( + f"Unsupported image format: {extension}", + file_path=str(path), + parser="image", + details={"supported_formats": self.SUPPORTED_EXTENSIONS}, + ) + + try: + with open(path, "rb") as f: + image_data = f.read() + + image_base64 = base64.b64encode(image_data).decode("utf-8") + mime_type = self._get_mime_type(extension) + + result = await self._analyze_image_async(image_base64, mime_type) + + logger.info( + f"[IMAGE-PARSER] Successfully parsed image (async): {path.name}, " + f"size={file_size}, chunks={len(result.chunks)}" + ) + + return ParseResult( + text=result.raw_text, + source_path=str(path), + file_size=file_size, + page_count=1, + metadata={ + "format": extension, + "parser": "image", + "mime_type": mime_type, + "image_summary": result.image_summary, + "chunk_count": len(result.chunks), + "chunks": [ + { + "chunk_index": c.chunk_index, + "content": c.content, + "chunk_type": c.chunk_type, + "keywords": c.keywords, + } + for c in result.chunks + ], + }, + pages=[PageText(page=1, text=result.raw_text)], + ) + + except Exception as e: + logger.error(f"[IMAGE-PARSER] Failed to parse image {path}: {e}") + raise DocumentParseException( + f"Failed to parse image: {str(e)}", + file_path=str(path), + parser="image", + details={"error": str(e)}, + ) + + async def parse_with_chunks(self, file_path: str | Path) -> ImageParseResult: + """ + Parse image and return structured result with intelligent chunks. + + Args: + file_path: Path to the image file. + + Returns: + ImageParseResult with intelligent chunks. + """ + path = Path(file_path) + if not path.exists(): + raise DocumentParseException( + f"Image file not found: {file_path}", + file_path=str(path), + parser="image", + ) + + file_size = path.stat().st_size + extension = path.suffix.lower() + + if extension not in self.SUPPORTED_EXTENSIONS: + raise DocumentParseException( + f"Unsupported image format: {extension}", + file_path=str(path), + parser="image", + details={"supported_formats": self.SUPPORTED_EXTENSIONS}, + ) + + with open(path, "rb") as f: + image_data = f.read() + + image_base64 = base64.b64encode(image_data).decode("utf-8") + mime_type = self._get_mime_type(extension) + + result = await self._analyze_image_async(image_base64, mime_type) + result.source_path = str(path) + result.file_size = file_size + result.metadata = { + "format": extension, + "parser": "image", + "mime_type": mime_type, + } + + return result + + async def _analyze_image_async(self, image_base64: str, mime_type: str) -> ImageParseResult: + """ + Analyze image using multimodal LLM and return structured chunks. + + Args: + image_base64: Base64 encoded image data. + mime_type: MIME type of the image. + + Returns: + ImageParseResult with intelligent chunks. + """ + try: + manager = get_llm_config_manager() + client = manager.get_kb_processing_client() + + config = manager.kb_processing_config + model = self._model or config.get("model", "gpt-4o-mini") + + messages = [ + { + "role": "system", + "content": IMAGE_SYSTEM_PROMPT, + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": IMAGE_USER_PROMPT, + }, + { + "type": "image_url", + "image_url": { + "url": f"data:{mime_type};base64,{image_base64}", + }, + }, + ], + }, + ] + + from app.services.llm.base import LLMConfig + + llm_config = LLMConfig( + model=model, + max_tokens=self._max_tokens, + temperature=0.3, + timeout_seconds=self._timeout_seconds, + ) + + response = await client.generate(messages=messages, config=llm_config) + + if not response.content: + raise DocumentParseException( + "LLM returned empty response for image analysis", + parser="image", + ) + + return self._parse_llm_response(response.content) + + except Exception as e: + logger.error(f"[IMAGE-PARSER] LLM analysis failed: {e}") + raise + + def _parse_llm_response(self, response_content: str) -> ImageParseResult: + """ + Parse LLM response into structured ImageParseResult. + + Args: + response_content: Raw LLM response content. + + Returns: + ImageParseResult with parsed chunks. + """ + try: + json_str = self._extract_json(response_content) + data = json.loads(json_str) + + image_summary = data.get("image_summary", "") + chunks_data = data.get("chunks", []) + + chunks = [] + for chunk_data in chunks_data: + chunk = ImageChunk( + chunk_index=chunk_data.get("chunk_index", len(chunks)), + content=chunk_data.get("content", ""), + chunk_type=chunk_data.get("chunk_type", "text"), + keywords=chunk_data.get("keywords", []), + ) + if chunk.content.strip(): + chunks.append(chunk) + + if not chunks: + chunks.append(ImageChunk( + chunk_index=0, + content=response_content, + chunk_type="text", + keywords=[], + )) + + raw_text = "\n\n".join([c.content for c in chunks]) + + return ImageParseResult( + image_summary=image_summary, + chunks=chunks, + raw_text=raw_text, + source_path="", + file_size=0, + ) + + except json.JSONDecodeError as e: + logger.warning(f"[IMAGE-PARSER] Failed to parse JSON response: {e}, using fallback") + return ImageParseResult( + image_summary="图片内容", + chunks=[ImageChunk( + chunk_index=0, + content=response_content, + chunk_type="text", + keywords=[], + )], + raw_text=response_content, + source_path="", + file_size=0, + ) + + def _extract_json(self, content: str) -> str: + """ + Extract JSON from LLM response content. + + Args: + content: Raw response content that may contain JSON. + + Returns: + Extracted JSON string. + """ + content = content.strip() + + if content.startswith("{") and content.endswith("}"): + return content + + json_start = content.find("{") + json_end = content.rfind("}") + + if json_start != -1 and json_end != -1 and json_end > json_start: + return content[json_start:json_end + 1] + + return content + + def _get_mime_type(self, extension: str) -> str: + """Get MIME type for image extension.""" + mime_types = { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", + ".tiff": "image/tiff", + ".tif": "image/tiff", + } + return mime_types.get(extension.lower(), "image/jpeg") + + def get_supported_extensions(self) -> list[str]: + """Get list of supported image extensions.""" + return ImageParser.SUPPORTED_EXTENSIONS diff --git a/ai-service/app/services/document/markdown_chunker.py b/ai-service/app/services/document/markdown_chunker.py new file mode 100644 index 0000000..0b6762d --- /dev/null +++ b/ai-service/app/services/document/markdown_chunker.py @@ -0,0 +1,771 @@ +""" +Markdown intelligent chunker with structure-aware splitting. +Supports headers, code blocks, tables, lists, and preserves context. +""" + +import logging +import re +from dataclasses import dataclass, field +from enum import Enum +from typing import Any + +logger = logging.getLogger(__name__) + + +class MarkdownElementType(Enum): + """Types of Markdown elements.""" + HEADER = "header" + PARAGRAPH = "paragraph" + CODE_BLOCK = "code_block" + INLINE_CODE = "inline_code" + TABLE = "table" + LIST = "list" + BLOCKQUOTE = "blockquote" + HORIZONTAL_RULE = "horizontal_rule" + IMAGE = "image" + LINK = "link" + TEXT = "text" + + +@dataclass +class MarkdownElement: + """Represents a parsed Markdown element.""" + type: MarkdownElementType + content: str + level: int = 0 + language: str = "" + metadata: dict[str, Any] = field(default_factory=dict) + line_start: int = 0 + line_end: int = 0 + + def to_dict(self) -> dict[str, Any]: + return { + "type": self.type.value, + "content": self.content, + "level": self.level, + "language": self.language, + "metadata": self.metadata, + "line_start": self.line_start, + "line_end": self.line_end, + } + + +@dataclass +class MarkdownChunk: + """Represents a chunk of Markdown content with context.""" + chunk_id: str + content: str + element_type: MarkdownElementType + header_context: list[str] + level: int = 0 + language: str = "" + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "chunk_id": self.chunk_id, + "content": self.content, + "element_type": self.element_type.value, + "header_context": self.header_context, + "level": self.level, + "language": self.language, + "metadata": self.metadata, + } + + +class MarkdownParser: + """ + Parser for Markdown documents. + Extracts structured elements from Markdown text. + """ + + HEADER_PATTERN = re.compile(r'^(#{1,6})\s+(.+?)(?:\s+#+)?$', re.MULTILINE) + CODE_BLOCK_PATTERN = re.compile(r'^```(\w*)\n(.*?)^```', re.MULTILINE | re.DOTALL) + TABLE_PATTERN = re.compile(r'^(\|.+\|)\n(\|[-:\s|]+\|)\n((?:\|.+\|\n?)+)', re.MULTILINE) + LIST_PATTERN = re.compile(r'^([ \t]*[-*+]|\d+\.)\s+(.+)$', re.MULTILINE) + BLOCKQUOTE_PATTERN = re.compile(r'^>\s*(.+)$', re.MULTILINE) + HR_PATTERN = re.compile(r'^[-*_]{3,}\s*$', re.MULTILINE) + IMAGE_PATTERN = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)') + LINK_PATTERN = re.compile(r'\[([^\]]+)\]\(([^)]+)\)') + INLINE_CODE_PATTERN = re.compile(r'`([^`]+)`') + + def parse(self, text: str) -> list[MarkdownElement]: + """ + Parse Markdown text into structured elements. + + Args: + text: Raw Markdown text + + Returns: + List of MarkdownElement objects + """ + elements = [] + lines = text.split('\n') + current_pos = 0 + + code_block_ranges = self._extract_code_blocks(text, lines, elements) + table_ranges = self._extract_tables(text, lines, elements) + protected_ranges = code_block_ranges + table_ranges + + self._extract_headers(lines, elements, protected_ranges) + self._extract_lists(lines, elements, protected_ranges) + self._extract_blockquotes(lines, elements, protected_ranges) + self._extract_horizontal_rules(lines, elements, protected_ranges) + + self._fill_paragraphs(lines, elements, protected_ranges) + + elements.sort(key=lambda e: e.line_start) + + return elements + + def _extract_code_blocks( + self, + text: str, + lines: list[str], + elements: list[MarkdownElement], + ) -> list[tuple[int, int]]: + """Extract code blocks with language info.""" + ranges = [] + in_code_block = False + code_start = 0 + language = "" + code_content = [] + + for i, line in enumerate(lines): + if line.strip().startswith('```'): + if not in_code_block: + in_code_block = True + code_start = i + language = line.strip()[3:].strip() + code_content = [] + else: + in_code_block = False + elements.append(MarkdownElement( + type=MarkdownElementType.CODE_BLOCK, + content='\n'.join(code_content), + language=language, + line_start=code_start, + line_end=i, + metadata={"language": language}, + )) + ranges.append((code_start, i)) + + elif in_code_block: + code_content.append(line) + + return ranges + + def _extract_tables( + self, + text: str, + lines: list[str], + elements: list[MarkdownElement], + ) -> list[tuple[int, int]]: + """Extract Markdown tables.""" + ranges = [] + i = 0 + + while i < len(lines): + line = lines[i] + + if '|' in line and i + 1 < len(lines): + next_line = lines[i + 1] + if '|' in next_line and re.match(r'^[\|\-\:\s]+$', next_line.strip()): + table_lines = [line, next_line] + j = i + 2 + + while j < len(lines) and '|' in lines[j]: + table_lines.append(lines[j]) + j += 1 + + table_content = '\n'.join(table_lines) + headers = [h.strip() for h in line.split('|') if h.strip()] + row_count = len(table_lines) - 2 + + elements.append(MarkdownElement( + type=MarkdownElementType.TABLE, + content=table_content, + line_start=i, + line_end=j - 1, + metadata={ + "headers": headers, + "row_count": row_count, + }, + )) + ranges.append((i, j - 1)) + i = j + continue + + i += 1 + + return ranges + + def _is_in_protected_range(self, line_num: int, ranges: list[tuple[int, int]]) -> bool: + """Check if a line is within a protected range.""" + for start, end in ranges: + if start <= line_num <= end: + return True + return False + + def _extract_headers( + self, + lines: list[str], + elements: list[MarkdownElement], + protected_ranges: list[tuple[int, int]], + ) -> None: + """Extract headers with level info.""" + for i, line in enumerate(lines): + if self._is_in_protected_range(i, protected_ranges): + continue + + match = self.HEADER_PATTERN.match(line) + if match: + level = len(match.group(1)) + title = match.group(2).strip() + + elements.append(MarkdownElement( + type=MarkdownElementType.HEADER, + content=title, + level=level, + line_start=i, + line_end=i, + metadata={"level": level}, + )) + + def _extract_lists( + self, + lines: list[str], + elements: list[MarkdownElement], + protected_ranges: list[tuple[int, int]], + ) -> None: + """Extract list items.""" + in_list = False + list_start = 0 + list_items = [] + list_indent = 0 + + for i, line in enumerate(lines): + if self._is_in_protected_range(i, protected_ranges): + if in_list: + self._save_list(elements, list_start, i - 1, list_items) + in_list = False + list_items = [] + continue + + match = self.LIST_PATTERN.match(line) + if match: + indent = len(line) - len(line.lstrip()) + item_content = match.group(2) + + if not in_list: + in_list = True + list_start = i + list_indent = indent + list_items = [(indent, item_content)] + else: + list_items.append((indent, item_content)) + else: + if in_list: + if line.strip() == '': + continue + else: + self._save_list(elements, list_start, i - 1, list_items) + in_list = False + list_items = [] + + if in_list: + self._save_list(elements, list_start, len(lines) - 1, list_items) + + def _save_list( + self, + elements: list[MarkdownElement], + start: int, + end: int, + items: list[tuple[int, str]], + ) -> None: + """Save a list element.""" + if not items: + return + + content = '\n'.join([item[1] for item in items]) + elements.append(MarkdownElement( + type=MarkdownElementType.LIST, + content=content, + line_start=start, + line_end=end, + metadata={ + "item_count": len(items), + "is_ordered": False, + }, + )) + + def _extract_blockquotes( + self, + lines: list[str], + elements: list[MarkdownElement], + protected_ranges: list[tuple[int, int]], + ) -> None: + """Extract blockquotes.""" + in_quote = False + quote_start = 0 + quote_lines = [] + + for i, line in enumerate(lines): + if self._is_in_protected_range(i, protected_ranges): + if in_quote: + self._save_blockquote(elements, quote_start, i - 1, quote_lines) + in_quote = False + quote_lines = [] + continue + + match = self.BLOCKQUOTE_PATTERN.match(line) + if match: + if not in_quote: + in_quote = True + quote_start = i + quote_lines.append(match.group(1)) + else: + if in_quote: + self._save_blockquote(elements, quote_start, i - 1, quote_lines) + in_quote = False + quote_lines = [] + + if in_quote: + self._save_blockquote(elements, quote_start, len(lines) - 1, quote_lines) + + def _save_blockquote( + self, + elements: list[MarkdownElement], + start: int, + end: int, + lines: list[str], + ) -> None: + """Save a blockquote element.""" + if not lines: + return + + elements.append(MarkdownElement( + type=MarkdownElementType.BLOCKQUOTE, + content='\n'.join(lines), + line_start=start, + line_end=end, + )) + + def _extract_horizontal_rules( + self, + lines: list[str], + elements: list[MarkdownElement], + protected_ranges: list[tuple[int, int]], + ) -> None: + """Extract horizontal rules.""" + for i, line in enumerate(lines): + if self._is_in_protected_range(i, protected_ranges): + continue + + if self.HR_PATTERN.match(line): + elements.append(MarkdownElement( + type=MarkdownElementType.HORIZONTAL_RULE, + content=line, + line_start=i, + line_end=i, + )) + + def _fill_paragraphs( + self, + lines: list[str], + elements: list[MarkdownElement], + protected_ranges: list[tuple[int, int]], + ) -> None: + """Fill in paragraphs for remaining content.""" + occupied = set() + for start, end in protected_ranges: + for i in range(start, end + 1): + occupied.add(i) + + for elem in elements: + for i in range(elem.line_start, elem.line_end + 1): + occupied.add(i) + + i = 0 + while i < len(lines): + if i in occupied: + i += 1 + continue + + if lines[i].strip() == '': + i += 1 + continue + + para_start = i + para_lines = [] + + while i < len(lines) and i not in occupied and lines[i].strip() != '': + para_lines.append(lines[i]) + occupied.add(i) + i += 1 + + if para_lines: + elements.append(MarkdownElement( + type=MarkdownElementType.PARAGRAPH, + content='\n'.join(para_lines), + line_start=para_start, + line_end=i - 1, + )) + + +class MarkdownChunker: + """ + Intelligent chunker for Markdown documents. + + Features: + - Structure-aware splitting (headers, code blocks, tables, lists) + - Context preservation (header hierarchy) + - Configurable chunk size and overlap + - Metadata extraction + """ + + def __init__( + self, + max_chunk_size: int = 1000, + min_chunk_size: int = 100, + chunk_overlap: int = 50, + preserve_code_blocks: bool = True, + preserve_tables: bool = True, + preserve_lists: bool = True, + include_header_context: bool = True, + ): + self._max_chunk_size = max_chunk_size + self._min_chunk_size = min_chunk_size + self._chunk_overlap = chunk_overlap + self._preserve_code_blocks = preserve_code_blocks + self._preserve_tables = preserve_tables + self._preserve_lists = preserve_lists + self._include_header_context = include_header_context + self._parser = MarkdownParser() + + def chunk(self, text: str, doc_id: str = "") -> list[MarkdownChunk]: + """ + Chunk Markdown text into structured segments. + + Args: + text: Raw Markdown text + doc_id: Optional document ID for chunk IDs + + Returns: + List of MarkdownChunk objects + """ + elements = self._parser.parse(text) + chunks = [] + header_stack: list[str] = [] + chunk_index = 0 + + for elem in elements: + if elem.type == MarkdownElementType.HEADER: + level = elem.level + while len(header_stack) >= level: + if header_stack: + header_stack.pop() + header_stack.append(elem.content) + continue + + if elem.type == MarkdownElementType.HORIZONTAL_RULE: + continue + + chunk_content = self._format_element_content(elem) + if not chunk_content: + continue + + chunk_id = f"{doc_id}_chunk_{chunk_index}" if doc_id else f"chunk_{chunk_index}" + + header_context = [] + if self._include_header_context: + header_context = header_stack.copy() + + if len(chunk_content) > self._max_chunk_size: + sub_chunks = self._split_large_element( + elem, + chunk_id, + header_context, + chunk_index, + ) + chunks.extend(sub_chunks) + chunk_index += len(sub_chunks) + else: + chunks.append(MarkdownChunk( + chunk_id=chunk_id, + content=chunk_content, + element_type=elem.type, + header_context=header_context, + level=elem.level, + language=elem.language, + metadata=elem.metadata, + )) + chunk_index += 1 + + return chunks + + def _format_element_content(self, elem: MarkdownElement) -> str: + """Format element content based on type.""" + if elem.type == MarkdownElementType.CODE_BLOCK: + lang = elem.language or "" + return f"```{lang}\n{elem.content}\n```" + + elif elem.type == MarkdownElementType.TABLE: + return elem.content + + elif elem.type == MarkdownElementType.LIST: + return elem.content + + elif elem.type == MarkdownElementType.BLOCKQUOTE: + lines = elem.content.split('\n') + return '\n'.join([f"> {line}" for line in lines]) + + elif elem.type == MarkdownElementType.PARAGRAPH: + return elem.content + + return elem.content + + def _split_large_element( + self, + elem: MarkdownElement, + base_id: str, + header_context: list[str], + start_index: int, + ) -> list[MarkdownChunk]: + """Split a large element into smaller chunks.""" + chunks = [] + + if elem.type == MarkdownElementType.CODE_BLOCK: + chunks = self._split_code_block(elem, base_id, header_context, start_index) + elif elem.type == MarkdownElementType.TABLE: + chunks = self._split_table(elem, base_id, header_context, start_index) + elif elem.type == MarkdownElementType.LIST: + chunks = self._split_list(elem, base_id, header_context, start_index) + else: + chunks = self._split_text(elem, base_id, header_context, start_index) + + return chunks + + def _split_code_block( + self, + elem: MarkdownElement, + base_id: str, + header_context: list[str], + start_index: int, + ) -> list[MarkdownChunk]: + """Split code block while preserving language marker.""" + chunks = [] + lines = elem.content.split('\n') + current_lines = [] + current_size = 0 + sub_index = 0 + + for line in lines: + if current_size + len(line) + 1 > self._max_chunk_size and current_lines: + chunk_content = f"```{elem.language}\n" + '\n'.join(current_lines) + "\n```" + chunks.append(MarkdownChunk( + chunk_id=f"{base_id}_{sub_index}", + content=chunk_content, + element_type=MarkdownElementType.CODE_BLOCK, + header_context=header_context, + language=elem.language, + metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1}, + )) + sub_index += 1 + current_lines = [] + current_size = 0 + + current_lines.append(line) + current_size += len(line) + 1 + + if current_lines: + chunk_content = f"```{elem.language}\n" + '\n'.join(current_lines) + "\n```" + chunks.append(MarkdownChunk( + chunk_id=f"{base_id}_{sub_index}", + content=chunk_content, + element_type=MarkdownElementType.CODE_BLOCK, + header_context=header_context, + language=elem.language, + metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1}, + )) + + return chunks + + def _split_table( + self, + elem: MarkdownElement, + base_id: str, + header_context: list[str], + start_index: int, + ) -> list[MarkdownChunk]: + """Split table while preserving header row.""" + chunks = [] + lines = elem.content.split('\n') + + if len(lines) < 2: + return [MarkdownChunk( + chunk_id=f"{base_id}_0", + content=elem.content, + element_type=MarkdownElementType.TABLE, + header_context=header_context, + metadata=elem.metadata, + )] + + header_line = lines[0] + separator_line = lines[1] + data_lines = lines[2:] + + current_lines = [header_line, separator_line] + current_size = len(header_line) + len(separator_line) + 2 + sub_index = 0 + + for line in data_lines: + if current_size + len(line) + 1 > self._max_chunk_size and len(current_lines) > 2: + chunks.append(MarkdownChunk( + chunk_id=f"{base_id}_{sub_index}", + content='\n'.join(current_lines), + element_type=MarkdownElementType.TABLE, + header_context=header_context, + metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1}, + )) + sub_index += 1 + current_lines = [header_line, separator_line] + current_size = len(header_line) + len(separator_line) + 2 + + current_lines.append(line) + current_size += len(line) + 1 + + if len(current_lines) > 2: + chunks.append(MarkdownChunk( + chunk_id=f"{base_id}_{sub_index}", + content='\n'.join(current_lines), + element_type=MarkdownElementType.TABLE, + header_context=header_context, + metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1}, + )) + + return chunks + + def _split_list( + self, + elem: MarkdownElement, + base_id: str, + header_context: list[str], + start_index: int, + ) -> list[MarkdownChunk]: + """Split list into smaller chunks.""" + chunks = [] + items = elem.content.split('\n') + current_items = [] + current_size = 0 + sub_index = 0 + + for item in items: + if current_size + len(item) + 1 > self._max_chunk_size and current_items: + chunks.append(MarkdownChunk( + chunk_id=f"{base_id}_{sub_index}", + content='\n'.join(current_items), + element_type=MarkdownElementType.LIST, + header_context=header_context, + metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1}, + )) + sub_index += 1 + current_items = [] + current_size = 0 + + current_items.append(item) + current_size += len(item) + 1 + + if current_items: + chunks.append(MarkdownChunk( + chunk_id=f"{base_id}_{sub_index}", + content='\n'.join(current_items), + element_type=MarkdownElementType.LIST, + header_context=header_context, + metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1}, + )) + + return chunks + + def _split_text( + self, + elem: MarkdownElement, + base_id: str, + header_context: list[str], + start_index: int, + ) -> list[MarkdownChunk]: + """Split text content by sentences or paragraphs.""" + chunks = [] + text = elem.content + sub_index = 0 + + paragraphs = text.split('\n\n') + + current_content = "" + current_size = 0 + + for para in paragraphs: + if current_size + len(para) + 2 > self._max_chunk_size and current_content: + chunks.append(MarkdownChunk( + chunk_id=f"{base_id}_{sub_index}", + content=current_content.strip(), + element_type=elem.type, + header_context=header_context, + metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1}, + )) + sub_index += 1 + current_content = "" + current_size = 0 + + current_content += para + "\n\n" + current_size += len(para) + 2 + + if current_content.strip(): + chunks.append(MarkdownChunk( + chunk_id=f"{base_id}_{sub_index}", + content=current_content.strip(), + element_type=elem.type, + header_context=header_context, + metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1}, + )) + + return chunks + + +def chunk_markdown( + text: str, + doc_id: str = "", + max_chunk_size: int = 1000, + min_chunk_size: int = 100, + preserve_code_blocks: bool = True, + preserve_tables: bool = True, + preserve_lists: bool = True, + include_header_context: bool = True, +) -> list[dict[str, Any]]: + """ + Convenience function to chunk Markdown text. + + Args: + text: Raw Markdown text + doc_id: Optional document ID + max_chunk_size: Maximum chunk size in characters + min_chunk_size: Minimum chunk size in characters + preserve_code_blocks: Whether to preserve code blocks + preserve_tables: Whether to preserve tables + preserve_lists: Whether to preserve lists + include_header_context: Whether to include header context + + Returns: + List of chunk dictionaries + """ + chunker = MarkdownChunker( + max_chunk_size=max_chunk_size, + min_chunk_size=min_chunk_size, + preserve_code_blocks=preserve_code_blocks, + preserve_tables=preserve_tables, + preserve_lists=preserve_lists, + include_header_context=include_header_context, + ) + + chunks = chunker.chunk(text, doc_id) + return [chunk.to_dict() for chunk in chunks] diff --git a/ai-service/app/services/document/markdown_parser.py b/ai-service/app/services/document/markdown_parser.py new file mode 100644 index 0000000..535103a --- /dev/null +++ b/ai-service/app/services/document/markdown_parser.py @@ -0,0 +1,178 @@ +""" +Markdown parser with intelligent chunking. +[AC-AISVC-33] Markdown file parsing with structure-aware chunking. +""" + +import logging +from pathlib import Path +from typing import Any + +from app.services.document.base import ( + DocumentParseException, + DocumentParser, + ParseResult, +) +from app.services.document.markdown_chunker import ( + MarkdownChunker, + MarkdownElementType, +) + +logger = logging.getLogger(__name__) + +ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"] + + +class MarkdownParser(DocumentParser): + """ + Parser for Markdown files with intelligent chunking. + [AC-AISVC-33] Structure-aware parsing for Markdown documents. + + Features: + - Header hierarchy extraction + - Code block preservation + - Table structure preservation + - List grouping + - Context-aware chunking + """ + + def __init__( + self, + encoding: str = "utf-8", + max_chunk_size: int = 1000, + min_chunk_size: int = 100, + preserve_code_blocks: bool = True, + preserve_tables: bool = True, + preserve_lists: bool = True, + include_header_context: bool = True, + **kwargs: Any, + ): + self._encoding = encoding + self._max_chunk_size = max_chunk_size + self._min_chunk_size = min_chunk_size + self._preserve_code_blocks = preserve_code_blocks + self._preserve_tables = preserve_tables + self._preserve_lists = preserve_lists + self._include_header_context = include_header_context + self._extra_config = kwargs + + self._chunker = MarkdownChunker( + max_chunk_size=max_chunk_size, + min_chunk_size=min_chunk_size, + preserve_code_blocks=preserve_code_blocks, + preserve_tables=preserve_tables, + preserve_lists=preserve_lists, + include_header_context=include_header_context, + ) + + def _try_encodings(self, path: Path) -> tuple[str, str]: + """ + Try multiple encodings to read the file. + Returns: (text, encoding_used) + """ + for enc in ENCODINGS_TO_TRY: + try: + with open(path, encoding=enc) as f: + text = f.read() + logger.info(f"Successfully parsed Markdown with encoding: {enc}") + return text, enc + except (UnicodeDecodeError, LookupError): + continue + + raise DocumentParseException( + "Failed to decode Markdown file with any known encoding", + file_path=str(path), + parser="markdown" + ) + + def parse(self, file_path: str | Path) -> ParseResult: + """ + Parse a Markdown file and extract structured content. + [AC-AISVC-33] Structure-aware parsing. + """ + path = Path(file_path) + + if not path.exists(): + raise DocumentParseException( + f"File not found: {path}", + file_path=str(path), + parser="markdown" + ) + + try: + text, encoding_used = self._try_encodings(path) + + file_size = path.stat().st_size + line_count = text.count("\n") + 1 + + chunks = self._chunker.chunk(text, doc_id=path.stem) + + header_count = sum( + 1 for c in chunks + if c.element_type == MarkdownElementType.HEADER + ) + code_block_count = sum( + 1 for c in chunks + if c.element_type == MarkdownElementType.CODE_BLOCK + ) + table_count = sum( + 1 for c in chunks + if c.element_type == MarkdownElementType.TABLE + ) + list_count = sum( + 1 for c in chunks + if c.element_type == MarkdownElementType.LIST + ) + + logger.info( + f"Parsed Markdown: {path.name}, lines={line_count}, " + f"chars={len(text)}, chunks={len(chunks)}, " + f"headers={header_count}, code_blocks={code_block_count}, " + f"tables={table_count}, lists={list_count}" + ) + + return ParseResult( + text=text, + source_path=str(path), + file_size=file_size, + metadata={ + "format": "markdown", + "line_count": line_count, + "encoding": encoding_used, + "chunk_count": len(chunks), + "structure": { + "headers": header_count, + "code_blocks": code_block_count, + "tables": table_count, + "lists": list_count, + }, + "chunks": [chunk.to_dict() for chunk in chunks], + } + ) + + except DocumentParseException: + raise + except Exception as e: + raise DocumentParseException( + f"Failed to parse Markdown file: {e}", + file_path=str(path), + parser="markdown", + details={"error": str(e)} + ) + + def get_supported_extensions(self) -> list[str]: + """Get supported file extensions.""" + return [".md", ".markdown"] + + def get_chunks(self, text: str, doc_id: str = "") -> list[dict[str, Any]]: + """ + Get structured chunks from Markdown text. + + Args: + text: Markdown text content + doc_id: Optional document ID + + Returns: + List of chunk dictionaries + """ + chunks = self._chunker.chunk(text, doc_id) + return [chunk.to_dict() for chunk in chunks]