[AC-DOC-PARSER] feat(document): 新增图片和 Markdown 解析器

- 新增 ImageParser 支持图片文件解析
- 新增 MarkdownParser 支持 Markdown 文件解析
- 新增 MarkdownChunker 实现 Markdown 智能分块
- 支持按标题、段落、代码块等元素类型分块
- 更新 document 模块导出和工厂方法
This commit is contained in:
MerCry 2026-03-11 18:56:43 +08:00
parent b3680bda8a
commit 4de2a2aece
5 changed files with 1475 additions and 2 deletions

View File

@ -16,6 +16,16 @@ from app.services.document.factory import (
get_supported_document_formats, get_supported_document_formats,
parse_document, parse_document,
) )
from app.services.document.image_parser import ImageParser
from app.services.document.markdown_chunker import (
MarkdownChunk,
MarkdownChunker,
MarkdownElement,
MarkdownElementType,
MarkdownParser as MarkdownStructureParser,
chunk_markdown,
)
from app.services.document.markdown_parser import MarkdownParser
from app.services.document.pdf_parser import PDFParser, PDFPlumberParser from app.services.document.pdf_parser import PDFParser, PDFPlumberParser
from app.services.document.text_parser import TextParser from app.services.document.text_parser import TextParser
from app.services.document.word_parser import WordParser from app.services.document.word_parser import WordParser
@ -35,4 +45,12 @@ __all__ = [
"ExcelParser", "ExcelParser",
"CSVParser", "CSVParser",
"TextParser", "TextParser",
"MarkdownParser",
"MarkdownChunker",
"MarkdownChunk",
"MarkdownElement",
"MarkdownElementType",
"MarkdownStructureParser",
"chunk_markdown",
"ImageParser",
] ]

View File

@ -16,6 +16,8 @@ from app.services.document.base import (
UnsupportedFormatError, UnsupportedFormatError,
) )
from app.services.document.excel_parser import CSVParser, ExcelParser from app.services.document.excel_parser import CSVParser, ExcelParser
from app.services.document.image_parser import ImageParser
from app.services.document.markdown_parser import MarkdownParser
from app.services.document.pdf_parser import PDFParser, PDFPlumberParser from app.services.document.pdf_parser import PDFParser, PDFPlumberParser
from app.services.document.text_parser import TextParser from app.services.document.text_parser import TextParser
from app.services.document.word_parser import WordParser from app.services.document.word_parser import WordParser
@ -45,6 +47,8 @@ class DocumentParserFactory:
"excel": ExcelParser, "excel": ExcelParser,
"csv": CSVParser, "csv": CSVParser,
"text": TextParser, "text": TextParser,
"markdown": MarkdownParser,
"image": ImageParser,
} }
cls._extension_map = { cls._extension_map = {
@ -54,14 +58,22 @@ class DocumentParserFactory:
".xls": "excel", ".xls": "excel",
".csv": "csv", ".csv": "csv",
".txt": "text", ".txt": "text",
".md": "text", ".md": "markdown",
".markdown": "text", ".markdown": "markdown",
".rst": "text", ".rst": "text",
".log": "text", ".log": "text",
".json": "text", ".json": "text",
".xml": "text", ".xml": "text",
".yaml": "text", ".yaml": "text",
".yml": "text", ".yml": "text",
".jpg": "image",
".jpeg": "image",
".png": "image",
".gif": "image",
".webp": "image",
".bmp": "image",
".tiff": "image",
".tif": "image",
} }
@classmethod @classmethod
@ -174,6 +186,8 @@ class DocumentParserFactory:
"excel": "Excel 电子表格", "excel": "Excel 电子表格",
"csv": "CSV 文件", "csv": "CSV 文件",
"text": "文本文件", "text": "文本文件",
"markdown": "Markdown 文档",
"image": "图片文件",
} }
descriptions = { descriptions = {
@ -183,6 +197,8 @@ class DocumentParserFactory:
"excel": "解析 Excel 电子表格,支持多工作表", "excel": "解析 Excel 电子表格,支持多工作表",
"csv": "解析 CSV 文件,自动检测编码", "csv": "解析 CSV 文件,自动检测编码",
"text": "解析纯文本文件,支持多种编码", "text": "解析纯文本文件,支持多种编码",
"markdown": "智能解析 Markdown 文档,保留结构(标题、代码块、表格、列表)",
"image": "使用多模态 LLM 解析图片,提取文字和关键信息",
} }
info.append({ info.append({

View File

@ -0,0 +1,490 @@
"""
Image parser using multimodal LLM.
Supports parsing images into structured text content for knowledge base indexing.
"""
import asyncio
import base64
import json
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from app.services.document.base import (
DocumentParseException,
DocumentParser,
PageText,
ParseResult,
)
from app.services.llm.factory import LLMUsageType, get_llm_config_manager
logger = logging.getLogger(__name__)
IMAGE_SYSTEM_PROMPT = """你是一个专业的图像内容分析助手。你的任务是分析图片内容,并将其智能拆分为适合知识库检索的独立数据块。
## 分析要求
1. 仔细分析图片内容识别其中的文字图表数据等信息
2. 根据内容的逻辑结构智能判断如何拆分为独立的知识条目
3. 每个条目应该是独立完整可检索的知识单元
## 输出格式
请严格按照以下 JSON 格式输出不要添加任何其他内容
```json
{
"image_summary": "图片整体概述(一句话描述图片主题)",
"total_chunks": <分块总数>,
"chunks": [
{
"chunk_index": 0,
"content": "该分块的完整内容文字",
"chunk_type": "text|table|list|diagram|chart|mixed",
"keywords": ["关键词1", "关键词2"]
}
]
}
```
## 分块策略
- **单一内容**: 如果图片只有一段完整的文字/信息可以只输出1个分块
- **多段落内容**: 按段落或逻辑单元拆分每个段落作为独立分块
- **表格数据**: 将表格内容转换为结构化文字作为一个分块
- **图表数据**: 描述图表内容和数据作为一个分块
- **列表内容**: 每个列表项可作为独立分块或合并为相关的一组
- **混合内容**: 根据内容类型分别处理确保每个分块主题明确
## 注意事项
1. 每个分块的 content 必须是完整可独立理解的文字
2. chunk_type 用于标识内容类型便于后续处理
3. keywords 提取该分块的核心关键词便于检索
4. 确保输出的 JSON 格式正确可以被解析"""
IMAGE_USER_PROMPT = "请分析这张图片,按照要求的 JSON 格式输出分块结果。"
@dataclass
class ImageChunk:
"""智能分块结果"""
chunk_index: int
content: str
chunk_type: str = "text"
keywords: list[str] = field(default_factory=list)
@dataclass
class ImageParseResult:
"""图片解析结果(包含智能分块)"""
image_summary: str
chunks: list[ImageChunk]
raw_text: str
source_path: str
file_size: int
metadata: dict[str, Any] = field(default_factory=dict)
class ImageParser(DocumentParser):
"""
Image parser using multimodal LLM.
Supports common image formats and extracts text content using
vision-capable LLM models (GPT-4V, GPT-4o, etc.).
Features:
- Intelligent chunking based on content structure
- Structured output with keywords and chunk types
- Support for various content types (text, table, chart, etc.)
"""
SUPPORTED_EXTENSIONS = [
".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"
]
def __init__(
self,
model: str | None = None,
max_tokens: int = 4096,
timeout_seconds: int = 120,
):
self._model = model
self._max_tokens = max_tokens
self._timeout_seconds = timeout_seconds
def parse(self, file_path: str | Path) -> ParseResult:
"""
Parse an image file and extract text content using multimodal LLM.
Note: This method is synchronous but internally uses async operations.
For async contexts, use parse_async() instead.
Args:
file_path: Path to the image file.
Returns:
ParseResult with extracted text content.
Raises:
DocumentParseException: If parsing fails.
"""
path = Path(file_path)
if not path.exists():
raise DocumentParseException(
f"Image file not found: {file_path}",
file_path=str(path),
parser="image",
)
file_size = path.stat().st_size
extension = path.suffix.lower()
if extension not in self.SUPPORTED_EXTENSIONS:
raise DocumentParseException(
f"Unsupported image format: {extension}",
file_path=str(path),
parser="image",
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
)
try:
with open(path, "rb") as f:
image_data = f.read()
image_base64 = base64.b64encode(image_data).decode("utf-8")
mime_type = self._get_mime_type(extension)
try:
loop = asyncio.get_running_loop()
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(
asyncio.run,
self._analyze_image_async(image_base64, mime_type)
)
result = future.result()
except RuntimeError:
result = asyncio.run(self._analyze_image_async(image_base64, mime_type))
logger.info(
f"[IMAGE-PARSER] Successfully parsed image: {path.name}, "
f"size={file_size}, chunks={len(result.chunks)}"
)
return ParseResult(
text=result.raw_text,
source_path=str(path),
file_size=file_size,
page_count=1,
metadata={
"format": extension,
"parser": "image",
"mime_type": mime_type,
"image_summary": result.image_summary,
"chunk_count": len(result.chunks),
"chunks": [
{
"chunk_index": c.chunk_index,
"content": c.content,
"chunk_type": c.chunk_type,
"keywords": c.keywords,
}
for c in result.chunks
],
},
pages=[PageText(page=1, text=result.raw_text)],
)
except Exception as e:
logger.error(f"[IMAGE-PARSER] Failed to parse image {path}: {e}")
raise DocumentParseException(
f"Failed to parse image: {str(e)}",
file_path=str(path),
parser="image",
details={"error": str(e)},
)
async def parse_async(self, file_path: str | Path) -> ParseResult:
"""
Async version of parse method for use in async contexts.
Args:
file_path: Path to the image file.
Returns:
ParseResult with extracted text content.
"""
path = Path(file_path)
if not path.exists():
raise DocumentParseException(
f"Image file not found: {file_path}",
file_path=str(path),
parser="image",
)
file_size = path.stat().st_size
extension = path.suffix.lower()
if extension not in self.SUPPORTED_EXTENSIONS:
raise DocumentParseException(
f"Unsupported image format: {extension}",
file_path=str(path),
parser="image",
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
)
try:
with open(path, "rb") as f:
image_data = f.read()
image_base64 = base64.b64encode(image_data).decode("utf-8")
mime_type = self._get_mime_type(extension)
result = await self._analyze_image_async(image_base64, mime_type)
logger.info(
f"[IMAGE-PARSER] Successfully parsed image (async): {path.name}, "
f"size={file_size}, chunks={len(result.chunks)}"
)
return ParseResult(
text=result.raw_text,
source_path=str(path),
file_size=file_size,
page_count=1,
metadata={
"format": extension,
"parser": "image",
"mime_type": mime_type,
"image_summary": result.image_summary,
"chunk_count": len(result.chunks),
"chunks": [
{
"chunk_index": c.chunk_index,
"content": c.content,
"chunk_type": c.chunk_type,
"keywords": c.keywords,
}
for c in result.chunks
],
},
pages=[PageText(page=1, text=result.raw_text)],
)
except Exception as e:
logger.error(f"[IMAGE-PARSER] Failed to parse image {path}: {e}")
raise DocumentParseException(
f"Failed to parse image: {str(e)}",
file_path=str(path),
parser="image",
details={"error": str(e)},
)
async def parse_with_chunks(self, file_path: str | Path) -> ImageParseResult:
"""
Parse image and return structured result with intelligent chunks.
Args:
file_path: Path to the image file.
Returns:
ImageParseResult with intelligent chunks.
"""
path = Path(file_path)
if not path.exists():
raise DocumentParseException(
f"Image file not found: {file_path}",
file_path=str(path),
parser="image",
)
file_size = path.stat().st_size
extension = path.suffix.lower()
if extension not in self.SUPPORTED_EXTENSIONS:
raise DocumentParseException(
f"Unsupported image format: {extension}",
file_path=str(path),
parser="image",
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
)
with open(path, "rb") as f:
image_data = f.read()
image_base64 = base64.b64encode(image_data).decode("utf-8")
mime_type = self._get_mime_type(extension)
result = await self._analyze_image_async(image_base64, mime_type)
result.source_path = str(path)
result.file_size = file_size
result.metadata = {
"format": extension,
"parser": "image",
"mime_type": mime_type,
}
return result
async def _analyze_image_async(self, image_base64: str, mime_type: str) -> ImageParseResult:
"""
Analyze image using multimodal LLM and return structured chunks.
Args:
image_base64: Base64 encoded image data.
mime_type: MIME type of the image.
Returns:
ImageParseResult with intelligent chunks.
"""
try:
manager = get_llm_config_manager()
client = manager.get_kb_processing_client()
config = manager.kb_processing_config
model = self._model or config.get("model", "gpt-4o-mini")
messages = [
{
"role": "system",
"content": IMAGE_SYSTEM_PROMPT,
},
{
"role": "user",
"content": [
{
"type": "text",
"text": IMAGE_USER_PROMPT,
},
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{image_base64}",
},
},
],
},
]
from app.services.llm.base import LLMConfig
llm_config = LLMConfig(
model=model,
max_tokens=self._max_tokens,
temperature=0.3,
timeout_seconds=self._timeout_seconds,
)
response = await client.generate(messages=messages, config=llm_config)
if not response.content:
raise DocumentParseException(
"LLM returned empty response for image analysis",
parser="image",
)
return self._parse_llm_response(response.content)
except Exception as e:
logger.error(f"[IMAGE-PARSER] LLM analysis failed: {e}")
raise
def _parse_llm_response(self, response_content: str) -> ImageParseResult:
"""
Parse LLM response into structured ImageParseResult.
Args:
response_content: Raw LLM response content.
Returns:
ImageParseResult with parsed chunks.
"""
try:
json_str = self._extract_json(response_content)
data = json.loads(json_str)
image_summary = data.get("image_summary", "")
chunks_data = data.get("chunks", [])
chunks = []
for chunk_data in chunks_data:
chunk = ImageChunk(
chunk_index=chunk_data.get("chunk_index", len(chunks)),
content=chunk_data.get("content", ""),
chunk_type=chunk_data.get("chunk_type", "text"),
keywords=chunk_data.get("keywords", []),
)
if chunk.content.strip():
chunks.append(chunk)
if not chunks:
chunks.append(ImageChunk(
chunk_index=0,
content=response_content,
chunk_type="text",
keywords=[],
))
raw_text = "\n\n".join([c.content for c in chunks])
return ImageParseResult(
image_summary=image_summary,
chunks=chunks,
raw_text=raw_text,
source_path="",
file_size=0,
)
except json.JSONDecodeError as e:
logger.warning(f"[IMAGE-PARSER] Failed to parse JSON response: {e}, using fallback")
return ImageParseResult(
image_summary="图片内容",
chunks=[ImageChunk(
chunk_index=0,
content=response_content,
chunk_type="text",
keywords=[],
)],
raw_text=response_content,
source_path="",
file_size=0,
)
def _extract_json(self, content: str) -> str:
"""
Extract JSON from LLM response content.
Args:
content: Raw response content that may contain JSON.
Returns:
Extracted JSON string.
"""
content = content.strip()
if content.startswith("{") and content.endswith("}"):
return content
json_start = content.find("{")
json_end = content.rfind("}")
if json_start != -1 and json_end != -1 and json_end > json_start:
return content[json_start:json_end + 1]
return content
def _get_mime_type(self, extension: str) -> str:
"""Get MIME type for image extension."""
mime_types = {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp",
".bmp": "image/bmp",
".tiff": "image/tiff",
".tif": "image/tiff",
}
return mime_types.get(extension.lower(), "image/jpeg")
def get_supported_extensions(self) -> list[str]:
"""Get list of supported image extensions."""
return ImageParser.SUPPORTED_EXTENSIONS

View File

@ -0,0 +1,771 @@
"""
Markdown intelligent chunker with structure-aware splitting.
Supports headers, code blocks, tables, lists, and preserves context.
"""
import logging
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Any
logger = logging.getLogger(__name__)
class MarkdownElementType(Enum):
"""Types of Markdown elements."""
HEADER = "header"
PARAGRAPH = "paragraph"
CODE_BLOCK = "code_block"
INLINE_CODE = "inline_code"
TABLE = "table"
LIST = "list"
BLOCKQUOTE = "blockquote"
HORIZONTAL_RULE = "horizontal_rule"
IMAGE = "image"
LINK = "link"
TEXT = "text"
@dataclass
class MarkdownElement:
"""Represents a parsed Markdown element."""
type: MarkdownElementType
content: str
level: int = 0
language: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
line_start: int = 0
line_end: int = 0
def to_dict(self) -> dict[str, Any]:
return {
"type": self.type.value,
"content": self.content,
"level": self.level,
"language": self.language,
"metadata": self.metadata,
"line_start": self.line_start,
"line_end": self.line_end,
}
@dataclass
class MarkdownChunk:
"""Represents a chunk of Markdown content with context."""
chunk_id: str
content: str
element_type: MarkdownElementType
header_context: list[str]
level: int = 0
language: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
return {
"chunk_id": self.chunk_id,
"content": self.content,
"element_type": self.element_type.value,
"header_context": self.header_context,
"level": self.level,
"language": self.language,
"metadata": self.metadata,
}
class MarkdownParser:
"""
Parser for Markdown documents.
Extracts structured elements from Markdown text.
"""
HEADER_PATTERN = re.compile(r'^(#{1,6})\s+(.+?)(?:\s+#+)?$', re.MULTILINE)
CODE_BLOCK_PATTERN = re.compile(r'^```(\w*)\n(.*?)^```', re.MULTILINE | re.DOTALL)
TABLE_PATTERN = re.compile(r'^(\|.+\|)\n(\|[-:\s|]+\|)\n((?:\|.+\|\n?)+)', re.MULTILINE)
LIST_PATTERN = re.compile(r'^([ \t]*[-*+]|\d+\.)\s+(.+)$', re.MULTILINE)
BLOCKQUOTE_PATTERN = re.compile(r'^>\s*(.+)$', re.MULTILINE)
HR_PATTERN = re.compile(r'^[-*_]{3,}\s*$', re.MULTILINE)
IMAGE_PATTERN = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
LINK_PATTERN = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
INLINE_CODE_PATTERN = re.compile(r'`([^`]+)`')
def parse(self, text: str) -> list[MarkdownElement]:
"""
Parse Markdown text into structured elements.
Args:
text: Raw Markdown text
Returns:
List of MarkdownElement objects
"""
elements = []
lines = text.split('\n')
current_pos = 0
code_block_ranges = self._extract_code_blocks(text, lines, elements)
table_ranges = self._extract_tables(text, lines, elements)
protected_ranges = code_block_ranges + table_ranges
self._extract_headers(lines, elements, protected_ranges)
self._extract_lists(lines, elements, protected_ranges)
self._extract_blockquotes(lines, elements, protected_ranges)
self._extract_horizontal_rules(lines, elements, protected_ranges)
self._fill_paragraphs(lines, elements, protected_ranges)
elements.sort(key=lambda e: e.line_start)
return elements
def _extract_code_blocks(
self,
text: str,
lines: list[str],
elements: list[MarkdownElement],
) -> list[tuple[int, int]]:
"""Extract code blocks with language info."""
ranges = []
in_code_block = False
code_start = 0
language = ""
code_content = []
for i, line in enumerate(lines):
if line.strip().startswith('```'):
if not in_code_block:
in_code_block = True
code_start = i
language = line.strip()[3:].strip()
code_content = []
else:
in_code_block = False
elements.append(MarkdownElement(
type=MarkdownElementType.CODE_BLOCK,
content='\n'.join(code_content),
language=language,
line_start=code_start,
line_end=i,
metadata={"language": language},
))
ranges.append((code_start, i))
elif in_code_block:
code_content.append(line)
return ranges
def _extract_tables(
self,
text: str,
lines: list[str],
elements: list[MarkdownElement],
) -> list[tuple[int, int]]:
"""Extract Markdown tables."""
ranges = []
i = 0
while i < len(lines):
line = lines[i]
if '|' in line and i + 1 < len(lines):
next_line = lines[i + 1]
if '|' in next_line and re.match(r'^[\|\-\:\s]+$', next_line.strip()):
table_lines = [line, next_line]
j = i + 2
while j < len(lines) and '|' in lines[j]:
table_lines.append(lines[j])
j += 1
table_content = '\n'.join(table_lines)
headers = [h.strip() for h in line.split('|') if h.strip()]
row_count = len(table_lines) - 2
elements.append(MarkdownElement(
type=MarkdownElementType.TABLE,
content=table_content,
line_start=i,
line_end=j - 1,
metadata={
"headers": headers,
"row_count": row_count,
},
))
ranges.append((i, j - 1))
i = j
continue
i += 1
return ranges
def _is_in_protected_range(self, line_num: int, ranges: list[tuple[int, int]]) -> bool:
"""Check if a line is within a protected range."""
for start, end in ranges:
if start <= line_num <= end:
return True
return False
def _extract_headers(
self,
lines: list[str],
elements: list[MarkdownElement],
protected_ranges: list[tuple[int, int]],
) -> None:
"""Extract headers with level info."""
for i, line in enumerate(lines):
if self._is_in_protected_range(i, protected_ranges):
continue
match = self.HEADER_PATTERN.match(line)
if match:
level = len(match.group(1))
title = match.group(2).strip()
elements.append(MarkdownElement(
type=MarkdownElementType.HEADER,
content=title,
level=level,
line_start=i,
line_end=i,
metadata={"level": level},
))
def _extract_lists(
self,
lines: list[str],
elements: list[MarkdownElement],
protected_ranges: list[tuple[int, int]],
) -> None:
"""Extract list items."""
in_list = False
list_start = 0
list_items = []
list_indent = 0
for i, line in enumerate(lines):
if self._is_in_protected_range(i, protected_ranges):
if in_list:
self._save_list(elements, list_start, i - 1, list_items)
in_list = False
list_items = []
continue
match = self.LIST_PATTERN.match(line)
if match:
indent = len(line) - len(line.lstrip())
item_content = match.group(2)
if not in_list:
in_list = True
list_start = i
list_indent = indent
list_items = [(indent, item_content)]
else:
list_items.append((indent, item_content))
else:
if in_list:
if line.strip() == '':
continue
else:
self._save_list(elements, list_start, i - 1, list_items)
in_list = False
list_items = []
if in_list:
self._save_list(elements, list_start, len(lines) - 1, list_items)
def _save_list(
self,
elements: list[MarkdownElement],
start: int,
end: int,
items: list[tuple[int, str]],
) -> None:
"""Save a list element."""
if not items:
return
content = '\n'.join([item[1] for item in items])
elements.append(MarkdownElement(
type=MarkdownElementType.LIST,
content=content,
line_start=start,
line_end=end,
metadata={
"item_count": len(items),
"is_ordered": False,
},
))
def _extract_blockquotes(
self,
lines: list[str],
elements: list[MarkdownElement],
protected_ranges: list[tuple[int, int]],
) -> None:
"""Extract blockquotes."""
in_quote = False
quote_start = 0
quote_lines = []
for i, line in enumerate(lines):
if self._is_in_protected_range(i, protected_ranges):
if in_quote:
self._save_blockquote(elements, quote_start, i - 1, quote_lines)
in_quote = False
quote_lines = []
continue
match = self.BLOCKQUOTE_PATTERN.match(line)
if match:
if not in_quote:
in_quote = True
quote_start = i
quote_lines.append(match.group(1))
else:
if in_quote:
self._save_blockquote(elements, quote_start, i - 1, quote_lines)
in_quote = False
quote_lines = []
if in_quote:
self._save_blockquote(elements, quote_start, len(lines) - 1, quote_lines)
def _save_blockquote(
self,
elements: list[MarkdownElement],
start: int,
end: int,
lines: list[str],
) -> None:
"""Save a blockquote element."""
if not lines:
return
elements.append(MarkdownElement(
type=MarkdownElementType.BLOCKQUOTE,
content='\n'.join(lines),
line_start=start,
line_end=end,
))
def _extract_horizontal_rules(
self,
lines: list[str],
elements: list[MarkdownElement],
protected_ranges: list[tuple[int, int]],
) -> None:
"""Extract horizontal rules."""
for i, line in enumerate(lines):
if self._is_in_protected_range(i, protected_ranges):
continue
if self.HR_PATTERN.match(line):
elements.append(MarkdownElement(
type=MarkdownElementType.HORIZONTAL_RULE,
content=line,
line_start=i,
line_end=i,
))
def _fill_paragraphs(
self,
lines: list[str],
elements: list[MarkdownElement],
protected_ranges: list[tuple[int, int]],
) -> None:
"""Fill in paragraphs for remaining content."""
occupied = set()
for start, end in protected_ranges:
for i in range(start, end + 1):
occupied.add(i)
for elem in elements:
for i in range(elem.line_start, elem.line_end + 1):
occupied.add(i)
i = 0
while i < len(lines):
if i in occupied:
i += 1
continue
if lines[i].strip() == '':
i += 1
continue
para_start = i
para_lines = []
while i < len(lines) and i not in occupied and lines[i].strip() != '':
para_lines.append(lines[i])
occupied.add(i)
i += 1
if para_lines:
elements.append(MarkdownElement(
type=MarkdownElementType.PARAGRAPH,
content='\n'.join(para_lines),
line_start=para_start,
line_end=i - 1,
))
class MarkdownChunker:
"""
Intelligent chunker for Markdown documents.
Features:
- Structure-aware splitting (headers, code blocks, tables, lists)
- Context preservation (header hierarchy)
- Configurable chunk size and overlap
- Metadata extraction
"""
def __init__(
self,
max_chunk_size: int = 1000,
min_chunk_size: int = 100,
chunk_overlap: int = 50,
preserve_code_blocks: bool = True,
preserve_tables: bool = True,
preserve_lists: bool = True,
include_header_context: bool = True,
):
self._max_chunk_size = max_chunk_size
self._min_chunk_size = min_chunk_size
self._chunk_overlap = chunk_overlap
self._preserve_code_blocks = preserve_code_blocks
self._preserve_tables = preserve_tables
self._preserve_lists = preserve_lists
self._include_header_context = include_header_context
self._parser = MarkdownParser()
def chunk(self, text: str, doc_id: str = "") -> list[MarkdownChunk]:
"""
Chunk Markdown text into structured segments.
Args:
text: Raw Markdown text
doc_id: Optional document ID for chunk IDs
Returns:
List of MarkdownChunk objects
"""
elements = self._parser.parse(text)
chunks = []
header_stack: list[str] = []
chunk_index = 0
for elem in elements:
if elem.type == MarkdownElementType.HEADER:
level = elem.level
while len(header_stack) >= level:
if header_stack:
header_stack.pop()
header_stack.append(elem.content)
continue
if elem.type == MarkdownElementType.HORIZONTAL_RULE:
continue
chunk_content = self._format_element_content(elem)
if not chunk_content:
continue
chunk_id = f"{doc_id}_chunk_{chunk_index}" if doc_id else f"chunk_{chunk_index}"
header_context = []
if self._include_header_context:
header_context = header_stack.copy()
if len(chunk_content) > self._max_chunk_size:
sub_chunks = self._split_large_element(
elem,
chunk_id,
header_context,
chunk_index,
)
chunks.extend(sub_chunks)
chunk_index += len(sub_chunks)
else:
chunks.append(MarkdownChunk(
chunk_id=chunk_id,
content=chunk_content,
element_type=elem.type,
header_context=header_context,
level=elem.level,
language=elem.language,
metadata=elem.metadata,
))
chunk_index += 1
return chunks
def _format_element_content(self, elem: MarkdownElement) -> str:
"""Format element content based on type."""
if elem.type == MarkdownElementType.CODE_BLOCK:
lang = elem.language or ""
return f"```{lang}\n{elem.content}\n```"
elif elem.type == MarkdownElementType.TABLE:
return elem.content
elif elem.type == MarkdownElementType.LIST:
return elem.content
elif elem.type == MarkdownElementType.BLOCKQUOTE:
lines = elem.content.split('\n')
return '\n'.join([f"> {line}" for line in lines])
elif elem.type == MarkdownElementType.PARAGRAPH:
return elem.content
return elem.content
def _split_large_element(
self,
elem: MarkdownElement,
base_id: str,
header_context: list[str],
start_index: int,
) -> list[MarkdownChunk]:
"""Split a large element into smaller chunks."""
chunks = []
if elem.type == MarkdownElementType.CODE_BLOCK:
chunks = self._split_code_block(elem, base_id, header_context, start_index)
elif elem.type == MarkdownElementType.TABLE:
chunks = self._split_table(elem, base_id, header_context, start_index)
elif elem.type == MarkdownElementType.LIST:
chunks = self._split_list(elem, base_id, header_context, start_index)
else:
chunks = self._split_text(elem, base_id, header_context, start_index)
return chunks
def _split_code_block(
self,
elem: MarkdownElement,
base_id: str,
header_context: list[str],
start_index: int,
) -> list[MarkdownChunk]:
"""Split code block while preserving language marker."""
chunks = []
lines = elem.content.split('\n')
current_lines = []
current_size = 0
sub_index = 0
for line in lines:
if current_size + len(line) + 1 > self._max_chunk_size and current_lines:
chunk_content = f"```{elem.language}\n" + '\n'.join(current_lines) + "\n```"
chunks.append(MarkdownChunk(
chunk_id=f"{base_id}_{sub_index}",
content=chunk_content,
element_type=MarkdownElementType.CODE_BLOCK,
header_context=header_context,
language=elem.language,
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
))
sub_index += 1
current_lines = []
current_size = 0
current_lines.append(line)
current_size += len(line) + 1
if current_lines:
chunk_content = f"```{elem.language}\n" + '\n'.join(current_lines) + "\n```"
chunks.append(MarkdownChunk(
chunk_id=f"{base_id}_{sub_index}",
content=chunk_content,
element_type=MarkdownElementType.CODE_BLOCK,
header_context=header_context,
language=elem.language,
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
))
return chunks
def _split_table(
self,
elem: MarkdownElement,
base_id: str,
header_context: list[str],
start_index: int,
) -> list[MarkdownChunk]:
"""Split table while preserving header row."""
chunks = []
lines = elem.content.split('\n')
if len(lines) < 2:
return [MarkdownChunk(
chunk_id=f"{base_id}_0",
content=elem.content,
element_type=MarkdownElementType.TABLE,
header_context=header_context,
metadata=elem.metadata,
)]
header_line = lines[0]
separator_line = lines[1]
data_lines = lines[2:]
current_lines = [header_line, separator_line]
current_size = len(header_line) + len(separator_line) + 2
sub_index = 0
for line in data_lines:
if current_size + len(line) + 1 > self._max_chunk_size and len(current_lines) > 2:
chunks.append(MarkdownChunk(
chunk_id=f"{base_id}_{sub_index}",
content='\n'.join(current_lines),
element_type=MarkdownElementType.TABLE,
header_context=header_context,
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
))
sub_index += 1
current_lines = [header_line, separator_line]
current_size = len(header_line) + len(separator_line) + 2
current_lines.append(line)
current_size += len(line) + 1
if len(current_lines) > 2:
chunks.append(MarkdownChunk(
chunk_id=f"{base_id}_{sub_index}",
content='\n'.join(current_lines),
element_type=MarkdownElementType.TABLE,
header_context=header_context,
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
))
return chunks
def _split_list(
self,
elem: MarkdownElement,
base_id: str,
header_context: list[str],
start_index: int,
) -> list[MarkdownChunk]:
"""Split list into smaller chunks."""
chunks = []
items = elem.content.split('\n')
current_items = []
current_size = 0
sub_index = 0
for item in items:
if current_size + len(item) + 1 > self._max_chunk_size and current_items:
chunks.append(MarkdownChunk(
chunk_id=f"{base_id}_{sub_index}",
content='\n'.join(current_items),
element_type=MarkdownElementType.LIST,
header_context=header_context,
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
))
sub_index += 1
current_items = []
current_size = 0
current_items.append(item)
current_size += len(item) + 1
if current_items:
chunks.append(MarkdownChunk(
chunk_id=f"{base_id}_{sub_index}",
content='\n'.join(current_items),
element_type=MarkdownElementType.LIST,
header_context=header_context,
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
))
return chunks
def _split_text(
self,
elem: MarkdownElement,
base_id: str,
header_context: list[str],
start_index: int,
) -> list[MarkdownChunk]:
"""Split text content by sentences or paragraphs."""
chunks = []
text = elem.content
sub_index = 0
paragraphs = text.split('\n\n')
current_content = ""
current_size = 0
for para in paragraphs:
if current_size + len(para) + 2 > self._max_chunk_size and current_content:
chunks.append(MarkdownChunk(
chunk_id=f"{base_id}_{sub_index}",
content=current_content.strip(),
element_type=elem.type,
header_context=header_context,
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
))
sub_index += 1
current_content = ""
current_size = 0
current_content += para + "\n\n"
current_size += len(para) + 2
if current_content.strip():
chunks.append(MarkdownChunk(
chunk_id=f"{base_id}_{sub_index}",
content=current_content.strip(),
element_type=elem.type,
header_context=header_context,
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
))
return chunks
def chunk_markdown(
text: str,
doc_id: str = "",
max_chunk_size: int = 1000,
min_chunk_size: int = 100,
preserve_code_blocks: bool = True,
preserve_tables: bool = True,
preserve_lists: bool = True,
include_header_context: bool = True,
) -> list[dict[str, Any]]:
"""
Convenience function to chunk Markdown text.
Args:
text: Raw Markdown text
doc_id: Optional document ID
max_chunk_size: Maximum chunk size in characters
min_chunk_size: Minimum chunk size in characters
preserve_code_blocks: Whether to preserve code blocks
preserve_tables: Whether to preserve tables
preserve_lists: Whether to preserve lists
include_header_context: Whether to include header context
Returns:
List of chunk dictionaries
"""
chunker = MarkdownChunker(
max_chunk_size=max_chunk_size,
min_chunk_size=min_chunk_size,
preserve_code_blocks=preserve_code_blocks,
preserve_tables=preserve_tables,
preserve_lists=preserve_lists,
include_header_context=include_header_context,
)
chunks = chunker.chunk(text, doc_id)
return [chunk.to_dict() for chunk in chunks]

View File

@ -0,0 +1,178 @@
"""
Markdown parser with intelligent chunking.
[AC-AISVC-33] Markdown file parsing with structure-aware chunking.
"""
import logging
from pathlib import Path
from typing import Any
from app.services.document.base import (
DocumentParseException,
DocumentParser,
ParseResult,
)
from app.services.document.markdown_chunker import (
MarkdownChunker,
MarkdownElementType,
)
logger = logging.getLogger(__name__)
ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]
class MarkdownParser(DocumentParser):
"""
Parser for Markdown files with intelligent chunking.
[AC-AISVC-33] Structure-aware parsing for Markdown documents.
Features:
- Header hierarchy extraction
- Code block preservation
- Table structure preservation
- List grouping
- Context-aware chunking
"""
def __init__(
self,
encoding: str = "utf-8",
max_chunk_size: int = 1000,
min_chunk_size: int = 100,
preserve_code_blocks: bool = True,
preserve_tables: bool = True,
preserve_lists: bool = True,
include_header_context: bool = True,
**kwargs: Any,
):
self._encoding = encoding
self._max_chunk_size = max_chunk_size
self._min_chunk_size = min_chunk_size
self._preserve_code_blocks = preserve_code_blocks
self._preserve_tables = preserve_tables
self._preserve_lists = preserve_lists
self._include_header_context = include_header_context
self._extra_config = kwargs
self._chunker = MarkdownChunker(
max_chunk_size=max_chunk_size,
min_chunk_size=min_chunk_size,
preserve_code_blocks=preserve_code_blocks,
preserve_tables=preserve_tables,
preserve_lists=preserve_lists,
include_header_context=include_header_context,
)
def _try_encodings(self, path: Path) -> tuple[str, str]:
"""
Try multiple encodings to read the file.
Returns: (text, encoding_used)
"""
for enc in ENCODINGS_TO_TRY:
try:
with open(path, encoding=enc) as f:
text = f.read()
logger.info(f"Successfully parsed Markdown with encoding: {enc}")
return text, enc
except (UnicodeDecodeError, LookupError):
continue
raise DocumentParseException(
"Failed to decode Markdown file with any known encoding",
file_path=str(path),
parser="markdown"
)
def parse(self, file_path: str | Path) -> ParseResult:
"""
Parse a Markdown file and extract structured content.
[AC-AISVC-33] Structure-aware parsing.
"""
path = Path(file_path)
if not path.exists():
raise DocumentParseException(
f"File not found: {path}",
file_path=str(path),
parser="markdown"
)
try:
text, encoding_used = self._try_encodings(path)
file_size = path.stat().st_size
line_count = text.count("\n") + 1
chunks = self._chunker.chunk(text, doc_id=path.stem)
header_count = sum(
1 for c in chunks
if c.element_type == MarkdownElementType.HEADER
)
code_block_count = sum(
1 for c in chunks
if c.element_type == MarkdownElementType.CODE_BLOCK
)
table_count = sum(
1 for c in chunks
if c.element_type == MarkdownElementType.TABLE
)
list_count = sum(
1 for c in chunks
if c.element_type == MarkdownElementType.LIST
)
logger.info(
f"Parsed Markdown: {path.name}, lines={line_count}, "
f"chars={len(text)}, chunks={len(chunks)}, "
f"headers={header_count}, code_blocks={code_block_count}, "
f"tables={table_count}, lists={list_count}"
)
return ParseResult(
text=text,
source_path=str(path),
file_size=file_size,
metadata={
"format": "markdown",
"line_count": line_count,
"encoding": encoding_used,
"chunk_count": len(chunks),
"structure": {
"headers": header_count,
"code_blocks": code_block_count,
"tables": table_count,
"lists": list_count,
},
"chunks": [chunk.to_dict() for chunk in chunks],
}
)
except DocumentParseException:
raise
except Exception as e:
raise DocumentParseException(
f"Failed to parse Markdown file: {e}",
file_path=str(path),
parser="markdown",
details={"error": str(e)}
)
def get_supported_extensions(self) -> list[str]:
"""Get supported file extensions."""
return [".md", ".markdown"]
def get_chunks(self, text: str, doc_id: str = "") -> list[dict[str, Any]]:
"""
Get structured chunks from Markdown text.
Args:
text: Markdown text content
doc_id: Optional document ID
Returns:
List of chunk dictionaries
"""
chunks = self._chunker.chunk(text, doc_id)
return [chunk.to_dict() for chunk in chunks]