[AC-DOC-PARSER] feat(document): 新增图片和 Markdown 解析器
- 新增 ImageParser 支持图片文件解析 - 新增 MarkdownParser 支持 Markdown 文件解析 - 新增 MarkdownChunker 实现 Markdown 智能分块 - 支持按标题、段落、代码块等元素类型分块 - 更新 document 模块导出和工厂方法
This commit is contained in:
parent
b3680bda8a
commit
4de2a2aece
|
|
@ -16,6 +16,16 @@ from app.services.document.factory import (
|
|||
get_supported_document_formats,
|
||||
parse_document,
|
||||
)
|
||||
from app.services.document.image_parser import ImageParser
|
||||
from app.services.document.markdown_chunker import (
|
||||
MarkdownChunk,
|
||||
MarkdownChunker,
|
||||
MarkdownElement,
|
||||
MarkdownElementType,
|
||||
MarkdownParser as MarkdownStructureParser,
|
||||
chunk_markdown,
|
||||
)
|
||||
from app.services.document.markdown_parser import MarkdownParser
|
||||
from app.services.document.pdf_parser import PDFParser, PDFPlumberParser
|
||||
from app.services.document.text_parser import TextParser
|
||||
from app.services.document.word_parser import WordParser
|
||||
|
|
@ -35,4 +45,12 @@ __all__ = [
|
|||
"ExcelParser",
|
||||
"CSVParser",
|
||||
"TextParser",
|
||||
"MarkdownParser",
|
||||
"MarkdownChunker",
|
||||
"MarkdownChunk",
|
||||
"MarkdownElement",
|
||||
"MarkdownElementType",
|
||||
"MarkdownStructureParser",
|
||||
"chunk_markdown",
|
||||
"ImageParser",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -16,6 +16,8 @@ from app.services.document.base import (
|
|||
UnsupportedFormatError,
|
||||
)
|
||||
from app.services.document.excel_parser import CSVParser, ExcelParser
|
||||
from app.services.document.image_parser import ImageParser
|
||||
from app.services.document.markdown_parser import MarkdownParser
|
||||
from app.services.document.pdf_parser import PDFParser, PDFPlumberParser
|
||||
from app.services.document.text_parser import TextParser
|
||||
from app.services.document.word_parser import WordParser
|
||||
|
|
@ -45,6 +47,8 @@ class DocumentParserFactory:
|
|||
"excel": ExcelParser,
|
||||
"csv": CSVParser,
|
||||
"text": TextParser,
|
||||
"markdown": MarkdownParser,
|
||||
"image": ImageParser,
|
||||
}
|
||||
|
||||
cls._extension_map = {
|
||||
|
|
@ -54,14 +58,22 @@ class DocumentParserFactory:
|
|||
".xls": "excel",
|
||||
".csv": "csv",
|
||||
".txt": "text",
|
||||
".md": "text",
|
||||
".markdown": "text",
|
||||
".md": "markdown",
|
||||
".markdown": "markdown",
|
||||
".rst": "text",
|
||||
".log": "text",
|
||||
".json": "text",
|
||||
".xml": "text",
|
||||
".yaml": "text",
|
||||
".yml": "text",
|
||||
".jpg": "image",
|
||||
".jpeg": "image",
|
||||
".png": "image",
|
||||
".gif": "image",
|
||||
".webp": "image",
|
||||
".bmp": "image",
|
||||
".tiff": "image",
|
||||
".tif": "image",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
|
|
@ -174,6 +186,8 @@ class DocumentParserFactory:
|
|||
"excel": "Excel 电子表格",
|
||||
"csv": "CSV 文件",
|
||||
"text": "文本文件",
|
||||
"markdown": "Markdown 文档",
|
||||
"image": "图片文件",
|
||||
}
|
||||
|
||||
descriptions = {
|
||||
|
|
@ -183,6 +197,8 @@ class DocumentParserFactory:
|
|||
"excel": "解析 Excel 电子表格,支持多工作表",
|
||||
"csv": "解析 CSV 文件,自动检测编码",
|
||||
"text": "解析纯文本文件,支持多种编码",
|
||||
"markdown": "智能解析 Markdown 文档,保留结构(标题、代码块、表格、列表)",
|
||||
"image": "使用多模态 LLM 解析图片,提取文字和关键信息",
|
||||
}
|
||||
|
||||
info.append({
|
||||
|
|
|
|||
|
|
@ -0,0 +1,490 @@
|
|||
"""
|
||||
Image parser using multimodal LLM.
|
||||
Supports parsing images into structured text content for knowledge base indexing.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.services.document.base import (
|
||||
DocumentParseException,
|
||||
DocumentParser,
|
||||
PageText,
|
||||
ParseResult,
|
||||
)
|
||||
from app.services.llm.factory import LLMUsageType, get_llm_config_manager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
IMAGE_SYSTEM_PROMPT = """你是一个专业的图像内容分析助手。你的任务是分析图片内容,并将其智能拆分为适合知识库检索的独立数据块。
|
||||
|
||||
## 分析要求
|
||||
1. 仔细分析图片内容,识别其中的文字、图表、数据等信息
|
||||
2. 根据内容的逻辑结构,智能判断如何拆分为独立的知识条目
|
||||
3. 每个条目应该是独立、完整、可检索的知识单元
|
||||
|
||||
## 输出格式
|
||||
请严格按照以下 JSON 格式输出,不要添加任何其他内容:
|
||||
|
||||
```json
|
||||
{
|
||||
"image_summary": "图片整体概述(一句话描述图片主题)",
|
||||
"total_chunks": <分块总数>,
|
||||
"chunks": [
|
||||
{
|
||||
"chunk_index": 0,
|
||||
"content": "该分块的完整内容文字",
|
||||
"chunk_type": "text|table|list|diagram|chart|mixed",
|
||||
"keywords": ["关键词1", "关键词2"]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## 分块策略
|
||||
- **单一内容**: 如果图片只有一段完整的文字/信息,可以只输出1个分块
|
||||
- **多段落内容**: 按段落或逻辑单元拆分,每个段落作为独立分块
|
||||
- **表格数据**: 将表格内容转换为结构化文字,作为一个分块
|
||||
- **图表数据**: 描述图表内容和数据,作为一个分块
|
||||
- **列表内容**: 每个列表项可作为独立分块,或合并为相关的一组
|
||||
- **混合内容**: 根据内容类型分别处理,确保每个分块主题明确
|
||||
|
||||
## 注意事项
|
||||
1. 每个分块的 content 必须是完整、可独立理解的文字
|
||||
2. chunk_type 用于标识内容类型,便于后续处理
|
||||
3. keywords 提取该分块的核心关键词,便于检索
|
||||
4. 确保输出的 JSON 格式正确,可以被解析"""
|
||||
|
||||
IMAGE_USER_PROMPT = "请分析这张图片,按照要求的 JSON 格式输出分块结果。"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImageChunk:
|
||||
"""智能分块结果"""
|
||||
chunk_index: int
|
||||
content: str
|
||||
chunk_type: str = "text"
|
||||
keywords: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImageParseResult:
|
||||
"""图片解析结果(包含智能分块)"""
|
||||
image_summary: str
|
||||
chunks: list[ImageChunk]
|
||||
raw_text: str
|
||||
source_path: str
|
||||
file_size: int
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
class ImageParser(DocumentParser):
|
||||
"""
|
||||
Image parser using multimodal LLM.
|
||||
|
||||
Supports common image formats and extracts text content using
|
||||
vision-capable LLM models (GPT-4V, GPT-4o, etc.).
|
||||
|
||||
Features:
|
||||
- Intelligent chunking based on content structure
|
||||
- Structured output with keywords and chunk types
|
||||
- Support for various content types (text, table, chart, etc.)
|
||||
"""
|
||||
|
||||
SUPPORTED_EXTENSIONS = [
|
||||
".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str | None = None,
|
||||
max_tokens: int = 4096,
|
||||
timeout_seconds: int = 120,
|
||||
):
|
||||
self._model = model
|
||||
self._max_tokens = max_tokens
|
||||
self._timeout_seconds = timeout_seconds
|
||||
|
||||
def parse(self, file_path: str | Path) -> ParseResult:
|
||||
"""
|
||||
Parse an image file and extract text content using multimodal LLM.
|
||||
|
||||
Note: This method is synchronous but internally uses async operations.
|
||||
For async contexts, use parse_async() instead.
|
||||
|
||||
Args:
|
||||
file_path: Path to the image file.
|
||||
|
||||
Returns:
|
||||
ParseResult with extracted text content.
|
||||
|
||||
Raises:
|
||||
DocumentParseException: If parsing fails.
|
||||
"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
raise DocumentParseException(
|
||||
f"Image file not found: {file_path}",
|
||||
file_path=str(path),
|
||||
parser="image",
|
||||
)
|
||||
|
||||
file_size = path.stat().st_size
|
||||
extension = path.suffix.lower()
|
||||
|
||||
if extension not in self.SUPPORTED_EXTENSIONS:
|
||||
raise DocumentParseException(
|
||||
f"Unsupported image format: {extension}",
|
||||
file_path=str(path),
|
||||
parser="image",
|
||||
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
|
||||
)
|
||||
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
image_data = f.read()
|
||||
|
||||
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||
mime_type = self._get_mime_type(extension)
|
||||
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(
|
||||
asyncio.run,
|
||||
self._analyze_image_async(image_base64, mime_type)
|
||||
)
|
||||
result = future.result()
|
||||
except RuntimeError:
|
||||
result = asyncio.run(self._analyze_image_async(image_base64, mime_type))
|
||||
|
||||
logger.info(
|
||||
f"[IMAGE-PARSER] Successfully parsed image: {path.name}, "
|
||||
f"size={file_size}, chunks={len(result.chunks)}"
|
||||
)
|
||||
|
||||
return ParseResult(
|
||||
text=result.raw_text,
|
||||
source_path=str(path),
|
||||
file_size=file_size,
|
||||
page_count=1,
|
||||
metadata={
|
||||
"format": extension,
|
||||
"parser": "image",
|
||||
"mime_type": mime_type,
|
||||
"image_summary": result.image_summary,
|
||||
"chunk_count": len(result.chunks),
|
||||
"chunks": [
|
||||
{
|
||||
"chunk_index": c.chunk_index,
|
||||
"content": c.content,
|
||||
"chunk_type": c.chunk_type,
|
||||
"keywords": c.keywords,
|
||||
}
|
||||
for c in result.chunks
|
||||
],
|
||||
},
|
||||
pages=[PageText(page=1, text=result.raw_text)],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[IMAGE-PARSER] Failed to parse image {path}: {e}")
|
||||
raise DocumentParseException(
|
||||
f"Failed to parse image: {str(e)}",
|
||||
file_path=str(path),
|
||||
parser="image",
|
||||
details={"error": str(e)},
|
||||
)
|
||||
|
||||
async def parse_async(self, file_path: str | Path) -> ParseResult:
|
||||
"""
|
||||
Async version of parse method for use in async contexts.
|
||||
|
||||
Args:
|
||||
file_path: Path to the image file.
|
||||
|
||||
Returns:
|
||||
ParseResult with extracted text content.
|
||||
"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
raise DocumentParseException(
|
||||
f"Image file not found: {file_path}",
|
||||
file_path=str(path),
|
||||
parser="image",
|
||||
)
|
||||
|
||||
file_size = path.stat().st_size
|
||||
extension = path.suffix.lower()
|
||||
|
||||
if extension not in self.SUPPORTED_EXTENSIONS:
|
||||
raise DocumentParseException(
|
||||
f"Unsupported image format: {extension}",
|
||||
file_path=str(path),
|
||||
parser="image",
|
||||
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
|
||||
)
|
||||
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
image_data = f.read()
|
||||
|
||||
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||
mime_type = self._get_mime_type(extension)
|
||||
|
||||
result = await self._analyze_image_async(image_base64, mime_type)
|
||||
|
||||
logger.info(
|
||||
f"[IMAGE-PARSER] Successfully parsed image (async): {path.name}, "
|
||||
f"size={file_size}, chunks={len(result.chunks)}"
|
||||
)
|
||||
|
||||
return ParseResult(
|
||||
text=result.raw_text,
|
||||
source_path=str(path),
|
||||
file_size=file_size,
|
||||
page_count=1,
|
||||
metadata={
|
||||
"format": extension,
|
||||
"parser": "image",
|
||||
"mime_type": mime_type,
|
||||
"image_summary": result.image_summary,
|
||||
"chunk_count": len(result.chunks),
|
||||
"chunks": [
|
||||
{
|
||||
"chunk_index": c.chunk_index,
|
||||
"content": c.content,
|
||||
"chunk_type": c.chunk_type,
|
||||
"keywords": c.keywords,
|
||||
}
|
||||
for c in result.chunks
|
||||
],
|
||||
},
|
||||
pages=[PageText(page=1, text=result.raw_text)],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[IMAGE-PARSER] Failed to parse image {path}: {e}")
|
||||
raise DocumentParseException(
|
||||
f"Failed to parse image: {str(e)}",
|
||||
file_path=str(path),
|
||||
parser="image",
|
||||
details={"error": str(e)},
|
||||
)
|
||||
|
||||
async def parse_with_chunks(self, file_path: str | Path) -> ImageParseResult:
|
||||
"""
|
||||
Parse image and return structured result with intelligent chunks.
|
||||
|
||||
Args:
|
||||
file_path: Path to the image file.
|
||||
|
||||
Returns:
|
||||
ImageParseResult with intelligent chunks.
|
||||
"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
raise DocumentParseException(
|
||||
f"Image file not found: {file_path}",
|
||||
file_path=str(path),
|
||||
parser="image",
|
||||
)
|
||||
|
||||
file_size = path.stat().st_size
|
||||
extension = path.suffix.lower()
|
||||
|
||||
if extension not in self.SUPPORTED_EXTENSIONS:
|
||||
raise DocumentParseException(
|
||||
f"Unsupported image format: {extension}",
|
||||
file_path=str(path),
|
||||
parser="image",
|
||||
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
|
||||
)
|
||||
|
||||
with open(path, "rb") as f:
|
||||
image_data = f.read()
|
||||
|
||||
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||
mime_type = self._get_mime_type(extension)
|
||||
|
||||
result = await self._analyze_image_async(image_base64, mime_type)
|
||||
result.source_path = str(path)
|
||||
result.file_size = file_size
|
||||
result.metadata = {
|
||||
"format": extension,
|
||||
"parser": "image",
|
||||
"mime_type": mime_type,
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
async def _analyze_image_async(self, image_base64: str, mime_type: str) -> ImageParseResult:
|
||||
"""
|
||||
Analyze image using multimodal LLM and return structured chunks.
|
||||
|
||||
Args:
|
||||
image_base64: Base64 encoded image data.
|
||||
mime_type: MIME type of the image.
|
||||
|
||||
Returns:
|
||||
ImageParseResult with intelligent chunks.
|
||||
"""
|
||||
try:
|
||||
manager = get_llm_config_manager()
|
||||
client = manager.get_kb_processing_client()
|
||||
|
||||
config = manager.kb_processing_config
|
||||
model = self._model or config.get("model", "gpt-4o-mini")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": IMAGE_SYSTEM_PROMPT,
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": IMAGE_USER_PROMPT,
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:{mime_type};base64,{image_base64}",
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
from app.services.llm.base import LLMConfig
|
||||
|
||||
llm_config = LLMConfig(
|
||||
model=model,
|
||||
max_tokens=self._max_tokens,
|
||||
temperature=0.3,
|
||||
timeout_seconds=self._timeout_seconds,
|
||||
)
|
||||
|
||||
response = await client.generate(messages=messages, config=llm_config)
|
||||
|
||||
if not response.content:
|
||||
raise DocumentParseException(
|
||||
"LLM returned empty response for image analysis",
|
||||
parser="image",
|
||||
)
|
||||
|
||||
return self._parse_llm_response(response.content)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[IMAGE-PARSER] LLM analysis failed: {e}")
|
||||
raise
|
||||
|
||||
def _parse_llm_response(self, response_content: str) -> ImageParseResult:
|
||||
"""
|
||||
Parse LLM response into structured ImageParseResult.
|
||||
|
||||
Args:
|
||||
response_content: Raw LLM response content.
|
||||
|
||||
Returns:
|
||||
ImageParseResult with parsed chunks.
|
||||
"""
|
||||
try:
|
||||
json_str = self._extract_json(response_content)
|
||||
data = json.loads(json_str)
|
||||
|
||||
image_summary = data.get("image_summary", "")
|
||||
chunks_data = data.get("chunks", [])
|
||||
|
||||
chunks = []
|
||||
for chunk_data in chunks_data:
|
||||
chunk = ImageChunk(
|
||||
chunk_index=chunk_data.get("chunk_index", len(chunks)),
|
||||
content=chunk_data.get("content", ""),
|
||||
chunk_type=chunk_data.get("chunk_type", "text"),
|
||||
keywords=chunk_data.get("keywords", []),
|
||||
)
|
||||
if chunk.content.strip():
|
||||
chunks.append(chunk)
|
||||
|
||||
if not chunks:
|
||||
chunks.append(ImageChunk(
|
||||
chunk_index=0,
|
||||
content=response_content,
|
||||
chunk_type="text",
|
||||
keywords=[],
|
||||
))
|
||||
|
||||
raw_text = "\n\n".join([c.content for c in chunks])
|
||||
|
||||
return ImageParseResult(
|
||||
image_summary=image_summary,
|
||||
chunks=chunks,
|
||||
raw_text=raw_text,
|
||||
source_path="",
|
||||
file_size=0,
|
||||
)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"[IMAGE-PARSER] Failed to parse JSON response: {e}, using fallback")
|
||||
return ImageParseResult(
|
||||
image_summary="图片内容",
|
||||
chunks=[ImageChunk(
|
||||
chunk_index=0,
|
||||
content=response_content,
|
||||
chunk_type="text",
|
||||
keywords=[],
|
||||
)],
|
||||
raw_text=response_content,
|
||||
source_path="",
|
||||
file_size=0,
|
||||
)
|
||||
|
||||
def _extract_json(self, content: str) -> str:
|
||||
"""
|
||||
Extract JSON from LLM response content.
|
||||
|
||||
Args:
|
||||
content: Raw response content that may contain JSON.
|
||||
|
||||
Returns:
|
||||
Extracted JSON string.
|
||||
"""
|
||||
content = content.strip()
|
||||
|
||||
if content.startswith("{") and content.endswith("}"):
|
||||
return content
|
||||
|
||||
json_start = content.find("{")
|
||||
json_end = content.rfind("}")
|
||||
|
||||
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||
return content[json_start:json_end + 1]
|
||||
|
||||
return content
|
||||
|
||||
def _get_mime_type(self, extension: str) -> str:
|
||||
"""Get MIME type for image extension."""
|
||||
mime_types = {
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".png": "image/png",
|
||||
".gif": "image/gif",
|
||||
".webp": "image/webp",
|
||||
".bmp": "image/bmp",
|
||||
".tiff": "image/tiff",
|
||||
".tif": "image/tiff",
|
||||
}
|
||||
return mime_types.get(extension.lower(), "image/jpeg")
|
||||
|
||||
def get_supported_extensions(self) -> list[str]:
|
||||
"""Get list of supported image extensions."""
|
||||
return ImageParser.SUPPORTED_EXTENSIONS
|
||||
|
|
@ -0,0 +1,771 @@
|
|||
"""
|
||||
Markdown intelligent chunker with structure-aware splitting.
|
||||
Supports headers, code blocks, tables, lists, and preserves context.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownElementType(Enum):
|
||||
"""Types of Markdown elements."""
|
||||
HEADER = "header"
|
||||
PARAGRAPH = "paragraph"
|
||||
CODE_BLOCK = "code_block"
|
||||
INLINE_CODE = "inline_code"
|
||||
TABLE = "table"
|
||||
LIST = "list"
|
||||
BLOCKQUOTE = "blockquote"
|
||||
HORIZONTAL_RULE = "horizontal_rule"
|
||||
IMAGE = "image"
|
||||
LINK = "link"
|
||||
TEXT = "text"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MarkdownElement:
|
||||
"""Represents a parsed Markdown element."""
|
||||
type: MarkdownElementType
|
||||
content: str
|
||||
level: int = 0
|
||||
language: str = ""
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
line_start: int = 0
|
||||
line_end: int = 0
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"type": self.type.value,
|
||||
"content": self.content,
|
||||
"level": self.level,
|
||||
"language": self.language,
|
||||
"metadata": self.metadata,
|
||||
"line_start": self.line_start,
|
||||
"line_end": self.line_end,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class MarkdownChunk:
|
||||
"""Represents a chunk of Markdown content with context."""
|
||||
chunk_id: str
|
||||
content: str
|
||||
element_type: MarkdownElementType
|
||||
header_context: list[str]
|
||||
level: int = 0
|
||||
language: str = ""
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"chunk_id": self.chunk_id,
|
||||
"content": self.content,
|
||||
"element_type": self.element_type.value,
|
||||
"header_context": self.header_context,
|
||||
"level": self.level,
|
||||
"language": self.language,
|
||||
"metadata": self.metadata,
|
||||
}
|
||||
|
||||
|
||||
class MarkdownParser:
|
||||
"""
|
||||
Parser for Markdown documents.
|
||||
Extracts structured elements from Markdown text.
|
||||
"""
|
||||
|
||||
HEADER_PATTERN = re.compile(r'^(#{1,6})\s+(.+?)(?:\s+#+)?$', re.MULTILINE)
|
||||
CODE_BLOCK_PATTERN = re.compile(r'^```(\w*)\n(.*?)^```', re.MULTILINE | re.DOTALL)
|
||||
TABLE_PATTERN = re.compile(r'^(\|.+\|)\n(\|[-:\s|]+\|)\n((?:\|.+\|\n?)+)', re.MULTILINE)
|
||||
LIST_PATTERN = re.compile(r'^([ \t]*[-*+]|\d+\.)\s+(.+)$', re.MULTILINE)
|
||||
BLOCKQUOTE_PATTERN = re.compile(r'^>\s*(.+)$', re.MULTILINE)
|
||||
HR_PATTERN = re.compile(r'^[-*_]{3,}\s*$', re.MULTILINE)
|
||||
IMAGE_PATTERN = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
|
||||
LINK_PATTERN = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
|
||||
INLINE_CODE_PATTERN = re.compile(r'`([^`]+)`')
|
||||
|
||||
def parse(self, text: str) -> list[MarkdownElement]:
|
||||
"""
|
||||
Parse Markdown text into structured elements.
|
||||
|
||||
Args:
|
||||
text: Raw Markdown text
|
||||
|
||||
Returns:
|
||||
List of MarkdownElement objects
|
||||
"""
|
||||
elements = []
|
||||
lines = text.split('\n')
|
||||
current_pos = 0
|
||||
|
||||
code_block_ranges = self._extract_code_blocks(text, lines, elements)
|
||||
table_ranges = self._extract_tables(text, lines, elements)
|
||||
protected_ranges = code_block_ranges + table_ranges
|
||||
|
||||
self._extract_headers(lines, elements, protected_ranges)
|
||||
self._extract_lists(lines, elements, protected_ranges)
|
||||
self._extract_blockquotes(lines, elements, protected_ranges)
|
||||
self._extract_horizontal_rules(lines, elements, protected_ranges)
|
||||
|
||||
self._fill_paragraphs(lines, elements, protected_ranges)
|
||||
|
||||
elements.sort(key=lambda e: e.line_start)
|
||||
|
||||
return elements
|
||||
|
||||
def _extract_code_blocks(
|
||||
self,
|
||||
text: str,
|
||||
lines: list[str],
|
||||
elements: list[MarkdownElement],
|
||||
) -> list[tuple[int, int]]:
|
||||
"""Extract code blocks with language info."""
|
||||
ranges = []
|
||||
in_code_block = False
|
||||
code_start = 0
|
||||
language = ""
|
||||
code_content = []
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip().startswith('```'):
|
||||
if not in_code_block:
|
||||
in_code_block = True
|
||||
code_start = i
|
||||
language = line.strip()[3:].strip()
|
||||
code_content = []
|
||||
else:
|
||||
in_code_block = False
|
||||
elements.append(MarkdownElement(
|
||||
type=MarkdownElementType.CODE_BLOCK,
|
||||
content='\n'.join(code_content),
|
||||
language=language,
|
||||
line_start=code_start,
|
||||
line_end=i,
|
||||
metadata={"language": language},
|
||||
))
|
||||
ranges.append((code_start, i))
|
||||
|
||||
elif in_code_block:
|
||||
code_content.append(line)
|
||||
|
||||
return ranges
|
||||
|
||||
def _extract_tables(
|
||||
self,
|
||||
text: str,
|
||||
lines: list[str],
|
||||
elements: list[MarkdownElement],
|
||||
) -> list[tuple[int, int]]:
|
||||
"""Extract Markdown tables."""
|
||||
ranges = []
|
||||
i = 0
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
if '|' in line and i + 1 < len(lines):
|
||||
next_line = lines[i + 1]
|
||||
if '|' in next_line and re.match(r'^[\|\-\:\s]+$', next_line.strip()):
|
||||
table_lines = [line, next_line]
|
||||
j = i + 2
|
||||
|
||||
while j < len(lines) and '|' in lines[j]:
|
||||
table_lines.append(lines[j])
|
||||
j += 1
|
||||
|
||||
table_content = '\n'.join(table_lines)
|
||||
headers = [h.strip() for h in line.split('|') if h.strip()]
|
||||
row_count = len(table_lines) - 2
|
||||
|
||||
elements.append(MarkdownElement(
|
||||
type=MarkdownElementType.TABLE,
|
||||
content=table_content,
|
||||
line_start=i,
|
||||
line_end=j - 1,
|
||||
metadata={
|
||||
"headers": headers,
|
||||
"row_count": row_count,
|
||||
},
|
||||
))
|
||||
ranges.append((i, j - 1))
|
||||
i = j
|
||||
continue
|
||||
|
||||
i += 1
|
||||
|
||||
return ranges
|
||||
|
||||
def _is_in_protected_range(self, line_num: int, ranges: list[tuple[int, int]]) -> bool:
|
||||
"""Check if a line is within a protected range."""
|
||||
for start, end in ranges:
|
||||
if start <= line_num <= end:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _extract_headers(
|
||||
self,
|
||||
lines: list[str],
|
||||
elements: list[MarkdownElement],
|
||||
protected_ranges: list[tuple[int, int]],
|
||||
) -> None:
|
||||
"""Extract headers with level info."""
|
||||
for i, line in enumerate(lines):
|
||||
if self._is_in_protected_range(i, protected_ranges):
|
||||
continue
|
||||
|
||||
match = self.HEADER_PATTERN.match(line)
|
||||
if match:
|
||||
level = len(match.group(1))
|
||||
title = match.group(2).strip()
|
||||
|
||||
elements.append(MarkdownElement(
|
||||
type=MarkdownElementType.HEADER,
|
||||
content=title,
|
||||
level=level,
|
||||
line_start=i,
|
||||
line_end=i,
|
||||
metadata={"level": level},
|
||||
))
|
||||
|
||||
def _extract_lists(
|
||||
self,
|
||||
lines: list[str],
|
||||
elements: list[MarkdownElement],
|
||||
protected_ranges: list[tuple[int, int]],
|
||||
) -> None:
|
||||
"""Extract list items."""
|
||||
in_list = False
|
||||
list_start = 0
|
||||
list_items = []
|
||||
list_indent = 0
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if self._is_in_protected_range(i, protected_ranges):
|
||||
if in_list:
|
||||
self._save_list(elements, list_start, i - 1, list_items)
|
||||
in_list = False
|
||||
list_items = []
|
||||
continue
|
||||
|
||||
match = self.LIST_PATTERN.match(line)
|
||||
if match:
|
||||
indent = len(line) - len(line.lstrip())
|
||||
item_content = match.group(2)
|
||||
|
||||
if not in_list:
|
||||
in_list = True
|
||||
list_start = i
|
||||
list_indent = indent
|
||||
list_items = [(indent, item_content)]
|
||||
else:
|
||||
list_items.append((indent, item_content))
|
||||
else:
|
||||
if in_list:
|
||||
if line.strip() == '':
|
||||
continue
|
||||
else:
|
||||
self._save_list(elements, list_start, i - 1, list_items)
|
||||
in_list = False
|
||||
list_items = []
|
||||
|
||||
if in_list:
|
||||
self._save_list(elements, list_start, len(lines) - 1, list_items)
|
||||
|
||||
def _save_list(
|
||||
self,
|
||||
elements: list[MarkdownElement],
|
||||
start: int,
|
||||
end: int,
|
||||
items: list[tuple[int, str]],
|
||||
) -> None:
|
||||
"""Save a list element."""
|
||||
if not items:
|
||||
return
|
||||
|
||||
content = '\n'.join([item[1] for item in items])
|
||||
elements.append(MarkdownElement(
|
||||
type=MarkdownElementType.LIST,
|
||||
content=content,
|
||||
line_start=start,
|
||||
line_end=end,
|
||||
metadata={
|
||||
"item_count": len(items),
|
||||
"is_ordered": False,
|
||||
},
|
||||
))
|
||||
|
||||
def _extract_blockquotes(
|
||||
self,
|
||||
lines: list[str],
|
||||
elements: list[MarkdownElement],
|
||||
protected_ranges: list[tuple[int, int]],
|
||||
) -> None:
|
||||
"""Extract blockquotes."""
|
||||
in_quote = False
|
||||
quote_start = 0
|
||||
quote_lines = []
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if self._is_in_protected_range(i, protected_ranges):
|
||||
if in_quote:
|
||||
self._save_blockquote(elements, quote_start, i - 1, quote_lines)
|
||||
in_quote = False
|
||||
quote_lines = []
|
||||
continue
|
||||
|
||||
match = self.BLOCKQUOTE_PATTERN.match(line)
|
||||
if match:
|
||||
if not in_quote:
|
||||
in_quote = True
|
||||
quote_start = i
|
||||
quote_lines.append(match.group(1))
|
||||
else:
|
||||
if in_quote:
|
||||
self._save_blockquote(elements, quote_start, i - 1, quote_lines)
|
||||
in_quote = False
|
||||
quote_lines = []
|
||||
|
||||
if in_quote:
|
||||
self._save_blockquote(elements, quote_start, len(lines) - 1, quote_lines)
|
||||
|
||||
def _save_blockquote(
|
||||
self,
|
||||
elements: list[MarkdownElement],
|
||||
start: int,
|
||||
end: int,
|
||||
lines: list[str],
|
||||
) -> None:
|
||||
"""Save a blockquote element."""
|
||||
if not lines:
|
||||
return
|
||||
|
||||
elements.append(MarkdownElement(
|
||||
type=MarkdownElementType.BLOCKQUOTE,
|
||||
content='\n'.join(lines),
|
||||
line_start=start,
|
||||
line_end=end,
|
||||
))
|
||||
|
||||
def _extract_horizontal_rules(
|
||||
self,
|
||||
lines: list[str],
|
||||
elements: list[MarkdownElement],
|
||||
protected_ranges: list[tuple[int, int]],
|
||||
) -> None:
|
||||
"""Extract horizontal rules."""
|
||||
for i, line in enumerate(lines):
|
||||
if self._is_in_protected_range(i, protected_ranges):
|
||||
continue
|
||||
|
||||
if self.HR_PATTERN.match(line):
|
||||
elements.append(MarkdownElement(
|
||||
type=MarkdownElementType.HORIZONTAL_RULE,
|
||||
content=line,
|
||||
line_start=i,
|
||||
line_end=i,
|
||||
))
|
||||
|
||||
def _fill_paragraphs(
|
||||
self,
|
||||
lines: list[str],
|
||||
elements: list[MarkdownElement],
|
||||
protected_ranges: list[tuple[int, int]],
|
||||
) -> None:
|
||||
"""Fill in paragraphs for remaining content."""
|
||||
occupied = set()
|
||||
for start, end in protected_ranges:
|
||||
for i in range(start, end + 1):
|
||||
occupied.add(i)
|
||||
|
||||
for elem in elements:
|
||||
for i in range(elem.line_start, elem.line_end + 1):
|
||||
occupied.add(i)
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if i in occupied:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
if lines[i].strip() == '':
|
||||
i += 1
|
||||
continue
|
||||
|
||||
para_start = i
|
||||
para_lines = []
|
||||
|
||||
while i < len(lines) and i not in occupied and lines[i].strip() != '':
|
||||
para_lines.append(lines[i])
|
||||
occupied.add(i)
|
||||
i += 1
|
||||
|
||||
if para_lines:
|
||||
elements.append(MarkdownElement(
|
||||
type=MarkdownElementType.PARAGRAPH,
|
||||
content='\n'.join(para_lines),
|
||||
line_start=para_start,
|
||||
line_end=i - 1,
|
||||
))
|
||||
|
||||
|
||||
class MarkdownChunker:
|
||||
"""
|
||||
Intelligent chunker for Markdown documents.
|
||||
|
||||
Features:
|
||||
- Structure-aware splitting (headers, code blocks, tables, lists)
|
||||
- Context preservation (header hierarchy)
|
||||
- Configurable chunk size and overlap
|
||||
- Metadata extraction
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_chunk_size: int = 1000,
|
||||
min_chunk_size: int = 100,
|
||||
chunk_overlap: int = 50,
|
||||
preserve_code_blocks: bool = True,
|
||||
preserve_tables: bool = True,
|
||||
preserve_lists: bool = True,
|
||||
include_header_context: bool = True,
|
||||
):
|
||||
self._max_chunk_size = max_chunk_size
|
||||
self._min_chunk_size = min_chunk_size
|
||||
self._chunk_overlap = chunk_overlap
|
||||
self._preserve_code_blocks = preserve_code_blocks
|
||||
self._preserve_tables = preserve_tables
|
||||
self._preserve_lists = preserve_lists
|
||||
self._include_header_context = include_header_context
|
||||
self._parser = MarkdownParser()
|
||||
|
||||
def chunk(self, text: str, doc_id: str = "") -> list[MarkdownChunk]:
|
||||
"""
|
||||
Chunk Markdown text into structured segments.
|
||||
|
||||
Args:
|
||||
text: Raw Markdown text
|
||||
doc_id: Optional document ID for chunk IDs
|
||||
|
||||
Returns:
|
||||
List of MarkdownChunk objects
|
||||
"""
|
||||
elements = self._parser.parse(text)
|
||||
chunks = []
|
||||
header_stack: list[str] = []
|
||||
chunk_index = 0
|
||||
|
||||
for elem in elements:
|
||||
if elem.type == MarkdownElementType.HEADER:
|
||||
level = elem.level
|
||||
while len(header_stack) >= level:
|
||||
if header_stack:
|
||||
header_stack.pop()
|
||||
header_stack.append(elem.content)
|
||||
continue
|
||||
|
||||
if elem.type == MarkdownElementType.HORIZONTAL_RULE:
|
||||
continue
|
||||
|
||||
chunk_content = self._format_element_content(elem)
|
||||
if not chunk_content:
|
||||
continue
|
||||
|
||||
chunk_id = f"{doc_id}_chunk_{chunk_index}" if doc_id else f"chunk_{chunk_index}"
|
||||
|
||||
header_context = []
|
||||
if self._include_header_context:
|
||||
header_context = header_stack.copy()
|
||||
|
||||
if len(chunk_content) > self._max_chunk_size:
|
||||
sub_chunks = self._split_large_element(
|
||||
elem,
|
||||
chunk_id,
|
||||
header_context,
|
||||
chunk_index,
|
||||
)
|
||||
chunks.extend(sub_chunks)
|
||||
chunk_index += len(sub_chunks)
|
||||
else:
|
||||
chunks.append(MarkdownChunk(
|
||||
chunk_id=chunk_id,
|
||||
content=chunk_content,
|
||||
element_type=elem.type,
|
||||
header_context=header_context,
|
||||
level=elem.level,
|
||||
language=elem.language,
|
||||
metadata=elem.metadata,
|
||||
))
|
||||
chunk_index += 1
|
||||
|
||||
return chunks
|
||||
|
||||
def _format_element_content(self, elem: MarkdownElement) -> str:
|
||||
"""Format element content based on type."""
|
||||
if elem.type == MarkdownElementType.CODE_BLOCK:
|
||||
lang = elem.language or ""
|
||||
return f"```{lang}\n{elem.content}\n```"
|
||||
|
||||
elif elem.type == MarkdownElementType.TABLE:
|
||||
return elem.content
|
||||
|
||||
elif elem.type == MarkdownElementType.LIST:
|
||||
return elem.content
|
||||
|
||||
elif elem.type == MarkdownElementType.BLOCKQUOTE:
|
||||
lines = elem.content.split('\n')
|
||||
return '\n'.join([f"> {line}" for line in lines])
|
||||
|
||||
elif elem.type == MarkdownElementType.PARAGRAPH:
|
||||
return elem.content
|
||||
|
||||
return elem.content
|
||||
|
||||
def _split_large_element(
|
||||
self,
|
||||
elem: MarkdownElement,
|
||||
base_id: str,
|
||||
header_context: list[str],
|
||||
start_index: int,
|
||||
) -> list[MarkdownChunk]:
|
||||
"""Split a large element into smaller chunks."""
|
||||
chunks = []
|
||||
|
||||
if elem.type == MarkdownElementType.CODE_BLOCK:
|
||||
chunks = self._split_code_block(elem, base_id, header_context, start_index)
|
||||
elif elem.type == MarkdownElementType.TABLE:
|
||||
chunks = self._split_table(elem, base_id, header_context, start_index)
|
||||
elif elem.type == MarkdownElementType.LIST:
|
||||
chunks = self._split_list(elem, base_id, header_context, start_index)
|
||||
else:
|
||||
chunks = self._split_text(elem, base_id, header_context, start_index)
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_code_block(
|
||||
self,
|
||||
elem: MarkdownElement,
|
||||
base_id: str,
|
||||
header_context: list[str],
|
||||
start_index: int,
|
||||
) -> list[MarkdownChunk]:
|
||||
"""Split code block while preserving language marker."""
|
||||
chunks = []
|
||||
lines = elem.content.split('\n')
|
||||
current_lines = []
|
||||
current_size = 0
|
||||
sub_index = 0
|
||||
|
||||
for line in lines:
|
||||
if current_size + len(line) + 1 > self._max_chunk_size and current_lines:
|
||||
chunk_content = f"```{elem.language}\n" + '\n'.join(current_lines) + "\n```"
|
||||
chunks.append(MarkdownChunk(
|
||||
chunk_id=f"{base_id}_{sub_index}",
|
||||
content=chunk_content,
|
||||
element_type=MarkdownElementType.CODE_BLOCK,
|
||||
header_context=header_context,
|
||||
language=elem.language,
|
||||
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
|
||||
))
|
||||
sub_index += 1
|
||||
current_lines = []
|
||||
current_size = 0
|
||||
|
||||
current_lines.append(line)
|
||||
current_size += len(line) + 1
|
||||
|
||||
if current_lines:
|
||||
chunk_content = f"```{elem.language}\n" + '\n'.join(current_lines) + "\n```"
|
||||
chunks.append(MarkdownChunk(
|
||||
chunk_id=f"{base_id}_{sub_index}",
|
||||
content=chunk_content,
|
||||
element_type=MarkdownElementType.CODE_BLOCK,
|
||||
header_context=header_context,
|
||||
language=elem.language,
|
||||
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
|
||||
))
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_table(
|
||||
self,
|
||||
elem: MarkdownElement,
|
||||
base_id: str,
|
||||
header_context: list[str],
|
||||
start_index: int,
|
||||
) -> list[MarkdownChunk]:
|
||||
"""Split table while preserving header row."""
|
||||
chunks = []
|
||||
lines = elem.content.split('\n')
|
||||
|
||||
if len(lines) < 2:
|
||||
return [MarkdownChunk(
|
||||
chunk_id=f"{base_id}_0",
|
||||
content=elem.content,
|
||||
element_type=MarkdownElementType.TABLE,
|
||||
header_context=header_context,
|
||||
metadata=elem.metadata,
|
||||
)]
|
||||
|
||||
header_line = lines[0]
|
||||
separator_line = lines[1]
|
||||
data_lines = lines[2:]
|
||||
|
||||
current_lines = [header_line, separator_line]
|
||||
current_size = len(header_line) + len(separator_line) + 2
|
||||
sub_index = 0
|
||||
|
||||
for line in data_lines:
|
||||
if current_size + len(line) + 1 > self._max_chunk_size and len(current_lines) > 2:
|
||||
chunks.append(MarkdownChunk(
|
||||
chunk_id=f"{base_id}_{sub_index}",
|
||||
content='\n'.join(current_lines),
|
||||
element_type=MarkdownElementType.TABLE,
|
||||
header_context=header_context,
|
||||
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
|
||||
))
|
||||
sub_index += 1
|
||||
current_lines = [header_line, separator_line]
|
||||
current_size = len(header_line) + len(separator_line) + 2
|
||||
|
||||
current_lines.append(line)
|
||||
current_size += len(line) + 1
|
||||
|
||||
if len(current_lines) > 2:
|
||||
chunks.append(MarkdownChunk(
|
||||
chunk_id=f"{base_id}_{sub_index}",
|
||||
content='\n'.join(current_lines),
|
||||
element_type=MarkdownElementType.TABLE,
|
||||
header_context=header_context,
|
||||
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
|
||||
))
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_list(
|
||||
self,
|
||||
elem: MarkdownElement,
|
||||
base_id: str,
|
||||
header_context: list[str],
|
||||
start_index: int,
|
||||
) -> list[MarkdownChunk]:
|
||||
"""Split list into smaller chunks."""
|
||||
chunks = []
|
||||
items = elem.content.split('\n')
|
||||
current_items = []
|
||||
current_size = 0
|
||||
sub_index = 0
|
||||
|
||||
for item in items:
|
||||
if current_size + len(item) + 1 > self._max_chunk_size and current_items:
|
||||
chunks.append(MarkdownChunk(
|
||||
chunk_id=f"{base_id}_{sub_index}",
|
||||
content='\n'.join(current_items),
|
||||
element_type=MarkdownElementType.LIST,
|
||||
header_context=header_context,
|
||||
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
|
||||
))
|
||||
sub_index += 1
|
||||
current_items = []
|
||||
current_size = 0
|
||||
|
||||
current_items.append(item)
|
||||
current_size += len(item) + 1
|
||||
|
||||
if current_items:
|
||||
chunks.append(MarkdownChunk(
|
||||
chunk_id=f"{base_id}_{sub_index}",
|
||||
content='\n'.join(current_items),
|
||||
element_type=MarkdownElementType.LIST,
|
||||
header_context=header_context,
|
||||
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
|
||||
))
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_text(
|
||||
self,
|
||||
elem: MarkdownElement,
|
||||
base_id: str,
|
||||
header_context: list[str],
|
||||
start_index: int,
|
||||
) -> list[MarkdownChunk]:
|
||||
"""Split text content by sentences or paragraphs."""
|
||||
chunks = []
|
||||
text = elem.content
|
||||
sub_index = 0
|
||||
|
||||
paragraphs = text.split('\n\n')
|
||||
|
||||
current_content = ""
|
||||
current_size = 0
|
||||
|
||||
for para in paragraphs:
|
||||
if current_size + len(para) + 2 > self._max_chunk_size and current_content:
|
||||
chunks.append(MarkdownChunk(
|
||||
chunk_id=f"{base_id}_{sub_index}",
|
||||
content=current_content.strip(),
|
||||
element_type=elem.type,
|
||||
header_context=header_context,
|
||||
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
|
||||
))
|
||||
sub_index += 1
|
||||
current_content = ""
|
||||
current_size = 0
|
||||
|
||||
current_content += para + "\n\n"
|
||||
current_size += len(para) + 2
|
||||
|
||||
if current_content.strip():
|
||||
chunks.append(MarkdownChunk(
|
||||
chunk_id=f"{base_id}_{sub_index}",
|
||||
content=current_content.strip(),
|
||||
element_type=elem.type,
|
||||
header_context=header_context,
|
||||
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
|
||||
))
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_markdown(
|
||||
text: str,
|
||||
doc_id: str = "",
|
||||
max_chunk_size: int = 1000,
|
||||
min_chunk_size: int = 100,
|
||||
preserve_code_blocks: bool = True,
|
||||
preserve_tables: bool = True,
|
||||
preserve_lists: bool = True,
|
||||
include_header_context: bool = True,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Convenience function to chunk Markdown text.
|
||||
|
||||
Args:
|
||||
text: Raw Markdown text
|
||||
doc_id: Optional document ID
|
||||
max_chunk_size: Maximum chunk size in characters
|
||||
min_chunk_size: Minimum chunk size in characters
|
||||
preserve_code_blocks: Whether to preserve code blocks
|
||||
preserve_tables: Whether to preserve tables
|
||||
preserve_lists: Whether to preserve lists
|
||||
include_header_context: Whether to include header context
|
||||
|
||||
Returns:
|
||||
List of chunk dictionaries
|
||||
"""
|
||||
chunker = MarkdownChunker(
|
||||
max_chunk_size=max_chunk_size,
|
||||
min_chunk_size=min_chunk_size,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
preserve_tables=preserve_tables,
|
||||
preserve_lists=preserve_lists,
|
||||
include_header_context=include_header_context,
|
||||
)
|
||||
|
||||
chunks = chunker.chunk(text, doc_id)
|
||||
return [chunk.to_dict() for chunk in chunks]
|
||||
|
|
@ -0,0 +1,178 @@
|
|||
"""
|
||||
Markdown parser with intelligent chunking.
|
||||
[AC-AISVC-33] Markdown file parsing with structure-aware chunking.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.services.document.base import (
|
||||
DocumentParseException,
|
||||
DocumentParser,
|
||||
ParseResult,
|
||||
)
|
||||
from app.services.document.markdown_chunker import (
|
||||
MarkdownChunker,
|
||||
MarkdownElementType,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]
|
||||
|
||||
|
||||
class MarkdownParser(DocumentParser):
|
||||
"""
|
||||
Parser for Markdown files with intelligent chunking.
|
||||
[AC-AISVC-33] Structure-aware parsing for Markdown documents.
|
||||
|
||||
Features:
|
||||
- Header hierarchy extraction
|
||||
- Code block preservation
|
||||
- Table structure preservation
|
||||
- List grouping
|
||||
- Context-aware chunking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
encoding: str = "utf-8",
|
||||
max_chunk_size: int = 1000,
|
||||
min_chunk_size: int = 100,
|
||||
preserve_code_blocks: bool = True,
|
||||
preserve_tables: bool = True,
|
||||
preserve_lists: bool = True,
|
||||
include_header_context: bool = True,
|
||||
**kwargs: Any,
|
||||
):
|
||||
self._encoding = encoding
|
||||
self._max_chunk_size = max_chunk_size
|
||||
self._min_chunk_size = min_chunk_size
|
||||
self._preserve_code_blocks = preserve_code_blocks
|
||||
self._preserve_tables = preserve_tables
|
||||
self._preserve_lists = preserve_lists
|
||||
self._include_header_context = include_header_context
|
||||
self._extra_config = kwargs
|
||||
|
||||
self._chunker = MarkdownChunker(
|
||||
max_chunk_size=max_chunk_size,
|
||||
min_chunk_size=min_chunk_size,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
preserve_tables=preserve_tables,
|
||||
preserve_lists=preserve_lists,
|
||||
include_header_context=include_header_context,
|
||||
)
|
||||
|
||||
def _try_encodings(self, path: Path) -> tuple[str, str]:
|
||||
"""
|
||||
Try multiple encodings to read the file.
|
||||
Returns: (text, encoding_used)
|
||||
"""
|
||||
for enc in ENCODINGS_TO_TRY:
|
||||
try:
|
||||
with open(path, encoding=enc) as f:
|
||||
text = f.read()
|
||||
logger.info(f"Successfully parsed Markdown with encoding: {enc}")
|
||||
return text, enc
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
continue
|
||||
|
||||
raise DocumentParseException(
|
||||
"Failed to decode Markdown file with any known encoding",
|
||||
file_path=str(path),
|
||||
parser="markdown"
|
||||
)
|
||||
|
||||
def parse(self, file_path: str | Path) -> ParseResult:
|
||||
"""
|
||||
Parse a Markdown file and extract structured content.
|
||||
[AC-AISVC-33] Structure-aware parsing.
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
if not path.exists():
|
||||
raise DocumentParseException(
|
||||
f"File not found: {path}",
|
||||
file_path=str(path),
|
||||
parser="markdown"
|
||||
)
|
||||
|
||||
try:
|
||||
text, encoding_used = self._try_encodings(path)
|
||||
|
||||
file_size = path.stat().st_size
|
||||
line_count = text.count("\n") + 1
|
||||
|
||||
chunks = self._chunker.chunk(text, doc_id=path.stem)
|
||||
|
||||
header_count = sum(
|
||||
1 for c in chunks
|
||||
if c.element_type == MarkdownElementType.HEADER
|
||||
)
|
||||
code_block_count = sum(
|
||||
1 for c in chunks
|
||||
if c.element_type == MarkdownElementType.CODE_BLOCK
|
||||
)
|
||||
table_count = sum(
|
||||
1 for c in chunks
|
||||
if c.element_type == MarkdownElementType.TABLE
|
||||
)
|
||||
list_count = sum(
|
||||
1 for c in chunks
|
||||
if c.element_type == MarkdownElementType.LIST
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Parsed Markdown: {path.name}, lines={line_count}, "
|
||||
f"chars={len(text)}, chunks={len(chunks)}, "
|
||||
f"headers={header_count}, code_blocks={code_block_count}, "
|
||||
f"tables={table_count}, lists={list_count}"
|
||||
)
|
||||
|
||||
return ParseResult(
|
||||
text=text,
|
||||
source_path=str(path),
|
||||
file_size=file_size,
|
||||
metadata={
|
||||
"format": "markdown",
|
||||
"line_count": line_count,
|
||||
"encoding": encoding_used,
|
||||
"chunk_count": len(chunks),
|
||||
"structure": {
|
||||
"headers": header_count,
|
||||
"code_blocks": code_block_count,
|
||||
"tables": table_count,
|
||||
"lists": list_count,
|
||||
},
|
||||
"chunks": [chunk.to_dict() for chunk in chunks],
|
||||
}
|
||||
)
|
||||
|
||||
except DocumentParseException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise DocumentParseException(
|
||||
f"Failed to parse Markdown file: {e}",
|
||||
file_path=str(path),
|
||||
parser="markdown",
|
||||
details={"error": str(e)}
|
||||
)
|
||||
|
||||
def get_supported_extensions(self) -> list[str]:
|
||||
"""Get supported file extensions."""
|
||||
return [".md", ".markdown"]
|
||||
|
||||
def get_chunks(self, text: str, doc_id: str = "") -> list[dict[str, Any]]:
|
||||
"""
|
||||
Get structured chunks from Markdown text.
|
||||
|
||||
Args:
|
||||
text: Markdown text content
|
||||
doc_id: Optional document ID
|
||||
|
||||
Returns:
|
||||
List of chunk dictionaries
|
||||
"""
|
||||
chunks = self._chunker.chunk(text, doc_id)
|
||||
return [chunk.to_dict() for chunk in chunks]
|
||||
Loading…
Reference in New Issue