[AC-DOC-PARSER] feat(document): 新增图片和 Markdown 解析器
- 新增 ImageParser 支持图片文件解析 - 新增 MarkdownParser 支持 Markdown 文件解析 - 新增 MarkdownChunker 实现 Markdown 智能分块 - 支持按标题、段落、代码块等元素类型分块 - 更新 document 模块导出和工厂方法
This commit is contained in:
parent
b3680bda8a
commit
4de2a2aece
|
|
@ -16,6 +16,16 @@ from app.services.document.factory import (
|
||||||
get_supported_document_formats,
|
get_supported_document_formats,
|
||||||
parse_document,
|
parse_document,
|
||||||
)
|
)
|
||||||
|
from app.services.document.image_parser import ImageParser
|
||||||
|
from app.services.document.markdown_chunker import (
|
||||||
|
MarkdownChunk,
|
||||||
|
MarkdownChunker,
|
||||||
|
MarkdownElement,
|
||||||
|
MarkdownElementType,
|
||||||
|
MarkdownParser as MarkdownStructureParser,
|
||||||
|
chunk_markdown,
|
||||||
|
)
|
||||||
|
from app.services.document.markdown_parser import MarkdownParser
|
||||||
from app.services.document.pdf_parser import PDFParser, PDFPlumberParser
|
from app.services.document.pdf_parser import PDFParser, PDFPlumberParser
|
||||||
from app.services.document.text_parser import TextParser
|
from app.services.document.text_parser import TextParser
|
||||||
from app.services.document.word_parser import WordParser
|
from app.services.document.word_parser import WordParser
|
||||||
|
|
@ -35,4 +45,12 @@ __all__ = [
|
||||||
"ExcelParser",
|
"ExcelParser",
|
||||||
"CSVParser",
|
"CSVParser",
|
||||||
"TextParser",
|
"TextParser",
|
||||||
|
"MarkdownParser",
|
||||||
|
"MarkdownChunker",
|
||||||
|
"MarkdownChunk",
|
||||||
|
"MarkdownElement",
|
||||||
|
"MarkdownElementType",
|
||||||
|
"MarkdownStructureParser",
|
||||||
|
"chunk_markdown",
|
||||||
|
"ImageParser",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,8 @@ from app.services.document.base import (
|
||||||
UnsupportedFormatError,
|
UnsupportedFormatError,
|
||||||
)
|
)
|
||||||
from app.services.document.excel_parser import CSVParser, ExcelParser
|
from app.services.document.excel_parser import CSVParser, ExcelParser
|
||||||
|
from app.services.document.image_parser import ImageParser
|
||||||
|
from app.services.document.markdown_parser import MarkdownParser
|
||||||
from app.services.document.pdf_parser import PDFParser, PDFPlumberParser
|
from app.services.document.pdf_parser import PDFParser, PDFPlumberParser
|
||||||
from app.services.document.text_parser import TextParser
|
from app.services.document.text_parser import TextParser
|
||||||
from app.services.document.word_parser import WordParser
|
from app.services.document.word_parser import WordParser
|
||||||
|
|
@ -45,6 +47,8 @@ class DocumentParserFactory:
|
||||||
"excel": ExcelParser,
|
"excel": ExcelParser,
|
||||||
"csv": CSVParser,
|
"csv": CSVParser,
|
||||||
"text": TextParser,
|
"text": TextParser,
|
||||||
|
"markdown": MarkdownParser,
|
||||||
|
"image": ImageParser,
|
||||||
}
|
}
|
||||||
|
|
||||||
cls._extension_map = {
|
cls._extension_map = {
|
||||||
|
|
@ -54,14 +58,22 @@ class DocumentParserFactory:
|
||||||
".xls": "excel",
|
".xls": "excel",
|
||||||
".csv": "csv",
|
".csv": "csv",
|
||||||
".txt": "text",
|
".txt": "text",
|
||||||
".md": "text",
|
".md": "markdown",
|
||||||
".markdown": "text",
|
".markdown": "markdown",
|
||||||
".rst": "text",
|
".rst": "text",
|
||||||
".log": "text",
|
".log": "text",
|
||||||
".json": "text",
|
".json": "text",
|
||||||
".xml": "text",
|
".xml": "text",
|
||||||
".yaml": "text",
|
".yaml": "text",
|
||||||
".yml": "text",
|
".yml": "text",
|
||||||
|
".jpg": "image",
|
||||||
|
".jpeg": "image",
|
||||||
|
".png": "image",
|
||||||
|
".gif": "image",
|
||||||
|
".webp": "image",
|
||||||
|
".bmp": "image",
|
||||||
|
".tiff": "image",
|
||||||
|
".tif": "image",
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -174,6 +186,8 @@ class DocumentParserFactory:
|
||||||
"excel": "Excel 电子表格",
|
"excel": "Excel 电子表格",
|
||||||
"csv": "CSV 文件",
|
"csv": "CSV 文件",
|
||||||
"text": "文本文件",
|
"text": "文本文件",
|
||||||
|
"markdown": "Markdown 文档",
|
||||||
|
"image": "图片文件",
|
||||||
}
|
}
|
||||||
|
|
||||||
descriptions = {
|
descriptions = {
|
||||||
|
|
@ -183,6 +197,8 @@ class DocumentParserFactory:
|
||||||
"excel": "解析 Excel 电子表格,支持多工作表",
|
"excel": "解析 Excel 电子表格,支持多工作表",
|
||||||
"csv": "解析 CSV 文件,自动检测编码",
|
"csv": "解析 CSV 文件,自动检测编码",
|
||||||
"text": "解析纯文本文件,支持多种编码",
|
"text": "解析纯文本文件,支持多种编码",
|
||||||
|
"markdown": "智能解析 Markdown 文档,保留结构(标题、代码块、表格、列表)",
|
||||||
|
"image": "使用多模态 LLM 解析图片,提取文字和关键信息",
|
||||||
}
|
}
|
||||||
|
|
||||||
info.append({
|
info.append({
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,490 @@
|
||||||
|
"""
|
||||||
|
Image parser using multimodal LLM.
|
||||||
|
Supports parsing images into structured text content for knowledge base indexing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from app.services.document.base import (
|
||||||
|
DocumentParseException,
|
||||||
|
DocumentParser,
|
||||||
|
PageText,
|
||||||
|
ParseResult,
|
||||||
|
)
|
||||||
|
from app.services.llm.factory import LLMUsageType, get_llm_config_manager
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
IMAGE_SYSTEM_PROMPT = """你是一个专业的图像内容分析助手。你的任务是分析图片内容,并将其智能拆分为适合知识库检索的独立数据块。
|
||||||
|
|
||||||
|
## 分析要求
|
||||||
|
1. 仔细分析图片内容,识别其中的文字、图表、数据等信息
|
||||||
|
2. 根据内容的逻辑结构,智能判断如何拆分为独立的知识条目
|
||||||
|
3. 每个条目应该是独立、完整、可检索的知识单元
|
||||||
|
|
||||||
|
## 输出格式
|
||||||
|
请严格按照以下 JSON 格式输出,不要添加任何其他内容:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"image_summary": "图片整体概述(一句话描述图片主题)",
|
||||||
|
"total_chunks": <分块总数>,
|
||||||
|
"chunks": [
|
||||||
|
{
|
||||||
|
"chunk_index": 0,
|
||||||
|
"content": "该分块的完整内容文字",
|
||||||
|
"chunk_type": "text|table|list|diagram|chart|mixed",
|
||||||
|
"keywords": ["关键词1", "关键词2"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## 分块策略
|
||||||
|
- **单一内容**: 如果图片只有一段完整的文字/信息,可以只输出1个分块
|
||||||
|
- **多段落内容**: 按段落或逻辑单元拆分,每个段落作为独立分块
|
||||||
|
- **表格数据**: 将表格内容转换为结构化文字,作为一个分块
|
||||||
|
- **图表数据**: 描述图表内容和数据,作为一个分块
|
||||||
|
- **列表内容**: 每个列表项可作为独立分块,或合并为相关的一组
|
||||||
|
- **混合内容**: 根据内容类型分别处理,确保每个分块主题明确
|
||||||
|
|
||||||
|
## 注意事项
|
||||||
|
1. 每个分块的 content 必须是完整、可独立理解的文字
|
||||||
|
2. chunk_type 用于标识内容类型,便于后续处理
|
||||||
|
3. keywords 提取该分块的核心关键词,便于检索
|
||||||
|
4. 确保输出的 JSON 格式正确,可以被解析"""
|
||||||
|
|
||||||
|
IMAGE_USER_PROMPT = "请分析这张图片,按照要求的 JSON 格式输出分块结果。"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ImageChunk:
|
||||||
|
"""智能分块结果"""
|
||||||
|
chunk_index: int
|
||||||
|
content: str
|
||||||
|
chunk_type: str = "text"
|
||||||
|
keywords: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ImageParseResult:
|
||||||
|
"""图片解析结果(包含智能分块)"""
|
||||||
|
image_summary: str
|
||||||
|
chunks: list[ImageChunk]
|
||||||
|
raw_text: str
|
||||||
|
source_path: str
|
||||||
|
file_size: int
|
||||||
|
metadata: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
class ImageParser(DocumentParser):
|
||||||
|
"""
|
||||||
|
Image parser using multimodal LLM.
|
||||||
|
|
||||||
|
Supports common image formats and extracts text content using
|
||||||
|
vision-capable LLM models (GPT-4V, GPT-4o, etc.).
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Intelligent chunking based on content structure
|
||||||
|
- Structured output with keywords and chunk types
|
||||||
|
- Support for various content types (text, table, chart, etc.)
|
||||||
|
"""
|
||||||
|
|
||||||
|
SUPPORTED_EXTENSIONS = [
|
||||||
|
".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: str | None = None,
|
||||||
|
max_tokens: int = 4096,
|
||||||
|
timeout_seconds: int = 120,
|
||||||
|
):
|
||||||
|
self._model = model
|
||||||
|
self._max_tokens = max_tokens
|
||||||
|
self._timeout_seconds = timeout_seconds
|
||||||
|
|
||||||
|
def parse(self, file_path: str | Path) -> ParseResult:
|
||||||
|
"""
|
||||||
|
Parse an image file and extract text content using multimodal LLM.
|
||||||
|
|
||||||
|
Note: This method is synchronous but internally uses async operations.
|
||||||
|
For async contexts, use parse_async() instead.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the image file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ParseResult with extracted text content.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
DocumentParseException: If parsing fails.
|
||||||
|
"""
|
||||||
|
path = Path(file_path)
|
||||||
|
if not path.exists():
|
||||||
|
raise DocumentParseException(
|
||||||
|
f"Image file not found: {file_path}",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="image",
|
||||||
|
)
|
||||||
|
|
||||||
|
file_size = path.stat().st_size
|
||||||
|
extension = path.suffix.lower()
|
||||||
|
|
||||||
|
if extension not in self.SUPPORTED_EXTENSIONS:
|
||||||
|
raise DocumentParseException(
|
||||||
|
f"Unsupported image format: {extension}",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="image",
|
||||||
|
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
image_data = f.read()
|
||||||
|
|
||||||
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||||
|
mime_type = self._get_mime_type(extension)
|
||||||
|
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
import concurrent.futures
|
||||||
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
|
future = executor.submit(
|
||||||
|
asyncio.run,
|
||||||
|
self._analyze_image_async(image_base64, mime_type)
|
||||||
|
)
|
||||||
|
result = future.result()
|
||||||
|
except RuntimeError:
|
||||||
|
result = asyncio.run(self._analyze_image_async(image_base64, mime_type))
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"[IMAGE-PARSER] Successfully parsed image: {path.name}, "
|
||||||
|
f"size={file_size}, chunks={len(result.chunks)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ParseResult(
|
||||||
|
text=result.raw_text,
|
||||||
|
source_path=str(path),
|
||||||
|
file_size=file_size,
|
||||||
|
page_count=1,
|
||||||
|
metadata={
|
||||||
|
"format": extension,
|
||||||
|
"parser": "image",
|
||||||
|
"mime_type": mime_type,
|
||||||
|
"image_summary": result.image_summary,
|
||||||
|
"chunk_count": len(result.chunks),
|
||||||
|
"chunks": [
|
||||||
|
{
|
||||||
|
"chunk_index": c.chunk_index,
|
||||||
|
"content": c.content,
|
||||||
|
"chunk_type": c.chunk_type,
|
||||||
|
"keywords": c.keywords,
|
||||||
|
}
|
||||||
|
for c in result.chunks
|
||||||
|
],
|
||||||
|
},
|
||||||
|
pages=[PageText(page=1, text=result.raw_text)],
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[IMAGE-PARSER] Failed to parse image {path}: {e}")
|
||||||
|
raise DocumentParseException(
|
||||||
|
f"Failed to parse image: {str(e)}",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="image",
|
||||||
|
details={"error": str(e)},
|
||||||
|
)
|
||||||
|
|
||||||
|
async def parse_async(self, file_path: str | Path) -> ParseResult:
|
||||||
|
"""
|
||||||
|
Async version of parse method for use in async contexts.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the image file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ParseResult with extracted text content.
|
||||||
|
"""
|
||||||
|
path = Path(file_path)
|
||||||
|
if not path.exists():
|
||||||
|
raise DocumentParseException(
|
||||||
|
f"Image file not found: {file_path}",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="image",
|
||||||
|
)
|
||||||
|
|
||||||
|
file_size = path.stat().st_size
|
||||||
|
extension = path.suffix.lower()
|
||||||
|
|
||||||
|
if extension not in self.SUPPORTED_EXTENSIONS:
|
||||||
|
raise DocumentParseException(
|
||||||
|
f"Unsupported image format: {extension}",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="image",
|
||||||
|
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
image_data = f.read()
|
||||||
|
|
||||||
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||||
|
mime_type = self._get_mime_type(extension)
|
||||||
|
|
||||||
|
result = await self._analyze_image_async(image_base64, mime_type)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"[IMAGE-PARSER] Successfully parsed image (async): {path.name}, "
|
||||||
|
f"size={file_size}, chunks={len(result.chunks)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ParseResult(
|
||||||
|
text=result.raw_text,
|
||||||
|
source_path=str(path),
|
||||||
|
file_size=file_size,
|
||||||
|
page_count=1,
|
||||||
|
metadata={
|
||||||
|
"format": extension,
|
||||||
|
"parser": "image",
|
||||||
|
"mime_type": mime_type,
|
||||||
|
"image_summary": result.image_summary,
|
||||||
|
"chunk_count": len(result.chunks),
|
||||||
|
"chunks": [
|
||||||
|
{
|
||||||
|
"chunk_index": c.chunk_index,
|
||||||
|
"content": c.content,
|
||||||
|
"chunk_type": c.chunk_type,
|
||||||
|
"keywords": c.keywords,
|
||||||
|
}
|
||||||
|
for c in result.chunks
|
||||||
|
],
|
||||||
|
},
|
||||||
|
pages=[PageText(page=1, text=result.raw_text)],
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[IMAGE-PARSER] Failed to parse image {path}: {e}")
|
||||||
|
raise DocumentParseException(
|
||||||
|
f"Failed to parse image: {str(e)}",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="image",
|
||||||
|
details={"error": str(e)},
|
||||||
|
)
|
||||||
|
|
||||||
|
async def parse_with_chunks(self, file_path: str | Path) -> ImageParseResult:
|
||||||
|
"""
|
||||||
|
Parse image and return structured result with intelligent chunks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the image file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ImageParseResult with intelligent chunks.
|
||||||
|
"""
|
||||||
|
path = Path(file_path)
|
||||||
|
if not path.exists():
|
||||||
|
raise DocumentParseException(
|
||||||
|
f"Image file not found: {file_path}",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="image",
|
||||||
|
)
|
||||||
|
|
||||||
|
file_size = path.stat().st_size
|
||||||
|
extension = path.suffix.lower()
|
||||||
|
|
||||||
|
if extension not in self.SUPPORTED_EXTENSIONS:
|
||||||
|
raise DocumentParseException(
|
||||||
|
f"Unsupported image format: {extension}",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="image",
|
||||||
|
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
image_data = f.read()
|
||||||
|
|
||||||
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||||
|
mime_type = self._get_mime_type(extension)
|
||||||
|
|
||||||
|
result = await self._analyze_image_async(image_base64, mime_type)
|
||||||
|
result.source_path = str(path)
|
||||||
|
result.file_size = file_size
|
||||||
|
result.metadata = {
|
||||||
|
"format": extension,
|
||||||
|
"parser": "image",
|
||||||
|
"mime_type": mime_type,
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _analyze_image_async(self, image_base64: str, mime_type: str) -> ImageParseResult:
|
||||||
|
"""
|
||||||
|
Analyze image using multimodal LLM and return structured chunks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_base64: Base64 encoded image data.
|
||||||
|
mime_type: MIME type of the image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ImageParseResult with intelligent chunks.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
manager = get_llm_config_manager()
|
||||||
|
client = manager.get_kb_processing_client()
|
||||||
|
|
||||||
|
config = manager.kb_processing_config
|
||||||
|
model = self._model or config.get("model", "gpt-4o-mini")
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": IMAGE_SYSTEM_PROMPT,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": IMAGE_USER_PROMPT,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:{mime_type};base64,{image_base64}",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
from app.services.llm.base import LLMConfig
|
||||||
|
|
||||||
|
llm_config = LLMConfig(
|
||||||
|
model=model,
|
||||||
|
max_tokens=self._max_tokens,
|
||||||
|
temperature=0.3,
|
||||||
|
timeout_seconds=self._timeout_seconds,
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await client.generate(messages=messages, config=llm_config)
|
||||||
|
|
||||||
|
if not response.content:
|
||||||
|
raise DocumentParseException(
|
||||||
|
"LLM returned empty response for image analysis",
|
||||||
|
parser="image",
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._parse_llm_response(response.content)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"[IMAGE-PARSER] LLM analysis failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _parse_llm_response(self, response_content: str) -> ImageParseResult:
|
||||||
|
"""
|
||||||
|
Parse LLM response into structured ImageParseResult.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
response_content: Raw LLM response content.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ImageParseResult with parsed chunks.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
json_str = self._extract_json(response_content)
|
||||||
|
data = json.loads(json_str)
|
||||||
|
|
||||||
|
image_summary = data.get("image_summary", "")
|
||||||
|
chunks_data = data.get("chunks", [])
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
for chunk_data in chunks_data:
|
||||||
|
chunk = ImageChunk(
|
||||||
|
chunk_index=chunk_data.get("chunk_index", len(chunks)),
|
||||||
|
content=chunk_data.get("content", ""),
|
||||||
|
chunk_type=chunk_data.get("chunk_type", "text"),
|
||||||
|
keywords=chunk_data.get("keywords", []),
|
||||||
|
)
|
||||||
|
if chunk.content.strip():
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
if not chunks:
|
||||||
|
chunks.append(ImageChunk(
|
||||||
|
chunk_index=0,
|
||||||
|
content=response_content,
|
||||||
|
chunk_type="text",
|
||||||
|
keywords=[],
|
||||||
|
))
|
||||||
|
|
||||||
|
raw_text = "\n\n".join([c.content for c in chunks])
|
||||||
|
|
||||||
|
return ImageParseResult(
|
||||||
|
image_summary=image_summary,
|
||||||
|
chunks=chunks,
|
||||||
|
raw_text=raw_text,
|
||||||
|
source_path="",
|
||||||
|
file_size=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning(f"[IMAGE-PARSER] Failed to parse JSON response: {e}, using fallback")
|
||||||
|
return ImageParseResult(
|
||||||
|
image_summary="图片内容",
|
||||||
|
chunks=[ImageChunk(
|
||||||
|
chunk_index=0,
|
||||||
|
content=response_content,
|
||||||
|
chunk_type="text",
|
||||||
|
keywords=[],
|
||||||
|
)],
|
||||||
|
raw_text=response_content,
|
||||||
|
source_path="",
|
||||||
|
file_size=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _extract_json(self, content: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract JSON from LLM response content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Raw response content that may contain JSON.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Extracted JSON string.
|
||||||
|
"""
|
||||||
|
content = content.strip()
|
||||||
|
|
||||||
|
if content.startswith("{") and content.endswith("}"):
|
||||||
|
return content
|
||||||
|
|
||||||
|
json_start = content.find("{")
|
||||||
|
json_end = content.rfind("}")
|
||||||
|
|
||||||
|
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||||
|
return content[json_start:json_end + 1]
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
def _get_mime_type(self, extension: str) -> str:
|
||||||
|
"""Get MIME type for image extension."""
|
||||||
|
mime_types = {
|
||||||
|
".jpg": "image/jpeg",
|
||||||
|
".jpeg": "image/jpeg",
|
||||||
|
".png": "image/png",
|
||||||
|
".gif": "image/gif",
|
||||||
|
".webp": "image/webp",
|
||||||
|
".bmp": "image/bmp",
|
||||||
|
".tiff": "image/tiff",
|
||||||
|
".tif": "image/tiff",
|
||||||
|
}
|
||||||
|
return mime_types.get(extension.lower(), "image/jpeg")
|
||||||
|
|
||||||
|
def get_supported_extensions(self) -> list[str]:
|
||||||
|
"""Get list of supported image extensions."""
|
||||||
|
return ImageParser.SUPPORTED_EXTENSIONS
|
||||||
|
|
@ -0,0 +1,771 @@
|
||||||
|
"""
|
||||||
|
Markdown intelligent chunker with structure-aware splitting.
|
||||||
|
Supports headers, code blocks, tables, lists, and preserves context.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownElementType(Enum):
|
||||||
|
"""Types of Markdown elements."""
|
||||||
|
HEADER = "header"
|
||||||
|
PARAGRAPH = "paragraph"
|
||||||
|
CODE_BLOCK = "code_block"
|
||||||
|
INLINE_CODE = "inline_code"
|
||||||
|
TABLE = "table"
|
||||||
|
LIST = "list"
|
||||||
|
BLOCKQUOTE = "blockquote"
|
||||||
|
HORIZONTAL_RULE = "horizontal_rule"
|
||||||
|
IMAGE = "image"
|
||||||
|
LINK = "link"
|
||||||
|
TEXT = "text"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MarkdownElement:
|
||||||
|
"""Represents a parsed Markdown element."""
|
||||||
|
type: MarkdownElementType
|
||||||
|
content: str
|
||||||
|
level: int = 0
|
||||||
|
language: str = ""
|
||||||
|
metadata: dict[str, Any] = field(default_factory=dict)
|
||||||
|
line_start: int = 0
|
||||||
|
line_end: int = 0
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"type": self.type.value,
|
||||||
|
"content": self.content,
|
||||||
|
"level": self.level,
|
||||||
|
"language": self.language,
|
||||||
|
"metadata": self.metadata,
|
||||||
|
"line_start": self.line_start,
|
||||||
|
"line_end": self.line_end,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MarkdownChunk:
|
||||||
|
"""Represents a chunk of Markdown content with context."""
|
||||||
|
chunk_id: str
|
||||||
|
content: str
|
||||||
|
element_type: MarkdownElementType
|
||||||
|
header_context: list[str]
|
||||||
|
level: int = 0
|
||||||
|
language: str = ""
|
||||||
|
metadata: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"chunk_id": self.chunk_id,
|
||||||
|
"content": self.content,
|
||||||
|
"element_type": self.element_type.value,
|
||||||
|
"header_context": self.header_context,
|
||||||
|
"level": self.level,
|
||||||
|
"language": self.language,
|
||||||
|
"metadata": self.metadata,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownParser:
|
||||||
|
"""
|
||||||
|
Parser for Markdown documents.
|
||||||
|
Extracts structured elements from Markdown text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
HEADER_PATTERN = re.compile(r'^(#{1,6})\s+(.+?)(?:\s+#+)?$', re.MULTILINE)
|
||||||
|
CODE_BLOCK_PATTERN = re.compile(r'^```(\w*)\n(.*?)^```', re.MULTILINE | re.DOTALL)
|
||||||
|
TABLE_PATTERN = re.compile(r'^(\|.+\|)\n(\|[-:\s|]+\|)\n((?:\|.+\|\n?)+)', re.MULTILINE)
|
||||||
|
LIST_PATTERN = re.compile(r'^([ \t]*[-*+]|\d+\.)\s+(.+)$', re.MULTILINE)
|
||||||
|
BLOCKQUOTE_PATTERN = re.compile(r'^>\s*(.+)$', re.MULTILINE)
|
||||||
|
HR_PATTERN = re.compile(r'^[-*_]{3,}\s*$', re.MULTILINE)
|
||||||
|
IMAGE_PATTERN = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
|
||||||
|
LINK_PATTERN = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
|
||||||
|
INLINE_CODE_PATTERN = re.compile(r'`([^`]+)`')
|
||||||
|
|
||||||
|
def parse(self, text: str) -> list[MarkdownElement]:
|
||||||
|
"""
|
||||||
|
Parse Markdown text into structured elements.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Raw Markdown text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of MarkdownElement objects
|
||||||
|
"""
|
||||||
|
elements = []
|
||||||
|
lines = text.split('\n')
|
||||||
|
current_pos = 0
|
||||||
|
|
||||||
|
code_block_ranges = self._extract_code_blocks(text, lines, elements)
|
||||||
|
table_ranges = self._extract_tables(text, lines, elements)
|
||||||
|
protected_ranges = code_block_ranges + table_ranges
|
||||||
|
|
||||||
|
self._extract_headers(lines, elements, protected_ranges)
|
||||||
|
self._extract_lists(lines, elements, protected_ranges)
|
||||||
|
self._extract_blockquotes(lines, elements, protected_ranges)
|
||||||
|
self._extract_horizontal_rules(lines, elements, protected_ranges)
|
||||||
|
|
||||||
|
self._fill_paragraphs(lines, elements, protected_ranges)
|
||||||
|
|
||||||
|
elements.sort(key=lambda e: e.line_start)
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
def _extract_code_blocks(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
lines: list[str],
|
||||||
|
elements: list[MarkdownElement],
|
||||||
|
) -> list[tuple[int, int]]:
|
||||||
|
"""Extract code blocks with language info."""
|
||||||
|
ranges = []
|
||||||
|
in_code_block = False
|
||||||
|
code_start = 0
|
||||||
|
language = ""
|
||||||
|
code_content = []
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if line.strip().startswith('```'):
|
||||||
|
if not in_code_block:
|
||||||
|
in_code_block = True
|
||||||
|
code_start = i
|
||||||
|
language = line.strip()[3:].strip()
|
||||||
|
code_content = []
|
||||||
|
else:
|
||||||
|
in_code_block = False
|
||||||
|
elements.append(MarkdownElement(
|
||||||
|
type=MarkdownElementType.CODE_BLOCK,
|
||||||
|
content='\n'.join(code_content),
|
||||||
|
language=language,
|
||||||
|
line_start=code_start,
|
||||||
|
line_end=i,
|
||||||
|
metadata={"language": language},
|
||||||
|
))
|
||||||
|
ranges.append((code_start, i))
|
||||||
|
|
||||||
|
elif in_code_block:
|
||||||
|
code_content.append(line)
|
||||||
|
|
||||||
|
return ranges
|
||||||
|
|
||||||
|
def _extract_tables(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
lines: list[str],
|
||||||
|
elements: list[MarkdownElement],
|
||||||
|
) -> list[tuple[int, int]]:
|
||||||
|
"""Extract Markdown tables."""
|
||||||
|
ranges = []
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
while i < len(lines):
|
||||||
|
line = lines[i]
|
||||||
|
|
||||||
|
if '|' in line and i + 1 < len(lines):
|
||||||
|
next_line = lines[i + 1]
|
||||||
|
if '|' in next_line and re.match(r'^[\|\-\:\s]+$', next_line.strip()):
|
||||||
|
table_lines = [line, next_line]
|
||||||
|
j = i + 2
|
||||||
|
|
||||||
|
while j < len(lines) and '|' in lines[j]:
|
||||||
|
table_lines.append(lines[j])
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
table_content = '\n'.join(table_lines)
|
||||||
|
headers = [h.strip() for h in line.split('|') if h.strip()]
|
||||||
|
row_count = len(table_lines) - 2
|
||||||
|
|
||||||
|
elements.append(MarkdownElement(
|
||||||
|
type=MarkdownElementType.TABLE,
|
||||||
|
content=table_content,
|
||||||
|
line_start=i,
|
||||||
|
line_end=j - 1,
|
||||||
|
metadata={
|
||||||
|
"headers": headers,
|
||||||
|
"row_count": row_count,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
ranges.append((i, j - 1))
|
||||||
|
i = j
|
||||||
|
continue
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return ranges
|
||||||
|
|
||||||
|
def _is_in_protected_range(self, line_num: int, ranges: list[tuple[int, int]]) -> bool:
|
||||||
|
"""Check if a line is within a protected range."""
|
||||||
|
for start, end in ranges:
|
||||||
|
if start <= line_num <= end:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _extract_headers(
|
||||||
|
self,
|
||||||
|
lines: list[str],
|
||||||
|
elements: list[MarkdownElement],
|
||||||
|
protected_ranges: list[tuple[int, int]],
|
||||||
|
) -> None:
|
||||||
|
"""Extract headers with level info."""
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if self._is_in_protected_range(i, protected_ranges):
|
||||||
|
continue
|
||||||
|
|
||||||
|
match = self.HEADER_PATTERN.match(line)
|
||||||
|
if match:
|
||||||
|
level = len(match.group(1))
|
||||||
|
title = match.group(2).strip()
|
||||||
|
|
||||||
|
elements.append(MarkdownElement(
|
||||||
|
type=MarkdownElementType.HEADER,
|
||||||
|
content=title,
|
||||||
|
level=level,
|
||||||
|
line_start=i,
|
||||||
|
line_end=i,
|
||||||
|
metadata={"level": level},
|
||||||
|
))
|
||||||
|
|
||||||
|
def _extract_lists(
|
||||||
|
self,
|
||||||
|
lines: list[str],
|
||||||
|
elements: list[MarkdownElement],
|
||||||
|
protected_ranges: list[tuple[int, int]],
|
||||||
|
) -> None:
|
||||||
|
"""Extract list items."""
|
||||||
|
in_list = False
|
||||||
|
list_start = 0
|
||||||
|
list_items = []
|
||||||
|
list_indent = 0
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if self._is_in_protected_range(i, protected_ranges):
|
||||||
|
if in_list:
|
||||||
|
self._save_list(elements, list_start, i - 1, list_items)
|
||||||
|
in_list = False
|
||||||
|
list_items = []
|
||||||
|
continue
|
||||||
|
|
||||||
|
match = self.LIST_PATTERN.match(line)
|
||||||
|
if match:
|
||||||
|
indent = len(line) - len(line.lstrip())
|
||||||
|
item_content = match.group(2)
|
||||||
|
|
||||||
|
if not in_list:
|
||||||
|
in_list = True
|
||||||
|
list_start = i
|
||||||
|
list_indent = indent
|
||||||
|
list_items = [(indent, item_content)]
|
||||||
|
else:
|
||||||
|
list_items.append((indent, item_content))
|
||||||
|
else:
|
||||||
|
if in_list:
|
||||||
|
if line.strip() == '':
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
self._save_list(elements, list_start, i - 1, list_items)
|
||||||
|
in_list = False
|
||||||
|
list_items = []
|
||||||
|
|
||||||
|
if in_list:
|
||||||
|
self._save_list(elements, list_start, len(lines) - 1, list_items)
|
||||||
|
|
||||||
|
def _save_list(
|
||||||
|
self,
|
||||||
|
elements: list[MarkdownElement],
|
||||||
|
start: int,
|
||||||
|
end: int,
|
||||||
|
items: list[tuple[int, str]],
|
||||||
|
) -> None:
|
||||||
|
"""Save a list element."""
|
||||||
|
if not items:
|
||||||
|
return
|
||||||
|
|
||||||
|
content = '\n'.join([item[1] for item in items])
|
||||||
|
elements.append(MarkdownElement(
|
||||||
|
type=MarkdownElementType.LIST,
|
||||||
|
content=content,
|
||||||
|
line_start=start,
|
||||||
|
line_end=end,
|
||||||
|
metadata={
|
||||||
|
"item_count": len(items),
|
||||||
|
"is_ordered": False,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
def _extract_blockquotes(
|
||||||
|
self,
|
||||||
|
lines: list[str],
|
||||||
|
elements: list[MarkdownElement],
|
||||||
|
protected_ranges: list[tuple[int, int]],
|
||||||
|
) -> None:
|
||||||
|
"""Extract blockquotes."""
|
||||||
|
in_quote = False
|
||||||
|
quote_start = 0
|
||||||
|
quote_lines = []
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if self._is_in_protected_range(i, protected_ranges):
|
||||||
|
if in_quote:
|
||||||
|
self._save_blockquote(elements, quote_start, i - 1, quote_lines)
|
||||||
|
in_quote = False
|
||||||
|
quote_lines = []
|
||||||
|
continue
|
||||||
|
|
||||||
|
match = self.BLOCKQUOTE_PATTERN.match(line)
|
||||||
|
if match:
|
||||||
|
if not in_quote:
|
||||||
|
in_quote = True
|
||||||
|
quote_start = i
|
||||||
|
quote_lines.append(match.group(1))
|
||||||
|
else:
|
||||||
|
if in_quote:
|
||||||
|
self._save_blockquote(elements, quote_start, i - 1, quote_lines)
|
||||||
|
in_quote = False
|
||||||
|
quote_lines = []
|
||||||
|
|
||||||
|
if in_quote:
|
||||||
|
self._save_blockquote(elements, quote_start, len(lines) - 1, quote_lines)
|
||||||
|
|
||||||
|
def _save_blockquote(
|
||||||
|
self,
|
||||||
|
elements: list[MarkdownElement],
|
||||||
|
start: int,
|
||||||
|
end: int,
|
||||||
|
lines: list[str],
|
||||||
|
) -> None:
|
||||||
|
"""Save a blockquote element."""
|
||||||
|
if not lines:
|
||||||
|
return
|
||||||
|
|
||||||
|
elements.append(MarkdownElement(
|
||||||
|
type=MarkdownElementType.BLOCKQUOTE,
|
||||||
|
content='\n'.join(lines),
|
||||||
|
line_start=start,
|
||||||
|
line_end=end,
|
||||||
|
))
|
||||||
|
|
||||||
|
def _extract_horizontal_rules(
|
||||||
|
self,
|
||||||
|
lines: list[str],
|
||||||
|
elements: list[MarkdownElement],
|
||||||
|
protected_ranges: list[tuple[int, int]],
|
||||||
|
) -> None:
|
||||||
|
"""Extract horizontal rules."""
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if self._is_in_protected_range(i, protected_ranges):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if self.HR_PATTERN.match(line):
|
||||||
|
elements.append(MarkdownElement(
|
||||||
|
type=MarkdownElementType.HORIZONTAL_RULE,
|
||||||
|
content=line,
|
||||||
|
line_start=i,
|
||||||
|
line_end=i,
|
||||||
|
))
|
||||||
|
|
||||||
|
def _fill_paragraphs(
|
||||||
|
self,
|
||||||
|
lines: list[str],
|
||||||
|
elements: list[MarkdownElement],
|
||||||
|
protected_ranges: list[tuple[int, int]],
|
||||||
|
) -> None:
|
||||||
|
"""Fill in paragraphs for remaining content."""
|
||||||
|
occupied = set()
|
||||||
|
for start, end in protected_ranges:
|
||||||
|
for i in range(start, end + 1):
|
||||||
|
occupied.add(i)
|
||||||
|
|
||||||
|
for elem in elements:
|
||||||
|
for i in range(elem.line_start, elem.line_end + 1):
|
||||||
|
occupied.add(i)
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < len(lines):
|
||||||
|
if i in occupied:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if lines[i].strip() == '':
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
para_start = i
|
||||||
|
para_lines = []
|
||||||
|
|
||||||
|
while i < len(lines) and i not in occupied and lines[i].strip() != '':
|
||||||
|
para_lines.append(lines[i])
|
||||||
|
occupied.add(i)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if para_lines:
|
||||||
|
elements.append(MarkdownElement(
|
||||||
|
type=MarkdownElementType.PARAGRAPH,
|
||||||
|
content='\n'.join(para_lines),
|
||||||
|
line_start=para_start,
|
||||||
|
line_end=i - 1,
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownChunker:
|
||||||
|
"""
|
||||||
|
Intelligent chunker for Markdown documents.
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Structure-aware splitting (headers, code blocks, tables, lists)
|
||||||
|
- Context preservation (header hierarchy)
|
||||||
|
- Configurable chunk size and overlap
|
||||||
|
- Metadata extraction
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_chunk_size: int = 1000,
|
||||||
|
min_chunk_size: int = 100,
|
||||||
|
chunk_overlap: int = 50,
|
||||||
|
preserve_code_blocks: bool = True,
|
||||||
|
preserve_tables: bool = True,
|
||||||
|
preserve_lists: bool = True,
|
||||||
|
include_header_context: bool = True,
|
||||||
|
):
|
||||||
|
self._max_chunk_size = max_chunk_size
|
||||||
|
self._min_chunk_size = min_chunk_size
|
||||||
|
self._chunk_overlap = chunk_overlap
|
||||||
|
self._preserve_code_blocks = preserve_code_blocks
|
||||||
|
self._preserve_tables = preserve_tables
|
||||||
|
self._preserve_lists = preserve_lists
|
||||||
|
self._include_header_context = include_header_context
|
||||||
|
self._parser = MarkdownParser()
|
||||||
|
|
||||||
|
def chunk(self, text: str, doc_id: str = "") -> list[MarkdownChunk]:
|
||||||
|
"""
|
||||||
|
Chunk Markdown text into structured segments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Raw Markdown text
|
||||||
|
doc_id: Optional document ID for chunk IDs
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of MarkdownChunk objects
|
||||||
|
"""
|
||||||
|
elements = self._parser.parse(text)
|
||||||
|
chunks = []
|
||||||
|
header_stack: list[str] = []
|
||||||
|
chunk_index = 0
|
||||||
|
|
||||||
|
for elem in elements:
|
||||||
|
if elem.type == MarkdownElementType.HEADER:
|
||||||
|
level = elem.level
|
||||||
|
while len(header_stack) >= level:
|
||||||
|
if header_stack:
|
||||||
|
header_stack.pop()
|
||||||
|
header_stack.append(elem.content)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if elem.type == MarkdownElementType.HORIZONTAL_RULE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk_content = self._format_element_content(elem)
|
||||||
|
if not chunk_content:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk_id = f"{doc_id}_chunk_{chunk_index}" if doc_id else f"chunk_{chunk_index}"
|
||||||
|
|
||||||
|
header_context = []
|
||||||
|
if self._include_header_context:
|
||||||
|
header_context = header_stack.copy()
|
||||||
|
|
||||||
|
if len(chunk_content) > self._max_chunk_size:
|
||||||
|
sub_chunks = self._split_large_element(
|
||||||
|
elem,
|
||||||
|
chunk_id,
|
||||||
|
header_context,
|
||||||
|
chunk_index,
|
||||||
|
)
|
||||||
|
chunks.extend(sub_chunks)
|
||||||
|
chunk_index += len(sub_chunks)
|
||||||
|
else:
|
||||||
|
chunks.append(MarkdownChunk(
|
||||||
|
chunk_id=chunk_id,
|
||||||
|
content=chunk_content,
|
||||||
|
element_type=elem.type,
|
||||||
|
header_context=header_context,
|
||||||
|
level=elem.level,
|
||||||
|
language=elem.language,
|
||||||
|
metadata=elem.metadata,
|
||||||
|
))
|
||||||
|
chunk_index += 1
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def _format_element_content(self, elem: MarkdownElement) -> str:
|
||||||
|
"""Format element content based on type."""
|
||||||
|
if elem.type == MarkdownElementType.CODE_BLOCK:
|
||||||
|
lang = elem.language or ""
|
||||||
|
return f"```{lang}\n{elem.content}\n```"
|
||||||
|
|
||||||
|
elif elem.type == MarkdownElementType.TABLE:
|
||||||
|
return elem.content
|
||||||
|
|
||||||
|
elif elem.type == MarkdownElementType.LIST:
|
||||||
|
return elem.content
|
||||||
|
|
||||||
|
elif elem.type == MarkdownElementType.BLOCKQUOTE:
|
||||||
|
lines = elem.content.split('\n')
|
||||||
|
return '\n'.join([f"> {line}" for line in lines])
|
||||||
|
|
||||||
|
elif elem.type == MarkdownElementType.PARAGRAPH:
|
||||||
|
return elem.content
|
||||||
|
|
||||||
|
return elem.content
|
||||||
|
|
||||||
|
def _split_large_element(
|
||||||
|
self,
|
||||||
|
elem: MarkdownElement,
|
||||||
|
base_id: str,
|
||||||
|
header_context: list[str],
|
||||||
|
start_index: int,
|
||||||
|
) -> list[MarkdownChunk]:
|
||||||
|
"""Split a large element into smaller chunks."""
|
||||||
|
chunks = []
|
||||||
|
|
||||||
|
if elem.type == MarkdownElementType.CODE_BLOCK:
|
||||||
|
chunks = self._split_code_block(elem, base_id, header_context, start_index)
|
||||||
|
elif elem.type == MarkdownElementType.TABLE:
|
||||||
|
chunks = self._split_table(elem, base_id, header_context, start_index)
|
||||||
|
elif elem.type == MarkdownElementType.LIST:
|
||||||
|
chunks = self._split_list(elem, base_id, header_context, start_index)
|
||||||
|
else:
|
||||||
|
chunks = self._split_text(elem, base_id, header_context, start_index)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def _split_code_block(
|
||||||
|
self,
|
||||||
|
elem: MarkdownElement,
|
||||||
|
base_id: str,
|
||||||
|
header_context: list[str],
|
||||||
|
start_index: int,
|
||||||
|
) -> list[MarkdownChunk]:
|
||||||
|
"""Split code block while preserving language marker."""
|
||||||
|
chunks = []
|
||||||
|
lines = elem.content.split('\n')
|
||||||
|
current_lines = []
|
||||||
|
current_size = 0
|
||||||
|
sub_index = 0
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if current_size + len(line) + 1 > self._max_chunk_size and current_lines:
|
||||||
|
chunk_content = f"```{elem.language}\n" + '\n'.join(current_lines) + "\n```"
|
||||||
|
chunks.append(MarkdownChunk(
|
||||||
|
chunk_id=f"{base_id}_{sub_index}",
|
||||||
|
content=chunk_content,
|
||||||
|
element_type=MarkdownElementType.CODE_BLOCK,
|
||||||
|
header_context=header_context,
|
||||||
|
language=elem.language,
|
||||||
|
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
|
||||||
|
))
|
||||||
|
sub_index += 1
|
||||||
|
current_lines = []
|
||||||
|
current_size = 0
|
||||||
|
|
||||||
|
current_lines.append(line)
|
||||||
|
current_size += len(line) + 1
|
||||||
|
|
||||||
|
if current_lines:
|
||||||
|
chunk_content = f"```{elem.language}\n" + '\n'.join(current_lines) + "\n```"
|
||||||
|
chunks.append(MarkdownChunk(
|
||||||
|
chunk_id=f"{base_id}_{sub_index}",
|
||||||
|
content=chunk_content,
|
||||||
|
element_type=MarkdownElementType.CODE_BLOCK,
|
||||||
|
header_context=header_context,
|
||||||
|
language=elem.language,
|
||||||
|
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
|
||||||
|
))
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def _split_table(
|
||||||
|
self,
|
||||||
|
elem: MarkdownElement,
|
||||||
|
base_id: str,
|
||||||
|
header_context: list[str],
|
||||||
|
start_index: int,
|
||||||
|
) -> list[MarkdownChunk]:
|
||||||
|
"""Split table while preserving header row."""
|
||||||
|
chunks = []
|
||||||
|
lines = elem.content.split('\n')
|
||||||
|
|
||||||
|
if len(lines) < 2:
|
||||||
|
return [MarkdownChunk(
|
||||||
|
chunk_id=f"{base_id}_0",
|
||||||
|
content=elem.content,
|
||||||
|
element_type=MarkdownElementType.TABLE,
|
||||||
|
header_context=header_context,
|
||||||
|
metadata=elem.metadata,
|
||||||
|
)]
|
||||||
|
|
||||||
|
header_line = lines[0]
|
||||||
|
separator_line = lines[1]
|
||||||
|
data_lines = lines[2:]
|
||||||
|
|
||||||
|
current_lines = [header_line, separator_line]
|
||||||
|
current_size = len(header_line) + len(separator_line) + 2
|
||||||
|
sub_index = 0
|
||||||
|
|
||||||
|
for line in data_lines:
|
||||||
|
if current_size + len(line) + 1 > self._max_chunk_size and len(current_lines) > 2:
|
||||||
|
chunks.append(MarkdownChunk(
|
||||||
|
chunk_id=f"{base_id}_{sub_index}",
|
||||||
|
content='\n'.join(current_lines),
|
||||||
|
element_type=MarkdownElementType.TABLE,
|
||||||
|
header_context=header_context,
|
||||||
|
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
|
||||||
|
))
|
||||||
|
sub_index += 1
|
||||||
|
current_lines = [header_line, separator_line]
|
||||||
|
current_size = len(header_line) + len(separator_line) + 2
|
||||||
|
|
||||||
|
current_lines.append(line)
|
||||||
|
current_size += len(line) + 1
|
||||||
|
|
||||||
|
if len(current_lines) > 2:
|
||||||
|
chunks.append(MarkdownChunk(
|
||||||
|
chunk_id=f"{base_id}_{sub_index}",
|
||||||
|
content='\n'.join(current_lines),
|
||||||
|
element_type=MarkdownElementType.TABLE,
|
||||||
|
header_context=header_context,
|
||||||
|
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
|
||||||
|
))
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def _split_list(
|
||||||
|
self,
|
||||||
|
elem: MarkdownElement,
|
||||||
|
base_id: str,
|
||||||
|
header_context: list[str],
|
||||||
|
start_index: int,
|
||||||
|
) -> list[MarkdownChunk]:
|
||||||
|
"""Split list into smaller chunks."""
|
||||||
|
chunks = []
|
||||||
|
items = elem.content.split('\n')
|
||||||
|
current_items = []
|
||||||
|
current_size = 0
|
||||||
|
sub_index = 0
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
if current_size + len(item) + 1 > self._max_chunk_size and current_items:
|
||||||
|
chunks.append(MarkdownChunk(
|
||||||
|
chunk_id=f"{base_id}_{sub_index}",
|
||||||
|
content='\n'.join(current_items),
|
||||||
|
element_type=MarkdownElementType.LIST,
|
||||||
|
header_context=header_context,
|
||||||
|
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
|
||||||
|
))
|
||||||
|
sub_index += 1
|
||||||
|
current_items = []
|
||||||
|
current_size = 0
|
||||||
|
|
||||||
|
current_items.append(item)
|
||||||
|
current_size += len(item) + 1
|
||||||
|
|
||||||
|
if current_items:
|
||||||
|
chunks.append(MarkdownChunk(
|
||||||
|
chunk_id=f"{base_id}_{sub_index}",
|
||||||
|
content='\n'.join(current_items),
|
||||||
|
element_type=MarkdownElementType.LIST,
|
||||||
|
header_context=header_context,
|
||||||
|
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
|
||||||
|
))
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def _split_text(
|
||||||
|
self,
|
||||||
|
elem: MarkdownElement,
|
||||||
|
base_id: str,
|
||||||
|
header_context: list[str],
|
||||||
|
start_index: int,
|
||||||
|
) -> list[MarkdownChunk]:
|
||||||
|
"""Split text content by sentences or paragraphs."""
|
||||||
|
chunks = []
|
||||||
|
text = elem.content
|
||||||
|
sub_index = 0
|
||||||
|
|
||||||
|
paragraphs = text.split('\n\n')
|
||||||
|
|
||||||
|
current_content = ""
|
||||||
|
current_size = 0
|
||||||
|
|
||||||
|
for para in paragraphs:
|
||||||
|
if current_size + len(para) + 2 > self._max_chunk_size and current_content:
|
||||||
|
chunks.append(MarkdownChunk(
|
||||||
|
chunk_id=f"{base_id}_{sub_index}",
|
||||||
|
content=current_content.strip(),
|
||||||
|
element_type=elem.type,
|
||||||
|
header_context=header_context,
|
||||||
|
metadata={**elem.metadata, "is_partial": True, "part": sub_index + 1},
|
||||||
|
))
|
||||||
|
sub_index += 1
|
||||||
|
current_content = ""
|
||||||
|
current_size = 0
|
||||||
|
|
||||||
|
current_content += para + "\n\n"
|
||||||
|
current_size += len(para) + 2
|
||||||
|
|
||||||
|
if current_content.strip():
|
||||||
|
chunks.append(MarkdownChunk(
|
||||||
|
chunk_id=f"{base_id}_{sub_index}",
|
||||||
|
content=current_content.strip(),
|
||||||
|
element_type=elem.type,
|
||||||
|
header_context=header_context,
|
||||||
|
metadata={**elem.metadata, "is_partial": sub_index > 0, "part": sub_index + 1},
|
||||||
|
))
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_markdown(
|
||||||
|
text: str,
|
||||||
|
doc_id: str = "",
|
||||||
|
max_chunk_size: int = 1000,
|
||||||
|
min_chunk_size: int = 100,
|
||||||
|
preserve_code_blocks: bool = True,
|
||||||
|
preserve_tables: bool = True,
|
||||||
|
preserve_lists: bool = True,
|
||||||
|
include_header_context: bool = True,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Convenience function to chunk Markdown text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Raw Markdown text
|
||||||
|
doc_id: Optional document ID
|
||||||
|
max_chunk_size: Maximum chunk size in characters
|
||||||
|
min_chunk_size: Minimum chunk size in characters
|
||||||
|
preserve_code_blocks: Whether to preserve code blocks
|
||||||
|
preserve_tables: Whether to preserve tables
|
||||||
|
preserve_lists: Whether to preserve lists
|
||||||
|
include_header_context: Whether to include header context
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of chunk dictionaries
|
||||||
|
"""
|
||||||
|
chunker = MarkdownChunker(
|
||||||
|
max_chunk_size=max_chunk_size,
|
||||||
|
min_chunk_size=min_chunk_size,
|
||||||
|
preserve_code_blocks=preserve_code_blocks,
|
||||||
|
preserve_tables=preserve_tables,
|
||||||
|
preserve_lists=preserve_lists,
|
||||||
|
include_header_context=include_header_context,
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks = chunker.chunk(text, doc_id)
|
||||||
|
return [chunk.to_dict() for chunk in chunks]
|
||||||
|
|
@ -0,0 +1,178 @@
|
||||||
|
"""
|
||||||
|
Markdown parser with intelligent chunking.
|
||||||
|
[AC-AISVC-33] Markdown file parsing with structure-aware chunking.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from app.services.document.base import (
|
||||||
|
DocumentParseException,
|
||||||
|
DocumentParser,
|
||||||
|
ParseResult,
|
||||||
|
)
|
||||||
|
from app.services.document.markdown_chunker import (
|
||||||
|
MarkdownChunker,
|
||||||
|
MarkdownElementType,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownParser(DocumentParser):
|
||||||
|
"""
|
||||||
|
Parser for Markdown files with intelligent chunking.
|
||||||
|
[AC-AISVC-33] Structure-aware parsing for Markdown documents.
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Header hierarchy extraction
|
||||||
|
- Code block preservation
|
||||||
|
- Table structure preservation
|
||||||
|
- List grouping
|
||||||
|
- Context-aware chunking
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
encoding: str = "utf-8",
|
||||||
|
max_chunk_size: int = 1000,
|
||||||
|
min_chunk_size: int = 100,
|
||||||
|
preserve_code_blocks: bool = True,
|
||||||
|
preserve_tables: bool = True,
|
||||||
|
preserve_lists: bool = True,
|
||||||
|
include_header_context: bool = True,
|
||||||
|
**kwargs: Any,
|
||||||
|
):
|
||||||
|
self._encoding = encoding
|
||||||
|
self._max_chunk_size = max_chunk_size
|
||||||
|
self._min_chunk_size = min_chunk_size
|
||||||
|
self._preserve_code_blocks = preserve_code_blocks
|
||||||
|
self._preserve_tables = preserve_tables
|
||||||
|
self._preserve_lists = preserve_lists
|
||||||
|
self._include_header_context = include_header_context
|
||||||
|
self._extra_config = kwargs
|
||||||
|
|
||||||
|
self._chunker = MarkdownChunker(
|
||||||
|
max_chunk_size=max_chunk_size,
|
||||||
|
min_chunk_size=min_chunk_size,
|
||||||
|
preserve_code_blocks=preserve_code_blocks,
|
||||||
|
preserve_tables=preserve_tables,
|
||||||
|
preserve_lists=preserve_lists,
|
||||||
|
include_header_context=include_header_context,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _try_encodings(self, path: Path) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Try multiple encodings to read the file.
|
||||||
|
Returns: (text, encoding_used)
|
||||||
|
"""
|
||||||
|
for enc in ENCODINGS_TO_TRY:
|
||||||
|
try:
|
||||||
|
with open(path, encoding=enc) as f:
|
||||||
|
text = f.read()
|
||||||
|
logger.info(f"Successfully parsed Markdown with encoding: {enc}")
|
||||||
|
return text, enc
|
||||||
|
except (UnicodeDecodeError, LookupError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
raise DocumentParseException(
|
||||||
|
"Failed to decode Markdown file with any known encoding",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="markdown"
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, file_path: str | Path) -> ParseResult:
|
||||||
|
"""
|
||||||
|
Parse a Markdown file and extract structured content.
|
||||||
|
[AC-AISVC-33] Structure-aware parsing.
|
||||||
|
"""
|
||||||
|
path = Path(file_path)
|
||||||
|
|
||||||
|
if not path.exists():
|
||||||
|
raise DocumentParseException(
|
||||||
|
f"File not found: {path}",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="markdown"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
text, encoding_used = self._try_encodings(path)
|
||||||
|
|
||||||
|
file_size = path.stat().st_size
|
||||||
|
line_count = text.count("\n") + 1
|
||||||
|
|
||||||
|
chunks = self._chunker.chunk(text, doc_id=path.stem)
|
||||||
|
|
||||||
|
header_count = sum(
|
||||||
|
1 for c in chunks
|
||||||
|
if c.element_type == MarkdownElementType.HEADER
|
||||||
|
)
|
||||||
|
code_block_count = sum(
|
||||||
|
1 for c in chunks
|
||||||
|
if c.element_type == MarkdownElementType.CODE_BLOCK
|
||||||
|
)
|
||||||
|
table_count = sum(
|
||||||
|
1 for c in chunks
|
||||||
|
if c.element_type == MarkdownElementType.TABLE
|
||||||
|
)
|
||||||
|
list_count = sum(
|
||||||
|
1 for c in chunks
|
||||||
|
if c.element_type == MarkdownElementType.LIST
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Parsed Markdown: {path.name}, lines={line_count}, "
|
||||||
|
f"chars={len(text)}, chunks={len(chunks)}, "
|
||||||
|
f"headers={header_count}, code_blocks={code_block_count}, "
|
||||||
|
f"tables={table_count}, lists={list_count}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ParseResult(
|
||||||
|
text=text,
|
||||||
|
source_path=str(path),
|
||||||
|
file_size=file_size,
|
||||||
|
metadata={
|
||||||
|
"format": "markdown",
|
||||||
|
"line_count": line_count,
|
||||||
|
"encoding": encoding_used,
|
||||||
|
"chunk_count": len(chunks),
|
||||||
|
"structure": {
|
||||||
|
"headers": header_count,
|
||||||
|
"code_blocks": code_block_count,
|
||||||
|
"tables": table_count,
|
||||||
|
"lists": list_count,
|
||||||
|
},
|
||||||
|
"chunks": [chunk.to_dict() for chunk in chunks],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
except DocumentParseException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
raise DocumentParseException(
|
||||||
|
f"Failed to parse Markdown file: {e}",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="markdown",
|
||||||
|
details={"error": str(e)}
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_supported_extensions(self) -> list[str]:
|
||||||
|
"""Get supported file extensions."""
|
||||||
|
return [".md", ".markdown"]
|
||||||
|
|
||||||
|
def get_chunks(self, text: str, doc_id: str = "") -> list[dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Get structured chunks from Markdown text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Markdown text content
|
||||||
|
doc_id: Optional document ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of chunk dictionaries
|
||||||
|
"""
|
||||||
|
chunks = self._chunker.chunk(text, doc_id)
|
||||||
|
return [chunk.to_dict() for chunk in chunks]
|
||||||
Loading…
Reference in New Issue