491 lines
16 KiB
Python
491 lines
16 KiB
Python
|
|
"""
|
|||
|
|
Image parser using multimodal LLM.
|
|||
|
|
Supports parsing images into structured text content for knowledge base indexing.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import asyncio
|
|||
|
|
import base64
|
|||
|
|
import json
|
|||
|
|
import logging
|
|||
|
|
from dataclasses import dataclass, field
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Any
|
|||
|
|
|
|||
|
|
from app.services.document.base import (
|
|||
|
|
DocumentParseException,
|
|||
|
|
DocumentParser,
|
|||
|
|
PageText,
|
|||
|
|
ParseResult,
|
|||
|
|
)
|
|||
|
|
from app.services.llm.factory import LLMUsageType, get_llm_config_manager
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
IMAGE_SYSTEM_PROMPT = """你是一个专业的图像内容分析助手。你的任务是分析图片内容,并将其智能拆分为适合知识库检索的独立数据块。
|
|||
|
|
|
|||
|
|
## 分析要求
|
|||
|
|
1. 仔细分析图片内容,识别其中的文字、图表、数据等信息
|
|||
|
|
2. 根据内容的逻辑结构,智能判断如何拆分为独立的知识条目
|
|||
|
|
3. 每个条目应该是独立、完整、可检索的知识单元
|
|||
|
|
|
|||
|
|
## 输出格式
|
|||
|
|
请严格按照以下 JSON 格式输出,不要添加任何其他内容:
|
|||
|
|
|
|||
|
|
```json
|
|||
|
|
{
|
|||
|
|
"image_summary": "图片整体概述(一句话描述图片主题)",
|
|||
|
|
"total_chunks": <分块总数>,
|
|||
|
|
"chunks": [
|
|||
|
|
{
|
|||
|
|
"chunk_index": 0,
|
|||
|
|
"content": "该分块的完整内容文字",
|
|||
|
|
"chunk_type": "text|table|list|diagram|chart|mixed",
|
|||
|
|
"keywords": ["关键词1", "关键词2"]
|
|||
|
|
}
|
|||
|
|
]
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
## 分块策略
|
|||
|
|
- **单一内容**: 如果图片只有一段完整的文字/信息,可以只输出1个分块
|
|||
|
|
- **多段落内容**: 按段落或逻辑单元拆分,每个段落作为独立分块
|
|||
|
|
- **表格数据**: 将表格内容转换为结构化文字,作为一个分块
|
|||
|
|
- **图表数据**: 描述图表内容和数据,作为一个分块
|
|||
|
|
- **列表内容**: 每个列表项可作为独立分块,或合并为相关的一组
|
|||
|
|
- **混合内容**: 根据内容类型分别处理,确保每个分块主题明确
|
|||
|
|
|
|||
|
|
## 注意事项
|
|||
|
|
1. 每个分块的 content 必须是完整、可独立理解的文字
|
|||
|
|
2. chunk_type 用于标识内容类型,便于后续处理
|
|||
|
|
3. keywords 提取该分块的核心关键词,便于检索
|
|||
|
|
4. 确保输出的 JSON 格式正确,可以被解析"""
|
|||
|
|
|
|||
|
|
IMAGE_USER_PROMPT = "请分析这张图片,按照要求的 JSON 格式输出分块结果。"
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class ImageChunk:
|
|||
|
|
"""智能分块结果"""
|
|||
|
|
chunk_index: int
|
|||
|
|
content: str
|
|||
|
|
chunk_type: str = "text"
|
|||
|
|
keywords: list[str] = field(default_factory=list)
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class ImageParseResult:
|
|||
|
|
"""图片解析结果(包含智能分块)"""
|
|||
|
|
image_summary: str
|
|||
|
|
chunks: list[ImageChunk]
|
|||
|
|
raw_text: str
|
|||
|
|
source_path: str
|
|||
|
|
file_size: int
|
|||
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class ImageParser(DocumentParser):
|
|||
|
|
"""
|
|||
|
|
Image parser using multimodal LLM.
|
|||
|
|
|
|||
|
|
Supports common image formats and extracts text content using
|
|||
|
|
vision-capable LLM models (GPT-4V, GPT-4o, etc.).
|
|||
|
|
|
|||
|
|
Features:
|
|||
|
|
- Intelligent chunking based on content structure
|
|||
|
|
- Structured output with keywords and chunk types
|
|||
|
|
- Support for various content types (text, table, chart, etc.)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
SUPPORTED_EXTENSIONS = [
|
|||
|
|
".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
def __init__(
|
|||
|
|
self,
|
|||
|
|
model: str | None = None,
|
|||
|
|
max_tokens: int = 4096,
|
|||
|
|
timeout_seconds: int = 120,
|
|||
|
|
):
|
|||
|
|
self._model = model
|
|||
|
|
self._max_tokens = max_tokens
|
|||
|
|
self._timeout_seconds = timeout_seconds
|
|||
|
|
|
|||
|
|
def parse(self, file_path: str | Path) -> ParseResult:
|
|||
|
|
"""
|
|||
|
|
Parse an image file and extract text content using multimodal LLM.
|
|||
|
|
|
|||
|
|
Note: This method is synchronous but internally uses async operations.
|
|||
|
|
For async contexts, use parse_async() instead.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: Path to the image file.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
ParseResult with extracted text content.
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
DocumentParseException: If parsing fails.
|
|||
|
|
"""
|
|||
|
|
path = Path(file_path)
|
|||
|
|
if not path.exists():
|
|||
|
|
raise DocumentParseException(
|
|||
|
|
f"Image file not found: {file_path}",
|
|||
|
|
file_path=str(path),
|
|||
|
|
parser="image",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
file_size = path.stat().st_size
|
|||
|
|
extension = path.suffix.lower()
|
|||
|
|
|
|||
|
|
if extension not in self.SUPPORTED_EXTENSIONS:
|
|||
|
|
raise DocumentParseException(
|
|||
|
|
f"Unsupported image format: {extension}",
|
|||
|
|
file_path=str(path),
|
|||
|
|
parser="image",
|
|||
|
|
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
with open(path, "rb") as f:
|
|||
|
|
image_data = f.read()
|
|||
|
|
|
|||
|
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
|||
|
|
mime_type = self._get_mime_type(extension)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
loop = asyncio.get_running_loop()
|
|||
|
|
import concurrent.futures
|
|||
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|||
|
|
future = executor.submit(
|
|||
|
|
asyncio.run,
|
|||
|
|
self._analyze_image_async(image_base64, mime_type)
|
|||
|
|
)
|
|||
|
|
result = future.result()
|
|||
|
|
except RuntimeError:
|
|||
|
|
result = asyncio.run(self._analyze_image_async(image_base64, mime_type))
|
|||
|
|
|
|||
|
|
logger.info(
|
|||
|
|
f"[IMAGE-PARSER] Successfully parsed image: {path.name}, "
|
|||
|
|
f"size={file_size}, chunks={len(result.chunks)}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return ParseResult(
|
|||
|
|
text=result.raw_text,
|
|||
|
|
source_path=str(path),
|
|||
|
|
file_size=file_size,
|
|||
|
|
page_count=1,
|
|||
|
|
metadata={
|
|||
|
|
"format": extension,
|
|||
|
|
"parser": "image",
|
|||
|
|
"mime_type": mime_type,
|
|||
|
|
"image_summary": result.image_summary,
|
|||
|
|
"chunk_count": len(result.chunks),
|
|||
|
|
"chunks": [
|
|||
|
|
{
|
|||
|
|
"chunk_index": c.chunk_index,
|
|||
|
|
"content": c.content,
|
|||
|
|
"chunk_type": c.chunk_type,
|
|||
|
|
"keywords": c.keywords,
|
|||
|
|
}
|
|||
|
|
for c in result.chunks
|
|||
|
|
],
|
|||
|
|
},
|
|||
|
|
pages=[PageText(page=1, text=result.raw_text)],
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"[IMAGE-PARSER] Failed to parse image {path}: {e}")
|
|||
|
|
raise DocumentParseException(
|
|||
|
|
f"Failed to parse image: {str(e)}",
|
|||
|
|
file_path=str(path),
|
|||
|
|
parser="image",
|
|||
|
|
details={"error": str(e)},
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
async def parse_async(self, file_path: str | Path) -> ParseResult:
|
|||
|
|
"""
|
|||
|
|
Async version of parse method for use in async contexts.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: Path to the image file.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
ParseResult with extracted text content.
|
|||
|
|
"""
|
|||
|
|
path = Path(file_path)
|
|||
|
|
if not path.exists():
|
|||
|
|
raise DocumentParseException(
|
|||
|
|
f"Image file not found: {file_path}",
|
|||
|
|
file_path=str(path),
|
|||
|
|
parser="image",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
file_size = path.stat().st_size
|
|||
|
|
extension = path.suffix.lower()
|
|||
|
|
|
|||
|
|
if extension not in self.SUPPORTED_EXTENSIONS:
|
|||
|
|
raise DocumentParseException(
|
|||
|
|
f"Unsupported image format: {extension}",
|
|||
|
|
file_path=str(path),
|
|||
|
|
parser="image",
|
|||
|
|
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
with open(path, "rb") as f:
|
|||
|
|
image_data = f.read()
|
|||
|
|
|
|||
|
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
|||
|
|
mime_type = self._get_mime_type(extension)
|
|||
|
|
|
|||
|
|
result = await self._analyze_image_async(image_base64, mime_type)
|
|||
|
|
|
|||
|
|
logger.info(
|
|||
|
|
f"[IMAGE-PARSER] Successfully parsed image (async): {path.name}, "
|
|||
|
|
f"size={file_size}, chunks={len(result.chunks)}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return ParseResult(
|
|||
|
|
text=result.raw_text,
|
|||
|
|
source_path=str(path),
|
|||
|
|
file_size=file_size,
|
|||
|
|
page_count=1,
|
|||
|
|
metadata={
|
|||
|
|
"format": extension,
|
|||
|
|
"parser": "image",
|
|||
|
|
"mime_type": mime_type,
|
|||
|
|
"image_summary": result.image_summary,
|
|||
|
|
"chunk_count": len(result.chunks),
|
|||
|
|
"chunks": [
|
|||
|
|
{
|
|||
|
|
"chunk_index": c.chunk_index,
|
|||
|
|
"content": c.content,
|
|||
|
|
"chunk_type": c.chunk_type,
|
|||
|
|
"keywords": c.keywords,
|
|||
|
|
}
|
|||
|
|
for c in result.chunks
|
|||
|
|
],
|
|||
|
|
},
|
|||
|
|
pages=[PageText(page=1, text=result.raw_text)],
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"[IMAGE-PARSER] Failed to parse image {path}: {e}")
|
|||
|
|
raise DocumentParseException(
|
|||
|
|
f"Failed to parse image: {str(e)}",
|
|||
|
|
file_path=str(path),
|
|||
|
|
parser="image",
|
|||
|
|
details={"error": str(e)},
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
async def parse_with_chunks(self, file_path: str | Path) -> ImageParseResult:
|
|||
|
|
"""
|
|||
|
|
Parse image and return structured result with intelligent chunks.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: Path to the image file.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
ImageParseResult with intelligent chunks.
|
|||
|
|
"""
|
|||
|
|
path = Path(file_path)
|
|||
|
|
if not path.exists():
|
|||
|
|
raise DocumentParseException(
|
|||
|
|
f"Image file not found: {file_path}",
|
|||
|
|
file_path=str(path),
|
|||
|
|
parser="image",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
file_size = path.stat().st_size
|
|||
|
|
extension = path.suffix.lower()
|
|||
|
|
|
|||
|
|
if extension not in self.SUPPORTED_EXTENSIONS:
|
|||
|
|
raise DocumentParseException(
|
|||
|
|
f"Unsupported image format: {extension}",
|
|||
|
|
file_path=str(path),
|
|||
|
|
parser="image",
|
|||
|
|
details={"supported_formats": self.SUPPORTED_EXTENSIONS},
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
with open(path, "rb") as f:
|
|||
|
|
image_data = f.read()
|
|||
|
|
|
|||
|
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
|||
|
|
mime_type = self._get_mime_type(extension)
|
|||
|
|
|
|||
|
|
result = await self._analyze_image_async(image_base64, mime_type)
|
|||
|
|
result.source_path = str(path)
|
|||
|
|
result.file_size = file_size
|
|||
|
|
result.metadata = {
|
|||
|
|
"format": extension,
|
|||
|
|
"parser": "image",
|
|||
|
|
"mime_type": mime_type,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
async def _analyze_image_async(self, image_base64: str, mime_type: str) -> ImageParseResult:
|
|||
|
|
"""
|
|||
|
|
Analyze image using multimodal LLM and return structured chunks.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
image_base64: Base64 encoded image data.
|
|||
|
|
mime_type: MIME type of the image.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
ImageParseResult with intelligent chunks.
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
manager = get_llm_config_manager()
|
|||
|
|
client = manager.get_kb_processing_client()
|
|||
|
|
|
|||
|
|
config = manager.kb_processing_config
|
|||
|
|
model = self._model or config.get("model", "gpt-4o-mini")
|
|||
|
|
|
|||
|
|
messages = [
|
|||
|
|
{
|
|||
|
|
"role": "system",
|
|||
|
|
"content": IMAGE_SYSTEM_PROMPT,
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"role": "user",
|
|||
|
|
"content": [
|
|||
|
|
{
|
|||
|
|
"type": "text",
|
|||
|
|
"text": IMAGE_USER_PROMPT,
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "image_url",
|
|||
|
|
"image_url": {
|
|||
|
|
"url": f"data:{mime_type};base64,{image_base64}",
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
],
|
|||
|
|
},
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
from app.services.llm.base import LLMConfig
|
|||
|
|
|
|||
|
|
llm_config = LLMConfig(
|
|||
|
|
model=model,
|
|||
|
|
max_tokens=self._max_tokens,
|
|||
|
|
temperature=0.3,
|
|||
|
|
timeout_seconds=self._timeout_seconds,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
response = await client.generate(messages=messages, config=llm_config)
|
|||
|
|
|
|||
|
|
if not response.content:
|
|||
|
|
raise DocumentParseException(
|
|||
|
|
"LLM returned empty response for image analysis",
|
|||
|
|
parser="image",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return self._parse_llm_response(response.content)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"[IMAGE-PARSER] LLM analysis failed: {e}")
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
def _parse_llm_response(self, response_content: str) -> ImageParseResult:
|
|||
|
|
"""
|
|||
|
|
Parse LLM response into structured ImageParseResult.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
response_content: Raw LLM response content.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
ImageParseResult with parsed chunks.
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
json_str = self._extract_json(response_content)
|
|||
|
|
data = json.loads(json_str)
|
|||
|
|
|
|||
|
|
image_summary = data.get("image_summary", "")
|
|||
|
|
chunks_data = data.get("chunks", [])
|
|||
|
|
|
|||
|
|
chunks = []
|
|||
|
|
for chunk_data in chunks_data:
|
|||
|
|
chunk = ImageChunk(
|
|||
|
|
chunk_index=chunk_data.get("chunk_index", len(chunks)),
|
|||
|
|
content=chunk_data.get("content", ""),
|
|||
|
|
chunk_type=chunk_data.get("chunk_type", "text"),
|
|||
|
|
keywords=chunk_data.get("keywords", []),
|
|||
|
|
)
|
|||
|
|
if chunk.content.strip():
|
|||
|
|
chunks.append(chunk)
|
|||
|
|
|
|||
|
|
if not chunks:
|
|||
|
|
chunks.append(ImageChunk(
|
|||
|
|
chunk_index=0,
|
|||
|
|
content=response_content,
|
|||
|
|
chunk_type="text",
|
|||
|
|
keywords=[],
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
raw_text = "\n\n".join([c.content for c in chunks])
|
|||
|
|
|
|||
|
|
return ImageParseResult(
|
|||
|
|
image_summary=image_summary,
|
|||
|
|
chunks=chunks,
|
|||
|
|
raw_text=raw_text,
|
|||
|
|
source_path="",
|
|||
|
|
file_size=0,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
except json.JSONDecodeError as e:
|
|||
|
|
logger.warning(f"[IMAGE-PARSER] Failed to parse JSON response: {e}, using fallback")
|
|||
|
|
return ImageParseResult(
|
|||
|
|
image_summary="图片内容",
|
|||
|
|
chunks=[ImageChunk(
|
|||
|
|
chunk_index=0,
|
|||
|
|
content=response_content,
|
|||
|
|
chunk_type="text",
|
|||
|
|
keywords=[],
|
|||
|
|
)],
|
|||
|
|
raw_text=response_content,
|
|||
|
|
source_path="",
|
|||
|
|
file_size=0,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def _extract_json(self, content: str) -> str:
|
|||
|
|
"""
|
|||
|
|
Extract JSON from LLM response content.
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
content: Raw response content that may contain JSON.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Extracted JSON string.
|
|||
|
|
"""
|
|||
|
|
content = content.strip()
|
|||
|
|
|
|||
|
|
if content.startswith("{") and content.endswith("}"):
|
|||
|
|
return content
|
|||
|
|
|
|||
|
|
json_start = content.find("{")
|
|||
|
|
json_end = content.rfind("}")
|
|||
|
|
|
|||
|
|
if json_start != -1 and json_end != -1 and json_end > json_start:
|
|||
|
|
return content[json_start:json_end + 1]
|
|||
|
|
|
|||
|
|
return content
|
|||
|
|
|
|||
|
|
def _get_mime_type(self, extension: str) -> str:
|
|||
|
|
"""Get MIME type for image extension."""
|
|||
|
|
mime_types = {
|
|||
|
|
".jpg": "image/jpeg",
|
|||
|
|
".jpeg": "image/jpeg",
|
|||
|
|
".png": "image/png",
|
|||
|
|
".gif": "image/gif",
|
|||
|
|
".webp": "image/webp",
|
|||
|
|
".bmp": "image/bmp",
|
|||
|
|
".tiff": "image/tiff",
|
|||
|
|
".tif": "image/tiff",
|
|||
|
|
}
|
|||
|
|
return mime_types.get(extension.lower(), "image/jpeg")
|
|||
|
|
|
|||
|
|
def get_supported_extensions(self) -> list[str]:
|
|||
|
|
"""Get list of supported image extensions."""
|
|||
|
|
return ImageParser.SUPPORTED_EXTENSIONS
|