376 lines
12 KiB
Python
376 lines
12 KiB
Python
|
|
"""
|
||
|
|
Unit tests for ImageParser.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import pytest
|
||
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||
|
|
|
||
|
|
from app.services.document.image_parser import (
|
||
|
|
ImageChunk,
|
||
|
|
ImageParseResult,
|
||
|
|
ImageParser,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
class TestImageParserBasics:
|
||
|
|
"""Test basic functionality of ImageParser."""
|
||
|
|
|
||
|
|
def test_supported_extensions(self):
|
||
|
|
"""Test that ImageParser supports correct extensions."""
|
||
|
|
parser = ImageParser()
|
||
|
|
extensions = parser.get_supported_extensions()
|
||
|
|
|
||
|
|
expected_extensions = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"]
|
||
|
|
assert extensions == expected_extensions
|
||
|
|
|
||
|
|
def test_get_mime_type(self):
|
||
|
|
"""Test MIME type mapping."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
assert parser._get_mime_type(".jpg") == "image/jpeg"
|
||
|
|
assert parser._get_mime_type(".jpeg") == "image/jpeg"
|
||
|
|
assert parser._get_mime_type(".png") == "image/png"
|
||
|
|
assert parser._get_mime_type(".gif") == "image/gif"
|
||
|
|
assert parser._get_mime_type(".webp") == "image/webp"
|
||
|
|
assert parser._get_mime_type(".bmp") == "image/bmp"
|
||
|
|
assert parser._get_mime_type(".tiff") == "image/tiff"
|
||
|
|
assert parser._get_mime_type(".tif") == "image/tiff"
|
||
|
|
assert parser._get_mime_type(".unknown") == "image/jpeg"
|
||
|
|
|
||
|
|
|
||
|
|
class TestImageChunkParsing:
|
||
|
|
"""Test LLM response parsing functionality."""
|
||
|
|
|
||
|
|
def test_extract_json_from_plain_json(self):
|
||
|
|
"""Test extracting JSON from plain JSON response."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
json_str = '{"image_summary": "test", "chunks": [{"chunk_index": 0, "content": "hello", "chunk_type": "text", "keywords": ["key"]}]}'
|
||
|
|
result = parser._extract_json(json_str)
|
||
|
|
|
||
|
|
assert result == json_str
|
||
|
|
|
||
|
|
def test_extract_json_from_markdown(self):
|
||
|
|
"""Test extracting JSON from markdown code block."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
markdown = """Here is the analysis:
|
||
|
|
|
||
|
|
```json
|
||
|
|
{"image_summary": "test", "chunks": [{"chunk_index": 0, "content": "hello"}]}
|
||
|
|
```
|
||
|
|
|
||
|
|
Hope this helps!"""
|
||
|
|
|
||
|
|
result = parser._extract_json(markdown)
|
||
|
|
assert "image_summary" in result
|
||
|
|
assert "test" in result
|
||
|
|
|
||
|
|
def test_extract_json_from_text_with_json(self):
|
||
|
|
"""Test extracting JSON from text with JSON embedded."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
text = "The result is: {'image_summary': 'summary', 'chunks': []}"
|
||
|
|
|
||
|
|
result = parser._extract_json(text)
|
||
|
|
assert "image_summary" in result
|
||
|
|
assert "chunks" in result
|
||
|
|
|
||
|
|
def test_parse_llm_response_valid_json(self):
|
||
|
|
"""Test parsing valid JSON response from LLM."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
response = json.dumps({
|
||
|
|
"image_summary": "测试图片",
|
||
|
|
"total_chunks": 2,
|
||
|
|
"chunks": [
|
||
|
|
{
|
||
|
|
"chunk_index": 0,
|
||
|
|
"content": "这是第一块内容",
|
||
|
|
"chunk_type": "text",
|
||
|
|
"keywords": ["测试", "内容"]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"chunk_index": 1,
|
||
|
|
"content": "这是第二块内容,包含表格数据",
|
||
|
|
"chunk_type": "table",
|
||
|
|
"keywords": ["表格", "数据"]
|
||
|
|
}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
|
||
|
|
result = parser._parse_llm_response(response)
|
||
|
|
|
||
|
|
assert result.image_summary == "测试图片"
|
||
|
|
assert len(result.chunks) == 2
|
||
|
|
assert result.chunks[0].content == "这是第一块内容"
|
||
|
|
assert result.chunks[0].chunk_type == "text"
|
||
|
|
assert result.chunks[0].keywords == ["测试", "内容"]
|
||
|
|
assert result.chunks[1].chunk_type == "table"
|
||
|
|
assert result.chunks[1].keywords == ["表格", "数据"]
|
||
|
|
|
||
|
|
def test_parse_llm_response_empty_chunks(self):
|
||
|
|
"""Test handling response with empty chunks."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
response = json.dumps({
|
||
|
|
"image_summary": "测试",
|
||
|
|
"chunks": []
|
||
|
|
})
|
||
|
|
|
||
|
|
result = parser._parse_llm_response(response)
|
||
|
|
|
||
|
|
assert len(result.chunks) == 1
|
||
|
|
assert result.chunks[0].content == response
|
||
|
|
|
||
|
|
def test_parse_llm_response_invalid_json(self):
|
||
|
|
"""Test handling invalid JSON response with fallback."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
response = "This is not JSON at all"
|
||
|
|
|
||
|
|
result = parser._parse_llm_response(response)
|
||
|
|
|
||
|
|
assert len(result.chunks) == 1
|
||
|
|
assert result.chunks[0].content == "This is not JSON at all"
|
||
|
|
|
||
|
|
def test_parse_llm_response_partial_json(self):
|
||
|
|
"""Test handling response with partial/invalid JSON uses fallback."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
response = '{"image_summary": "test" some text here {"chunks": []}'
|
||
|
|
|
||
|
|
result = parser._parse_llm_response(response)
|
||
|
|
|
||
|
|
assert len(result.chunks) == 1
|
||
|
|
assert result.chunks[0].content == response
|
||
|
|
|
||
|
|
|
||
|
|
class TestImageChunkDataClass:
|
||
|
|
"""Test ImageChunk dataclass functionality."""
|
||
|
|
|
||
|
|
def test_image_chunk_creation(self):
|
||
|
|
"""Test creating ImageChunk."""
|
||
|
|
chunk = ImageChunk(
|
||
|
|
chunk_index=0,
|
||
|
|
content="Test content",
|
||
|
|
chunk_type="text",
|
||
|
|
keywords=["test", "content"]
|
||
|
|
)
|
||
|
|
|
||
|
|
assert chunk.chunk_index == 0
|
||
|
|
assert chunk.content == "Test content"
|
||
|
|
assert chunk.chunk_type == "text"
|
||
|
|
assert chunk.keywords == ["test", "content"]
|
||
|
|
|
||
|
|
def test_image_chunk_default_values(self):
|
||
|
|
"""Test ImageChunk with default values."""
|
||
|
|
chunk = ImageChunk(chunk_index=0, content="Test")
|
||
|
|
|
||
|
|
assert chunk.chunk_type == "text"
|
||
|
|
assert chunk.keywords == []
|
||
|
|
|
||
|
|
def test_image_parse_result_creation(self):
|
||
|
|
"""Test creating ImageParseResult."""
|
||
|
|
chunks = [
|
||
|
|
ImageChunk(chunk_index=0, content="Chunk 1"),
|
||
|
|
ImageChunk(chunk_index=1, content="Chunk 2"),
|
||
|
|
]
|
||
|
|
|
||
|
|
result = ImageParseResult(
|
||
|
|
image_summary="Test summary",
|
||
|
|
chunks=chunks,
|
||
|
|
raw_text="Chunk 1\n\nChunk 2",
|
||
|
|
source_path="/path/to/image.png",
|
||
|
|
file_size=1024,
|
||
|
|
metadata={"format": "png"}
|
||
|
|
)
|
||
|
|
|
||
|
|
assert result.image_summary == "Test summary"
|
||
|
|
assert len(result.chunks) == 2
|
||
|
|
assert result.raw_text == "Chunk 1\n\nChunk 2"
|
||
|
|
assert result.file_size == 1024
|
||
|
|
assert result.metadata["format"] == "png"
|
||
|
|
|
||
|
|
|
||
|
|
class TestChunkTypes:
|
||
|
|
"""Test different chunk types."""
|
||
|
|
|
||
|
|
def test_text_chunk_type(self):
|
||
|
|
"""Test text chunk type."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
response = json.dumps({
|
||
|
|
"image_summary": "Text content",
|
||
|
|
"chunks": [
|
||
|
|
{
|
||
|
|
"chunk_index": 0,
|
||
|
|
"content": "Plain text content",
|
||
|
|
"chunk_type": "text",
|
||
|
|
"keywords": ["text"]
|
||
|
|
}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
|
||
|
|
result = parser._parse_llm_response(response)
|
||
|
|
assert result.chunks[0].chunk_type == "text"
|
||
|
|
|
||
|
|
def test_table_chunk_type(self):
|
||
|
|
"""Test table chunk type."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
response = json.dumps({
|
||
|
|
"image_summary": "Table content",
|
||
|
|
"chunks": [
|
||
|
|
{
|
||
|
|
"chunk_index": 0,
|
||
|
|
"content": "Name | Age\n---- | ---\nJohn | 30",
|
||
|
|
"chunk_type": "table",
|
||
|
|
"keywords": ["table", "data"]
|
||
|
|
}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
|
||
|
|
result = parser._parse_llm_response(response)
|
||
|
|
assert result.chunks[0].chunk_type == "table"
|
||
|
|
|
||
|
|
def test_chart_chunk_type(self):
|
||
|
|
"""Test chart chunk type."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
response = json.dumps({
|
||
|
|
"image_summary": "Chart content",
|
||
|
|
"chunks": [
|
||
|
|
{
|
||
|
|
"chunk_index": 0,
|
||
|
|
"content": "Bar chart showing sales data",
|
||
|
|
"chunk_type": "chart",
|
||
|
|
"keywords": ["chart", "sales"]
|
||
|
|
}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
|
||
|
|
result = parser._parse_llm_response(response)
|
||
|
|
assert result.chunks[0].chunk_type == "chart"
|
||
|
|
|
||
|
|
def test_list_chunk_type(self):
|
||
|
|
"""Test list chunk type."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
response = json.dumps({
|
||
|
|
"image_summary": "List content",
|
||
|
|
"chunks": [
|
||
|
|
{
|
||
|
|
"chunk_index": 0,
|
||
|
|
"content": "1. First item\n2. Second item\n3. Third item",
|
||
|
|
"chunk_type": "list",
|
||
|
|
"keywords": ["list", "items"]
|
||
|
|
}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
|
||
|
|
result = parser._parse_llm_response(response)
|
||
|
|
assert result.chunks[0].chunk_type == "list"
|
||
|
|
|
||
|
|
|
||
|
|
class TestIntegrationScenarios:
|
||
|
|
"""Test integration scenarios."""
|
||
|
|
|
||
|
|
def test_single_chunk_scenario(self):
|
||
|
|
"""Test single chunk scenario - simple image with one main content."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
response = json.dumps({
|
||
|
|
"image_summary": "简单文档截图",
|
||
|
|
"chunks": [
|
||
|
|
{
|
||
|
|
"chunk_index": 0,
|
||
|
|
"content": "这是一段完整的文档内容,包含所有的信息要点。",
|
||
|
|
"chunk_type": "text",
|
||
|
|
"keywords": ["文档", "信息"]
|
||
|
|
}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
|
||
|
|
result = parser._parse_llm_response(response)
|
||
|
|
|
||
|
|
assert len(result.chunks) == 1
|
||
|
|
assert result.image_summary == "简单文档截图"
|
||
|
|
assert result.raw_text == "这是一段完整的文档内容,包含所有的信息要点。"
|
||
|
|
|
||
|
|
def test_multi_chunk_scenario(self):
|
||
|
|
"""Test multi-chunk scenario - complex image with multiple sections."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
response = json.dumps({
|
||
|
|
"image_summary": "多段落文档",
|
||
|
|
"chunks": [
|
||
|
|
{
|
||
|
|
"chunk_index": 0,
|
||
|
|
"content": "第一章:介绍部分,介绍项目的背景和目标。",
|
||
|
|
"chunk_type": "text",
|
||
|
|
"keywords": ["第一章", "介绍"]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"chunk_index": 1,
|
||
|
|
"content": "第二章:技术架构,包括前端、后端和数据库设计。",
|
||
|
|
"chunk_type": "text",
|
||
|
|
"keywords": ["第二章", "架构"]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"chunk_index": 2,
|
||
|
|
"content": "第三章:部署流程,包含开发环境和生产环境配置。",
|
||
|
|
"chunk_type": "text",
|
||
|
|
"keywords": ["第三章", "部署"]
|
||
|
|
}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
|
||
|
|
result = parser._parse_llm_response(response)
|
||
|
|
|
||
|
|
assert len(result.chunks) == 3
|
||
|
|
assert "第一章" in result.chunks[0].content
|
||
|
|
assert "第二章" in result.chunks[1].content
|
||
|
|
assert "第三章" in result.chunks[2].content
|
||
|
|
assert result.raw_text.count("\n\n") == 2
|
||
|
|
|
||
|
|
def test_mixed_content_scenario(self):
|
||
|
|
"""Test mixed content scenario - text and table."""
|
||
|
|
parser = ImageParser()
|
||
|
|
|
||
|
|
response = json.dumps({
|
||
|
|
"image_summary": "混合内容图片",
|
||
|
|
"chunks": [
|
||
|
|
{
|
||
|
|
"chunk_index": 0,
|
||
|
|
"content": "产品介绍:本文档介绍我们的核心产品功能。",
|
||
|
|
"chunk_type": "text",
|
||
|
|
"keywords": ["产品", "功能"]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"chunk_index": 1,
|
||
|
|
"content": "产品规格表:\n| 型号 | 价格 | 库存 |\n| --- | --- | --- |\n| A1 | 100 | 50 |",
|
||
|
|
"chunk_type": "table",
|
||
|
|
"keywords": ["规格", "价格", "库存"]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"chunk_index": 2,
|
||
|
|
"content": "使用说明:\n1. 打开包装\n2. 连接电源\n3. 按下启动按钮",
|
||
|
|
"chunk_type": "list",
|
||
|
|
"keywords": ["说明", "步骤"]
|
||
|
|
}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
|
||
|
|
result = parser._parse_llm_response(response)
|
||
|
|
|
||
|
|
assert len(result.chunks) == 3
|
||
|
|
assert result.chunks[0].chunk_type == "text"
|
||
|
|
assert result.chunks[1].chunk_type == "table"
|
||
|
|
assert result.chunks[2].chunk_type == "list"
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
pytest.main([__file__, "-v"])
|