ai-robot-core/ai-service/tests/test_image_parser.py

376 lines
12 KiB
Python
Raw Permalink Normal View History

"""
Unit tests for ImageParser.
"""
import json
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from app.services.document.image_parser import (
ImageChunk,
ImageParseResult,
ImageParser,
)
class TestImageParserBasics:
"""Test basic functionality of ImageParser."""
def test_supported_extensions(self):
"""Test that ImageParser supports correct extensions."""
parser = ImageParser()
extensions = parser.get_supported_extensions()
expected_extensions = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"]
assert extensions == expected_extensions
def test_get_mime_type(self):
"""Test MIME type mapping."""
parser = ImageParser()
assert parser._get_mime_type(".jpg") == "image/jpeg"
assert parser._get_mime_type(".jpeg") == "image/jpeg"
assert parser._get_mime_type(".png") == "image/png"
assert parser._get_mime_type(".gif") == "image/gif"
assert parser._get_mime_type(".webp") == "image/webp"
assert parser._get_mime_type(".bmp") == "image/bmp"
assert parser._get_mime_type(".tiff") == "image/tiff"
assert parser._get_mime_type(".tif") == "image/tiff"
assert parser._get_mime_type(".unknown") == "image/jpeg"
class TestImageChunkParsing:
"""Test LLM response parsing functionality."""
def test_extract_json_from_plain_json(self):
"""Test extracting JSON from plain JSON response."""
parser = ImageParser()
json_str = '{"image_summary": "test", "chunks": [{"chunk_index": 0, "content": "hello", "chunk_type": "text", "keywords": ["key"]}]}'
result = parser._extract_json(json_str)
assert result == json_str
def test_extract_json_from_markdown(self):
"""Test extracting JSON from markdown code block."""
parser = ImageParser()
markdown = """Here is the analysis:
```json
{"image_summary": "test", "chunks": [{"chunk_index": 0, "content": "hello"}]}
```
Hope this helps!"""
result = parser._extract_json(markdown)
assert "image_summary" in result
assert "test" in result
def test_extract_json_from_text_with_json(self):
"""Test extracting JSON from text with JSON embedded."""
parser = ImageParser()
text = "The result is: {'image_summary': 'summary', 'chunks': []}"
result = parser._extract_json(text)
assert "image_summary" in result
assert "chunks" in result
def test_parse_llm_response_valid_json(self):
"""Test parsing valid JSON response from LLM."""
parser = ImageParser()
response = json.dumps({
"image_summary": "测试图片",
"total_chunks": 2,
"chunks": [
{
"chunk_index": 0,
"content": "这是第一块内容",
"chunk_type": "text",
"keywords": ["测试", "内容"]
},
{
"chunk_index": 1,
"content": "这是第二块内容,包含表格数据",
"chunk_type": "table",
"keywords": ["表格", "数据"]
}
]
})
result = parser._parse_llm_response(response)
assert result.image_summary == "测试图片"
assert len(result.chunks) == 2
assert result.chunks[0].content == "这是第一块内容"
assert result.chunks[0].chunk_type == "text"
assert result.chunks[0].keywords == ["测试", "内容"]
assert result.chunks[1].chunk_type == "table"
assert result.chunks[1].keywords == ["表格", "数据"]
def test_parse_llm_response_empty_chunks(self):
"""Test handling response with empty chunks."""
parser = ImageParser()
response = json.dumps({
"image_summary": "测试",
"chunks": []
})
result = parser._parse_llm_response(response)
assert len(result.chunks) == 1
assert result.chunks[0].content == response
def test_parse_llm_response_invalid_json(self):
"""Test handling invalid JSON response with fallback."""
parser = ImageParser()
response = "This is not JSON at all"
result = parser._parse_llm_response(response)
assert len(result.chunks) == 1
assert result.chunks[0].content == "This is not JSON at all"
def test_parse_llm_response_partial_json(self):
"""Test handling response with partial/invalid JSON uses fallback."""
parser = ImageParser()
response = '{"image_summary": "test" some text here {"chunks": []}'
result = parser._parse_llm_response(response)
assert len(result.chunks) == 1
assert result.chunks[0].content == response
class TestImageChunkDataClass:
"""Test ImageChunk dataclass functionality."""
def test_image_chunk_creation(self):
"""Test creating ImageChunk."""
chunk = ImageChunk(
chunk_index=0,
content="Test content",
chunk_type="text",
keywords=["test", "content"]
)
assert chunk.chunk_index == 0
assert chunk.content == "Test content"
assert chunk.chunk_type == "text"
assert chunk.keywords == ["test", "content"]
def test_image_chunk_default_values(self):
"""Test ImageChunk with default values."""
chunk = ImageChunk(chunk_index=0, content="Test")
assert chunk.chunk_type == "text"
assert chunk.keywords == []
def test_image_parse_result_creation(self):
"""Test creating ImageParseResult."""
chunks = [
ImageChunk(chunk_index=0, content="Chunk 1"),
ImageChunk(chunk_index=1, content="Chunk 2"),
]
result = ImageParseResult(
image_summary="Test summary",
chunks=chunks,
raw_text="Chunk 1\n\nChunk 2",
source_path="/path/to/image.png",
file_size=1024,
metadata={"format": "png"}
)
assert result.image_summary == "Test summary"
assert len(result.chunks) == 2
assert result.raw_text == "Chunk 1\n\nChunk 2"
assert result.file_size == 1024
assert result.metadata["format"] == "png"
class TestChunkTypes:
"""Test different chunk types."""
def test_text_chunk_type(self):
"""Test text chunk type."""
parser = ImageParser()
response = json.dumps({
"image_summary": "Text content",
"chunks": [
{
"chunk_index": 0,
"content": "Plain text content",
"chunk_type": "text",
"keywords": ["text"]
}
]
})
result = parser._parse_llm_response(response)
assert result.chunks[0].chunk_type == "text"
def test_table_chunk_type(self):
"""Test table chunk type."""
parser = ImageParser()
response = json.dumps({
"image_summary": "Table content",
"chunks": [
{
"chunk_index": 0,
"content": "Name | Age\n---- | ---\nJohn | 30",
"chunk_type": "table",
"keywords": ["table", "data"]
}
]
})
result = parser._parse_llm_response(response)
assert result.chunks[0].chunk_type == "table"
def test_chart_chunk_type(self):
"""Test chart chunk type."""
parser = ImageParser()
response = json.dumps({
"image_summary": "Chart content",
"chunks": [
{
"chunk_index": 0,
"content": "Bar chart showing sales data",
"chunk_type": "chart",
"keywords": ["chart", "sales"]
}
]
})
result = parser._parse_llm_response(response)
assert result.chunks[0].chunk_type == "chart"
def test_list_chunk_type(self):
"""Test list chunk type."""
parser = ImageParser()
response = json.dumps({
"image_summary": "List content",
"chunks": [
{
"chunk_index": 0,
"content": "1. First item\n2. Second item\n3. Third item",
"chunk_type": "list",
"keywords": ["list", "items"]
}
]
})
result = parser._parse_llm_response(response)
assert result.chunks[0].chunk_type == "list"
class TestIntegrationScenarios:
"""Test integration scenarios."""
def test_single_chunk_scenario(self):
"""Test single chunk scenario - simple image with one main content."""
parser = ImageParser()
response = json.dumps({
"image_summary": "简单文档截图",
"chunks": [
{
"chunk_index": 0,
"content": "这是一段完整的文档内容,包含所有的信息要点。",
"chunk_type": "text",
"keywords": ["文档", "信息"]
}
]
})
result = parser._parse_llm_response(response)
assert len(result.chunks) == 1
assert result.image_summary == "简单文档截图"
assert result.raw_text == "这是一段完整的文档内容,包含所有的信息要点。"
def test_multi_chunk_scenario(self):
"""Test multi-chunk scenario - complex image with multiple sections."""
parser = ImageParser()
response = json.dumps({
"image_summary": "多段落文档",
"chunks": [
{
"chunk_index": 0,
"content": "第一章:介绍部分,介绍项目的背景和目标。",
"chunk_type": "text",
"keywords": ["第一章", "介绍"]
},
{
"chunk_index": 1,
"content": "第二章:技术架构,包括前端、后端和数据库设计。",
"chunk_type": "text",
"keywords": ["第二章", "架构"]
},
{
"chunk_index": 2,
"content": "第三章:部署流程,包含开发环境和生产环境配置。",
"chunk_type": "text",
"keywords": ["第三章", "部署"]
}
]
})
result = parser._parse_llm_response(response)
assert len(result.chunks) == 3
assert "第一章" in result.chunks[0].content
assert "第二章" in result.chunks[1].content
assert "第三章" in result.chunks[2].content
assert result.raw_text.count("\n\n") == 2
def test_mixed_content_scenario(self):
"""Test mixed content scenario - text and table."""
parser = ImageParser()
response = json.dumps({
"image_summary": "混合内容图片",
"chunks": [
{
"chunk_index": 0,
"content": "产品介绍:本文档介绍我们的核心产品功能。",
"chunk_type": "text",
"keywords": ["产品", "功能"]
},
{
"chunk_index": 1,
"content": "产品规格表:\n| 型号 | 价格 | 库存 |\n| --- | --- | --- |\n| A1 | 100 | 50 |",
"chunk_type": "table",
"keywords": ["规格", "价格", "库存"]
},
{
"chunk_index": 2,
"content": "使用说明:\n1. 打开包装\n2. 连接电源\n3. 按下启动按钮",
"chunk_type": "list",
"keywords": ["说明", "步骤"]
}
]
})
result = parser._parse_llm_response(response)
assert len(result.chunks) == 3
assert result.chunks[0].chunk_type == "text"
assert result.chunks[1].chunk_type == "table"
assert result.chunks[2].chunk_type == "list"
if __name__ == "__main__":
pytest.main([__file__, "-v"])