""" Unit tests for ImageParser. """ import json import pytest from unittest.mock import AsyncMock, MagicMock, patch from app.services.document.image_parser import ( ImageChunk, ImageParseResult, ImageParser, ) class TestImageParserBasics: """Test basic functionality of ImageParser.""" def test_supported_extensions(self): """Test that ImageParser supports correct extensions.""" parser = ImageParser() extensions = parser.get_supported_extensions() expected_extensions = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"] assert extensions == expected_extensions def test_get_mime_type(self): """Test MIME type mapping.""" parser = ImageParser() assert parser._get_mime_type(".jpg") == "image/jpeg" assert parser._get_mime_type(".jpeg") == "image/jpeg" assert parser._get_mime_type(".png") == "image/png" assert parser._get_mime_type(".gif") == "image/gif" assert parser._get_mime_type(".webp") == "image/webp" assert parser._get_mime_type(".bmp") == "image/bmp" assert parser._get_mime_type(".tiff") == "image/tiff" assert parser._get_mime_type(".tif") == "image/tiff" assert parser._get_mime_type(".unknown") == "image/jpeg" class TestImageChunkParsing: """Test LLM response parsing functionality.""" def test_extract_json_from_plain_json(self): """Test extracting JSON from plain JSON response.""" parser = ImageParser() json_str = '{"image_summary": "test", "chunks": [{"chunk_index": 0, "content": "hello", "chunk_type": "text", "keywords": ["key"]}]}' result = parser._extract_json(json_str) assert result == json_str def test_extract_json_from_markdown(self): """Test extracting JSON from markdown code block.""" parser = ImageParser() markdown = """Here is the analysis: ```json {"image_summary": "test", "chunks": [{"chunk_index": 0, "content": "hello"}]} ``` Hope this helps!""" result = parser._extract_json(markdown) assert "image_summary" in result assert "test" in result def test_extract_json_from_text_with_json(self): """Test extracting JSON from text with JSON embedded.""" parser = ImageParser() text = "The result is: {'image_summary': 'summary', 'chunks': []}" result = parser._extract_json(text) assert "image_summary" in result assert "chunks" in result def test_parse_llm_response_valid_json(self): """Test parsing valid JSON response from LLM.""" parser = ImageParser() response = json.dumps({ "image_summary": "测试图片", "total_chunks": 2, "chunks": [ { "chunk_index": 0, "content": "这是第一块内容", "chunk_type": "text", "keywords": ["测试", "内容"] }, { "chunk_index": 1, "content": "这是第二块内容,包含表格数据", "chunk_type": "table", "keywords": ["表格", "数据"] } ] }) result = parser._parse_llm_response(response) assert result.image_summary == "测试图片" assert len(result.chunks) == 2 assert result.chunks[0].content == "这是第一块内容" assert result.chunks[0].chunk_type == "text" assert result.chunks[0].keywords == ["测试", "内容"] assert result.chunks[1].chunk_type == "table" assert result.chunks[1].keywords == ["表格", "数据"] def test_parse_llm_response_empty_chunks(self): """Test handling response with empty chunks.""" parser = ImageParser() response = json.dumps({ "image_summary": "测试", "chunks": [] }) result = parser._parse_llm_response(response) assert len(result.chunks) == 1 assert result.chunks[0].content == response def test_parse_llm_response_invalid_json(self): """Test handling invalid JSON response with fallback.""" parser = ImageParser() response = "This is not JSON at all" result = parser._parse_llm_response(response) assert len(result.chunks) == 1 assert result.chunks[0].content == "This is not JSON at all" def test_parse_llm_response_partial_json(self): """Test handling response with partial/invalid JSON uses fallback.""" parser = ImageParser() response = '{"image_summary": "test" some text here {"chunks": []}' result = parser._parse_llm_response(response) assert len(result.chunks) == 1 assert result.chunks[0].content == response class TestImageChunkDataClass: """Test ImageChunk dataclass functionality.""" def test_image_chunk_creation(self): """Test creating ImageChunk.""" chunk = ImageChunk( chunk_index=0, content="Test content", chunk_type="text", keywords=["test", "content"] ) assert chunk.chunk_index == 0 assert chunk.content == "Test content" assert chunk.chunk_type == "text" assert chunk.keywords == ["test", "content"] def test_image_chunk_default_values(self): """Test ImageChunk with default values.""" chunk = ImageChunk(chunk_index=0, content="Test") assert chunk.chunk_type == "text" assert chunk.keywords == [] def test_image_parse_result_creation(self): """Test creating ImageParseResult.""" chunks = [ ImageChunk(chunk_index=0, content="Chunk 1"), ImageChunk(chunk_index=1, content="Chunk 2"), ] result = ImageParseResult( image_summary="Test summary", chunks=chunks, raw_text="Chunk 1\n\nChunk 2", source_path="/path/to/image.png", file_size=1024, metadata={"format": "png"} ) assert result.image_summary == "Test summary" assert len(result.chunks) == 2 assert result.raw_text == "Chunk 1\n\nChunk 2" assert result.file_size == 1024 assert result.metadata["format"] == "png" class TestChunkTypes: """Test different chunk types.""" def test_text_chunk_type(self): """Test text chunk type.""" parser = ImageParser() response = json.dumps({ "image_summary": "Text content", "chunks": [ { "chunk_index": 0, "content": "Plain text content", "chunk_type": "text", "keywords": ["text"] } ] }) result = parser._parse_llm_response(response) assert result.chunks[0].chunk_type == "text" def test_table_chunk_type(self): """Test table chunk type.""" parser = ImageParser() response = json.dumps({ "image_summary": "Table content", "chunks": [ { "chunk_index": 0, "content": "Name | Age\n---- | ---\nJohn | 30", "chunk_type": "table", "keywords": ["table", "data"] } ] }) result = parser._parse_llm_response(response) assert result.chunks[0].chunk_type == "table" def test_chart_chunk_type(self): """Test chart chunk type.""" parser = ImageParser() response = json.dumps({ "image_summary": "Chart content", "chunks": [ { "chunk_index": 0, "content": "Bar chart showing sales data", "chunk_type": "chart", "keywords": ["chart", "sales"] } ] }) result = parser._parse_llm_response(response) assert result.chunks[0].chunk_type == "chart" def test_list_chunk_type(self): """Test list chunk type.""" parser = ImageParser() response = json.dumps({ "image_summary": "List content", "chunks": [ { "chunk_index": 0, "content": "1. First item\n2. Second item\n3. Third item", "chunk_type": "list", "keywords": ["list", "items"] } ] }) result = parser._parse_llm_response(response) assert result.chunks[0].chunk_type == "list" class TestIntegrationScenarios: """Test integration scenarios.""" def test_single_chunk_scenario(self): """Test single chunk scenario - simple image with one main content.""" parser = ImageParser() response = json.dumps({ "image_summary": "简单文档截图", "chunks": [ { "chunk_index": 0, "content": "这是一段完整的文档内容,包含所有的信息要点。", "chunk_type": "text", "keywords": ["文档", "信息"] } ] }) result = parser._parse_llm_response(response) assert len(result.chunks) == 1 assert result.image_summary == "简单文档截图" assert result.raw_text == "这是一段完整的文档内容,包含所有的信息要点。" def test_multi_chunk_scenario(self): """Test multi-chunk scenario - complex image with multiple sections.""" parser = ImageParser() response = json.dumps({ "image_summary": "多段落文档", "chunks": [ { "chunk_index": 0, "content": "第一章:介绍部分,介绍项目的背景和目标。", "chunk_type": "text", "keywords": ["第一章", "介绍"] }, { "chunk_index": 1, "content": "第二章:技术架构,包括前端、后端和数据库设计。", "chunk_type": "text", "keywords": ["第二章", "架构"] }, { "chunk_index": 2, "content": "第三章:部署流程,包含开发环境和生产环境配置。", "chunk_type": "text", "keywords": ["第三章", "部署"] } ] }) result = parser._parse_llm_response(response) assert len(result.chunks) == 3 assert "第一章" in result.chunks[0].content assert "第二章" in result.chunks[1].content assert "第三章" in result.chunks[2].content assert result.raw_text.count("\n\n") == 2 def test_mixed_content_scenario(self): """Test mixed content scenario - text and table.""" parser = ImageParser() response = json.dumps({ "image_summary": "混合内容图片", "chunks": [ { "chunk_index": 0, "content": "产品介绍:本文档介绍我们的核心产品功能。", "chunk_type": "text", "keywords": ["产品", "功能"] }, { "chunk_index": 1, "content": "产品规格表:\n| 型号 | 价格 | 库存 |\n| --- | --- | --- |\n| A1 | 100 | 50 |", "chunk_type": "table", "keywords": ["规格", "价格", "库存"] }, { "chunk_index": 2, "content": "使用说明:\n1. 打开包装\n2. 连接电源\n3. 按下启动按钮", "chunk_type": "list", "keywords": ["说明", "步骤"] } ] }) result = parser._parse_llm_response(response) assert len(result.chunks) == 3 assert result.chunks[0].chunk_type == "text" assert result.chunks[1].chunk_type == "table" assert result.chunks[2].chunk_type == "list" if __name__ == "__main__": pytest.main([__file__, "-v"])