""" Unit tests for LlmJudge. [AC-AISVC-118, AC-AISVC-119] Tests for LLM-based intent arbitration. """ import pytest from unittest.mock import AsyncMock, MagicMock, patch import uuid from app.services.intent.llm_judge import LlmJudge from app.services.intent.models import ( FusionConfig, LlmJudgeInput, LlmJudgeResult, RuleMatchResult, SemanticCandidate, SemanticMatchResult, ) @pytest.fixture def mock_llm_client(): """Create a mock LLM client.""" client = AsyncMock() return client @pytest.fixture def config(): """Create a fusion config.""" return FusionConfig() @pytest.fixture def mock_rule(): """Create a mock intent rule.""" rule = MagicMock() rule.id = uuid.uuid4() rule.name = "Test Intent" return rule class TestLlmJudge: """Tests for LlmJudge class.""" def test_init(self, mock_llm_client, config): """Test LlmJudge initialization.""" judge = LlmJudge(mock_llm_client, config) assert judge._llm_client == mock_llm_client assert judge._config == config def test_should_trigger_disabled(self, mock_llm_client): """Test should_trigger when LLM judge is disabled.""" config = FusionConfig(llm_judge_enabled=False) judge = LlmJudge(mock_llm_client, config) rule_result = RuleMatchResult( rule_id=uuid.uuid4(), rule=MagicMock(), match_type="keyword", matched_text="test", score=1.0, duration_ms=10, ) semantic_result = SemanticMatchResult( candidates=[], top_score=0.8, duration_ms=50, skipped=False, skip_reason=None, ) triggered, reason = judge.should_trigger(rule_result, semantic_result) assert triggered is False assert reason == "disabled" def test_should_trigger_rule_semantic_conflict(self, mock_llm_client, config, mock_rule): """Test should_trigger for rule vs semantic conflict.""" judge = LlmJudge(mock_llm_client, config) rule_result = RuleMatchResult( rule_id=uuid.uuid4(), rule=mock_rule, match_type="keyword", matched_text="test", score=1.0, duration_ms=10, ) other_rule = MagicMock() other_rule.id = uuid.uuid4() other_rule.name = "Other Intent" semantic_result = SemanticMatchResult( candidates=[SemanticCandidate(rule=other_rule, score=0.95)], top_score=0.95, duration_ms=50, skipped=False, skip_reason=None, ) triggered, reason = judge.should_trigger(rule_result, semantic_result) assert triggered is True assert reason == "rule_semantic_conflict" def test_should_trigger_gray_zone(self, mock_llm_client, config, mock_rule): """Test should_trigger for gray zone scenario.""" judge = LlmJudge(mock_llm_client, config) rule_result = RuleMatchResult( rule_id=None, rule=None, match_type=None, matched_text=None, score=0.0, duration_ms=10, ) semantic_result = SemanticMatchResult( candidates=[SemanticCandidate(rule=mock_rule, score=0.5)], top_score=0.5, duration_ms=50, skipped=False, skip_reason=None, ) triggered, reason = judge.should_trigger(rule_result, semantic_result) assert triggered is True assert reason == "gray_zone" def test_should_trigger_multi_intent(self, mock_llm_client, config, mock_rule): """Test should_trigger for multi-intent scenario.""" judge = LlmJudge(mock_llm_client, config) rule_result = RuleMatchResult( rule_id=None, rule=None, match_type=None, matched_text=None, score=0.0, duration_ms=10, ) other_rule = MagicMock() other_rule.id = uuid.uuid4() other_rule.name = "Other Intent" semantic_result = SemanticMatchResult( candidates=[ SemanticCandidate(rule=mock_rule, score=0.8), SemanticCandidate(rule=other_rule, score=0.75), ], top_score=0.8, duration_ms=50, skipped=False, skip_reason=None, ) triggered, reason = judge.should_trigger(rule_result, semantic_result) assert triggered is True assert reason == "multi_intent" def test_should_not_trigger_high_confidence(self, mock_llm_client, config, mock_rule): """Test should_trigger returns False for high confidence match.""" judge = LlmJudge(mock_llm_client, config) rule_result = RuleMatchResult( rule_id=mock_rule.id, rule=mock_rule, match_type="keyword", matched_text="test", score=1.0, duration_ms=10, ) semantic_result = SemanticMatchResult( candidates=[SemanticCandidate(rule=mock_rule, score=0.9)], top_score=0.9, duration_ms=50, skipped=False, skip_reason=None, ) triggered, reason = judge.should_trigger(rule_result, semantic_result) assert triggered is False @pytest.mark.asyncio async def test_judge_success(self, mock_llm_client, config): """Test successful LLM judge.""" from app.services.llm.base import LLMResponse mock_response = LLMResponse( content='{"intent_id": "test-id", "intent_name": "Test", "confidence": 0.85, "reasoning": "Test reasoning"}', model="gpt-4", usage={"total_tokens": 100}, ) mock_llm_client.generate = AsyncMock(return_value=mock_response) judge = LlmJudge(mock_llm_client, config) input_data = LlmJudgeInput( message="test message", candidates=[{"id": "test-id", "name": "Test", "description": "Test intent"}], conflict_type="gray_zone", ) result = await judge.judge(input_data, "tenant-1") assert result.triggered is True assert result.intent_id == "test-id" assert result.intent_name == "Test" assert result.score == 0.85 assert result.reasoning == "Test reasoning" assert result.tokens_used == 100 @pytest.mark.asyncio async def test_judge_timeout(self, mock_llm_client, config): """Test LLM judge timeout.""" import asyncio mock_llm_client.generate = AsyncMock(side_effect=asyncio.TimeoutError()) judge = LlmJudge(mock_llm_client, config) input_data = LlmJudgeInput( message="test message", candidates=[{"id": "test-id", "name": "Test"}], conflict_type="gray_zone", ) result = await judge.judge(input_data, "tenant-1") assert result.triggered is True assert result.intent_id is None assert "timeout" in result.reasoning.lower() @pytest.mark.asyncio async def test_judge_error(self, mock_llm_client, config): """Test LLM judge error handling.""" mock_llm_client.generate = AsyncMock(side_effect=Exception("LLM error")) judge = LlmJudge(mock_llm_client, config) input_data = LlmJudgeInput( message="test message", candidates=[{"id": "test-id", "name": "Test"}], conflict_type="gray_zone", ) result = await judge.judge(input_data, "tenant-1") assert result.triggered is True assert result.intent_id is None assert "error" in result.reasoning.lower() def test_parse_response_valid_json(self, mock_llm_client, config): """Test parsing valid JSON response.""" judge = LlmJudge(mock_llm_client, config) content = '{"intent_id": "test", "confidence": 0.9}' result = judge._parse_response(content) assert result["intent_id"] == "test" assert result["confidence"] == 0.9 def test_parse_response_with_markdown(self, mock_llm_client, config): """Test parsing JSON response with markdown code block.""" judge = LlmJudge(mock_llm_client, config) content = '```json\n{"intent_id": "test", "confidence": 0.9}\n```' result = judge._parse_response(content) assert result["intent_id"] == "test" assert result["confidence"] == 0.9 def test_parse_response_invalid_json(self, mock_llm_client, config): """Test parsing invalid JSON response.""" judge = LlmJudge(mock_llm_client, config) content = "This is not valid JSON" result = judge._parse_response(content) assert result == {} def test_llm_judge_result_empty(self): """Test LlmJudgeResult.empty() class method.""" result = LlmJudgeResult.empty() assert result.intent_id is None assert result.intent_name is None assert result.score == 0.0 assert result.reasoning is None assert result.duration_ms == 0 assert result.tokens_used == 0 assert result.triggered is False