ai-robot-core/ai-service/tests/test_llm_judge.py

292 lines
9.1 KiB
Python

"""
Unit tests for LlmJudge.
[AC-AISVC-118, AC-AISVC-119] Tests for LLM-based intent arbitration.
"""
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
import uuid
from app.services.intent.llm_judge import LlmJudge
from app.services.intent.models import (
FusionConfig,
LlmJudgeInput,
LlmJudgeResult,
RuleMatchResult,
SemanticCandidate,
SemanticMatchResult,
)
@pytest.fixture
def mock_llm_client():
"""Create a mock LLM client."""
client = AsyncMock()
return client
@pytest.fixture
def config():
"""Create a fusion config."""
return FusionConfig()
@pytest.fixture
def mock_rule():
"""Create a mock intent rule."""
rule = MagicMock()
rule.id = uuid.uuid4()
rule.name = "Test Intent"
return rule
class TestLlmJudge:
"""Tests for LlmJudge class."""
def test_init(self, mock_llm_client, config):
"""Test LlmJudge initialization."""
judge = LlmJudge(mock_llm_client, config)
assert judge._llm_client == mock_llm_client
assert judge._config == config
def test_should_trigger_disabled(self, mock_llm_client):
"""Test should_trigger when LLM judge is disabled."""
config = FusionConfig(llm_judge_enabled=False)
judge = LlmJudge(mock_llm_client, config)
rule_result = RuleMatchResult(
rule_id=uuid.uuid4(),
rule=MagicMock(),
match_type="keyword",
matched_text="test",
score=1.0,
duration_ms=10,
)
semantic_result = SemanticMatchResult(
candidates=[],
top_score=0.8,
duration_ms=50,
skipped=False,
skip_reason=None,
)
triggered, reason = judge.should_trigger(rule_result, semantic_result)
assert triggered is False
assert reason == "disabled"
def test_should_trigger_rule_semantic_conflict(self, mock_llm_client, config, mock_rule):
"""Test should_trigger for rule vs semantic conflict."""
judge = LlmJudge(mock_llm_client, config)
rule_result = RuleMatchResult(
rule_id=uuid.uuid4(),
rule=mock_rule,
match_type="keyword",
matched_text="test",
score=1.0,
duration_ms=10,
)
other_rule = MagicMock()
other_rule.id = uuid.uuid4()
other_rule.name = "Other Intent"
semantic_result = SemanticMatchResult(
candidates=[SemanticCandidate(rule=other_rule, score=0.95)],
top_score=0.95,
duration_ms=50,
skipped=False,
skip_reason=None,
)
triggered, reason = judge.should_trigger(rule_result, semantic_result)
assert triggered is True
assert reason == "rule_semantic_conflict"
def test_should_trigger_gray_zone(self, mock_llm_client, config, mock_rule):
"""Test should_trigger for gray zone scenario."""
judge = LlmJudge(mock_llm_client, config)
rule_result = RuleMatchResult(
rule_id=None,
rule=None,
match_type=None,
matched_text=None,
score=0.0,
duration_ms=10,
)
semantic_result = SemanticMatchResult(
candidates=[SemanticCandidate(rule=mock_rule, score=0.5)],
top_score=0.5,
duration_ms=50,
skipped=False,
skip_reason=None,
)
triggered, reason = judge.should_trigger(rule_result, semantic_result)
assert triggered is True
assert reason == "gray_zone"
def test_should_trigger_multi_intent(self, mock_llm_client, config, mock_rule):
"""Test should_trigger for multi-intent scenario."""
judge = LlmJudge(mock_llm_client, config)
rule_result = RuleMatchResult(
rule_id=None,
rule=None,
match_type=None,
matched_text=None,
score=0.0,
duration_ms=10,
)
other_rule = MagicMock()
other_rule.id = uuid.uuid4()
other_rule.name = "Other Intent"
semantic_result = SemanticMatchResult(
candidates=[
SemanticCandidate(rule=mock_rule, score=0.8),
SemanticCandidate(rule=other_rule, score=0.75),
],
top_score=0.8,
duration_ms=50,
skipped=False,
skip_reason=None,
)
triggered, reason = judge.should_trigger(rule_result, semantic_result)
assert triggered is True
assert reason == "multi_intent"
def test_should_not_trigger_high_confidence(self, mock_llm_client, config, mock_rule):
"""Test should_trigger returns False for high confidence match."""
judge = LlmJudge(mock_llm_client, config)
rule_result = RuleMatchResult(
rule_id=mock_rule.id,
rule=mock_rule,
match_type="keyword",
matched_text="test",
score=1.0,
duration_ms=10,
)
semantic_result = SemanticMatchResult(
candidates=[SemanticCandidate(rule=mock_rule, score=0.9)],
top_score=0.9,
duration_ms=50,
skipped=False,
skip_reason=None,
)
triggered, reason = judge.should_trigger(rule_result, semantic_result)
assert triggered is False
@pytest.mark.asyncio
async def test_judge_success(self, mock_llm_client, config):
"""Test successful LLM judge."""
from app.services.llm.base import LLMResponse
mock_response = LLMResponse(
content='{"intent_id": "test-id", "intent_name": "Test", "confidence": 0.85, "reasoning": "Test reasoning"}',
model="gpt-4",
usage={"total_tokens": 100},
)
mock_llm_client.generate = AsyncMock(return_value=mock_response)
judge = LlmJudge(mock_llm_client, config)
input_data = LlmJudgeInput(
message="test message",
candidates=[{"id": "test-id", "name": "Test", "description": "Test intent"}],
conflict_type="gray_zone",
)
result = await judge.judge(input_data, "tenant-1")
assert result.triggered is True
assert result.intent_id == "test-id"
assert result.intent_name == "Test"
assert result.score == 0.85
assert result.reasoning == "Test reasoning"
assert result.tokens_used == 100
@pytest.mark.asyncio
async def test_judge_timeout(self, mock_llm_client, config):
"""Test LLM judge timeout."""
import asyncio
mock_llm_client.generate = AsyncMock(side_effect=asyncio.TimeoutError())
judge = LlmJudge(mock_llm_client, config)
input_data = LlmJudgeInput(
message="test message",
candidates=[{"id": "test-id", "name": "Test"}],
conflict_type="gray_zone",
)
result = await judge.judge(input_data, "tenant-1")
assert result.triggered is True
assert result.intent_id is None
assert "timeout" in result.reasoning.lower()
@pytest.mark.asyncio
async def test_judge_error(self, mock_llm_client, config):
"""Test LLM judge error handling."""
mock_llm_client.generate = AsyncMock(side_effect=Exception("LLM error"))
judge = LlmJudge(mock_llm_client, config)
input_data = LlmJudgeInput(
message="test message",
candidates=[{"id": "test-id", "name": "Test"}],
conflict_type="gray_zone",
)
result = await judge.judge(input_data, "tenant-1")
assert result.triggered is True
assert result.intent_id is None
assert "error" in result.reasoning.lower()
def test_parse_response_valid_json(self, mock_llm_client, config):
"""Test parsing valid JSON response."""
judge = LlmJudge(mock_llm_client, config)
content = '{"intent_id": "test", "confidence": 0.9}'
result = judge._parse_response(content)
assert result["intent_id"] == "test"
assert result["confidence"] == 0.9
def test_parse_response_with_markdown(self, mock_llm_client, config):
"""Test parsing JSON response with markdown code block."""
judge = LlmJudge(mock_llm_client, config)
content = '```json\n{"intent_id": "test", "confidence": 0.9}\n```'
result = judge._parse_response(content)
assert result["intent_id"] == "test"
assert result["confidence"] == 0.9
def test_parse_response_invalid_json(self, mock_llm_client, config):
"""Test parsing invalid JSON response."""
judge = LlmJudge(mock_llm_client, config)
content = "This is not valid JSON"
result = judge._parse_response(content)
assert result == {}
def test_llm_judge_result_empty(self):
"""Test LlmJudgeResult.empty() class method."""
result = LlmJudgeResult.empty()
assert result.intent_id is None
assert result.intent_name is None
assert result.score == 0.0
assert result.reasoning is None
assert result.duration_ms == 0
assert result.tokens_used == 0
assert result.triggered is False