292 lines
9.1 KiB
Python
292 lines
9.1 KiB
Python
"""
|
|
Unit tests for LlmJudge.
|
|
[AC-AISVC-118, AC-AISVC-119] Tests for LLM-based intent arbitration.
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
import uuid
|
|
|
|
from app.services.intent.llm_judge import LlmJudge
|
|
from app.services.intent.models import (
|
|
FusionConfig,
|
|
LlmJudgeInput,
|
|
LlmJudgeResult,
|
|
RuleMatchResult,
|
|
SemanticCandidate,
|
|
SemanticMatchResult,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_llm_client():
|
|
"""Create a mock LLM client."""
|
|
client = AsyncMock()
|
|
return client
|
|
|
|
|
|
@pytest.fixture
|
|
def config():
|
|
"""Create a fusion config."""
|
|
return FusionConfig()
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_rule():
|
|
"""Create a mock intent rule."""
|
|
rule = MagicMock()
|
|
rule.id = uuid.uuid4()
|
|
rule.name = "Test Intent"
|
|
return rule
|
|
|
|
|
|
class TestLlmJudge:
|
|
"""Tests for LlmJudge class."""
|
|
|
|
def test_init(self, mock_llm_client, config):
|
|
"""Test LlmJudge initialization."""
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
assert judge._llm_client == mock_llm_client
|
|
assert judge._config == config
|
|
|
|
def test_should_trigger_disabled(self, mock_llm_client):
|
|
"""Test should_trigger when LLM judge is disabled."""
|
|
config = FusionConfig(llm_judge_enabled=False)
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
|
|
rule_result = RuleMatchResult(
|
|
rule_id=uuid.uuid4(),
|
|
rule=MagicMock(),
|
|
match_type="keyword",
|
|
matched_text="test",
|
|
score=1.0,
|
|
duration_ms=10,
|
|
)
|
|
semantic_result = SemanticMatchResult(
|
|
candidates=[],
|
|
top_score=0.8,
|
|
duration_ms=50,
|
|
skipped=False,
|
|
skip_reason=None,
|
|
)
|
|
|
|
triggered, reason = judge.should_trigger(rule_result, semantic_result)
|
|
assert triggered is False
|
|
assert reason == "disabled"
|
|
|
|
def test_should_trigger_rule_semantic_conflict(self, mock_llm_client, config, mock_rule):
|
|
"""Test should_trigger for rule vs semantic conflict."""
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
|
|
rule_result = RuleMatchResult(
|
|
rule_id=uuid.uuid4(),
|
|
rule=mock_rule,
|
|
match_type="keyword",
|
|
matched_text="test",
|
|
score=1.0,
|
|
duration_ms=10,
|
|
)
|
|
|
|
other_rule = MagicMock()
|
|
other_rule.id = uuid.uuid4()
|
|
other_rule.name = "Other Intent"
|
|
|
|
semantic_result = SemanticMatchResult(
|
|
candidates=[SemanticCandidate(rule=other_rule, score=0.95)],
|
|
top_score=0.95,
|
|
duration_ms=50,
|
|
skipped=False,
|
|
skip_reason=None,
|
|
)
|
|
|
|
triggered, reason = judge.should_trigger(rule_result, semantic_result)
|
|
assert triggered is True
|
|
assert reason == "rule_semantic_conflict"
|
|
|
|
def test_should_trigger_gray_zone(self, mock_llm_client, config, mock_rule):
|
|
"""Test should_trigger for gray zone scenario."""
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
|
|
rule_result = RuleMatchResult(
|
|
rule_id=None,
|
|
rule=None,
|
|
match_type=None,
|
|
matched_text=None,
|
|
score=0.0,
|
|
duration_ms=10,
|
|
)
|
|
|
|
semantic_result = SemanticMatchResult(
|
|
candidates=[SemanticCandidate(rule=mock_rule, score=0.5)],
|
|
top_score=0.5,
|
|
duration_ms=50,
|
|
skipped=False,
|
|
skip_reason=None,
|
|
)
|
|
|
|
triggered, reason = judge.should_trigger(rule_result, semantic_result)
|
|
assert triggered is True
|
|
assert reason == "gray_zone"
|
|
|
|
def test_should_trigger_multi_intent(self, mock_llm_client, config, mock_rule):
|
|
"""Test should_trigger for multi-intent scenario."""
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
|
|
rule_result = RuleMatchResult(
|
|
rule_id=None,
|
|
rule=None,
|
|
match_type=None,
|
|
matched_text=None,
|
|
score=0.0,
|
|
duration_ms=10,
|
|
)
|
|
|
|
other_rule = MagicMock()
|
|
other_rule.id = uuid.uuid4()
|
|
other_rule.name = "Other Intent"
|
|
|
|
semantic_result = SemanticMatchResult(
|
|
candidates=[
|
|
SemanticCandidate(rule=mock_rule, score=0.8),
|
|
SemanticCandidate(rule=other_rule, score=0.75),
|
|
],
|
|
top_score=0.8,
|
|
duration_ms=50,
|
|
skipped=False,
|
|
skip_reason=None,
|
|
)
|
|
|
|
triggered, reason = judge.should_trigger(rule_result, semantic_result)
|
|
assert triggered is True
|
|
assert reason == "multi_intent"
|
|
|
|
def test_should_not_trigger_high_confidence(self, mock_llm_client, config, mock_rule):
|
|
"""Test should_trigger returns False for high confidence match."""
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
|
|
rule_result = RuleMatchResult(
|
|
rule_id=mock_rule.id,
|
|
rule=mock_rule,
|
|
match_type="keyword",
|
|
matched_text="test",
|
|
score=1.0,
|
|
duration_ms=10,
|
|
)
|
|
|
|
semantic_result = SemanticMatchResult(
|
|
candidates=[SemanticCandidate(rule=mock_rule, score=0.9)],
|
|
top_score=0.9,
|
|
duration_ms=50,
|
|
skipped=False,
|
|
skip_reason=None,
|
|
)
|
|
|
|
triggered, reason = judge.should_trigger(rule_result, semantic_result)
|
|
assert triggered is False
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_success(self, mock_llm_client, config):
|
|
"""Test successful LLM judge."""
|
|
from app.services.llm.base import LLMResponse
|
|
|
|
mock_response = LLMResponse(
|
|
content='{"intent_id": "test-id", "intent_name": "Test", "confidence": 0.85, "reasoning": "Test reasoning"}',
|
|
model="gpt-4",
|
|
usage={"total_tokens": 100},
|
|
)
|
|
mock_llm_client.generate = AsyncMock(return_value=mock_response)
|
|
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
input_data = LlmJudgeInput(
|
|
message="test message",
|
|
candidates=[{"id": "test-id", "name": "Test", "description": "Test intent"}],
|
|
conflict_type="gray_zone",
|
|
)
|
|
|
|
result = await judge.judge(input_data, "tenant-1")
|
|
|
|
assert result.triggered is True
|
|
assert result.intent_id == "test-id"
|
|
assert result.intent_name == "Test"
|
|
assert result.score == 0.85
|
|
assert result.reasoning == "Test reasoning"
|
|
assert result.tokens_used == 100
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_timeout(self, mock_llm_client, config):
|
|
"""Test LLM judge timeout."""
|
|
import asyncio
|
|
mock_llm_client.generate = AsyncMock(side_effect=asyncio.TimeoutError())
|
|
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
input_data = LlmJudgeInput(
|
|
message="test message",
|
|
candidates=[{"id": "test-id", "name": "Test"}],
|
|
conflict_type="gray_zone",
|
|
)
|
|
|
|
result = await judge.judge(input_data, "tenant-1")
|
|
|
|
assert result.triggered is True
|
|
assert result.intent_id is None
|
|
assert "timeout" in result.reasoning.lower()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_error(self, mock_llm_client, config):
|
|
"""Test LLM judge error handling."""
|
|
mock_llm_client.generate = AsyncMock(side_effect=Exception("LLM error"))
|
|
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
input_data = LlmJudgeInput(
|
|
message="test message",
|
|
candidates=[{"id": "test-id", "name": "Test"}],
|
|
conflict_type="gray_zone",
|
|
)
|
|
|
|
result = await judge.judge(input_data, "tenant-1")
|
|
|
|
assert result.triggered is True
|
|
assert result.intent_id is None
|
|
assert "error" in result.reasoning.lower()
|
|
|
|
def test_parse_response_valid_json(self, mock_llm_client, config):
|
|
"""Test parsing valid JSON response."""
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
|
|
content = '{"intent_id": "test", "confidence": 0.9}'
|
|
result = judge._parse_response(content)
|
|
|
|
assert result["intent_id"] == "test"
|
|
assert result["confidence"] == 0.9
|
|
|
|
def test_parse_response_with_markdown(self, mock_llm_client, config):
|
|
"""Test parsing JSON response with markdown code block."""
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
|
|
content = '```json\n{"intent_id": "test", "confidence": 0.9}\n```'
|
|
result = judge._parse_response(content)
|
|
|
|
assert result["intent_id"] == "test"
|
|
assert result["confidence"] == 0.9
|
|
|
|
def test_parse_response_invalid_json(self, mock_llm_client, config):
|
|
"""Test parsing invalid JSON response."""
|
|
judge = LlmJudge(mock_llm_client, config)
|
|
|
|
content = "This is not valid JSON"
|
|
result = judge._parse_response(content)
|
|
|
|
assert result == {}
|
|
|
|
def test_llm_judge_result_empty(self):
|
|
"""Test LlmJudgeResult.empty() class method."""
|
|
result = LlmJudgeResult.empty()
|
|
|
|
assert result.intent_id is None
|
|
assert result.intent_name is None
|
|
assert result.score == 0.0
|
|
assert result.reasoning is None
|
|
assert result.duration_ms == 0
|
|
assert result.tokens_used == 0
|
|
assert result.triggered is False
|