453 lines
14 KiB
Python
453 lines
14 KiB
Python
|
|
"""
|
||
|
|
Strategy Metrics Service for AI Service.
|
||
|
|
[AC-AISVC-RES-03, AC-AISVC-RES-08] Metrics collection for strategy operations.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import logging
|
||
|
|
import time
|
||
|
|
from collections import defaultdict
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
from datetime import datetime
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
from app.schemas.retrieval_strategy import (
|
||
|
|
ReactMode,
|
||
|
|
StrategyMetrics,
|
||
|
|
StrategyType,
|
||
|
|
)
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class LatencyTracker:
|
||
|
|
"""
|
||
|
|
Latency tracking for a single operation.
|
||
|
|
"""
|
||
|
|
|
||
|
|
latencies: list[float] = field(default_factory=list)
|
||
|
|
max_samples: int = 1000
|
||
|
|
|
||
|
|
def record(self, latency_ms: float) -> None:
|
||
|
|
"""Record a latency sample."""
|
||
|
|
if len(self.latencies) >= self.max_samples:
|
||
|
|
self.latencies = self.latencies[-self.max_samples // 2 :]
|
||
|
|
self.latencies.append(latency_ms)
|
||
|
|
|
||
|
|
def get_percentile(self, percentile: float) -> float:
|
||
|
|
"""Get latency at given percentile."""
|
||
|
|
if not self.latencies:
|
||
|
|
return 0.0
|
||
|
|
sorted_latencies = sorted(self.latencies)
|
||
|
|
index = int(len(sorted_latencies) * percentile / 100)
|
||
|
|
index = min(index, len(sorted_latencies) - 1)
|
||
|
|
return sorted_latencies[index]
|
||
|
|
|
||
|
|
def get_avg(self) -> float:
|
||
|
|
"""Get average latency."""
|
||
|
|
if not self.latencies:
|
||
|
|
return 0.0
|
||
|
|
return sum(self.latencies) / len(self.latencies)
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class StrategyMetricsData:
|
||
|
|
"""
|
||
|
|
Internal metrics data structure.
|
||
|
|
"""
|
||
|
|
|
||
|
|
total_requests: int = 0
|
||
|
|
successful_requests: int = 0
|
||
|
|
failed_requests: int = 0
|
||
|
|
latency_tracker: LatencyTracker = field(default_factory=LatencyTracker)
|
||
|
|
direct_route_count: int = 0
|
||
|
|
react_route_count: int = 0
|
||
|
|
auto_route_count: int = 0
|
||
|
|
fallback_count: int = 0
|
||
|
|
last_updated: str | None = None
|
||
|
|
|
||
|
|
|
||
|
|
class StrategyMetricsService:
|
||
|
|
"""
|
||
|
|
[AC-AISVC-RES-03, AC-AISVC-RES-08] Metrics service for strategy operations.
|
||
|
|
|
||
|
|
Features:
|
||
|
|
- Request counting by strategy and route mode
|
||
|
|
- Latency tracking with percentiles
|
||
|
|
- Fallback and error tracking
|
||
|
|
- Metrics export for monitoring
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
self._metrics: dict[str, StrategyMetricsData] = defaultdict(StrategyMetricsData)
|
||
|
|
self._route_metrics: dict[str, StrategyMetricsData] = defaultdict(StrategyMetricsData)
|
||
|
|
self._current_strategy: StrategyType = StrategyType.DEFAULT
|
||
|
|
self._current_react_mode: ReactMode = ReactMode.NON_REACT
|
||
|
|
|
||
|
|
def set_current_strategy(
|
||
|
|
self,
|
||
|
|
strategy: StrategyType,
|
||
|
|
react_mode: ReactMode,
|
||
|
|
) -> None:
|
||
|
|
"""
|
||
|
|
Set current strategy for metrics attribution.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
strategy: Current active strategy.
|
||
|
|
react_mode: Current react mode.
|
||
|
|
"""
|
||
|
|
self._current_strategy = strategy
|
||
|
|
self._current_react_mode = react_mode
|
||
|
|
|
||
|
|
def record_request(
|
||
|
|
self,
|
||
|
|
latency_ms: float,
|
||
|
|
success: bool = True,
|
||
|
|
route_mode: str | None = None,
|
||
|
|
fallback: bool = False,
|
||
|
|
strategy: StrategyType | None = None,
|
||
|
|
) -> None:
|
||
|
|
"""
|
||
|
|
[AC-AISVC-RES-03, AC-AISVC-RES-08] Record a retrieval request.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
latency_ms: Request latency in milliseconds.
|
||
|
|
success: Whether the request was successful.
|
||
|
|
route_mode: Route mode used (direct, react, auto).
|
||
|
|
fallback: Whether fallback to default occurred.
|
||
|
|
strategy: Strategy used (defaults to current).
|
||
|
|
"""
|
||
|
|
effective_strategy = strategy or self._current_strategy
|
||
|
|
key = effective_strategy.value
|
||
|
|
|
||
|
|
metrics = self._metrics[key]
|
||
|
|
metrics.total_requests += 1
|
||
|
|
|
||
|
|
if success:
|
||
|
|
metrics.successful_requests += 1
|
||
|
|
else:
|
||
|
|
metrics.failed_requests += 1
|
||
|
|
|
||
|
|
metrics.latency_tracker.record(latency_ms)
|
||
|
|
metrics.last_updated = datetime.utcnow().isoformat()
|
||
|
|
|
||
|
|
if fallback:
|
||
|
|
metrics.fallback_count += 1
|
||
|
|
|
||
|
|
if route_mode:
|
||
|
|
self._record_route_metric(route_mode, latency_ms, success)
|
||
|
|
|
||
|
|
logger.debug(
|
||
|
|
f"[AC-AISVC-RES-08] Request recorded: strategy={key}, "
|
||
|
|
f"latency={latency_ms:.2f}ms, success={success}, route={route_mode}"
|
||
|
|
)
|
||
|
|
|
||
|
|
def _record_route_metric(
|
||
|
|
self,
|
||
|
|
route_mode: str,
|
||
|
|
latency_ms: float,
|
||
|
|
success: bool,
|
||
|
|
) -> None:
|
||
|
|
"""
|
||
|
|
Record metrics for route mode.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
route_mode: Route mode (direct, react, auto).
|
||
|
|
latency_ms: Request latency.
|
||
|
|
success: Whether successful.
|
||
|
|
"""
|
||
|
|
metrics = self._route_metrics[route_mode]
|
||
|
|
metrics.total_requests += 1
|
||
|
|
|
||
|
|
if success:
|
||
|
|
metrics.successful_requests += 1
|
||
|
|
else:
|
||
|
|
metrics.failed_requests += 1
|
||
|
|
|
||
|
|
metrics.latency_tracker.record(latency_ms)
|
||
|
|
metrics.last_updated = datetime.utcnow().isoformat()
|
||
|
|
|
||
|
|
if route_mode == "direct":
|
||
|
|
self._metrics[self._current_strategy.value].direct_route_count += 1
|
||
|
|
elif route_mode == "react":
|
||
|
|
self._metrics[self._current_strategy.value].react_route_count += 1
|
||
|
|
elif route_mode == "auto":
|
||
|
|
self._metrics[self._current_strategy.value].auto_route_count += 1
|
||
|
|
|
||
|
|
def record_strategy_switch(
|
||
|
|
self,
|
||
|
|
from_strategy: str,
|
||
|
|
to_strategy: str,
|
||
|
|
) -> None:
|
||
|
|
"""
|
||
|
|
Record a strategy switch event.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
from_strategy: Previous strategy.
|
||
|
|
to_strategy: New strategy.
|
||
|
|
"""
|
||
|
|
metrics_logger = logging.getLogger("metrics.strategy")
|
||
|
|
metrics_logger.info(
|
||
|
|
json.dumps(
|
||
|
|
{
|
||
|
|
"event": "strategy_switch",
|
||
|
|
"from_strategy": from_strategy,
|
||
|
|
"to_strategy": to_strategy,
|
||
|
|
"timestamp": datetime.utcnow().isoformat(),
|
||
|
|
},
|
||
|
|
ensure_ascii=False,
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
logger.info(
|
||
|
|
f"[AC-AISVC-RES-03] Strategy switch recorded: {from_strategy} -> {to_strategy}"
|
||
|
|
)
|
||
|
|
|
||
|
|
def record_grayscale_request(
|
||
|
|
self,
|
||
|
|
tenant_id: str,
|
||
|
|
strategy_used: str,
|
||
|
|
in_grayscale: bool,
|
||
|
|
) -> None:
|
||
|
|
"""
|
||
|
|
[AC-AISVC-RES-03] Record a grayscale request.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
tenant_id: Tenant ID.
|
||
|
|
strategy_used: Strategy used for the request.
|
||
|
|
in_grayscale: Whether the request was in grayscale group.
|
||
|
|
"""
|
||
|
|
metrics_logger = logging.getLogger("metrics.grayscale")
|
||
|
|
metrics_logger.info(
|
||
|
|
json.dumps(
|
||
|
|
{
|
||
|
|
"event": "grayscale_request",
|
||
|
|
"tenant_id": tenant_id,
|
||
|
|
"strategy_used": strategy_used,
|
||
|
|
"in_grayscale": in_grayscale,
|
||
|
|
"timestamp": datetime.utcnow().isoformat(),
|
||
|
|
},
|
||
|
|
ensure_ascii=False,
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
def get_metrics(self, strategy: StrategyType | None = None) -> StrategyMetrics:
|
||
|
|
"""
|
||
|
|
Get metrics for a specific strategy or current strategy.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
strategy: Strategy to get metrics for (defaults to current).
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
StrategyMetrics for the strategy.
|
||
|
|
"""
|
||
|
|
effective_strategy = strategy or self._current_strategy
|
||
|
|
key = effective_strategy.value
|
||
|
|
data = self._metrics[key]
|
||
|
|
|
||
|
|
return StrategyMetrics(
|
||
|
|
strategy=effective_strategy,
|
||
|
|
react_mode=self._current_react_mode,
|
||
|
|
total_requests=data.total_requests,
|
||
|
|
successful_requests=data.successful_requests,
|
||
|
|
failed_requests=data.failed_requests,
|
||
|
|
avg_latency_ms=round(data.latency_tracker.get_avg(), 2),
|
||
|
|
p99_latency_ms=round(data.latency_tracker.get_percentile(99), 2),
|
||
|
|
direct_route_count=data.direct_route_count,
|
||
|
|
react_route_count=data.react_route_count,
|
||
|
|
auto_route_count=data.auto_route_count,
|
||
|
|
fallback_count=data.fallback_count,
|
||
|
|
last_updated=data.last_updated,
|
||
|
|
)
|
||
|
|
|
||
|
|
def get_all_metrics(self) -> dict[str, StrategyMetrics]:
|
||
|
|
"""
|
||
|
|
Get metrics for all strategies.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dictionary of strategy name to metrics.
|
||
|
|
"""
|
||
|
|
return {
|
||
|
|
strategy.value: self.get_metrics(StrategyType(strategy))
|
||
|
|
for strategy in StrategyType
|
||
|
|
}
|
||
|
|
|
||
|
|
def get_route_metrics(self) -> dict[str, dict[str, Any]]:
|
||
|
|
"""
|
||
|
|
Get metrics by route mode.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dictionary of route mode to metrics.
|
||
|
|
"""
|
||
|
|
result = {}
|
||
|
|
for route_mode, data in self._route_metrics.items():
|
||
|
|
result[route_mode] = {
|
||
|
|
"total_requests": data.total_requests,
|
||
|
|
"successful_requests": data.successful_requests,
|
||
|
|
"failed_requests": data.failed_requests,
|
||
|
|
"avg_latency_ms": round(data.latency_tracker.get_avg(), 2),
|
||
|
|
"p99_latency_ms": round(data.latency_tracker.get_percentile(99), 2),
|
||
|
|
"last_updated": data.last_updated,
|
||
|
|
}
|
||
|
|
return result
|
||
|
|
|
||
|
|
def get_performance_summary(self) -> dict[str, Any]:
|
||
|
|
"""
|
||
|
|
[AC-AISVC-RES-08] Get performance summary for monitoring.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Performance summary dictionary.
|
||
|
|
"""
|
||
|
|
all_metrics = self.get_all_metrics()
|
||
|
|
|
||
|
|
total_requests = sum(m.total_requests for m in all_metrics.values())
|
||
|
|
total_success = sum(m.successful_requests for m in all_metrics.values())
|
||
|
|
total_failed = sum(m.failed_requests for m in all_metrics.values())
|
||
|
|
|
||
|
|
avg_latencies = [
|
||
|
|
m.avg_latency_ms for m in all_metrics.values() if m.avg_latency_ms > 0
|
||
|
|
]
|
||
|
|
overall_avg_latency = (
|
||
|
|
sum(avg_latencies) / len(avg_latencies) if avg_latencies else 0.0
|
||
|
|
)
|
||
|
|
|
||
|
|
p99_latencies = [
|
||
|
|
m.p99_latency_ms for m in all_metrics.values() if m.p99_latency_ms > 0
|
||
|
|
]
|
||
|
|
overall_p99_latency = max(p99_latencies) if p99_latencies else 0.0
|
||
|
|
|
||
|
|
return {
|
||
|
|
"total_requests": total_requests,
|
||
|
|
"successful_requests": total_success,
|
||
|
|
"failed_requests": total_failed,
|
||
|
|
"success_rate": round(total_success / total_requests, 4) if total_requests > 0 else 0.0,
|
||
|
|
"avg_latency_ms": round(overall_avg_latency, 2),
|
||
|
|
"p99_latency_ms": round(overall_p99_latency, 2),
|
||
|
|
"current_strategy": self._current_strategy.value,
|
||
|
|
"current_react_mode": self._current_react_mode.value,
|
||
|
|
"strategies": {
|
||
|
|
name: {
|
||
|
|
"total_requests": m.total_requests,
|
||
|
|
"success_rate": round(
|
||
|
|
m.successful_requests / m.total_requests, 4
|
||
|
|
)
|
||
|
|
if m.total_requests > 0
|
||
|
|
else 0.0,
|
||
|
|
"avg_latency_ms": m.avg_latency_ms,
|
||
|
|
"p99_latency_ms": m.p99_latency_ms,
|
||
|
|
}
|
||
|
|
for name, m in all_metrics.items()
|
||
|
|
},
|
||
|
|
"routes": self.get_route_metrics(),
|
||
|
|
}
|
||
|
|
|
||
|
|
def reset_metrics(self, strategy: StrategyType | None = None) -> None:
|
||
|
|
"""
|
||
|
|
Reset metrics for a strategy or all strategies.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
strategy: Strategy to reset (None for all).
|
||
|
|
"""
|
||
|
|
if strategy:
|
||
|
|
self._metrics[strategy.value] = StrategyMetricsData()
|
||
|
|
logger.info(f"[AC-AISVC-RES-08] Metrics reset for strategy: {strategy.value}")
|
||
|
|
else:
|
||
|
|
self._metrics.clear()
|
||
|
|
self._route_metrics.clear()
|
||
|
|
logger.info("[AC-AISVC-RES-08] All metrics reset")
|
||
|
|
|
||
|
|
def check_performance_threshold(
|
||
|
|
self,
|
||
|
|
strategy: StrategyType,
|
||
|
|
max_latency_ms: float = 5000.0,
|
||
|
|
max_error_rate: float = 0.1,
|
||
|
|
) -> dict[str, Any]:
|
||
|
|
"""
|
||
|
|
[AC-AISVC-RES-08] Check if performance is within acceptable thresholds.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
strategy: Strategy to check.
|
||
|
|
max_latency_ms: Maximum acceptable average latency.
|
||
|
|
max_error_rate: Maximum acceptable error rate (0-1).
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dictionary with check results.
|
||
|
|
"""
|
||
|
|
metrics = self.get_metrics(strategy)
|
||
|
|
|
||
|
|
latency_ok = metrics.avg_latency_ms <= max_latency_ms
|
||
|
|
error_rate = (
|
||
|
|
metrics.failed_requests / metrics.total_requests
|
||
|
|
if metrics.total_requests > 0
|
||
|
|
else 0.0
|
||
|
|
)
|
||
|
|
error_rate_ok = error_rate <= max_error_rate
|
||
|
|
|
||
|
|
return {
|
||
|
|
"strategy": strategy.value,
|
||
|
|
"latency_ok": latency_ok,
|
||
|
|
"avg_latency_ms": metrics.avg_latency_ms,
|
||
|
|
"max_latency_ms": max_latency_ms,
|
||
|
|
"error_rate_ok": error_rate_ok,
|
||
|
|
"error_rate": round(error_rate, 4),
|
||
|
|
"max_error_rate": max_error_rate,
|
||
|
|
"overall_ok": latency_ok and error_rate_ok,
|
||
|
|
"recommendation": (
|
||
|
|
"Performance within acceptable thresholds"
|
||
|
|
if latency_ok and error_rate_ok
|
||
|
|
else "Consider rollback or investigation"
|
||
|
|
),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
class MetricsContext:
|
||
|
|
"""
|
||
|
|
Context manager for timing operations.
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
metrics_service: StrategyMetricsService,
|
||
|
|
route_mode: str | None = None,
|
||
|
|
strategy: StrategyType | None = None,
|
||
|
|
):
|
||
|
|
self._metrics_service = metrics_service
|
||
|
|
self._route_mode = route_mode
|
||
|
|
self._strategy = strategy
|
||
|
|
self._start_time: float | None = None
|
||
|
|
self._success = True
|
||
|
|
|
||
|
|
def __enter__(self) -> "MetricsContext":
|
||
|
|
self._start_time = time.time()
|
||
|
|
return self
|
||
|
|
|
||
|
|
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
||
|
|
if self._start_time is None:
|
||
|
|
return
|
||
|
|
|
||
|
|
latency_ms = (time.time() - self._start_time) * 1000
|
||
|
|
success = exc_type is None
|
||
|
|
|
||
|
|
self._metrics_service.record_request(
|
||
|
|
latency_ms=latency_ms,
|
||
|
|
success=success,
|
||
|
|
route_mode=self._route_mode,
|
||
|
|
strategy=self._strategy,
|
||
|
|
)
|
||
|
|
|
||
|
|
def mark_failed(self) -> None:
|
||
|
|
"""Mark the operation as failed."""
|
||
|
|
self._success = False
|
||
|
|
|
||
|
|
|
||
|
|
_metrics_service: StrategyMetricsService | None = None
|
||
|
|
|
||
|
|
|
||
|
|
def get_metrics_service() -> StrategyMetricsService:
|
||
|
|
"""Get or create StrategyMetricsService instance."""
|
||
|
|
global _metrics_service
|
||
|
|
if _metrics_service is None:
|
||
|
|
_metrics_service = StrategyMetricsService()
|
||
|
|
return _metrics_service
|