ai-robot-core/ai-service/app/services/retrieval/strategy_metrics.py

453 lines
14 KiB
Python
Raw Normal View History

"""
Strategy Metrics Service for AI Service.
[AC-AISVC-RES-03, AC-AISVC-RES-08] Metrics collection for strategy operations.
"""
import json
import logging
import time
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
from app.schemas.retrieval_strategy import (
ReactMode,
StrategyMetrics,
StrategyType,
)
logger = logging.getLogger(__name__)
@dataclass
class LatencyTracker:
"""
Latency tracking for a single operation.
"""
latencies: list[float] = field(default_factory=list)
max_samples: int = 1000
def record(self, latency_ms: float) -> None:
"""Record a latency sample."""
if len(self.latencies) >= self.max_samples:
self.latencies = self.latencies[-self.max_samples // 2 :]
self.latencies.append(latency_ms)
def get_percentile(self, percentile: float) -> float:
"""Get latency at given percentile."""
if not self.latencies:
return 0.0
sorted_latencies = sorted(self.latencies)
index = int(len(sorted_latencies) * percentile / 100)
index = min(index, len(sorted_latencies) - 1)
return sorted_latencies[index]
def get_avg(self) -> float:
"""Get average latency."""
if not self.latencies:
return 0.0
return sum(self.latencies) / len(self.latencies)
@dataclass
class StrategyMetricsData:
"""
Internal metrics data structure.
"""
total_requests: int = 0
successful_requests: int = 0
failed_requests: int = 0
latency_tracker: LatencyTracker = field(default_factory=LatencyTracker)
direct_route_count: int = 0
react_route_count: int = 0
auto_route_count: int = 0
fallback_count: int = 0
last_updated: str | None = None
class StrategyMetricsService:
"""
[AC-AISVC-RES-03, AC-AISVC-RES-08] Metrics service for strategy operations.
Features:
- Request counting by strategy and route mode
- Latency tracking with percentiles
- Fallback and error tracking
- Metrics export for monitoring
"""
def __init__(self):
self._metrics: dict[str, StrategyMetricsData] = defaultdict(StrategyMetricsData)
self._route_metrics: dict[str, StrategyMetricsData] = defaultdict(StrategyMetricsData)
self._current_strategy: StrategyType = StrategyType.DEFAULT
self._current_react_mode: ReactMode = ReactMode.NON_REACT
def set_current_strategy(
self,
strategy: StrategyType,
react_mode: ReactMode,
) -> None:
"""
Set current strategy for metrics attribution.
Args:
strategy: Current active strategy.
react_mode: Current react mode.
"""
self._current_strategy = strategy
self._current_react_mode = react_mode
def record_request(
self,
latency_ms: float,
success: bool = True,
route_mode: str | None = None,
fallback: bool = False,
strategy: StrategyType | None = None,
) -> None:
"""
[AC-AISVC-RES-03, AC-AISVC-RES-08] Record a retrieval request.
Args:
latency_ms: Request latency in milliseconds.
success: Whether the request was successful.
route_mode: Route mode used (direct, react, auto).
fallback: Whether fallback to default occurred.
strategy: Strategy used (defaults to current).
"""
effective_strategy = strategy or self._current_strategy
key = effective_strategy.value
metrics = self._metrics[key]
metrics.total_requests += 1
if success:
metrics.successful_requests += 1
else:
metrics.failed_requests += 1
metrics.latency_tracker.record(latency_ms)
metrics.last_updated = datetime.utcnow().isoformat()
if fallback:
metrics.fallback_count += 1
if route_mode:
self._record_route_metric(route_mode, latency_ms, success)
logger.debug(
f"[AC-AISVC-RES-08] Request recorded: strategy={key}, "
f"latency={latency_ms:.2f}ms, success={success}, route={route_mode}"
)
def _record_route_metric(
self,
route_mode: str,
latency_ms: float,
success: bool,
) -> None:
"""
Record metrics for route mode.
Args:
route_mode: Route mode (direct, react, auto).
latency_ms: Request latency.
success: Whether successful.
"""
metrics = self._route_metrics[route_mode]
metrics.total_requests += 1
if success:
metrics.successful_requests += 1
else:
metrics.failed_requests += 1
metrics.latency_tracker.record(latency_ms)
metrics.last_updated = datetime.utcnow().isoformat()
if route_mode == "direct":
self._metrics[self._current_strategy.value].direct_route_count += 1
elif route_mode == "react":
self._metrics[self._current_strategy.value].react_route_count += 1
elif route_mode == "auto":
self._metrics[self._current_strategy.value].auto_route_count += 1
def record_strategy_switch(
self,
from_strategy: str,
to_strategy: str,
) -> None:
"""
Record a strategy switch event.
Args:
from_strategy: Previous strategy.
to_strategy: New strategy.
"""
metrics_logger = logging.getLogger("metrics.strategy")
metrics_logger.info(
json.dumps(
{
"event": "strategy_switch",
"from_strategy": from_strategy,
"to_strategy": to_strategy,
"timestamp": datetime.utcnow().isoformat(),
},
ensure_ascii=False,
)
)
logger.info(
f"[AC-AISVC-RES-03] Strategy switch recorded: {from_strategy} -> {to_strategy}"
)
def record_grayscale_request(
self,
tenant_id: str,
strategy_used: str,
in_grayscale: bool,
) -> None:
"""
[AC-AISVC-RES-03] Record a grayscale request.
Args:
tenant_id: Tenant ID.
strategy_used: Strategy used for the request.
in_grayscale: Whether the request was in grayscale group.
"""
metrics_logger = logging.getLogger("metrics.grayscale")
metrics_logger.info(
json.dumps(
{
"event": "grayscale_request",
"tenant_id": tenant_id,
"strategy_used": strategy_used,
"in_grayscale": in_grayscale,
"timestamp": datetime.utcnow().isoformat(),
},
ensure_ascii=False,
)
)
def get_metrics(self, strategy: StrategyType | None = None) -> StrategyMetrics:
"""
Get metrics for a specific strategy or current strategy.
Args:
strategy: Strategy to get metrics for (defaults to current).
Returns:
StrategyMetrics for the strategy.
"""
effective_strategy = strategy or self._current_strategy
key = effective_strategy.value
data = self._metrics[key]
return StrategyMetrics(
strategy=effective_strategy,
react_mode=self._current_react_mode,
total_requests=data.total_requests,
successful_requests=data.successful_requests,
failed_requests=data.failed_requests,
avg_latency_ms=round(data.latency_tracker.get_avg(), 2),
p99_latency_ms=round(data.latency_tracker.get_percentile(99), 2),
direct_route_count=data.direct_route_count,
react_route_count=data.react_route_count,
auto_route_count=data.auto_route_count,
fallback_count=data.fallback_count,
last_updated=data.last_updated,
)
def get_all_metrics(self) -> dict[str, StrategyMetrics]:
"""
Get metrics for all strategies.
Returns:
Dictionary of strategy name to metrics.
"""
return {
strategy.value: self.get_metrics(StrategyType(strategy))
for strategy in StrategyType
}
def get_route_metrics(self) -> dict[str, dict[str, Any]]:
"""
Get metrics by route mode.
Returns:
Dictionary of route mode to metrics.
"""
result = {}
for route_mode, data in self._route_metrics.items():
result[route_mode] = {
"total_requests": data.total_requests,
"successful_requests": data.successful_requests,
"failed_requests": data.failed_requests,
"avg_latency_ms": round(data.latency_tracker.get_avg(), 2),
"p99_latency_ms": round(data.latency_tracker.get_percentile(99), 2),
"last_updated": data.last_updated,
}
return result
def get_performance_summary(self) -> dict[str, Any]:
"""
[AC-AISVC-RES-08] Get performance summary for monitoring.
Returns:
Performance summary dictionary.
"""
all_metrics = self.get_all_metrics()
total_requests = sum(m.total_requests for m in all_metrics.values())
total_success = sum(m.successful_requests for m in all_metrics.values())
total_failed = sum(m.failed_requests for m in all_metrics.values())
avg_latencies = [
m.avg_latency_ms for m in all_metrics.values() if m.avg_latency_ms > 0
]
overall_avg_latency = (
sum(avg_latencies) / len(avg_latencies) if avg_latencies else 0.0
)
p99_latencies = [
m.p99_latency_ms for m in all_metrics.values() if m.p99_latency_ms > 0
]
overall_p99_latency = max(p99_latencies) if p99_latencies else 0.0
return {
"total_requests": total_requests,
"successful_requests": total_success,
"failed_requests": total_failed,
"success_rate": round(total_success / total_requests, 4) if total_requests > 0 else 0.0,
"avg_latency_ms": round(overall_avg_latency, 2),
"p99_latency_ms": round(overall_p99_latency, 2),
"current_strategy": self._current_strategy.value,
"current_react_mode": self._current_react_mode.value,
"strategies": {
name: {
"total_requests": m.total_requests,
"success_rate": round(
m.successful_requests / m.total_requests, 4
)
if m.total_requests > 0
else 0.0,
"avg_latency_ms": m.avg_latency_ms,
"p99_latency_ms": m.p99_latency_ms,
}
for name, m in all_metrics.items()
},
"routes": self.get_route_metrics(),
}
def reset_metrics(self, strategy: StrategyType | None = None) -> None:
"""
Reset metrics for a strategy or all strategies.
Args:
strategy: Strategy to reset (None for all).
"""
if strategy:
self._metrics[strategy.value] = StrategyMetricsData()
logger.info(f"[AC-AISVC-RES-08] Metrics reset for strategy: {strategy.value}")
else:
self._metrics.clear()
self._route_metrics.clear()
logger.info("[AC-AISVC-RES-08] All metrics reset")
def check_performance_threshold(
self,
strategy: StrategyType,
max_latency_ms: float = 5000.0,
max_error_rate: float = 0.1,
) -> dict[str, Any]:
"""
[AC-AISVC-RES-08] Check if performance is within acceptable thresholds.
Args:
strategy: Strategy to check.
max_latency_ms: Maximum acceptable average latency.
max_error_rate: Maximum acceptable error rate (0-1).
Returns:
Dictionary with check results.
"""
metrics = self.get_metrics(strategy)
latency_ok = metrics.avg_latency_ms <= max_latency_ms
error_rate = (
metrics.failed_requests / metrics.total_requests
if metrics.total_requests > 0
else 0.0
)
error_rate_ok = error_rate <= max_error_rate
return {
"strategy": strategy.value,
"latency_ok": latency_ok,
"avg_latency_ms": metrics.avg_latency_ms,
"max_latency_ms": max_latency_ms,
"error_rate_ok": error_rate_ok,
"error_rate": round(error_rate, 4),
"max_error_rate": max_error_rate,
"overall_ok": latency_ok and error_rate_ok,
"recommendation": (
"Performance within acceptable thresholds"
if latency_ok and error_rate_ok
else "Consider rollback or investigation"
),
}
class MetricsContext:
"""
Context manager for timing operations.
"""
def __init__(
self,
metrics_service: StrategyMetricsService,
route_mode: str | None = None,
strategy: StrategyType | None = None,
):
self._metrics_service = metrics_service
self._route_mode = route_mode
self._strategy = strategy
self._start_time: float | None = None
self._success = True
def __enter__(self) -> "MetricsContext":
self._start_time = time.time()
return self
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
if self._start_time is None:
return
latency_ms = (time.time() - self._start_time) * 1000
success = exc_type is None
self._metrics_service.record_request(
latency_ms=latency_ms,
success=success,
route_mode=self._route_mode,
strategy=self._strategy,
)
def mark_failed(self) -> None:
"""Mark the operation as failed."""
self._success = False
_metrics_service: StrategyMetricsService | None = None
def get_metrics_service() -> StrategyMetricsService:
"""Get or create StrategyMetricsService instance."""
global _metrics_service
if _metrics_service is None:
_metrics_service = StrategyMetricsService()
return _metrics_service