""" Strategy Metrics Service for AI Service. [AC-AISVC-RES-03, AC-AISVC-RES-08] Metrics collection for strategy operations. """ import json import logging import time from collections import defaultdict from dataclasses import dataclass, field from datetime import datetime from typing import Any from app.schemas.retrieval_strategy import ( ReactMode, StrategyMetrics, StrategyType, ) logger = logging.getLogger(__name__) @dataclass class LatencyTracker: """ Latency tracking for a single operation. """ latencies: list[float] = field(default_factory=list) max_samples: int = 1000 def record(self, latency_ms: float) -> None: """Record a latency sample.""" if len(self.latencies) >= self.max_samples: self.latencies = self.latencies[-self.max_samples // 2 :] self.latencies.append(latency_ms) def get_percentile(self, percentile: float) -> float: """Get latency at given percentile.""" if not self.latencies: return 0.0 sorted_latencies = sorted(self.latencies) index = int(len(sorted_latencies) * percentile / 100) index = min(index, len(sorted_latencies) - 1) return sorted_latencies[index] def get_avg(self) -> float: """Get average latency.""" if not self.latencies: return 0.0 return sum(self.latencies) / len(self.latencies) @dataclass class StrategyMetricsData: """ Internal metrics data structure. """ total_requests: int = 0 successful_requests: int = 0 failed_requests: int = 0 latency_tracker: LatencyTracker = field(default_factory=LatencyTracker) direct_route_count: int = 0 react_route_count: int = 0 auto_route_count: int = 0 fallback_count: int = 0 last_updated: str | None = None class StrategyMetricsService: """ [AC-AISVC-RES-03, AC-AISVC-RES-08] Metrics service for strategy operations. Features: - Request counting by strategy and route mode - Latency tracking with percentiles - Fallback and error tracking - Metrics export for monitoring """ def __init__(self): self._metrics: dict[str, StrategyMetricsData] = defaultdict(StrategyMetricsData) self._route_metrics: dict[str, StrategyMetricsData] = defaultdict(StrategyMetricsData) self._current_strategy: StrategyType = StrategyType.DEFAULT self._current_react_mode: ReactMode = ReactMode.NON_REACT def set_current_strategy( self, strategy: StrategyType, react_mode: ReactMode, ) -> None: """ Set current strategy for metrics attribution. Args: strategy: Current active strategy. react_mode: Current react mode. """ self._current_strategy = strategy self._current_react_mode = react_mode def record_request( self, latency_ms: float, success: bool = True, route_mode: str | None = None, fallback: bool = False, strategy: StrategyType | None = None, ) -> None: """ [AC-AISVC-RES-03, AC-AISVC-RES-08] Record a retrieval request. Args: latency_ms: Request latency in milliseconds. success: Whether the request was successful. route_mode: Route mode used (direct, react, auto). fallback: Whether fallback to default occurred. strategy: Strategy used (defaults to current). """ effective_strategy = strategy or self._current_strategy key = effective_strategy.value metrics = self._metrics[key] metrics.total_requests += 1 if success: metrics.successful_requests += 1 else: metrics.failed_requests += 1 metrics.latency_tracker.record(latency_ms) metrics.last_updated = datetime.utcnow().isoformat() if fallback: metrics.fallback_count += 1 if route_mode: self._record_route_metric(route_mode, latency_ms, success) logger.debug( f"[AC-AISVC-RES-08] Request recorded: strategy={key}, " f"latency={latency_ms:.2f}ms, success={success}, route={route_mode}" ) def _record_route_metric( self, route_mode: str, latency_ms: float, success: bool, ) -> None: """ Record metrics for route mode. Args: route_mode: Route mode (direct, react, auto). latency_ms: Request latency. success: Whether successful. """ metrics = self._route_metrics[route_mode] metrics.total_requests += 1 if success: metrics.successful_requests += 1 else: metrics.failed_requests += 1 metrics.latency_tracker.record(latency_ms) metrics.last_updated = datetime.utcnow().isoformat() if route_mode == "direct": self._metrics[self._current_strategy.value].direct_route_count += 1 elif route_mode == "react": self._metrics[self._current_strategy.value].react_route_count += 1 elif route_mode == "auto": self._metrics[self._current_strategy.value].auto_route_count += 1 def record_strategy_switch( self, from_strategy: str, to_strategy: str, ) -> None: """ Record a strategy switch event. Args: from_strategy: Previous strategy. to_strategy: New strategy. """ metrics_logger = logging.getLogger("metrics.strategy") metrics_logger.info( json.dumps( { "event": "strategy_switch", "from_strategy": from_strategy, "to_strategy": to_strategy, "timestamp": datetime.utcnow().isoformat(), }, ensure_ascii=False, ) ) logger.info( f"[AC-AISVC-RES-03] Strategy switch recorded: {from_strategy} -> {to_strategy}" ) def record_grayscale_request( self, tenant_id: str, strategy_used: str, in_grayscale: bool, ) -> None: """ [AC-AISVC-RES-03] Record a grayscale request. Args: tenant_id: Tenant ID. strategy_used: Strategy used for the request. in_grayscale: Whether the request was in grayscale group. """ metrics_logger = logging.getLogger("metrics.grayscale") metrics_logger.info( json.dumps( { "event": "grayscale_request", "tenant_id": tenant_id, "strategy_used": strategy_used, "in_grayscale": in_grayscale, "timestamp": datetime.utcnow().isoformat(), }, ensure_ascii=False, ) ) def get_metrics(self, strategy: StrategyType | None = None) -> StrategyMetrics: """ Get metrics for a specific strategy or current strategy. Args: strategy: Strategy to get metrics for (defaults to current). Returns: StrategyMetrics for the strategy. """ effective_strategy = strategy or self._current_strategy key = effective_strategy.value data = self._metrics[key] return StrategyMetrics( strategy=effective_strategy, react_mode=self._current_react_mode, total_requests=data.total_requests, successful_requests=data.successful_requests, failed_requests=data.failed_requests, avg_latency_ms=round(data.latency_tracker.get_avg(), 2), p99_latency_ms=round(data.latency_tracker.get_percentile(99), 2), direct_route_count=data.direct_route_count, react_route_count=data.react_route_count, auto_route_count=data.auto_route_count, fallback_count=data.fallback_count, last_updated=data.last_updated, ) def get_all_metrics(self) -> dict[str, StrategyMetrics]: """ Get metrics for all strategies. Returns: Dictionary of strategy name to metrics. """ return { strategy.value: self.get_metrics(StrategyType(strategy)) for strategy in StrategyType } def get_route_metrics(self) -> dict[str, dict[str, Any]]: """ Get metrics by route mode. Returns: Dictionary of route mode to metrics. """ result = {} for route_mode, data in self._route_metrics.items(): result[route_mode] = { "total_requests": data.total_requests, "successful_requests": data.successful_requests, "failed_requests": data.failed_requests, "avg_latency_ms": round(data.latency_tracker.get_avg(), 2), "p99_latency_ms": round(data.latency_tracker.get_percentile(99), 2), "last_updated": data.last_updated, } return result def get_performance_summary(self) -> dict[str, Any]: """ [AC-AISVC-RES-08] Get performance summary for monitoring. Returns: Performance summary dictionary. """ all_metrics = self.get_all_metrics() total_requests = sum(m.total_requests for m in all_metrics.values()) total_success = sum(m.successful_requests for m in all_metrics.values()) total_failed = sum(m.failed_requests for m in all_metrics.values()) avg_latencies = [ m.avg_latency_ms for m in all_metrics.values() if m.avg_latency_ms > 0 ] overall_avg_latency = ( sum(avg_latencies) / len(avg_latencies) if avg_latencies else 0.0 ) p99_latencies = [ m.p99_latency_ms for m in all_metrics.values() if m.p99_latency_ms > 0 ] overall_p99_latency = max(p99_latencies) if p99_latencies else 0.0 return { "total_requests": total_requests, "successful_requests": total_success, "failed_requests": total_failed, "success_rate": round(total_success / total_requests, 4) if total_requests > 0 else 0.0, "avg_latency_ms": round(overall_avg_latency, 2), "p99_latency_ms": round(overall_p99_latency, 2), "current_strategy": self._current_strategy.value, "current_react_mode": self._current_react_mode.value, "strategies": { name: { "total_requests": m.total_requests, "success_rate": round( m.successful_requests / m.total_requests, 4 ) if m.total_requests > 0 else 0.0, "avg_latency_ms": m.avg_latency_ms, "p99_latency_ms": m.p99_latency_ms, } for name, m in all_metrics.items() }, "routes": self.get_route_metrics(), } def reset_metrics(self, strategy: StrategyType | None = None) -> None: """ Reset metrics for a strategy or all strategies. Args: strategy: Strategy to reset (None for all). """ if strategy: self._metrics[strategy.value] = StrategyMetricsData() logger.info(f"[AC-AISVC-RES-08] Metrics reset for strategy: {strategy.value}") else: self._metrics.clear() self._route_metrics.clear() logger.info("[AC-AISVC-RES-08] All metrics reset") def check_performance_threshold( self, strategy: StrategyType, max_latency_ms: float = 5000.0, max_error_rate: float = 0.1, ) -> dict[str, Any]: """ [AC-AISVC-RES-08] Check if performance is within acceptable thresholds. Args: strategy: Strategy to check. max_latency_ms: Maximum acceptable average latency. max_error_rate: Maximum acceptable error rate (0-1). Returns: Dictionary with check results. """ metrics = self.get_metrics(strategy) latency_ok = metrics.avg_latency_ms <= max_latency_ms error_rate = ( metrics.failed_requests / metrics.total_requests if metrics.total_requests > 0 else 0.0 ) error_rate_ok = error_rate <= max_error_rate return { "strategy": strategy.value, "latency_ok": latency_ok, "avg_latency_ms": metrics.avg_latency_ms, "max_latency_ms": max_latency_ms, "error_rate_ok": error_rate_ok, "error_rate": round(error_rate, 4), "max_error_rate": max_error_rate, "overall_ok": latency_ok and error_rate_ok, "recommendation": ( "Performance within acceptable thresholds" if latency_ok and error_rate_ok else "Consider rollback or investigation" ), } class MetricsContext: """ Context manager for timing operations. """ def __init__( self, metrics_service: StrategyMetricsService, route_mode: str | None = None, strategy: StrategyType | None = None, ): self._metrics_service = metrics_service self._route_mode = route_mode self._strategy = strategy self._start_time: float | None = None self._success = True def __enter__(self) -> "MetricsContext": self._start_time = time.time() return self def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: if self._start_time is None: return latency_ms = (time.time() - self._start_time) * 1000 success = exc_type is None self._metrics_service.record_request( latency_ms=latency_ms, success=success, route_mode=self._route_mode, strategy=self._strategy, ) def mark_failed(self) -> None: """Mark the operation as failed.""" self._success = False _metrics_service: StrategyMetricsService | None = None def get_metrics_service() -> StrategyMetricsService: """Get or create StrategyMetricsService instance.""" global _metrics_service if _metrics_service is None: _metrics_service = StrategyMetricsService() return _metrics_service