ai-robot-core/ai-service/app/services/monitoring/cache.py

379 lines
10 KiB
Python
Raw Normal View History

"""
Redis cache layer for monitoring data.
[AC-AISVC-91, AC-AISVC-92] Redis-based caching for Dashboard statistics.
"""
import json
import logging
from datetime import datetime
from typing import Any
import redis.asyncio as redis
from app.core.config import get_settings
logger = logging.getLogger(__name__)
class MonitoringCache:
"""
[AC-AISVC-91, AC-AISVC-92] Redis cache layer for monitoring data.
Features:
- Dashboard stats caching (60s TTL)
- Incremental counters (90 days TTL)
- Top N leaderboards
"""
def __init__(self, redis_client: redis.Redis | None = None):
self._redis = redis_client
self._settings = get_settings()
self._enabled = self._settings.redis_enabled
async def _get_client(self) -> redis.Redis | None:
if not self._enabled:
return None
if self._redis is None:
try:
self._redis = redis.from_url(
self._settings.redis_url,
encoding="utf-8",
decode_responses=True,
)
except Exception as e:
logger.warning(f"[MonitoringCache] Failed to connect to Redis: {e}")
self._enabled = False
return None
return self._redis
async def incr_counter(
self,
tenant_id: str,
metric: str,
date: str | None = None,
) -> int:
"""
Increment a counter atomically.
Key format: stats:{tenant_id}:counter:{metric}:{date}
TTL: 90 days (7776000 seconds)
Args:
tenant_id: Tenant ID for isolation
metric: Metric name (e.g., 'intent_hit', 'template_use', 'flow_activate', 'guardrail_block')
date: Date string (YYYY-MM-DD), defaults to today
Returns:
New counter value
"""
client = await self._get_client()
if client is None:
return 0
if date is None:
date = datetime.utcnow().strftime("%Y-%m-%d")
key = f"stats:{tenant_id}:counter:{metric}:{date}"
try:
count = await client.incr(key)
await client.expire(key, self._settings.stats_counter_ttl)
return count
except Exception as e:
logger.warning(f"[MonitoringCache] Failed to increment counter: {e}")
return 0
async def incr_counter_by(
self,
tenant_id: str,
metric: str,
amount: int,
date: str | None = None,
) -> int:
"""
Increment a counter by a specific amount atomically.
Key format: stats:{tenant_id}:counter:{metric}:{date}
TTL: 90 days
Args:
tenant_id: Tenant ID for isolation
metric: Metric name
amount: Amount to increment by
date: Date string (YYYY-MM-DD), defaults to today
Returns:
New counter value
"""
client = await self._get_client()
if client is None:
return 0
if date is None:
date = datetime.utcnow().strftime("%Y-%m-%d")
key = f"stats:{tenant_id}:counter:{metric}:{date}"
try:
count = await client.incrby(key, amount)
await client.expire(key, self._settings.stats_counter_ttl)
return count
except Exception as e:
logger.warning(f"[MonitoringCache] Failed to increment counter: {e}")
return 0
async def get_counter(
self,
tenant_id: str,
metric: str,
date: str | None = None,
) -> int:
"""
Get counter value.
Args:
tenant_id: Tenant ID for isolation
metric: Metric name
date: Date string (YYYY-MM-DD), defaults to today
Returns:
Counter value (0 if not found)
"""
client = await self._get_client()
if client is None:
return 0
if date is None:
date = datetime.utcnow().strftime("%Y-%m-%d")
key = f"stats:{tenant_id}:counter:{metric}:{date}"
try:
value = await client.get(key)
return int(value) if value else 0
except Exception as e:
logger.warning(f"[MonitoringCache] Failed to get counter: {e}")
return 0
async def get_dashboard_stats(
self,
tenant_id: str,
) -> dict[str, Any] | None:
"""
Get cached Dashboard statistics.
Key format: stats:{tenant_id}:dashboard
TTL: 60 seconds
Args:
tenant_id: Tenant ID for isolation
Returns:
Cached stats dict or None if not found
"""
client = await self._get_client()
if client is None:
return None
key = f"stats:{tenant_id}:dashboard"
try:
data = await client.get(key)
if data:
return json.loads(data)
return None
except Exception as e:
logger.warning(f"[MonitoringCache] Failed to get dashboard stats: {e}")
return None
async def set_dashboard_stats(
self,
tenant_id: str,
stats: dict[str, Any],
) -> bool:
"""
Set Dashboard statistics cache.
Args:
tenant_id: Tenant ID for isolation
stats: Stats dict to cache
Returns:
True if successful, False otherwise
"""
client = await self._get_client()
if client is None:
return False
key = f"stats:{tenant_id}:dashboard"
try:
await client.setex(
key,
self._settings.dashboard_cache_ttl,
json.dumps(stats, default=str),
)
return True
except Exception as e:
logger.warning(f"[MonitoringCache] Failed to set dashboard stats: {e}")
return False
async def invalidate_dashboard_stats(
self,
tenant_id: str,
) -> bool:
"""
Invalidate Dashboard statistics cache.
Args:
tenant_id: Tenant ID for isolation
Returns:
True if successful, False otherwise
"""
client = await self._get_client()
if client is None:
return False
key = f"stats:{tenant_id}:dashboard"
try:
await client.delete(key)
return True
except Exception as e:
logger.warning(f"[MonitoringCache] Failed to invalidate dashboard stats: {e}")
return False
async def add_to_leaderboard(
self,
tenant_id: str,
leaderboard: str,
member: str,
score: float,
) -> bool:
"""
Add/update a member in a leaderboard (sorted set).
Key format: stats:{tenant_id}:leaderboard:{name}
Args:
tenant_id: Tenant ID for isolation
leaderboard: Leaderboard name (e.g., 'intent_rules', 'templates', 'flows')
member: Member identifier
score: Score value
Returns:
True if successful
"""
client = await self._get_client()
if client is None:
return False
key = f"stats:{tenant_id}:leaderboard:{leaderboard}"
try:
await client.zadd(key, {member: score})
await client.expire(key, self._settings.stats_counter_ttl)
return True
except Exception as e:
logger.warning(f"[MonitoringCache] Failed to add to leaderboard: {e}")
return False
async def get_leaderboard(
self,
tenant_id: str,
leaderboard: str,
limit: int = 5,
desc: bool = True,
) -> list[tuple[str, float]]:
"""
Get top N members from a leaderboard.
Args:
tenant_id: Tenant ID for isolation
leaderboard: Leaderboard name
limit: Maximum number of results
desc: Sort descending (highest first)
Returns:
List of (member, score) tuples
"""
client = await self._get_client()
if client is None:
return []
key = f"stats:{tenant_id}:leaderboard:{leaderboard}"
try:
if desc:
results = await client.zrevrangebyscore(
key,
min=0,
max=float("inf"),
start=0,
num=limit,
withscores=True,
)
else:
results = await client.zrangebyscore(
key,
min=0,
max=float("inf"),
start=0,
num=limit,
withscores=True,
)
return results
except Exception as e:
logger.warning(f"[MonitoringCache] Failed to get leaderboard: {e}")
return []
async def incr_leaderboard_member(
self,
tenant_id: str,
leaderboard: str,
member: str,
increment: float = 1.0,
) -> bool:
"""
Increment a member's score in a leaderboard.
Args:
tenant_id: Tenant ID for isolation
leaderboard: Leaderboard name
member: Member identifier
increment: Amount to increment
Returns:
True if successful
"""
client = await self._get_client()
if client is None:
return False
key = f"stats:{tenant_id}:leaderboard:{leaderboard}"
try:
await client.zincrby(key, increment, member)
await client.expire(key, self._settings.stats_counter_ttl)
return True
except Exception as e:
logger.warning(f"[MonitoringCache] Failed to incr leaderboard member: {e}")
return False
async def close(self) -> None:
"""Close Redis connection."""
if self._redis:
await self._redis.close()
_monitoring_cache: MonitoringCache | None = None
def get_monitoring_cache() -> MonitoringCache:
"""Get singleton MonitoringCache instance."""
global _monitoring_cache
if _monitoring_cache is None:
_monitoring_cache = MonitoringCache()
return _monitoring_cache