""" System Monitoring - Track system resources and performance over time """ import psutil import time from datetime import datetime, timedelta from typing import Dict, List, Optional from collections import deque from dataclasses import dataclass, field import logging logger = logging.getLogger(__name__) @dataclass class SystemSnapshot: """A snapshot of system resources at a point in time""" timestamp: datetime cpu_percent: float ram_percent: float ram_used_gb: float ram_total_gb: float gpu_percent: Optional[float] = None gpu_memory_used_gb: Optional[float] = None gpu_memory_total_gb: Optional[float] = None gpu_temperature: Optional[float] = None @dataclass class ResponseTimeMetric: """Track response times for different operations""" timestamp: datetime operation: str # "chat", "dream", "reflection", etc. duration_ms: float tokens_generated: int success: bool class SystemMonitor: """Track system resources and performance over time""" def __init__(self, history_size: int = 1000): self.system_snapshots: deque = deque(maxlen=history_size) self.response_times: deque = deque(maxlen=history_size) self.start_time = datetime.now() # Try to import GPU monitoring self.gpu_available = False self.pynvml = None self.gpu_handle = None try: import pynvml pynvml.nvmlInit() self.gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0) self.pynvml = pynvml self.gpu_available = True logger.info("[MONITOR] GPU monitoring enabled") except Exception as e: logger.info(f"[MONITOR] GPU monitoring not available: {e}") def capture_snapshot(self) -> SystemSnapshot: """Capture current system state""" memory = psutil.virtual_memory() snapshot = SystemSnapshot( timestamp=datetime.now(), cpu_percent=psutil.cpu_percent(interval=0.1), ram_percent=memory.percent, ram_used_gb=memory.used / (1024**3), ram_total_gb=memory.total / (1024**3) ) # Try to get GPU stats if self.gpu_available and self.pynvml: try: util = self.pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handle) mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handle) temp = self.pynvml.nvmlDeviceGetTemperature( self.gpu_handle, self.pynvml.NVML_TEMPERATURE_GPU ) snapshot.gpu_percent = float(util.gpu) if util.gpu is not None else None snapshot.gpu_memory_used_gb = float(mem_info.used) / (1024**3) if mem_info.used is not None else None snapshot.gpu_memory_total_gb = float(mem_info.total) / (1024**3) if mem_info.total is not None else None snapshot.gpu_temperature = float(temp) if temp is not None else None except Exception as e: logger.debug(f"[MONITOR] GPU read error: {e}") self.system_snapshots.append(snapshot) return snapshot def log_response_time(self, operation: str, duration_ms: float, tokens: int = 0, success: bool = True): """Log operation timing""" metric = ResponseTimeMetric( timestamp=datetime.now(), operation=operation, duration_ms=duration_ms, tokens_generated=tokens, success=success ) self.response_times.append(metric) logger.debug(f"[MONITOR] {operation}: {duration_ms:.0f}ms ({tokens} tokens)") def get_avg_response_time(self, operation: Optional[str] = None, last_n: Optional[int] = None) -> float: """Get average response time""" metrics = list(self.response_times) if last_n: metrics = metrics[-last_n:] if operation: times = [m.duration_ms for m in metrics if m.operation == operation] else: times = [m.duration_ms for m in metrics] return sum(times) / len(times) if times else 0.0 def get_tokens_per_second(self, operation: Optional[str] = None, last_n: int = 10) -> float: """Calculate tokens per second for recent operations""" metrics = list(self.response_times)[-last_n:] if operation: metrics = [m for m in metrics if m.operation == operation] if not metrics: return 0.0 total_tokens = sum(m.tokens_generated for m in metrics) total_time_s = sum(m.duration_ms for m in metrics) / 1000 return total_tokens / total_time_s if total_time_s > 0 else 0.0 def get_success_rate(self, operation: Optional[str] = None, last_n: int = 100) -> float: """Get success rate for operations""" metrics = list(self.response_times)[-last_n:] if operation: metrics = [m for m in metrics if m.operation == operation] if not metrics: return 1.0 successes = sum(1 for m in metrics if m.success) return successes / len(metrics) def get_current_stats(self) -> Dict: """Get current system stats""" snapshot = self.capture_snapshot() uptime = (datetime.now() - self.start_time).total_seconds() stats = { "timestamp": snapshot.timestamp.isoformat(), "uptime_seconds": uptime, "uptime_formatted": self._format_uptime(uptime), "cpu": { "percent": round(snapshot.cpu_percent, 1) }, "ram": { "percent": round(snapshot.ram_percent, 1), "used_gb": round(snapshot.ram_used_gb, 2), "total_gb": round(snapshot.ram_total_gb, 2) }, "performance": { "avg_response_ms": round(self.get_avg_response_time(last_n=20), 0), "tokens_per_second": round(self.get_tokens_per_second(), 1), "success_rate": round(self.get_success_rate(), 2) } } if snapshot.gpu_percent is not None: stats["gpu"] = { "percent": round(snapshot.gpu_percent if snapshot.gpu_percent is not None else 0.0, 1), "memory_used_gb": round(snapshot.gpu_memory_used_gb if snapshot.gpu_memory_used_gb is not None else 0.0, 2), "memory_total_gb": round(snapshot.gpu_memory_total_gb if snapshot.gpu_memory_total_gb is not None else 0.0, 2), "temperature_c": round(snapshot.gpu_temperature if snapshot.gpu_temperature is not None else 0.0, 1) } return stats def get_performance_summary(self) -> Dict: """Get summary of performance metrics""" operations = set(m.operation for m in self.response_times) summary = { "overall": { "avg_ms": round(self.get_avg_response_time(), 0), "tokens_per_sec": round(self.get_tokens_per_second(), 1), "success_rate": round(self.get_success_rate(), 2) }, "by_operation": {} } for op in operations: summary["by_operation"][op] = { "avg_ms": round(self.get_avg_response_time(op, last_n=20), 0), "count": len([m for m in self.response_times if m.operation == op]), "success_rate": round(self.get_success_rate(op, last_n=20), 2) } return summary def _format_uptime(self, seconds: float) -> str: """Format uptime as human-readable string""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) if hours > 0: return f"{hours}h {minutes}m {secs}s" elif minutes > 0: return f"{minutes}m {secs}s" else: return f"{secs}s" def get_resource_alerts(self) -> List[str]: """Check for resource issues and return alerts""" alerts = [] if not self.system_snapshots: return alerts latest = self.system_snapshots[-1] # CPU alerts if latest.cpu_percent > 90: alerts.append(f"⚠️ HIGH CPU: {latest.cpu_percent:.1f}%") # RAM alerts if latest.ram_percent > 90: alerts.append(f"⚠️ HIGH RAM: {latest.ram_percent:.1f}%") # GPU alerts if latest.gpu_percent is not None: if latest.gpu_percent > 95: alerts.append(f"⚠️ HIGH GPU: {latest.gpu_percent:.1f}%") if latest.gpu_temperature and latest.gpu_temperature > 80: alerts.append(f"🔥 GPU HOT: {latest.gpu_temperature:.1f}°C") # Performance alerts recent_avg = self.get_avg_response_time(last_n=10) if recent_avg > 5000: # 5 seconds alerts.append(f"⏱️ SLOW RESPONSE: {recent_avg:.0f}ms avg") success_rate = self.get_success_rate(last_n=20) if success_rate < 0.9: alerts.append(f"❌ LOW SUCCESS: {success_rate:.0%}") return alerts def export_to_csv(self, filepath: str): """Export system snapshots to CSV""" import csv with open(filepath, 'w', newline='') as f: writer = csv.writer(f) writer.writerow([ 'timestamp', 'cpu_percent', 'ram_percent', 'ram_used_gb', 'gpu_percent', 'gpu_memory_used_gb', 'gpu_temperature' ]) for s in self.system_snapshots: writer.writerow([ s.timestamp.isoformat(), s.cpu_percent, s.ram_percent, s.ram_used_gb, s.gpu_percent or '', s.gpu_memory_used_gb or '', s.gpu_temperature or '' ]) logger.info(f"[MONITOR] Exported {len(self.system_snapshots)} snapshots to {filepath}") def get_timeseries(self, metric: str, hours: int = 24) -> Dict[str, list]: """Return time-series data for a given metric over the last N hours.""" cutoff = datetime.now() - timedelta(hours=hours) snapshots = [s for s in self.system_snapshots if s.timestamp > cutoff] timestamps = [s.timestamp.isoformat() for s in snapshots] metric_map = { "cpu_percent": lambda s: s.cpu_percent, "ram_percent": lambda s: s.ram_percent, "ram_used_gb": lambda s: s.ram_used_gb, "gpu_percent": lambda s: s.gpu_percent if s.gpu_percent is not None else 0.0, "gpu_memory_used_gb": lambda s: s.gpu_memory_used_gb if s.gpu_memory_used_gb is not None else 0.0, "gpu_temperature": lambda s: s.gpu_temperature if s.gpu_temperature is not None else 0.0, } if metric in metric_map: values = [metric_map[metric](s) for s in snapshots] else: values = [] return {"timestamps": timestamps, "values": values} def __del__(self): """Cleanup GPU monitoring""" if self.gpu_available and self.pynvml: try: self.pynvml.nvmlShutdown() except: pass