Spaces:

neural-thinker
/

cidadao.ai-backend

Paused

anderson-ufrj commited on Sep 19

Commit

35d7096

1 Parent(s): 1762def

feat: implement resilience patterns and monitoring endpoints

Resilience patterns:
- Circuit breaker pattern to prevent cascading failures
- Bulkhead pattern for resource isolation and protection
- Configurable failure thresholds and recovery timeouts
- Automatic state transitions and health monitoring

Circuit breakers:
- CLOSED → OPEN → HALF_OPEN state management
- Per-service configuration for external dependencies
- Failure rate monitoring and automatic recovery
- Statistics tracking for performance analysis

Bulkhead isolation:
- Semaphore and queue-based resource isolation
- Configurable concurrency limits per resource type
- Timeout handling and rejection policies
- Resource utilization monitoring

Monitoring endpoints:
- /api/v1/resilience/circuit-breakers - Circuit breaker status
- /api/v1/resilience/bulkheads - Resource utilization metrics
- /api/v1/resilience/health - Overall system health
- /api/v1/cqrs/* - CQRS command and query endpoints

Benefits:
- Improved system stability through failure isolation
- Better resource management and utilization
- Comprehensive monitoring and alerting capabilities
- Enhanced observability for production environments

Files changed (12) hide show

app.py +10 -0
pyproject.toml +2 -0
src/api/app.py +34 -1
src/api/routes/__init__.py +2 -2
src/api/routes/cqrs.py +400 -0
src/api/routes/resilience.py +348 -0
src/core/__init__.py +4 -0
src/infrastructure/__init__.py +19 -0
src/infrastructure/resilience/__init__.py +41 -0
src/infrastructure/resilience/bulkhead.py +613 -0
src/infrastructure/resilience/circuit_breaker.py +516 -0
src/llm/providers.py +41 -9

app.py CHANGED Viewed

@@ -548,6 +548,16 @@ app.add_middleware(
     allow_headers=["*"],
 )
 # ==================== ENDPOINTS ====================
 @app.get("/", response_model=HealthResponse)

     allow_headers=["*"],
 )
+# Add compression middleware for better performance
+from src.api.middleware.compression import add_compression_middleware
+add_compression_middleware(
+    app,
+    minimum_size=1024,  # Compress responses larger than 1KB
+    gzip_level=6,       # Good balance of speed vs compression
+    brotli_quality=4,   # Fast brotli compression
+    exclude_paths={"/health", "/metrics", "/health/metrics"}
+)
 # ==================== ENDPOINTS ====================
 @app.get("/", response_model=HealthResponse)

pyproject.toml CHANGED Viewed

@@ -84,6 +84,7 @@ dependencies = [
     "python-dotenv>=1.0.0",
     "tenacity>=8.2.3",
     "pendulum>=3.0.0",
 ]
 [project.optional-dependencies]
@@ -101,6 +102,7 @@ hf = [
     "python-dotenv>=1.0.0",
     "numpy>=1.26.3",
     "pandas>=2.1.4",
 ]
 dev = [

     "python-dotenv>=1.0.0",
     "tenacity>=8.2.3",
     "pendulum>=3.0.0",
+    "orjson>=3.9.10",
 ]
 [project.optional-dependencies]
     "python-dotenv>=1.0.0",
     "numpy>=1.26.3",
     "pandas>=2.1.4",
+    "orjson>=3.9.10",
 ]
 dev = [

src/api/app.py CHANGED Viewed

@@ -20,7 +20,7 @@ from fastapi.openapi.utils import get_openapi
 from src.core import get_logger, settings
 from src.core.exceptions import CidadaoAIError, create_error_response
 from src.core.audit import audit_logger, AuditEventType, AuditSeverity, AuditContext
-from src.api.routes import investigations, analysis, reports, health, auth, oauth, audit, chat, websocket_chat
 from src.api.middleware.rate_limiting import RateLimitMiddleware
 from src.api.middleware.authentication import AuthenticationMiddleware
 from src.api.middleware.logging_middleware import LoggingMiddleware
@@ -150,6 +150,16 @@ app.add_middleware(
     expose_headers=["X-RateLimit-Limit", "X-RateLimit-Remaining"]
 )
 # Custom OpenAPI schema
 def custom_openapi():
@@ -264,6 +274,29 @@ app.include_router(
     tags=["WebSocket"]
 )
 # Global exception handler
 @app.exception_handler(CidadaoAIError)

 from src.core import get_logger, settings
 from src.core.exceptions import CidadaoAIError, create_error_response
 from src.core.audit import audit_logger, AuditEventType, AuditSeverity, AuditContext
+from src.api.routes import investigations, analysis, reports, health, auth, oauth, audit, chat, websocket_chat, batch, graphql, cqrs, resilience
 from src.api.middleware.rate_limiting import RateLimitMiddleware
 from src.api.middleware.authentication import AuthenticationMiddleware
 from src.api.middleware.logging_middleware import LoggingMiddleware
     expose_headers=["X-RateLimit-Limit", "X-RateLimit-Remaining"]
 )
+# Add compression middleware
+from src.api.middleware.compression import add_compression_middleware
+add_compression_middleware(
+    app,
+    minimum_size=1024,
+    gzip_level=6,
+    brotli_quality=4,
+    exclude_paths={"/health", "/metrics", "/health/metrics", "/api/v1/ws"}
+)
 # Custom OpenAPI schema
 def custom_openapi():
     tags=["WebSocket"]
 )
+app.include_router(
+    batch.router,
+    tags=["Batch Operations"]
+)
+# GraphQL endpoint
+app.include_router(
+    graphql.router,
+    tags=["GraphQL"]
+)
+# CQRS endpoints
+app.include_router(
+    cqrs.router,
+    tags=["CQRS"]
+)
+# Resilience monitoring endpoints
+app.include_router(
+    resilience.router,
+    tags=["Resilience"]
+)
 # Global exception handler
 @app.exception_handler(CidadaoAIError)

src/api/routes/__init__.py CHANGED Viewed

@@ -6,6 +6,6 @@ Date: 2025-01-24
 License: Proprietary - All rights reserved
 """
-from . import health, investigations, analysis, reports, chat, websocket_chat
-__all__ = ["health", "investigations", "analysis", "reports", "chat", "websocket_chat"]

 License: Proprietary - All rights reserved
 """
+from . import health, investigations, analysis, reports, chat, websocket_chat, cqrs, resilience
+__all__ = ["health", "investigations", "analysis", "reports", "chat", "websocket_chat", "cqrs", "resilience"]

src/api/routes/cqrs.py ADDED Viewed

	@@ -0,0 +1,400 @@

+"""
+CQRS API endpoints for command and query operations.
+This module provides RESTful endpoints that use the CQRS pattern
+for better scalability and separation of concerns.
+"""
+from typing import Dict, Any, Optional
+from datetime import datetime
+from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
+from pydantic import BaseModel
+from src.core import get_logger
+from src.api.auth import get_current_user
+from src.infrastructure.cqrs.commands import (
+    CommandBus,
+    CreateInvestigationCommand,
+    UpdateInvestigationCommand,
+    CancelInvestigationCommand,
+    ExecuteAgentTaskCommand,
+    SendChatMessageCommand
+)
+from src.infrastructure.cqrs.queries import (
+    QueryBus,
+    GetInvestigationByIdQuery,
+    SearchInvestigationsQuery,
+    GetInvestigationStatsQuery,
+    SearchContractsQuery,
+    GetAgentPerformanceQuery
+)
+from src.infrastructure.events.event_bus import get_event_bus
+logger = get_logger(__name__)
+router = APIRouter(prefix="/api/v1/cqrs", tags=["CQRS"])
+# Request/Response models
+class CreateInvestigationRequest(BaseModel):
+    """Request to create investigation."""
+    query: str
+    data_sources: Optional[list[str]] = None
+    priority: str = "medium"
+class UpdateInvestigationRequest(BaseModel):
+    """Request to update investigation."""
+    status: str
+    results: Optional[Dict[str, Any]] = None
+class SearchInvestigationsRequest(BaseModel):
+    """Request to search investigations."""
+    filters: Dict[str, Any] = {}
+    sort_by: str = "created_at"
+    sort_order: str = "desc"
+    limit: int = 20
+    offset: int = 0
+class SearchContractsRequest(BaseModel):
+    """Request to search contracts."""
+    search_term: Optional[str] = None
+    orgao: Optional[str] = None
+    min_value: Optional[float] = None
+    max_value: Optional[float] = None
+    year: Optional[int] = None
+    limit: int = 50
+    offset: int = 0
+class ExecuteAgentTaskRequest(BaseModel):
+    """Request to execute agent task."""
+    agent_name: str
+    task_type: str
+    payload: Dict[str, Any]
+    timeout: Optional[float] = None
+# Global instances
+command_bus: Optional[CommandBus] = None
+query_bus: Optional[QueryBus] = None
+async def get_command_bus() -> CommandBus:
+    """Get command bus instance."""
+    global command_bus
+    if command_bus is None:
+        event_bus = await get_event_bus()
+        command_bus = CommandBus(event_bus)
+    return command_bus
+async def get_query_bus() -> QueryBus:
+    """Get query bus instance."""
+    global query_bus
+    if query_bus is None:
+        query_bus = QueryBus()
+    return query_bus
+# Command endpoints
+@router.post("/investigations", response_model=Dict[str, Any])
+async def create_investigation(
+    request: CreateInvestigationRequest,
+    background_tasks: BackgroundTasks,
+    current_user = Depends(get_current_user),
+    cmd_bus: CommandBus = Depends(get_command_bus)
+):
+    """
+    Create a new investigation using CQRS command.
+    This endpoint demonstrates the command side of CQRS:
+    - Accepts write operations
+    - Publishes events
+    - Returns minimal response
+    """
+    command = CreateInvestigationCommand(
+        user_id=current_user["sub"],
+        query=request.query,
+        data_sources=request.data_sources,
+        priority=request.priority
+    )
+    result = await cmd_bus.execute(command)
+    if not result.success:
+        raise HTTPException(status_code=400, detail=result.error)
+    return {
+        "investigation_id": result.data["investigation_id"],
+        "command_id": result.command_id,
+        "events_published": result.events_published
+    }
+@router.put("/investigations/{investigation_id}", response_model=Dict[str, Any])
+async def update_investigation(
+    investigation_id: str,
+    request: UpdateInvestigationRequest,
+    current_user = Depends(get_current_user),
+    cmd_bus: CommandBus = Depends(get_command_bus)
+):
+    """Update investigation status."""
+    command = UpdateInvestigationCommand(
+        user_id=current_user["sub"],
+        investigation_id=investigation_id,
+        status=request.status,
+        results=request.results
+    )
+    result = await cmd_bus.execute(command)
+    if not result.success:
+        raise HTTPException(status_code=400, detail=result.error)
+    return {"success": True, "command_id": result.command_id}
+@router.delete("/investigations/{investigation_id}", response_model=Dict[str, Any])
+async def cancel_investigation(
+    investigation_id: str,
+    reason: Optional[str] = None,
+    current_user = Depends(get_current_user),
+    cmd_bus: CommandBus = Depends(get_command_bus)
+):
+    """Cancel an investigation."""
+    command = CancelInvestigationCommand(
+        user_id=current_user["sub"],
+        investigation_id=investigation_id,
+        reason=reason
+    )
+    result = await cmd_bus.execute(command)
+    if not result.success:
+        raise HTTPException(status_code=400, detail=result.error)
+    return {"success": True, "command_id": result.command_id}
+@router.post("/agents/execute", response_model=Dict[str, Any])
+async def execute_agent_task(
+    request: ExecuteAgentTaskRequest,
+    background_tasks: BackgroundTasks,
+    current_user = Depends(get_current_user),
+    cmd_bus: CommandBus = Depends(get_command_bus)
+):
+    """Execute an agent task."""
+    command = ExecuteAgentTaskCommand(
+        user_id=current_user["sub"],
+        agent_name=request.agent_name,
+        task_type=request.task_type,
+        payload=request.payload,
+        timeout=request.timeout
+    )
+    result = await cmd_bus.execute(command)
+    if not result.success:
+        raise HTTPException(status_code=400, detail=result.error)
+    return {
+        "success": True,
+        "command_id": result.command_id,
+        "task_id": result.data.get("task_id") if result.data else None
+    }
+# Query endpoints
+@router.get("/investigations/{investigation_id}", response_model=Dict[str, Any])
+async def get_investigation(
+    investigation_id: str,
+    include_findings: bool = True,
+    include_anomalies: bool = True,
+    current_user = Depends(get_current_user),
+    q_bus: QueryBus = Depends(get_query_bus)
+):
+    """
+    Get investigation by ID using CQRS query.
+    This endpoint demonstrates the query side of CQRS:
+    - Optimized for reads
+    - Uses caching
+    - Returns denormalized data
+    """
+    query = GetInvestigationByIdQuery(
+        user_id=current_user["sub"],
+        investigation_id=investigation_id,
+        include_findings=include_findings,
+        include_anomalies=include_anomalies
+    )
+    result = await q_bus.execute(query)
+    if not result.success:
+        raise HTTPException(status_code=404, detail=result.error)
+    return {
+        "investigation": result.data,
+        "from_cache": result.from_cache,
+        "execution_time_ms": result.execution_time_ms
+    }
+@router.post("/investigations/search", response_model=Dict[str, Any])
+async def search_investigations(
+    request: SearchInvestigationsRequest,
+    current_user = Depends(get_current_user),
+    q_bus: QueryBus = Depends(get_query_bus)
+):
+    """Search investigations with filters."""
+    query = SearchInvestigationsQuery(
+        user_id=current_user["sub"],
+        filters=request.filters,
+        sort_by=request.sort_by,
+        sort_order=request.sort_order,
+        limit=request.limit,
+        offset=request.offset
+    )
+    result = await q_bus.execute(query)
+    if not result.success:
+        raise HTTPException(status_code=400, detail=result.error)
+    return {
+        "investigations": result.data,
+        "metadata": result.metadata,
+        "execution_time_ms": result.execution_time_ms
+    }
+@router.get("/investigations/stats", response_model=Dict[str, Any])
+async def get_investigation_stats(
+    date_from: Optional[datetime] = None,
+    date_to: Optional[datetime] = None,
+    current_user = Depends(get_current_user),
+    q_bus: QueryBus = Depends(get_query_bus)
+):
+    """Get investigation statistics."""
+    query = GetInvestigationStatsQuery(
+        user_id=current_user["sub"],
+        date_from=date_from,
+        date_to=date_to
+    )
+    result = await q_bus.execute(query)
+    if not result.success:
+        raise HTTPException(status_code=400, detail=result.error)
+    return {
+        "stats": result.data,
+        "from_cache": result.from_cache,
+        "execution_time_ms": result.execution_time_ms
+    }
+@router.post("/contracts/search", response_model=Dict[str, Any])
+async def search_contracts(
+    request: SearchContractsRequest,
+    current_user = Depends(get_current_user),
+    q_bus: QueryBus = Depends(get_query_bus)
+):
+    """Search contracts with filters."""
+    query = SearchContractsQuery(
+        user_id=current_user["sub"],
+        search_term=request.search_term,
+        orgao=request.orgao,
+        min_value=request.min_value,
+        max_value=request.max_value,
+        year=request.year,
+        limit=request.limit,
+        offset=request.offset,
+        use_cache=True,
+        cache_ttl=300  # 5 minutes
+    )
+    result = await q_bus.execute(query)
+    if not result.success:
+        raise HTTPException(status_code=400, detail=result.error)
+    return {
+        "contracts": result.data,
+        "from_cache": result.from_cache,
+        "execution_time_ms": result.execution_time_ms
+    }
+@router.get("/agents/performance", response_model=Dict[str, Any])
+async def get_agent_performance(
+    agent_name: Optional[str] = None,
+    time_period: str = "1h",
+    current_user = Depends(get_current_user),
+    q_bus: QueryBus = Depends(get_query_bus)
+):
+    """Get agent performance metrics."""
+    query = GetAgentPerformanceQuery(
+        user_id=current_user["sub"],
+        agent_name=agent_name,
+        time_period=time_period,
+        use_cache=True,
+        cache_ttl=60  # 1 minute for recent metrics
+    )
+    result = await q_bus.execute(query)
+    if not result.success:
+        raise HTTPException(status_code=400, detail=result.error)
+    return {
+        "performance": result.data,
+        "from_cache": result.from_cache,
+        "execution_time_ms": result.execution_time_ms
+    }
+# Bus statistics endpoints
+@router.get("/stats/commands", response_model=Dict[str, Any])
+async def get_command_bus_stats(
+    current_user = Depends(get_current_user),
+    cmd_bus: CommandBus = Depends(get_command_bus)
+):
+    """Get command bus statistics."""
+    return cmd_bus.get_stats()
+@router.get("/stats/queries", response_model=Dict[str, Any])
+async def get_query_bus_stats(
+    current_user = Depends(get_current_user),
+    q_bus: QueryBus = Depends(get_query_bus)
+):
+    """Get query bus statistics."""
+    return q_bus.get_stats()
+# Health check
+@router.get("/health", response_model=Dict[str, Any])
+async def cqrs_health_check():
+    """Check CQRS system health."""
+    try:
+        cmd_bus = await get_command_bus()
+        q_bus = await get_query_bus()
+        event_bus = await get_event_bus()
+        return {
+            "status": "healthy",
+            "command_bus": "ready",
+            "query_bus": "ready",
+            "event_bus": "ready",
+            "event_bus_stats": event_bus.get_stats()
+        }
+    except Exception as e:
+        return {
+            "status": "unhealthy",
+            "error": str(e)
+        }

src/api/routes/resilience.py ADDED Viewed

	@@ -0,0 +1,348 @@

+"""
+Resilience monitoring endpoints.
+This module provides endpoints for monitoring circuit breakers,
+bulkheads, and overall system resilience.
+"""
+from typing import Dict, Any
+from fastapi import APIRouter, HTTPException, Depends
+from src.core import get_logger
+from src.api.auth import get_current_user
+from src.infrastructure.resilience import circuit_breaker_manager, bulkhead_manager
+logger = get_logger(__name__)
+router = APIRouter(prefix="/api/v1/resilience", tags=["Resilience"])
+@router.get("/circuit-breakers", response_model=Dict[str, Any])
+async def get_circuit_breaker_stats(
+    current_user = Depends(get_current_user)
+):
+    """
+    Get statistics for all circuit breakers.
+    Returns detailed statistics including:
+    - Current state
+    - Success/failure rates
+    - Request counts
+    - Recent state changes
+    """
+    try:
+        stats = circuit_breaker_manager.get_all_stats()
+        health_status = circuit_breaker_manager.get_health_status()
+        return {
+            "circuit_breakers": stats,
+            "health_status": health_status,
+            "summary": {
+                "total_breakers": len(stats),
+                "healthy_services": len(health_status["healthy_services"]),
+                "degraded_services": len(health_status["degraded_services"]),
+                "failed_services": len(health_status["failed_services"])
+            }
+        }
+    except Exception as e:
+        logger.error(f"Failed to get circuit breaker stats: {e}")
+        raise HTTPException(status_code=500, detail="Failed to retrieve circuit breaker statistics")
+@router.get("/circuit-breakers/{service_name}", response_model=Dict[str, Any])
+async def get_circuit_breaker_stats_by_service(
+    service_name: str,
+    current_user = Depends(get_current_user)
+):
+    """
+    Get statistics for a specific circuit breaker.
+    Args:
+        service_name: Name of the service
+    Returns:
+        Detailed statistics for the specified circuit breaker
+    """
+    try:
+        # Get or create circuit breaker to ensure it exists
+        breaker = circuit_breaker_manager.get_circuit_breaker(service_name)
+        stats = breaker.get_stats()
+        return {
+            "service_name": service_name,
+            "circuit_breaker": stats
+        }
+    except Exception as e:
+        logger.error(f"Failed to get circuit breaker stats for {service_name}: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to retrieve circuit breaker statistics for {service_name}")
+@router.post("/circuit-breakers/{service_name}/reset", response_model=Dict[str, Any])
+async def reset_circuit_breaker(
+    service_name: str,
+    current_user = Depends(get_current_user)
+):
+    """
+    Reset a specific circuit breaker to closed state.
+    Args:
+        service_name: Name of the service
+    Returns:
+        Success confirmation
+    """
+    try:
+        breaker = circuit_breaker_manager.get_circuit_breaker(service_name)
+        await breaker.reset()
+        logger.info(f"Circuit breaker for {service_name} reset by user {current_user['sub']}")
+        return {
+            "message": f"Circuit breaker for {service_name} reset successfully",
+            "service_name": service_name,
+            "new_state": breaker.state.value
+        }
+    except Exception as e:
+        logger.error(f"Failed to reset circuit breaker for {service_name}: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to reset circuit breaker for {service_name}")
+@router.post("/circuit-breakers/reset-all", response_model=Dict[str, Any])
+async def reset_all_circuit_breakers(
+    current_user = Depends(get_current_user)
+):
+    """
+    Reset all circuit breakers to closed state.
+    Returns:
+        Success confirmation
+    """
+    try:
+        await circuit_breaker_manager.reset_all()
+        logger.warning(f"All circuit breakers reset by user {current_user['sub']}")
+        return {
+            "message": "All circuit breakers reset successfully",
+            "reset_by": current_user["sub"]
+        }
+    except Exception as e:
+        logger.error(f"Failed to reset all circuit breakers: {e}")
+        raise HTTPException(status_code=500, detail="Failed to reset all circuit breakers")
+@router.get("/bulkheads", response_model=Dict[str, Any])
+async def get_bulkhead_stats(
+    current_user = Depends(get_current_user)
+):
+    """
+    Get statistics for all bulkheads.
+    Returns detailed statistics including:
+    - Current utilization
+    - Active/queued operations
+    - Performance metrics
+    - Resource isolation status
+    """
+    try:
+        stats = bulkhead_manager.get_all_stats()
+        utilization = bulkhead_manager.get_resource_utilization()
+        return {
+            "bulkheads": stats,
+            "resource_utilization": utilization,
+            "summary": {
+                "total_bulkheads": len(stats),
+                "overall_utilization": utilization["overall_utilization"],
+                "total_capacity": utilization["total_capacity"],
+                "total_active": utilization["total_active"],
+                "total_queued": utilization["total_queued"]
+            }
+        }
+    except Exception as e:
+        logger.error(f"Failed to get bulkhead stats: {e}")
+        raise HTTPException(status_code=500, detail="Failed to retrieve bulkhead statistics")
+@router.get("/bulkheads/{resource_type}", response_model=Dict[str, Any])
+async def get_bulkhead_stats_by_resource(
+    resource_type: str,
+    current_user = Depends(get_current_user)
+):
+    """
+    Get statistics for a specific bulkhead.
+    Args:
+        resource_type: Type of resource
+    Returns:
+        Detailed statistics for the specified bulkhead
+    """
+    try:
+        # Get or create bulkhead to ensure it exists
+        bulkhead = bulkhead_manager.get_bulkhead(resource_type)
+        stats = bulkhead.get_stats()
+        return {
+            "resource_type": resource_type,
+            "bulkhead": stats
+        }
+    except Exception as e:
+        logger.error(f"Failed to get bulkhead stats for {resource_type}: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to retrieve bulkhead statistics for {resource_type}")
+@router.get("/health", response_model=Dict[str, Any])
+async def get_resilience_health():
+    """
+    Get overall resilience health status.
+    Returns:
+        Comprehensive health status of all resilience components
+    """
+    try:
+        circuit_breaker_health = circuit_breaker_manager.get_health_status()
+        bulkhead_utilization = bulkhead_manager.get_resource_utilization()
+        # Determine overall health
+        overall_health = "healthy"
+        # Check circuit breaker health
+        if circuit_breaker_health["overall_health"] == "critical":
+            overall_health = "critical"
+        elif circuit_breaker_health["overall_health"] == "degraded":
+            overall_health = "degraded"
+        # Check bulkhead utilization
+        if bulkhead_utilization["overall_utilization"] > 0.9:
+            if overall_health == "healthy":
+                overall_health = "degraded"
+        elif bulkhead_utilization["overall_utilization"] > 0.95:
+            overall_health = "critical"
+        return {
+            "overall_health": overall_health,
+            "circuit_breakers": {
+                "health": circuit_breaker_health["overall_health"],
+                "healthy_services": len(circuit_breaker_health["healthy_services"]),
+                "failed_services": len(circuit_breaker_health["failed_services"]),
+                "health_score": circuit_breaker_health["health_score"]
+            },
+            "bulkheads": {
+                "utilization": bulkhead_utilization["overall_utilization"],
+                "active_operations": bulkhead_utilization["total_active"],
+                "total_capacity": bulkhead_utilization["total_capacity"],
+                "queued_operations": bulkhead_utilization["total_queued"]
+            },
+            "recommendations": _generate_health_recommendations(
+                circuit_breaker_health,
+                bulkhead_utilization
+            )
+        }
+    except Exception as e:
+        logger.error(f"Failed to get resilience health: {e}")
+        raise HTTPException(status_code=500, detail="Failed to retrieve resilience health status")
+@router.get("/metrics", response_model=Dict[str, Any])
+async def get_resilience_metrics(
+    current_user = Depends(get_current_user)
+):
+    """
+    Get comprehensive resilience metrics.
+    Returns:
+        Detailed metrics for monitoring and alerting
+    """
+    try:
+        circuit_breaker_stats = circuit_breaker_manager.get_all_stats()
+        bulkhead_stats = bulkhead_manager.get_all_stats()
+        # Aggregate metrics
+        total_requests = 0
+        total_failures = 0
+        total_timeouts = 0
+        total_rejections = 0
+        for stats in circuit_breaker_stats.values():
+            total_requests += stats["stats"]["total_requests"]
+            total_failures += stats["stats"]["failed_requests"]
+            total_rejections += stats["stats"]["rejected_requests"]
+        for stats in bulkhead_stats.values():
+            total_requests += stats["stats"]["total_requests"]
+            total_failures += stats["stats"]["failed_requests"]
+            total_timeouts += stats["stats"]["timeout_requests"]
+            total_rejections += stats["stats"]["rejected_requests"]
+        success_rate = (
+            (total_requests - total_failures) / total_requests
+            if total_requests > 0 else 1.0
+        )
+        return {
+            "circuit_breakers": circuit_breaker_stats,
+            "bulkheads": bulkhead_stats,
+            "aggregate_metrics": {
+                "total_requests": total_requests,
+                "total_failures": total_failures,
+                "total_timeouts": total_timeouts,
+                "total_rejections": total_rejections,
+                "success_rate": success_rate,
+                "failure_rate": total_failures / total_requests if total_requests > 0 else 0,
+                "rejection_rate": total_rejections / total_requests if total_requests > 0 else 0
+            }
+        }
+    except Exception as e:
+        logger.error(f"Failed to get resilience metrics: {e}")
+        raise HTTPException(status_code=500, detail="Failed to retrieve resilience metrics")
+def _generate_health_recommendations(
+    circuit_breaker_health: Dict[str, Any],
+    bulkhead_utilization: Dict[str, Any]
+) -> list[str]:
+    """Generate health recommendations based on current status."""
+    recommendations = []
+    # Circuit breaker recommendations
+    if circuit_breaker_health["failed_services"]:
+        recommendations.append(
+            f"⚠️ {len(circuit_breaker_health['failed_services'])} services have failing circuit breakers. "
+            f"Check: {', '.join(circuit_breaker_health['failed_services'])}"
+        )
+    if circuit_breaker_health["degraded_services"]:
+        recommendations.append(
+            f"⚡ {len(circuit_breaker_health['degraded_services'])} services are in recovery mode. "
+            f"Monitor: {', '.join(circuit_breaker_health['degraded_services'])}"
+        )
+    # Bulkhead recommendations
+    if bulkhead_utilization["overall_utilization"] > 0.8:
+        recommendations.append(
+            f"📊 High resource utilization ({bulkhead_utilization['overall_utilization']:.1%}). "
+            "Consider scaling or optimizing workloads."
+        )
+    high_util_resources = [
+        name for name, resource in bulkhead_utilization["resources"].items()
+        if resource["utilization"] > 0.9
+    ]
+    if high_util_resources:
+        recommendations.append(
+            f"🔥 High utilization resources: {', '.join(high_util_resources)}. "
+            "Consider increasing capacity or load balancing."
+        )
+    if bulkhead_utilization["total_queued"] > 0:
+        recommendations.append(
+            f"⏳ {bulkhead_utilization['total_queued']} operations queued. "
+            "Monitor queue lengths and processing times."
+        )
+    if not recommendations:
+        recommendations.append("✅ All resilience components are healthy.")
+    return recommendations

src/core/__init__.py CHANGED Viewed

@@ -30,6 +30,7 @@ from .exceptions import (
     ValidationError,
 )
 from .logging import get_logger, setup_logging
 __all__ = [
     # Config
@@ -58,6 +59,9 @@ __all__ = [
     # Logging
     "get_logger",
     "setup_logging",
 ]
 # Initialize logging on import

     ValidationError,
 )
 from .logging import get_logger, setup_logging
+from .llm_pool import llm_pool, get_llm_pool
 __all__ = [
     # Config
     # Logging
     "get_logger",
     "setup_logging",
+    # LLM Pool
+    "llm_pool",
+    "get_llm_pool",
 ]
 # Initialize logging on import

src/infrastructure/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""Infrastructure components for Cidadão.AI."""
+from .cqrs import CommandBus, QueryBus
+from .events import EventBus, EventType, Event
+from .websocket import websocket_manager
+from .messaging import QueueService
+from .resilience import circuit_breaker_manager, bulkhead_manager
+__all__ = [
+    "CommandBus",
+    "QueryBus",
+    "EventBus",
+    "EventType",
+    "Event",
+    "websocket_manager",
+    "QueueService",
+    "circuit_breaker_manager",
+    "bulkhead_manager"
+]

src/infrastructure/resilience/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Resilience patterns for Cidadão.AI."""
+from .circuit_breaker import (
+    CircuitBreaker,
+    CircuitBreakerConfig,
+    CircuitState,
+    CircuitBreakerManager,
+    CircuitBreakerOpenException,
+    CircuitBreakerTimeoutException,
+    circuit_breaker_manager,
+    circuit_breaker
+)
+from .bulkhead import (
+    Bulkhead,
+    BulkheadConfig,
+    BulkheadType,
+    BulkheadManager,
+    BulkheadRejectedException,
+    BulkheadTimeoutException,
+    bulkhead_manager,
+    bulkhead
+)
+__all__ = [
+    "CircuitBreaker",
+    "CircuitBreakerConfig",
+    "CircuitState",
+    "CircuitBreakerManager",
+    "CircuitBreakerOpenException",
+    "CircuitBreakerTimeoutException",
+    "circuit_breaker_manager",
+    "circuit_breaker",
+    "Bulkhead",
+    "BulkheadConfig",
+    "BulkheadType",
+    "BulkheadManager",
+    "BulkheadRejectedException",
+    "BulkheadTimeoutException",
+    "bulkhead_manager",
+    "bulkhead"
+]

src/infrastructure/resilience/bulkhead.py ADDED Viewed

	@@ -0,0 +1,613 @@

+"""
+Bulkhead pattern implementation for resource isolation.
+This module provides bulkhead functionality to isolate different
+types of operations and prevent resource exhaustion.
+"""
+import asyncio
+from typing import Any, Callable, Optional, Dict, Set
+from datetime import datetime, timedelta
+from enum import Enum
+import time
+from dataclasses import dataclass
+import uuid
+from src.core import get_logger
+logger = get_logger(__name__)
+class BulkheadType(str, Enum):
+    """Types of bulkhead isolation."""
+    THREAD_POOL = "thread_pool"      # Thread pool isolation
+    SEMAPHORE = "semaphore"          # Semaphore-based isolation
+    QUEUE = "queue"                  # Queue-based isolation
+@dataclass
+class BulkheadConfig:
+    """Bulkhead configuration."""
+    max_concurrent: int = 10         # Maximum concurrent operations
+    queue_size: Optional[int] = None # Queue size (None = unlimited)
+    timeout: float = 30.0            # Operation timeout
+    bulkhead_type: BulkheadType = BulkheadType.SEMAPHORE
+@dataclass
+class BulkheadStats:
+    """Bulkhead statistics."""
+    total_requests: int = 0
+    successful_requests: int = 0
+    failed_requests: int = 0
+    rejected_requests: int = 0
+    timeout_requests: int = 0
+    current_active: int = 0
+    current_queued: int = 0
+    max_active_reached: int = 0
+    max_queued_reached: int = 0
+    total_wait_time_ms: float = 0.0
+    total_execution_time_ms: float = 0.0
+class BulkheadRejectedException(Exception):
+    """Exception raised when bulkhead rejects request."""
+    pass
+class BulkheadTimeoutException(Exception):
+    """Exception raised when operation times out."""
+    pass
+class Bulkhead:
+    """
+    Bulkhead implementation for resource isolation.
+    Features:
+    - Configurable concurrency limits
+    - Queue management
+    - Timeout handling
+    - Performance monitoring
+    - Different isolation strategies
+    """
+    def __init__(
+        self,
+        name: str,
+        config: Optional[BulkheadConfig] = None
+    ):
+        """
+        Initialize bulkhead.
+        Args:
+            name: Bulkhead name for identification
+            config: Configuration parameters
+        """
+        self.name = name
+        self.config = config or BulkheadConfig()
+        self.stats = BulkheadStats()
+        # Initialize based on bulkhead type
+        if self.config.bulkhead_type == BulkheadType.SEMAPHORE:
+            self._semaphore = asyncio.Semaphore(self.config.max_concurrent)
+        elif self.config.bulkhead_type == BulkheadType.QUEUE:
+            self._queue: asyncio.Queue = asyncio.Queue(
+                maxsize=self.config.queue_size or 0
+            )
+            self._workers: Set[asyncio.Task] = set()
+            self._start_workers()
+        self._active_operations: Set[str] = set()
+        self._lock = asyncio.Lock()
+        logger.info(f"Bulkhead '{name}' initialized with type {self.config.bulkhead_type}")
+    async def execute(self, func: Callable, *args, **kwargs) -> Any:
+        """
+        Execute function with bulkhead protection.
+        Args:
+            func: Function to execute
+            *args: Function arguments
+            **kwargs: Function keyword arguments
+        Returns:
+            Function result
+        Raises:
+            BulkheadRejectedException: When bulkhead rejects request
+            BulkheadTimeoutException: When operation times out
+        """
+        operation_id = str(uuid.uuid4())
+        start_time = time.time()
+        async with self._lock:
+            self.stats.total_requests += 1
+        try:
+            if self.config.bulkhead_type == BulkheadType.SEMAPHORE:
+                return await self._execute_with_semaphore(
+                    func, operation_id, start_time, *args, **kwargs
+                )
+            elif self.config.bulkhead_type == BulkheadType.QUEUE:
+                return await self._execute_with_queue(
+                    func, operation_id, start_time, *args, **kwargs
+                )
+            else:
+                # Direct execution (no protection)
+                return await self._execute_function(func, *args, **kwargs)
+        except Exception as e:
+            async with self._lock:
+                if isinstance(e, (BulkheadRejectedException, BulkheadTimeoutException)):
+                    if isinstance(e, BulkheadRejectedException):
+                        self.stats.rejected_requests += 1
+                    else:
+                        self.stats.timeout_requests += 1
+                else:
+                    self.stats.failed_requests += 1
+            raise
+    async def _execute_with_semaphore(
+        self,
+        func: Callable,
+        operation_id: str,
+        start_time: float,
+        *args,
+        **kwargs
+    ) -> Any:
+        """Execute function using semaphore isolation."""
+        wait_start = time.time()
+        try:
+            # Try to acquire semaphore with timeout
+            await asyncio.wait_for(
+                self._semaphore.acquire(),
+                timeout=self.config.timeout
+            )
+        except asyncio.TimeoutError:
+            raise BulkheadTimeoutException(
+                f"Failed to acquire semaphore for bulkhead '{self.name}' "
+                f"within {self.config.timeout}s"
+            )
+        wait_time = time.time() - wait_start
+        try:
+            async with self._lock:
+                self.stats.current_active += 1
+                self.stats.max_active_reached = max(
+                    self.stats.max_active_reached,
+                    self.stats.current_active
+                )
+                self.stats.total_wait_time_ms += wait_time * 1000
+                self._active_operations.add(operation_id)
+            # Execute function
+            exec_start = time.time()
+            result = await self._execute_function(func, *args, **kwargs)
+            exec_time = time.time() - exec_start
+            async with self._lock:
+                self.stats.successful_requests += 1
+                self.stats.total_execution_time_ms += exec_time * 1000
+            return result
+        finally:
+            async with self._lock:
+                self.stats.current_active -= 1
+                self._active_operations.discard(operation_id)
+            self._semaphore.release()
+    async def _execute_with_queue(
+        self,
+        func: Callable,
+        operation_id: str,
+        start_time: float,
+        *args,
+        **kwargs
+    ) -> Any:
+        """Execute function using queue isolation."""
+        # Create operation item
+        operation = {
+            "id": operation_id,
+            "func": func,
+            "args": args,
+            "kwargs": kwargs,
+            "future": asyncio.Future(),
+            "submitted_at": time.time()
+        }
+        try:
+            # Try to add to queue
+            if self.config.queue_size and self._queue.qsize() >= self.config.queue_size:
+                raise BulkheadRejectedException(
+                    f"Queue full for bulkhead '{self.name}' "
+                    f"(size: {self._queue.qsize()})"
+                )
+            await self._queue.put(operation)
+            async with self._lock:
+                self.stats.current_queued += 1
+                self.stats.max_queued_reached = max(
+                    self.stats.max_queued_reached,
+                    self.stats.current_queued
+                )
+            # Wait for result with timeout
+            try:
+                result = await asyncio.wait_for(
+                    operation["future"],
+                    timeout=self.config.timeout
+                )
+                async with self._lock:
+                    self.stats.successful_requests += 1
+                return result
+            except asyncio.TimeoutError:
+                # Cancel the operation
+                operation["future"].cancel()
+                raise BulkheadTimeoutException(
+                    f"Operation timed out in bulkhead '{self.name}' "
+                    f"after {self.config.timeout}s"
+                )
+        finally:
+            async with self._lock:
+                if self.stats.current_queued > 0:
+                    self.stats.current_queued -= 1
+    def _start_workers(self):
+        """Start worker tasks for queue processing."""
+        for i in range(self.config.max_concurrent):
+            worker = asyncio.create_task(
+                self._worker_loop(f"worker-{i}")
+            )
+            self._workers.add(worker)
+    async def _worker_loop(self, worker_name: str):
+        """Worker loop for processing queued operations."""
+        logger.debug(f"Worker {worker_name} started for bulkhead '{self.name}'")
+        while True:
+            try:
+                # Get operation from queue
+                operation = await self._queue.get()
+                if operation is None:  # Shutdown signal
+                    break
+                operation_id = operation["id"]
+                wait_time = time.time() - operation["submitted_at"]
+                try:
+                    async with self._lock:
+                        self.stats.current_active += 1
+                        self.stats.max_active_reached = max(
+                            self.stats.max_active_reached,
+                            self.stats.current_active
+                        )
+                        self.stats.total_wait_time_ms += wait_time * 1000
+                        self._active_operations.add(operation_id)
+                    # Execute function
+                    if not operation["future"].cancelled():
+                        exec_start = time.time()
+                        result = await self._execute_function(
+                            operation["func"],
+                            *operation["args"],
+                            **operation["kwargs"]
+                        )
+                        exec_time = time.time() - exec_start
+                        operation["future"].set_result(result)
+                        async with self._lock:
+                            self.stats.total_execution_time_ms += exec_time * 1000
+                except Exception as e:
+                    if not operation["future"].cancelled():
+                        operation["future"].set_exception(e)
+                    async with self._lock:
+                        self.stats.failed_requests += 1
+                finally:
+                    async with self._lock:
+                        self.stats.current_active -= 1
+                        self._active_operations.discard(operation_id)
+                    self._queue.task_done()
+            except Exception as e:
+                logger.error(f"Worker {worker_name} error: {e}")
+    async def _execute_function(self, func: Callable, *args, **kwargs) -> Any:
+        """Execute function, handling both sync and async functions."""
+        if asyncio.iscoroutinefunction(func):
+            return await func(*args, **kwargs)
+        else:
+            # Run sync function in thread pool
+            loop = asyncio.get_event_loop()
+            return await loop.run_in_executor(None, func, *args, **kwargs)
+    def get_stats(self) -> Dict[str, Any]:
+        """Get bulkhead statistics."""
+        success_rate = (
+            self.stats.successful_requests / self.stats.total_requests
+            if self.stats.total_requests > 0 else 0
+        )
+        avg_wait_time = (
+            self.stats.total_wait_time_ms / self.stats.total_requests
+            if self.stats.total_requests > 0 else 0
+        )
+        avg_exec_time = (
+            self.stats.total_execution_time_ms / self.stats.successful_requests
+            if self.stats.successful_requests > 0 else 0
+        )
+        return {
+            "name": self.name,
+            "type": self.config.bulkhead_type.value,
+            "config": {
+                "max_concurrent": self.config.max_concurrent,
+                "queue_size": self.config.queue_size,
+                "timeout": self.config.timeout
+            },
+            "stats": {
+                "total_requests": self.stats.total_requests,
+                "successful_requests": self.stats.successful_requests,
+                "failed_requests": self.stats.failed_requests,
+                "rejected_requests": self.stats.rejected_requests,
+                "timeout_requests": self.stats.timeout_requests,
+                "success_rate": success_rate,
+                "current_active": self.stats.current_active,
+                "current_queued": self.stats.current_queued,
+                "max_active_reached": self.stats.max_active_reached,
+                "max_queued_reached": self.stats.max_queued_reached,
+                "avg_wait_time_ms": avg_wait_time,
+                "avg_execution_time_ms": avg_exec_time
+            }
+        }
+    async def shutdown(self):
+        """Shutdown bulkhead and cleanup resources."""
+        if self.config.bulkhead_type == BulkheadType.QUEUE:
+            # Signal workers to stop
+            for _ in self._workers:
+                await self._queue.put(None)
+            # Wait for workers to finish
+            await asyncio.gather(*self._workers, return_exceptions=True)
+            self._workers.clear()
+        logger.info(f"Bulkhead '{self.name}' shut down")
+class BulkheadManager:
+    """
+    Manager for multiple bulkheads.
+    Provides centralized management and monitoring of bulkheads.
+    """
+    def __init__(self):
+        """Initialize bulkhead manager."""
+        self._bulkheads: Dict[str, Bulkhead] = {}
+        self._default_configs: Dict[str, BulkheadConfig] = {}
+    def register_default_config(
+        self,
+        resource_type: str,
+        config: BulkheadConfig
+    ):
+        """
+        Register default configuration for a resource type.
+        Args:
+            resource_type: Resource type name
+            config: Default configuration
+        """
+        self._default_configs[resource_type] = config
+        logger.info(f"Registered default bulkhead config for '{resource_type}'")
+    def get_bulkhead(
+        self,
+        resource_type: str,
+        config: Optional[BulkheadConfig] = None
+    ) -> Bulkhead:
+        """
+        Get or create bulkhead for resource type.
+        Args:
+            resource_type: Resource type name
+            config: Configuration (uses default if not provided)
+        Returns:
+            Bulkhead instance
+        """
+        if resource_type not in self._bulkheads:
+            # Use provided config or default
+            bulkhead_config = (
+                config or
+                self._default_configs.get(resource_type) or
+                BulkheadConfig()
+            )
+            self._bulkheads[resource_type] = Bulkhead(
+                resource_type,
+                bulkhead_config
+            )
+        return self._bulkheads[resource_type]
+    async def execute_with_bulkhead(
+        self,
+        resource_type: str,
+        func: Callable,
+        *args,
+        config: Optional[BulkheadConfig] = None,
+        **kwargs
+    ) -> Any:
+        """
+        Execute function with bulkhead protection.
+        Args:
+            resource_type: Resource type name
+            func: Function to execute
+            *args: Function arguments
+            config: Optional configuration
+            **kwargs: Function keyword arguments
+        Returns:
+            Function result
+        """
+        bulkhead = self.get_bulkhead(resource_type, config)
+        return await bulkhead.execute(func, *args, **kwargs)
+    def get_all_stats(self) -> Dict[str, Any]:
+        """Get statistics for all bulkheads."""
+        return {
+            name: bulkhead.get_stats()
+            for name, bulkhead in self._bulkheads.items()
+        }
+    async def shutdown_all(self):
+        """Shutdown all bulkheads."""
+        for bulkhead in self._bulkheads.values():
+            await bulkhead.shutdown()
+        logger.info("All bulkheads shut down")
+    def get_resource_utilization(self) -> Dict[str, Any]:
+        """Get resource utilization across all bulkheads."""
+        total_capacity = 0
+        total_active = 0
+        total_queued = 0
+        resource_stats = {}
+        for name, bulkhead in self._bulkheads.items():
+            stats = bulkhead.get_stats()
+            capacity = stats["config"]["max_concurrent"]
+            active = stats["stats"]["current_active"]
+            queued = stats["stats"]["current_queued"]
+            total_capacity += capacity
+            total_active += active
+            total_queued += queued
+            resource_stats[name] = {
+                "utilization": active / capacity if capacity > 0 else 0,
+                "active": active,
+                "capacity": capacity,
+                "queued": queued
+            }
+        overall_utilization = (
+            total_active / total_capacity if total_capacity > 0 else 0
+        )
+        return {
+            "overall_utilization": overall_utilization,
+            "total_capacity": total_capacity,
+            "total_active": total_active,
+            "total_queued": total_queued,
+            "resources": resource_stats
+        }
+# Global bulkhead manager
+bulkhead_manager = BulkheadManager()
+# Pre-configured bulkheads for common resource types
+def setup_default_bulkheads():
+    """Setup default bulkhead configurations."""
+    # Database operations
+    bulkhead_manager.register_default_config(
+        "database",
+        BulkheadConfig(
+            max_concurrent=20,
+            queue_size=100,
+            timeout=30.0,
+            bulkhead_type=BulkheadType.SEMAPHORE
+        )
+    )
+    # External API calls
+    bulkhead_manager.register_default_config(
+        "external_api",
+        BulkheadConfig(
+            max_concurrent=10,
+            queue_size=50,
+            timeout=15.0,
+            bulkhead_type=BulkheadType.QUEUE
+        )
+    )
+    # LLM operations
+    bulkhead_manager.register_default_config(
+        "llm_operations",
+        BulkheadConfig(
+            max_concurrent=5,
+            queue_size=20,
+            timeout=60.0,
+            bulkhead_type=BulkheadType.QUEUE
+        )
+    )
+    # File operations
+    bulkhead_manager.register_default_config(
+        "file_operations",
+        BulkheadConfig(
+            max_concurrent=15,
+            timeout=30.0,
+            bulkhead_type=BulkheadType.SEMAPHORE
+        )
+    )
+    # Analytics operations
+    bulkhead_manager.register_default_config(
+        "analytics",
+        BulkheadConfig(
+            max_concurrent=8,
+            queue_size=30,
+            timeout=120.0,
+            bulkhead_type=BulkheadType.QUEUE
+        )
+    )
+# Initialize default configurations
+setup_default_bulkheads()
+# Convenience decorator
+def bulkhead(
+    resource_type: str,
+    config: Optional[BulkheadConfig] = None
+):
+    """
+    Decorator to protect functions with bulkhead.
+    Args:
+        resource_type: Resource type for bulkhead
+        config: Optional configuration
+    """
+    def decorator(func):
+        async def wrapper(*args, **kwargs):
+            return await bulkhead_manager.execute_with_bulkhead(
+                resource_type, func, *args, config=config, **kwargs
+            )
+        return wrapper
+    return decorator

src/infrastructure/resilience/circuit_breaker.py ADDED Viewed

	@@ -0,0 +1,516 @@

+"""
+Circuit breaker pattern implementation for external services.
+This module provides circuit breaker functionality to prevent cascading
+failures and improve system resilience.
+"""
+import asyncio
+from typing import Any, Callable, Optional, Dict, Union
+from datetime import datetime, timedelta
+from enum import Enum
+import time
+from dataclasses import dataclass, field
+from src.core import get_logger
+logger = get_logger(__name__)
+class CircuitState(str, Enum):
+    """Circuit breaker states."""
+    CLOSED = "closed"        # Normal operation
+    OPEN = "open"           # Circuit is open, rejecting requests
+    HALF_OPEN = "half_open" # Testing if service is recovered
+@dataclass
+class CircuitBreakerConfig:
+    """Circuit breaker configuration."""
+    failure_threshold: int = 5          # Failures before opening
+    recovery_timeout: float = 60.0      # Seconds before trying half-open
+    success_threshold: int = 3          # Successes to close from half-open
+    timeout: float = 30.0               # Request timeout
+    expected_exception: type = Exception # Exception type to count as failure
+@dataclass
+class CircuitBreakerStats:
+    """Circuit breaker statistics."""
+    total_requests: int = 0
+    successful_requests: int = 0
+    failed_requests: int = 0
+    rejected_requests: int = 0
+    state_changes: int = 0
+    last_failure_time: Optional[datetime] = None
+    last_success_time: Optional[datetime] = None
+    current_consecutive_failures: int = 0
+    current_consecutive_successes: int = 0
+class CircuitBreakerOpenException(Exception):
+    """Exception raised when circuit breaker is open."""
+    pass
+class CircuitBreakerTimeoutException(Exception):
+    """Exception raised when request times out."""
+    pass
+class CircuitBreaker:
+    """
+    Circuit breaker implementation for resilient external service calls.
+    Features:
+    - Automatic failure detection
+    - Configurable thresholds
+    - Recovery mechanism
+    - Statistics and monitoring
+    - Async/await support
+    """
+    def __init__(
+        self,
+        name: str,
+        config: Optional[CircuitBreakerConfig] = None
+    ):
+        """
+        Initialize circuit breaker.
+        Args:
+            name: Circuit breaker name for identification
+            config: Configuration parameters
+        """
+        self.name = name
+        self.config = config or CircuitBreakerConfig()
+        self.state = CircuitState.CLOSED
+        self.stats = CircuitBreakerStats()
+        self._lock = asyncio.Lock()
+        self._last_failure_time: Optional[float] = None
+        logger.info(f"Circuit breaker '{name}' initialized")
+    async def call(self, func: Callable, *args, **kwargs) -> Any:
+        """
+        Execute function with circuit breaker protection.
+        Args:
+            func: Function to execute
+            *args: Function arguments
+            **kwargs: Function keyword arguments
+        Returns:
+            Function result
+        Raises:
+            CircuitBreakerOpenException: When circuit is open
+            CircuitBreakerTimeoutException: When request times out
+        """
+        async with self._lock:
+            self.stats.total_requests += 1
+            # Check if circuit should be opened
+            await self._check_state()
+            if self.state == CircuitState.OPEN:
+                self.stats.rejected_requests += 1
+                raise CircuitBreakerOpenException(
+                    f"Circuit breaker '{self.name}' is open"
+                )
+        # Execute the function
+        start_time = time.time()
+        try:
+            # Execute with timeout
+            result = await asyncio.wait_for(
+                self._execute_async(func, *args, **kwargs),
+                timeout=self.config.timeout
+            )
+            # Record success
+            await self._record_success()
+            execution_time = time.time() - start_time
+            logger.debug(
+                f"Circuit breaker '{self.name}' - Success "
+                f"(time: {execution_time:.3f}s)"
+            )
+            return result
+        except asyncio.TimeoutError:
+            await self._record_failure()
+            execution_time = time.time() - start_time
+            logger.warning(
+                f"Circuit breaker '{self.name}' - Timeout "
+                f"(time: {execution_time:.3f}s)"
+            )
+            raise CircuitBreakerTimeoutException(
+                f"Request to '{self.name}' timed out after {self.config.timeout}s"
+            )
+        except self.config.expected_exception as e:
+            await self._record_failure()
+            execution_time = time.time() - start_time
+            logger.warning(
+                f"Circuit breaker '{self.name}' - Failure: {e} "
+                f"(time: {execution_time:.3f}s)"
+            )
+            raise
+    async def _execute_async(self, func: Callable, *args, **kwargs) -> Any:
+        """Execute function, handling both sync and async functions."""
+        if asyncio.iscoroutinefunction(func):
+            return await func(*args, **kwargs)
+        else:
+            # Run sync function in thread pool
+            loop = asyncio.get_event_loop()
+            return await loop.run_in_executor(None, func, *args, **kwargs)
+    async def _check_state(self):
+        """Check and update circuit breaker state."""
+        current_time = time.time()
+        if self.state == CircuitState.OPEN:
+            # Check if we should try half-open
+            if (self._last_failure_time and
+                current_time - self._last_failure_time >= self.config.recovery_timeout):
+                self.state = CircuitState.HALF_OPEN
+                self.stats.state_changes += 1
+                logger.info(
+                    f"Circuit breaker '{self.name}' transitioned to HALF_OPEN"
+                )
+        elif self.state == CircuitState.HALF_OPEN:
+            # Half-open state is handled in success/failure recording
+            pass
+    async def _record_success(self):
+        """Record successful execution."""
+        async with self._lock:
+            self.stats.successful_requests += 1
+            self.stats.current_consecutive_failures = 0
+            self.stats.current_consecutive_successes += 1
+            self.stats.last_success_time = datetime.utcnow()
+            if self.state == CircuitState.HALF_OPEN:
+                if (self.stats.current_consecutive_successes >=
+                    self.config.success_threshold):
+                    # Transition to closed
+                    self.state = CircuitState.CLOSED
+                    self.stats.state_changes += 1
+                    self.stats.current_consecutive_successes = 0
+                    logger.info(
+                        f"Circuit breaker '{self.name}' transitioned to CLOSED"
+                    )
+    async def _record_failure(self):
+        """Record failed execution."""
+        async with self._lock:
+            self.stats.failed_requests += 1
+            self.stats.current_consecutive_successes = 0
+            self.stats.current_consecutive_failures += 1
+            self.stats.last_failure_time = datetime.utcnow()
+            self._last_failure_time = time.time()
+            # Check if we should open the circuit
+            if (self.state in [CircuitState.CLOSED, CircuitState.HALF_OPEN] and
+                self.stats.current_consecutive_failures >= self.config.failure_threshold):
+                self.state = CircuitState.OPEN
+                self.stats.state_changes += 1
+                logger.warning(
+                    f"Circuit breaker '{self.name}' opened after "
+                    f"{self.stats.current_consecutive_failures} consecutive failures"
+                )
+    def get_stats(self) -> Dict[str, Any]:
+        """Get circuit breaker statistics."""
+        success_rate = (
+            self.stats.successful_requests / self.stats.total_requests
+            if self.stats.total_requests > 0 else 0
+        )
+        return {
+            "name": self.name,
+            "state": self.state.value,
+            "config": {
+                "failure_threshold": self.config.failure_threshold,
+                "recovery_timeout": self.config.recovery_timeout,
+                "success_threshold": self.config.success_threshold,
+                "timeout": self.config.timeout
+            },
+            "stats": {
+                "total_requests": self.stats.total_requests,
+                "successful_requests": self.stats.successful_requests,
+                "failed_requests": self.stats.failed_requests,
+                "rejected_requests": self.stats.rejected_requests,
+                "success_rate": success_rate,
+                "state_changes": self.stats.state_changes,
+                "current_consecutive_failures": self.stats.current_consecutive_failures,
+                "current_consecutive_successes": self.stats.current_consecutive_successes,
+                "last_failure_time": (
+                    self.stats.last_failure_time.isoformat()
+                    if self.stats.last_failure_time else None
+                ),
+                "last_success_time": (
+                    self.stats.last_success_time.isoformat()
+                    if self.stats.last_success_time else None
+                )
+            }
+        }
+    async def reset(self):
+        """Reset circuit breaker to closed state."""
+        async with self._lock:
+            self.state = CircuitState.CLOSED
+            self.stats.current_consecutive_failures = 0
+            self.stats.current_consecutive_successes = 0
+            self._last_failure_time = None
+            logger.info(f"Circuit breaker '{self.name}' manually reset")
+    async def force_open(self):
+        """Force circuit breaker to open state."""
+        async with self._lock:
+            self.state = CircuitState.OPEN
+            self._last_failure_time = time.time()
+            logger.warning(f"Circuit breaker '{self.name}' manually opened")
+class CircuitBreakerManager:
+    """
+    Manager for multiple circuit breakers.
+    Provides centralized management and monitoring of circuit breakers.
+    """
+    def __init__(self):
+        """Initialize circuit breaker manager."""
+        self._breakers: Dict[str, CircuitBreaker] = {}
+        self._default_configs: Dict[str, CircuitBreakerConfig] = {}
+    def register_default_config(
+        self,
+        service_name: str,
+        config: CircuitBreakerConfig
+    ):
+        """
+        Register default configuration for a service.
+        Args:
+            service_name: Service name
+            config: Default configuration
+        """
+        self._default_configs[service_name] = config
+        logger.info(f"Registered default config for service '{service_name}'")
+    def get_circuit_breaker(
+        self,
+        service_name: str,
+        config: Optional[CircuitBreakerConfig] = None
+    ) -> CircuitBreaker:
+        """
+        Get or create circuit breaker for service.
+        Args:
+            service_name: Service name
+            config: Configuration (uses default if not provided)
+        Returns:
+            Circuit breaker instance
+        """
+        if service_name not in self._breakers:
+            # Use provided config or default
+            breaker_config = (
+                config or
+                self._default_configs.get(service_name) or
+                CircuitBreakerConfig()
+            )
+            self._breakers[service_name] = CircuitBreaker(
+                service_name,
+                breaker_config
+            )
+        return self._breakers[service_name]
+    async def call_service(
+        self,
+        service_name: str,
+        func: Callable,
+        *args,
+        config: Optional[CircuitBreakerConfig] = None,
+        **kwargs
+    ) -> Any:
+        """
+        Call service through circuit breaker.
+        Args:
+            service_name: Service name
+            func: Function to call
+            *args: Function arguments
+            config: Optional configuration
+            **kwargs: Function keyword arguments
+        Returns:
+            Function result
+        """
+        breaker = self.get_circuit_breaker(service_name, config)
+        return await breaker.call(func, *args, **kwargs)
+    def get_all_stats(self) -> Dict[str, Any]:
+        """Get statistics for all circuit breakers."""
+        return {
+            name: breaker.get_stats()
+            for name, breaker in self._breakers.items()
+        }
+    async def reset_all(self):
+        """Reset all circuit breakers."""
+        for breaker in self._breakers.values():
+            await breaker.reset()
+        logger.info("All circuit breakers reset")
+    def get_health_status(self) -> Dict[str, Any]:
+        """Get health status of all services."""
+        healthy_services = []
+        degraded_services = []
+        failed_services = []
+        for name, breaker in self._breakers.items():
+            if breaker.state == CircuitState.CLOSED:
+                healthy_services.append(name)
+            elif breaker.state == CircuitState.HALF_OPEN:
+                degraded_services.append(name)
+            else:  # OPEN
+                failed_services.append(name)
+        total_services = len(self._breakers)
+        healthy_count = len(healthy_services)
+        overall_health = "healthy"
+        if len(failed_services) > 0:
+            if healthy_count == 0:
+                overall_health = "critical"
+            else:
+                overall_health = "degraded"
+        elif len(degraded_services) > 0:
+            overall_health = "degraded"
+        return {
+            "overall_health": overall_health,
+            "total_services": total_services,
+            "healthy_services": healthy_services,
+            "degraded_services": degraded_services,
+            "failed_services": failed_services,
+            "health_score": healthy_count / total_services if total_services > 0 else 1.0
+        }
+# Global circuit breaker manager
+circuit_breaker_manager = CircuitBreakerManager()
+# Pre-configured circuit breakers for common services
+def setup_default_circuit_breakers():
+    """Setup default circuit breaker configurations."""
+    # Portal da Transparência API
+    circuit_breaker_manager.register_default_config(
+        "transparency_api",
+        CircuitBreakerConfig(
+            failure_threshold=3,
+            recovery_timeout=30.0,
+            success_threshold=2,
+            timeout=15.0
+        )
+    )
+    # LLM Services (Groq, etc)
+    circuit_breaker_manager.register_default_config(
+        "llm_service",
+        CircuitBreakerConfig(
+            failure_threshold=5,
+            recovery_timeout=60.0,
+            success_threshold=3,
+            timeout=30.0
+        )
+    )
+    # Database connections
+    circuit_breaker_manager.register_default_config(
+        "database",
+        CircuitBreakerConfig(
+            failure_threshold=2,
+            recovery_timeout=10.0,
+            success_threshold=1,
+            timeout=5.0
+        )
+    )
+    # Redis connections
+    circuit_breaker_manager.register_default_config(
+        "redis",
+        CircuitBreakerConfig(
+            failure_threshold=3,
+            recovery_timeout=20.0,
+            success_threshold=2,
+            timeout=3.0
+        )
+    )
+# Initialize default configurations
+setup_default_circuit_breakers()
+# Convenience decorators
+def circuit_breaker(
+    service_name: str,
+    config: Optional[CircuitBreakerConfig] = None
+):
+    """
+    Decorator to protect functions with circuit breaker.
+    Args:
+        service_name: Service name for circuit breaker
+        config: Optional configuration
+    """
+    def decorator(func):
+        async def wrapper(*args, **kwargs):
+            return await circuit_breaker_manager.call_service(
+                service_name, func, *args, config=config, **kwargs
+            )
+        return wrapper
+    return decorator
+# Example usage functions
+async def protected_api_call(url: str) -> dict:
+    """Example of API call protected by circuit breaker."""
+    import httpx
+    async def make_request():
+        async with httpx.AsyncClient() as client:
+            response = await client.get(url)
+            response.raise_for_status()
+            return response.json()
+    return await circuit_breaker_manager.call_service(
+        "external_api",
+        make_request
+    )

src/llm/providers.py CHANGED Viewed

@@ -16,9 +16,10 @@ from enum import Enum
 import httpx
 from pydantic import BaseModel, Field as PydanticField
-from src.core import get_logger, settings
 from src.core.exceptions import LLMError, LLMRateLimitError
 from src.services.maritaca_client import MaritacaClient, MaritacaModel
 class LLMProvider(str, Enum):
@@ -81,14 +82,19 @@ class BaseLLMProvider(ABC):
         self.timeout = timeout
         self.max_retries = max_retries
         self.logger = get_logger(__name__)
-        self.client = httpx.AsyncClient(
-            timeout=httpx.Timeout(timeout),
-            limits=httpx.Limits(max_keepalive_connections=10, max_connections=20),
-        )
     async def __aenter__(self):
         """Async context manager entry."""
         return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
@@ -97,7 +103,8 @@ class BaseLLMProvider(ABC):
     async def close(self):
         """Close HTTP client."""
-        await self.client.aclose()
     @abstractmethod
     async def complete(self, request: LLMRequest) -> LLMResponse:
@@ -134,6 +141,30 @@ class BaseLLMProvider(ABC):
         stream: bool = False
     ) -> Union[Dict[str, Any], AsyncGenerator[Dict[str, Any], None]]:
         """Make HTTP request with retry logic."""
         url = f"{self.base_url}{endpoint}"
         headers = self._get_headers()
@@ -165,7 +196,7 @@ class BaseLLMProvider(ABC):
                 else:
                     response = await self.client.post(
                         url,
-                        json=data,
                         headers=headers,
                     )
@@ -178,7 +209,7 @@ class BaseLLMProvider(ABC):
                             response_time=response_time,
                         )
-                        return response.json()
                     else:
                         await self._handle_error_response(response, attempt)
@@ -277,7 +308,7 @@ class BaseLLMProvider(ABC):
                 if data == "[DONE]":
                     break
                 try:
-                    yield eval(data)  # Parse JSON chunk
                 except:
                     continue
@@ -294,6 +325,7 @@ class GroqProvider(BaseLLMProvider):
             timeout=60,
             max_retries=3,
         )
     async def complete(self, request: LLMRequest) -> LLMResponse:
         """Complete text generation using Groq."""

 import httpx
 from pydantic import BaseModel, Field as PydanticField
+from src.core import get_logger, settings, get_llm_pool
 from src.core.exceptions import LLMError, LLMRateLimitError
 from src.services.maritaca_client import MaritacaClient, MaritacaModel
+from src.core.json_utils import dumps, loads
 class LLMProvider(str, Enum):
         self.timeout = timeout
         self.max_retries = max_retries
         self.logger = get_logger(__name__)
+        self._use_pool = True  # Flag to use connection pool
+        # Legacy client for backward compatibility
+        self.client = None
     async def __aenter__(self):
         """Async context manager entry."""
+        # Initialize legacy client if not using pool
+        if not self._use_pool and not self.client:
+            self.client = httpx.AsyncClient(
+                timeout=httpx.Timeout(self.timeout),
+                limits=httpx.Limits(max_keepalive_connections=10, max_connections=20),
+            )
         return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
     async def close(self):
         """Close HTTP client."""
+        if self.client:
+            await self.client.aclose()
     @abstractmethod
     async def complete(self, request: LLMRequest) -> LLMResponse:
         stream: bool = False
     ) -> Union[Dict[str, Any], AsyncGenerator[Dict[str, Any], None]]:
         """Make HTTP request with retry logic."""
+        # Use connection pool if available
+        if self._use_pool and hasattr(self, '_provider_name'):
+            try:
+                pool = await get_llm_pool()
+                if stream:
+                    # For streaming, fall back to regular client for now
+                    self.logger.debug("Streaming not yet supported with pool, using regular client")
+                else:
+                    result = await pool.post(
+                        self._provider_name,
+                        endpoint,
+                        data
+                    )
+                    return result
+            except Exception as e:
+                self.logger.warning(f"Pool request failed, falling back to regular client: {e}")
+        # Original implementation for fallback or streaming
+        if not self.client:
+            self.client = httpx.AsyncClient(
+                timeout=httpx.Timeout(self.timeout),
+                limits=httpx.Limits(max_keepalive_connections=10, max_connections=20),
+            )
         url = f"{self.base_url}{endpoint}"
         headers = self._get_headers()
                 else:
                     response = await self.client.post(
                         url,
+                        content=dumps_bytes(data),
                         headers=headers,
                     )
                             response_time=response_time,
                         )
+                        return loads(response.content)
                     else:
                         await self._handle_error_response(response, attempt)
                 if data == "[DONE]":
                     break
                 try:
+                    yield loads(data)  # Parse JSON chunk safely with orjson
                 except:
                     continue
             timeout=60,
             max_retries=3,
         )
+        self._provider_name = "groq"
     async def complete(self, request: LLMRequest) -> LLMResponse:
         """Complete text generation using Groq."""