Spaces:

nazdridoy
/

inferoxy-hub

Running

File size: 11,033 Bytes

c1bee18
c2e6d7e
c1bee18
 
 
 
9a50492
c40f3d0
 
 
c1bee18
 
c40f3d0
c1bee18
 
 
9a50492
074f3bf
c1bee18
 
c40f3d0
 
 
c1bee18
 
 
 
 
 
dd78fc1
c1bee18
 
 
8c7976b
c1bee18
 
c2e6d7e
c1bee18
 
 
 
 
 
 
 
 
c40f3d0
c1bee18
c2e6d7e
c1bee18
 
 
 
dd78fc1
 
 
c1bee18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c40f3d0
c1bee18
 
 
 
 
 
 
 
 
 
 
 
c40f3d0
 
 
 
 
 
 
 
 
 
 
 
 
 
c1bee18
c40f3d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1bee18
c40f3d0
 
 
 
 
 
c1bee18
 
c40f3d0
8c7976b
c40f3d0
 
 
c2e6d7e
c40f3d0
 
8c7976b
c40f3d0
 
 
 
 
 
 
8c7976b
c40f3d0
c1bee18
 
c40f3d0
 
 
 
8c7976b
c40f3d0
 
 
 
 
 
 
 
 
 
c1bee18
 
c40f3d0
 
 
 
 
 
c1bee18
 
52fc803
c1bee18
c192021
c1bee18
 
c192021
 
dd78fc1
bc34cae
9a50492
52fc803
bc34cae
 
9a50492
 
 
 
c1bee18
 
 
c192021
c1bee18
 
 
 
dd78fc1
 
c1bee18
 
8c7976b
 
c1bee18
 
c192021
c1bee18
 
a6a8ac0
c192021
 
 
9a48f85
 
52fc803
9a48f85
 
 
 
bc34cae
9a50492
52fc803
bc34cae
 
9a50492
 
 
9a48f85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd78fc1
9a48f85
 
8c7976b
 
9a48f85
 
 
 
a6a8ac0
9a48f85

"""
Chat functionality handler for AI-Inferoxy AI Hub.
Handles chat completion requests with streaming responses.
"""

import os
import gradio as gr
import time
import threading
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
from huggingface_hub import InferenceClient
from huggingface_hub.errors import HfHubHTTPError
from requests.exceptions import ConnectionError, Timeout, RequestException
from hf_token_utils import get_proxy_token, report_token_status
from utils import (
    validate_proxy_key, 
    format_error_message,
    render_with_reasoning_toggle
)

# Timeout configuration for inference requests
INFERENCE_TIMEOUT = 120  # 2 minutes max for inference


def chat_respond(
    message,
    history: list[dict[str, str]],
    system_message,
    model_name,
    provider_override,
    max_tokens,
    temperature,
    top_p,
    client_name: str | None = None,
):
    """
    Chat completion function using AI-Inferoxy token management.
    """
    # Validate proxy API key
    is_valid, error_msg = validate_proxy_key()
    if not is_valid:
        yield error_msg
        return
    
    proxy_api_key = os.getenv("PROXY_KEY")
    
    token_id = None
    try:
        # Get token from AI-Inferoxy proxy server with timeout handling
        print(f"🔑 Chat: Requesting token from proxy...")
        token, token_id = get_proxy_token(api_key=proxy_api_key)
        print(f"✅ Chat: Got token: {token_id}")
        
        # Enforce explicit provider selection via dropdown
        model = model_name
        provider = provider_override or "auto"
        
        print(f"🤖 Chat: Using model='{model}', provider='{provider if provider else 'auto'}'")
        
        # Prepare messages first
        messages = [{"role": "system", "content": system_message}]
        messages.extend(history)
        messages.append({"role": "user", "content": message})

        print(f"💬 Chat: Prepared {len(messages)} messages, creating client...")
        
        # Create client with provider (auto if none specified) and always pass model
        client = InferenceClient(
            provider=provider if provider else "auto", 
            api_key=token
        )
        
        print(f"🚀 Chat: Client created, starting inference with timeout...")
        
        chat_completion_kwargs = {
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
            "stream": True,
            "temperature": temperature,
            "top_p": top_p,
        }

        response = ""
        
        print(f"📡 Chat: Making streaming request with {INFERENCE_TIMEOUT}s timeout...")
        
        # Create streaming function for timeout handling
        def create_stream():
            return client.chat_completion(**chat_completion_kwargs)
        
        # Execute with timeout using ThreadPoolExecutor
        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(create_stream)
            
            try:
                # Get the stream with timeout
                stream = future.result(timeout=INFERENCE_TIMEOUT)
                print(f"🔄 Chat: Got stream, starting to iterate...")

                # Track streaming time to detect hangs
                last_token_time = time.time()
                token_timeout = 30  # 30 seconds between tokens
                
                for message in stream:
                    current_time = time.time()
                    
                    # Check if we've been waiting too long for a token
                    if current_time - last_token_time > token_timeout:
                        raise TimeoutError(f"No response received for {token_timeout} seconds during streaming")
                    
                    choices = message.choices
                    token_content = ""
                    if len(choices) and choices[0].delta.content:
                        token_content = choices[0].delta.content
                        last_token_time = current_time  # Reset timer when we get content

                    response += token_content
                    yield response
                    
            except FutureTimeoutError:
                future.cancel()  # Cancel the running task
                raise TimeoutError(f"Chat request timed out after {INFERENCE_TIMEOUT} seconds")
        
        # Report successful token usage
        if token_id:
            report_token_status(token_id, "success", api_key=proxy_api_key, client_name=client_name)
            
    except ConnectionError as e:
        # Handle proxy connection errors
        error_msg = f"Cannot connect to AI-Inferoxy server: {str(e)}"
        print(f"🔌 Chat connection error: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
        yield format_error_message("Connection Error", "Unable to connect to the proxy server. Please check if it's running.")
        
    except TimeoutError as e:
        # Handle timeout errors
        error_msg = f"Request timed out: {str(e)}"
        print(f"⏰ Chat timeout: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
        yield format_error_message("Timeout Error", "The request took too long. The server may be overloaded. Please try again.")
        
    except HfHubHTTPError as e:
        # Handle HuggingFace API errors
        error_msg = str(e)
        print(f"🤗 Chat HF error: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
        
        # Provide more user-friendly error messages
        if "401" in error_msg:
            yield format_error_message("Authentication Error", "Invalid or expired API token. The proxy will provide a new token on retry.")
        elif "402" in error_msg:
            yield format_error_message("Quota Exceeded", "API quota exceeded. The proxy will try alternative providers.")
        elif "429" in error_msg:
            yield format_error_message("Rate Limited", "Too many requests. Please wait a moment and try again.")
        else:
            yield format_error_message("HuggingFace API Error", error_msg)
        
    except Exception as e:
        # Handle all other errors
        error_msg = str(e)
        print(f"❌ Chat unexpected error: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
        yield format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")


def handle_chat_submit(message, history, system_msg, model_name, provider, max_tokens, temperature, top_p, hf_token: gr.OAuthToken = None, hf_profile: gr.OAuthProfile = None):
    """
    Handle chat submission and manage conversation history with streaming.
    """
    if not message.strip():
        yield history, ""
        return

    # Require sign-in: if no token present, prompt login
    access_token = getattr(hf_token, "token", None) if hf_token is not None else None
    username = getattr(hf_profile, "username", None) if hf_profile is not None else None
    if not access_token:
        assistant_response = format_error_message("Access Required", "Please sign in with Hugging Face (sidebar Login button).")
        current_history = history + [{"role": "assistant", "content": assistant_response}]
        yield current_history, ""
        return
    
    # Add user message to history
    history = history + [{"role": "user", "content": message}]
    
    # Generate response with streaming
    response_generator = chat_respond(
        message, 
        history[:-1],  # Don't include the current message in history for the function
        system_msg, 
        model_name,
        provider,
        max_tokens, 
        temperature, 
        top_p,
        client_name=username
    )
    
    # Stream the assistant response token by token
    assistant_response = ""
    for partial_response in response_generator:
        assistant_response = render_with_reasoning_toggle(partial_response, True)
        # Update history with the current partial response and yield it
        current_history = history + [{"role": "assistant", "content": assistant_response}]
        yield current_history, ""


def handle_chat_retry(history, system_msg, model_name, provider, max_tokens, temperature, top_p, hf_token: gr.OAuthToken = None, hf_profile: gr.OAuthProfile = None, retry_data=None):
    """
    Retry the assistant response for the selected message.
    Works with gr.Chatbot.retry() which provides retry_data.index for the message.
    """
    # Require sign-in: if no token present, prompt login
    access_token = getattr(hf_token, "token", None) if hf_token is not None else None
    username = getattr(hf_profile, "username", None) if hf_profile is not None else None
    if not access_token:
        assistant_response = format_error_message("Access Required", "Please sign in with Hugging Face (sidebar Login button).")
        current_history = (history or []) + [{"role": "assistant", "content": assistant_response}]
        yield current_history
        return
    # Guard: empty history
    if not history:
        yield history
        return

    # Determine which assistant message index to retry
    retry_index = None
    try:
        retry_index = getattr(retry_data, "index", None)
    except Exception:
        retry_index = None

    if retry_index is None:
        # Fallback to last assistant message
        retry_index = len(history) - 1

    # Trim history up to the message being retried (exclude that assistant msg)
    trimmed_history = list(history[:retry_index])

    # Find the most recent user message before retry_index
    last_user_idx = None
    for idx in range(retry_index - 1, -1, -1):
        if trimmed_history[idx].get("role") == "user":
            last_user_idx = idx
            break

    # Nothing to retry if no prior user message
    if last_user_idx is None:
        yield history
        return

    # Message to retry and prior conversation context (before that user msg)
    message = trimmed_history[last_user_idx].get("content", "")
    prior_history = trimmed_history[:last_user_idx]

    if not message.strip():
        yield history
        return

    # Stream a new assistant response
    response_generator = chat_respond(
        message,
        prior_history,
        system_msg,
        model_name,
        provider,
        max_tokens,
        temperature,
        top_p,
        client_name=username
    )

    assistant_response = ""
    for partial_response in response_generator:
        assistant_response = render_with_reasoning_toggle(partial_response, True)
        current_history = trimmed_history + [{"role": "assistant", "content": assistant_response}]
        yield current_history