Spaces:

nazdridoy
/

inferoxy-hub

Running

File size: 7,867 Bytes

"""
Text-to-speech functionality handler for AI-Inferoxy AI Hub.
Handles text-to-speech generation with multiple providers.
"""

import os
import gradio as gr
import time
import threading
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
from huggingface_hub import InferenceClient
from huggingface_hub.errors import HfHubHTTPError
from requests.exceptions import ConnectionError, Timeout, RequestException
from hf_token_utils import get_proxy_token, report_token_status
from utils import (
    IMAGE_CONFIG, 
    validate_proxy_key, 
    format_error_message, 
    format_success_message,
    TTS_MODEL_CONFIGS,
)

# Timeout configuration for TTS generation
TTS_GENERATION_TIMEOUT = 300  # 5 minutes max for TTS generation


def generate_text_to_speech(
    text: str,
    model_name: str,
    provider: str,
    voice: str = "af_bella",
    speed: float = 1.0,
    audio_url: str = "",
    exaggeration: float = 0.25,
    temperature: float = 0.7,
    cfg: float = 0.5,
    client_name: str | None = None,
):
    """
    Generate speech from text using the specified model and provider through AI-Inferoxy.
    """
    # Validate proxy API key
    is_valid, error_msg = validate_proxy_key()
    if not is_valid:
        return None, error_msg
    
    proxy_api_key = os.getenv("PROXY_KEY")
    
    token_id = None
    try:
        # Get token from AI-Inferoxy proxy server with timeout handling
        print(f"🔑 TTS: Requesting token from proxy...")
        token, token_id = get_proxy_token(api_key=proxy_api_key)
        print(f"✅ TTS: Got token: {token_id}")
        
        print(f"🎤 TTS: Using model='{model_name}', provider='{provider}', voice='{voice}'")
        
        # Create client with specified provider
        client = InferenceClient(
            provider=provider,
            api_key=token
        )
        
        print(f"🚀 TTS: Client created, preparing generation params...")
        
        # Get model configuration
        model_config = TTS_MODEL_CONFIGS.get(model_name, {})
        extra_body_params = model_config.get("extra_body_params", [])
        
        # Prepare generation parameters
        generation_params = {
            "text": text,
            "model": model_name,
            "extra_body": {}
        }
        
        # Add model-specific parameters to extra_body
        if "voice" in extra_body_params:
            generation_params["extra_body"]["voice"] = voice
        if "speed" in extra_body_params:
            generation_params["extra_body"]["speed"] = speed
        if "audio_url" in extra_body_params:
            generation_params["extra_body"]["audio_url"] = audio_url
        if "exaggeration" in extra_body_params:
            generation_params["extra_body"]["exaggeration"] = exaggeration
        if "temperature" in extra_body_params:
            generation_params["extra_body"]["temperature"] = temperature
        if "cfg" in extra_body_params:
            generation_params["extra_body"]["cfg"] = cfg
        
        print(f"📡 TTS: Making generation request with {TTS_GENERATION_TIMEOUT}s timeout...")
        
        # Create generation function for timeout handling
        def generate_audio_task():
            return client.text_to_speech(**generation_params)
        
        # Execute with timeout using ThreadPoolExecutor
        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(generate_audio_task)
            
            try:
                # Generate audio with timeout
                audio = future.result(timeout=TTS_GENERATION_TIMEOUT)
            except FutureTimeoutError:
                future.cancel()  # Cancel the running task
                raise TimeoutError(f"TTS generation timed out after {TTS_GENERATION_TIMEOUT} seconds")
        
        print(f"🎵 TTS: Generation completed! Audio type: {type(audio)}")
        
        # Report successful token usage
        if token_id:
            report_token_status(token_id, "success", api_key=proxy_api_key, client_name=client_name)
        
        return audio, format_success_message("Speech generated", f"using {model_name} on {provider} with voice {voice}")
        
    except ConnectionError as e:
        # Handle proxy connection errors
        error_msg = f"Cannot connect to AI-Inferoxy server: {str(e)}"
        print(f"🔌 TTS connection error: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
        return None, format_error_message("Connection Error", "Unable to connect to the proxy server. Please check if it's running.")
        
    except TimeoutError as e:
        # Handle timeout errors
        error_msg = f"TTS generation timed out: {str(e)}"
        print(f"⏰ TTS timeout: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
        return None, format_error_message("Timeout Error", f"TTS generation took too long (>{TTS_GENERATION_TIMEOUT//60} minutes). Try shorter text.")
        
    except HfHubHTTPError as e:
        # Handle HuggingFace API errors
        error_msg = str(e)
        print(f"🤗 TTS HF error: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
        
        # Provide more user-friendly error messages
        if "401" in error_msg:
            return None, format_error_message("Authentication Error", "Invalid or expired API token. The proxy will provide a new token on retry.")
        elif "402" in error_msg:
            return None, format_error_message("Quota Exceeded", "API quota exceeded. The proxy will try alternative providers.")
        elif "429" in error_msg:
            return None, format_error_message("Rate Limited", "Too many requests. Please wait a moment and try again.")
        else:
            return None, format_error_message("HuggingFace API Error", error_msg)
        
    except Exception as e:
        # Handle all other errors
        error_msg = str(e)
        print(f"❌ TTS unexpected error: {error_msg}")
        if token_id:
            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
        return None, format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")


def handle_text_to_speech_generation(text_val, model_val, provider_val, voice_val, speed_val, audio_url_val, exaggeration_val, temperature_val, cfg_val, hf_token: gr.OAuthToken = None, hf_profile: gr.OAuthProfile = None):
    """
    Handle text-to-speech generation request with validation.
    """
    # Validate input text
    if not text_val or not text_val.strip():
        return None, format_error_message("Validation Error", "Please enter some text to convert to speech")
    
    # Limit text length to prevent timeouts
    if len(text_val) > 5000:
        return None, format_error_message("Validation Error", "Text is too long. Please keep it under 5000 characters.")
    
    # Require sign-in via HF OAuth token
    access_token = getattr(hf_token, "token", None) if hf_token is not None else None
    username = getattr(hf_profile, "username", None) if hf_profile is not None else None
    if not access_token:
        return None, format_error_message("Access Required", "Please sign in with Hugging Face (sidebar Login button).")
    
    # Generate speech
    return generate_text_to_speech(
        text=text_val.strip(),
        model_name=model_val,
        provider=provider_val,
        voice=voice_val,
        speed=speed_val,
        audio_url=audio_url_val,
        exaggeration=exaggeration_val,
        temperature=temperature_val,
        cfg=cfg_val,
        client_name=username
    )