Spaces:
Running
Running
File size: 11,033 Bytes
c1bee18 c2e6d7e c1bee18 9a50492 c40f3d0 c1bee18 c40f3d0 c1bee18 9a50492 074f3bf c1bee18 c40f3d0 c1bee18 dd78fc1 c1bee18 8c7976b c1bee18 c2e6d7e c1bee18 c40f3d0 c1bee18 c2e6d7e c1bee18 dd78fc1 c1bee18 c40f3d0 c1bee18 c40f3d0 c1bee18 c40f3d0 c1bee18 c40f3d0 c1bee18 c40f3d0 8c7976b c40f3d0 c2e6d7e c40f3d0 8c7976b c40f3d0 8c7976b c40f3d0 c1bee18 c40f3d0 8c7976b c40f3d0 c1bee18 c40f3d0 c1bee18 52fc803 c1bee18 c192021 c1bee18 c192021 dd78fc1 bc34cae 9a50492 52fc803 bc34cae 9a50492 c1bee18 c192021 c1bee18 dd78fc1 c1bee18 8c7976b c1bee18 c192021 c1bee18 a6a8ac0 c192021 9a48f85 52fc803 9a48f85 bc34cae 9a50492 52fc803 bc34cae 9a50492 9a48f85 dd78fc1 9a48f85 8c7976b 9a48f85 a6a8ac0 9a48f85 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 |
"""
Chat functionality handler for AI-Inferoxy AI Hub.
Handles chat completion requests with streaming responses.
"""
import os
import gradio as gr
import time
import threading
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
from huggingface_hub import InferenceClient
from huggingface_hub.errors import HfHubHTTPError
from requests.exceptions import ConnectionError, Timeout, RequestException
from hf_token_utils import get_proxy_token, report_token_status
from utils import (
validate_proxy_key,
format_error_message,
render_with_reasoning_toggle
)
# Timeout configuration for inference requests
INFERENCE_TIMEOUT = 120 # 2 minutes max for inference
def chat_respond(
message,
history: list[dict[str, str]],
system_message,
model_name,
provider_override,
max_tokens,
temperature,
top_p,
client_name: str | None = None,
):
"""
Chat completion function using AI-Inferoxy token management.
"""
# Validate proxy API key
is_valid, error_msg = validate_proxy_key()
if not is_valid:
yield error_msg
return
proxy_api_key = os.getenv("PROXY_KEY")
token_id = None
try:
# Get token from AI-Inferoxy proxy server with timeout handling
print(f"π Chat: Requesting token from proxy...")
token, token_id = get_proxy_token(api_key=proxy_api_key)
print(f"β
Chat: Got token: {token_id}")
# Enforce explicit provider selection via dropdown
model = model_name
provider = provider_override or "auto"
print(f"π€ Chat: Using model='{model}', provider='{provider if provider else 'auto'}'")
# Prepare messages first
messages = [{"role": "system", "content": system_message}]
messages.extend(history)
messages.append({"role": "user", "content": message})
print(f"π¬ Chat: Prepared {len(messages)} messages, creating client...")
# Create client with provider (auto if none specified) and always pass model
client = InferenceClient(
provider=provider if provider else "auto",
api_key=token
)
print(f"π Chat: Client created, starting inference with timeout...")
chat_completion_kwargs = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"stream": True,
"temperature": temperature,
"top_p": top_p,
}
response = ""
print(f"π‘ Chat: Making streaming request with {INFERENCE_TIMEOUT}s timeout...")
# Create streaming function for timeout handling
def create_stream():
return client.chat_completion(**chat_completion_kwargs)
# Execute with timeout using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=1) as executor:
future = executor.submit(create_stream)
try:
# Get the stream with timeout
stream = future.result(timeout=INFERENCE_TIMEOUT)
print(f"π Chat: Got stream, starting to iterate...")
# Track streaming time to detect hangs
last_token_time = time.time()
token_timeout = 30 # 30 seconds between tokens
for message in stream:
current_time = time.time()
# Check if we've been waiting too long for a token
if current_time - last_token_time > token_timeout:
raise TimeoutError(f"No response received for {token_timeout} seconds during streaming")
choices = message.choices
token_content = ""
if len(choices) and choices[0].delta.content:
token_content = choices[0].delta.content
last_token_time = current_time # Reset timer when we get content
response += token_content
yield response
except FutureTimeoutError:
future.cancel() # Cancel the running task
raise TimeoutError(f"Chat request timed out after {INFERENCE_TIMEOUT} seconds")
# Report successful token usage
if token_id:
report_token_status(token_id, "success", api_key=proxy_api_key, client_name=client_name)
except ConnectionError as e:
# Handle proxy connection errors
error_msg = f"Cannot connect to AI-Inferoxy server: {str(e)}"
print(f"π Chat connection error: {error_msg}")
if token_id:
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
yield format_error_message("Connection Error", "Unable to connect to the proxy server. Please check if it's running.")
except TimeoutError as e:
# Handle timeout errors
error_msg = f"Request timed out: {str(e)}"
print(f"β° Chat timeout: {error_msg}")
if token_id:
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
yield format_error_message("Timeout Error", "The request took too long. The server may be overloaded. Please try again.")
except HfHubHTTPError as e:
# Handle HuggingFace API errors
error_msg = str(e)
print(f"π€ Chat HF error: {error_msg}")
if token_id:
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key, client_name=client_name)
# Provide more user-friendly error messages
if "401" in error_msg:
yield format_error_message("Authentication Error", "Invalid or expired API token. The proxy will provide a new token on retry.")
elif "402" in error_msg:
yield format_error_message("Quota Exceeded", "API quota exceeded. The proxy will try alternative providers.")
elif "429" in error_msg:
yield format_error_message("Rate Limited", "Too many requests. Please wait a moment and try again.")
else:
yield format_error_message("HuggingFace API Error", error_msg)
except Exception as e:
# Handle all other errors
error_msg = str(e)
print(f"β Chat unexpected error: {error_msg}")
if token_id:
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
yield format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")
def handle_chat_submit(message, history, system_msg, model_name, provider, max_tokens, temperature, top_p, hf_token: gr.OAuthToken = None, hf_profile: gr.OAuthProfile = None):
"""
Handle chat submission and manage conversation history with streaming.
"""
if not message.strip():
yield history, ""
return
# Require sign-in: if no token present, prompt login
access_token = getattr(hf_token, "token", None) if hf_token is not None else None
username = getattr(hf_profile, "username", None) if hf_profile is not None else None
if not access_token:
assistant_response = format_error_message("Access Required", "Please sign in with Hugging Face (sidebar Login button).")
current_history = history + [{"role": "assistant", "content": assistant_response}]
yield current_history, ""
return
# Add user message to history
history = history + [{"role": "user", "content": message}]
# Generate response with streaming
response_generator = chat_respond(
message,
history[:-1], # Don't include the current message in history for the function
system_msg,
model_name,
provider,
max_tokens,
temperature,
top_p,
client_name=username
)
# Stream the assistant response token by token
assistant_response = ""
for partial_response in response_generator:
assistant_response = render_with_reasoning_toggle(partial_response, True)
# Update history with the current partial response and yield it
current_history = history + [{"role": "assistant", "content": assistant_response}]
yield current_history, ""
def handle_chat_retry(history, system_msg, model_name, provider, max_tokens, temperature, top_p, hf_token: gr.OAuthToken = None, hf_profile: gr.OAuthProfile = None, retry_data=None):
"""
Retry the assistant response for the selected message.
Works with gr.Chatbot.retry() which provides retry_data.index for the message.
"""
# Require sign-in: if no token present, prompt login
access_token = getattr(hf_token, "token", None) if hf_token is not None else None
username = getattr(hf_profile, "username", None) if hf_profile is not None else None
if not access_token:
assistant_response = format_error_message("Access Required", "Please sign in with Hugging Face (sidebar Login button).")
current_history = (history or []) + [{"role": "assistant", "content": assistant_response}]
yield current_history
return
# Guard: empty history
if not history:
yield history
return
# Determine which assistant message index to retry
retry_index = None
try:
retry_index = getattr(retry_data, "index", None)
except Exception:
retry_index = None
if retry_index is None:
# Fallback to last assistant message
retry_index = len(history) - 1
# Trim history up to the message being retried (exclude that assistant msg)
trimmed_history = list(history[:retry_index])
# Find the most recent user message before retry_index
last_user_idx = None
for idx in range(retry_index - 1, -1, -1):
if trimmed_history[idx].get("role") == "user":
last_user_idx = idx
break
# Nothing to retry if no prior user message
if last_user_idx is None:
yield history
return
# Message to retry and prior conversation context (before that user msg)
message = trimmed_history[last_user_idx].get("content", "")
prior_history = trimmed_history[:last_user_idx]
if not message.strip():
yield history
return
# Stream a new assistant response
response_generator = chat_respond(
message,
prior_history,
system_msg,
model_name,
provider,
max_tokens,
temperature,
top_p,
client_name=username
)
assistant_response = ""
for partial_response in response_generator:
assistant_response = render_with_reasoning_toggle(partial_response, True)
current_history = trimmed_history + [{"role": "assistant", "content": assistant_response}]
yield current_history
|