tuandunghcmut's picture
update
5bd456e
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import InferenceClient
import os
import spaces
# Available models for selection
AVAILABLE_MODELS = [
"Qwen/Qwen2.5-0.5B",
"Qwen/Qwen2.5-1.5B",
"Qwen/Qwen2.5-7B",
"Qwen/Qwen2.5-14B",
"meta-llama/Llama-2-7b-chat-hf",
"microsoft/phi-2",
"bigscience/bloom-560m"
]
# Default model
DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B"
# Check if we're running in a Space or locally
# Hugging Face Spaces set this environment variable
IS_SPACE = os.getenv("SPACE_ID") is not None
# Global variables for model and tokenizer
model = None
tokenizer = None
def load_model(model_name):
global model, tokenizer
if IS_SPACE:
print(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
model.to('cuda').eval()
return f"Model {model_name} loaded successfully!"
else:
raise ValueError("Model loading is only supported in Hugging Face Spaces.")
# Model configuration
MODEL_NAME = DEFAULT_MODEL
load_model(MODEL_NAME)
@spaces.GPU(duration=120)
def respond(
message,
history: list[tuple[str, str]],
model_name,
system_message,
max_tokens,
temperature,
top_p,
repetition_penalty,
top_k,
):
global model, tokenizer
# If model name changed, load the new model
if model_name != MODEL_NAME:
load_model(model_name)
# Prepare the conversation in ChatML format
messages = []
# Add system message if provided
if system_message:
messages.append({"role": "system", "content": system_message})
# Add conversation history
for user_msg, assistant_msg in history:
if user_msg: # Add user message
messages.append({"role": "user", "content": user_msg})
if assistant_msg: # Add assistant message
messages.append({"role": "assistant", "content": assistant_msg})
# Add the current message
messages.append({"role": "user", "content": message})
# Apply the chat template
try:
# Use apply_chat_template which handles different model formats
chat_text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
except (AttributeError, NotImplementedError):
# Fallback for models without chat template
chat_text = f"{system_message}\n\n"
for msg in messages:
if msg["role"] == "system":
continue # Already added at the beginning
elif msg["role"] == "user":
chat_text += f"User: {msg['content']}\n"
elif msg["role"] == "assistant":
chat_text += f"Assistant: {msg['content']}\n\n"
chat_text += "Assistant:"
# Tokenize the input
inputs = tokenizer(chat_text, return_tensors="pt")
input_ids = inputs["input_ids"].to(model.device)
# Set up generation parameters
gen_kwargs = {
"max_new_tokens": int(max_tokens),
"temperature": float(temperature),
"top_p": float(top_p),
"top_k": int(top_k),
"repetition_penalty": float(repetition_penalty),
"do_sample": True,
"pad_token_id": tokenizer.eos_token_id
}
# Stream the response token by token
streamer = iter(model.generate(
input_ids,
**gen_kwargs,
streamer=None
))
# Initial empty response
response = ""
# Process the streamed tokens
for output in streamer:
# Get the last token generated
next_token_id = output[-1]
# Decode the token
next_token = tokenizer.decode(next_token_id, skip_special_tokens=True)
# Append to the response
response += next_token
# Yield the response so far
yield response.strip()
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
with gr.Blocks() as demo:
with gr.Row():
gr.Markdown("# 🤖 Multi-Model Chat with Zero GPU")
with gr.Row():
with gr.Column(scale=4):
chatbot = gr.Chatbot(height=600)
msg = gr.Textbox(
placeholder="Ask me anything...",
container=False,
scale=7,
)
submit = gr.Button("Submit", variant="primary")
clear = gr.Button("Clear")
with gr.Column(scale=1):
gr.Markdown("## Model Settings")
model_dropdown = gr.Dropdown(
choices=AVAILABLE_MODELS,
value=DEFAULT_MODEL,
label="Select Model",
info="Choose a model for chat"
)
load_button = gr.Button("Load Model")
system_message = gr.Textbox(
value="You are a friendly and helpful AI assistant.",
label="System Message",
info="Instructions for the AI"
)
gr.Markdown("## Sampling Parameters")
max_tokens = gr.Slider(
minimum=1, maximum=4096, value=512, step=1,
label="Max New Tokens",
info="Maximum number of tokens to generate"
)
temperature = gr.Slider(
minimum=0.1, maximum=2.0, value=0.7, step=0.1,
label="Temperature",
info="Higher = more creative, Lower = more focused"
)
top_p = gr.Slider(
minimum=0.1, maximum=1.0, value=0.95, step=0.05,
label="Top-p (nucleus sampling)",
info="Cumulative probability cutoff for token selection"
)
repetition_penalty = gr.Slider(
minimum=1.0, maximum=2.0, value=1.1, step=0.05,
label="Repetition Penalty",
info="Penalty for repeating tokens, 1.0 = no penalty"
)
top_k = gr.Slider(
minimum=1, maximum=100, value=50, step=1,
label="Top-k",
info="Number of highest probability tokens to consider"
)
# Function to handle chat
chat_history = gr.State([])
def user(user_message, history):
return "", history + [[user_message, None]]
def bot(history, model_name, system_msg, max_len, temp, top_p_val, rep_penalty, top_k_val):
user_message = history[-1][0]
history[-1][1] = ""
for response in respond(
user_message,
history[:-1],
model_name,
system_msg,
max_len,
temp,
top_p_val,
rep_penalty,
top_k_val
):
history[-1][1] = response
yield history
def clear_chat():
return [], []
msg.submit(
user,
[msg, chat_history],
[msg, chat_history],
queue=False
).then(
bot,
[chat_history, model_dropdown, system_message, max_tokens, temperature, top_p, repetition_penalty, top_k],
chatbot
)
submit.click(
user,
[msg, chat_history],
[msg, chat_history],
queue=False
).then(
bot,
[chat_history, model_dropdown, system_message, max_tokens, temperature, top_p, repetition_penalty, top_k],
chatbot
)
clear.click(clear_chat, None, [chatbot, chat_history])
load_button.click(
load_model,
inputs=[model_dropdown],
outputs=[gr.Textbox(label="Model Loading Status")]
)
if __name__ == "__main__":
demo.launch()