Spaces:

tuandunghcmut
/

Multiple-Chatbots

Sleeping

App Files Files Community

Multiple-Chatbots / app.py

tuandunghcmut

update

5bd456e 3 months ago

raw

history blame contribute delete

7.9 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	from huggingface_hub import InferenceClient
	import os
	import spaces

	# Available models for selection
	AVAILABLE_MODELS = [
	"Qwen/Qwen2.5-0.5B",
	"Qwen/Qwen2.5-1.5B",
	"Qwen/Qwen2.5-7B",
	"Qwen/Qwen2.5-14B",
	"meta-llama/Llama-2-7b-chat-hf",
	"microsoft/phi-2",
	"bigscience/bloom-560m"
	]

	# Default model
	DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B"

	# Check if we're running in a Space or locally
	# Hugging Face Spaces set this environment variable
	IS_SPACE = os.getenv("SPACE_ID") is not None

	# Global variables for model and tokenizer
	model = None
	tokenizer = None

	def load_model(model_name):
	global model, tokenizer

	if IS_SPACE:
	print(f"Loading model: {model_name}")
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
	model.to('cuda').eval()
	return f"Model {model_name} loaded successfully!"
	else:
	raise ValueError("Model loading is only supported in Hugging Face Spaces.")

	# Model configuration
	MODEL_NAME = DEFAULT_MODEL
	load_model(MODEL_NAME)


	@spaces.GPU(duration=120)
	def respond(
	message,
	history: list[tuple[str, str]],
	model_name,
	system_message,
	max_tokens,
	temperature,
	top_p,
	repetition_penalty,
	top_k,
	):
	global model, tokenizer

	# If model name changed, load the new model
	if model_name != MODEL_NAME:
	load_model(model_name)

	# Prepare the conversation in ChatML format
	messages = []

	# Add system message if provided
	if system_message:
	messages.append({"role": "system", "content": system_message})

	# Add conversation history
	for user_msg, assistant_msg in history:
	if user_msg: # Add user message
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg: # Add assistant message
	messages.append({"role": "assistant", "content": assistant_msg})

	# Add the current message
	messages.append({"role": "user", "content": message})

	# Apply the chat template
	try:
	# Use apply_chat_template which handles different model formats
	chat_text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	except (AttributeError, NotImplementedError):
	# Fallback for models without chat template
	chat_text = f"{system_message}\n\n"
	for msg in messages:
	if msg["role"] == "system":
	continue # Already added at the beginning
	elif msg["role"] == "user":
	chat_text += f"User: {msg['content']}\n"
	elif msg["role"] == "assistant":
	chat_text += f"Assistant: {msg['content']}\n\n"
	chat_text += "Assistant:"

	# Tokenize the input
	inputs = tokenizer(chat_text, return_tensors="pt")
	input_ids = inputs["input_ids"].to(model.device)

	# Set up generation parameters
	gen_kwargs = {
	"max_new_tokens": int(max_tokens),
	"temperature": float(temperature),
	"top_p": float(top_p),
	"top_k": int(top_k),
	"repetition_penalty": float(repetition_penalty),
	"do_sample": True,
	"pad_token_id": tokenizer.eos_token_id
	}

	# Stream the response token by token
	streamer = iter(model.generate(
	input_ids,
	**gen_kwargs,
	streamer=None
	))

	# Initial empty response
	response = ""

	# Process the streamed tokens
	for output in streamer:
	# Get the last token generated
	next_token_id = output[-1]

	# Decode the token
	next_token = tokenizer.decode(next_token_id, skip_special_tokens=True)

	# Append to the response
	response += next_token

	# Yield the response so far
	yield response.strip()


	"""
	For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
	"""
	with gr.Blocks() as demo:
	with gr.Row():
	gr.Markdown("# 🤖 Multi-Model Chat with Zero GPU")

	with gr.Row():
	with gr.Column(scale=4):
	chatbot = gr.Chatbot(height=600)
	msg = gr.Textbox(
	placeholder="Ask me anything...",
	container=False,
	scale=7,
	)
	submit = gr.Button("Submit", variant="primary")
	clear = gr.Button("Clear")

	with gr.Column(scale=1):
	gr.Markdown("## Model Settings")
	model_dropdown = gr.Dropdown(
	choices=AVAILABLE_MODELS,
	value=DEFAULT_MODEL,
	label="Select Model",
	info="Choose a model for chat"
	)
	load_button = gr.Button("Load Model")

	system_message = gr.Textbox(
	value="You are a friendly and helpful AI assistant.",
	label="System Message",
	info="Instructions for the AI"
	)

	gr.Markdown("## Sampling Parameters")
	max_tokens = gr.Slider(
	minimum=1, maximum=4096, value=512, step=1,
	label="Max New Tokens",
	info="Maximum number of tokens to generate"
	)
	temperature = gr.Slider(
	minimum=0.1, maximum=2.0, value=0.7, step=0.1,
	label="Temperature",
	info="Higher = more creative, Lower = more focused"
	)
	top_p = gr.Slider(
	minimum=0.1, maximum=1.0, value=0.95, step=0.05,
	label="Top-p (nucleus sampling)",
	info="Cumulative probability cutoff for token selection"
	)
	repetition_penalty = gr.Slider(
	minimum=1.0, maximum=2.0, value=1.1, step=0.05,
	label="Repetition Penalty",
	info="Penalty for repeating tokens, 1.0 = no penalty"
	)
	top_k = gr.Slider(
	minimum=1, maximum=100, value=50, step=1,
	label="Top-k",
	info="Number of highest probability tokens to consider"
	)

	# Function to handle chat
	chat_history = gr.State([])

	def user(user_message, history):
	return "", history + [[user_message, None]]

	def bot(history, model_name, system_msg, max_len, temp, top_p_val, rep_penalty, top_k_val):
	user_message = history[-1][0]
	history[-1][1] = ""

	for response in respond(
	user_message,
	history[:-1],
	model_name,
	system_msg,
	max_len,
	temp,
	top_p_val,
	rep_penalty,
	top_k_val
	):
	history[-1][1] = response
	yield history

	def clear_chat():
	return [], []

	msg.submit(
	user,
	[msg, chat_history],
	[msg, chat_history],
	queue=False
	).then(
	bot,
	[chat_history, model_dropdown, system_message, max_tokens, temperature, top_p, repetition_penalty, top_k],
	chatbot
	)

	submit.click(
	user,
	[msg, chat_history],
	[msg, chat_history],
	queue=False
	).then(
	bot,
	[chat_history, model_dropdown, system_message, max_tokens, temperature, top_p, repetition_penalty, top_k],
	chatbot
	)

	clear.click(clear_chat, None, [chatbot, chat_history])

	load_button.click(
	load_model,
	inputs=[model_dropdown],
	outputs=[gr.Textbox(label="Model Loading Status")]
	)

	if __name__ == "__main__":
	demo.launch()