import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch from huggingface_hub import InferenceClient import os import spaces # Available models for selection AVAILABLE_MODELS = [ "Qwen/Qwen2.5-0.5B", "Qwen/Qwen2.5-1.5B", "Qwen/Qwen2.5-7B", "Qwen/Qwen2.5-14B", "meta-llama/Llama-2-7b-chat-hf", "microsoft/phi-2", "bigscience/bloom-560m" ] # Default model DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B" # Check if we're running in a Space or locally # Hugging Face Spaces set this environment variable IS_SPACE = os.getenv("SPACE_ID") is not None # Global variables for model and tokenizer model = None tokenizer = None def load_model(model_name): global model, tokenizer if IS_SPACE: print(f"Loading model: {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") model.to('cuda').eval() return f"Model {model_name} loaded successfully!" else: raise ValueError("Model loading is only supported in Hugging Face Spaces.") # Model configuration MODEL_NAME = DEFAULT_MODEL load_model(MODEL_NAME) @spaces.GPU(duration=120) def respond( message, history: list[tuple[str, str]], model_name, system_message, max_tokens, temperature, top_p, repetition_penalty, top_k, ): global model, tokenizer # If model name changed, load the new model if model_name != MODEL_NAME: load_model(model_name) # Prepare the conversation in ChatML format messages = [] # Add system message if provided if system_message: messages.append({"role": "system", "content": system_message}) # Add conversation history for user_msg, assistant_msg in history: if user_msg: # Add user message messages.append({"role": "user", "content": user_msg}) if assistant_msg: # Add assistant message messages.append({"role": "assistant", "content": assistant_msg}) # Add the current message messages.append({"role": "user", "content": message}) # Apply the chat template try: # Use apply_chat_template which handles different model formats chat_text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) except (AttributeError, NotImplementedError): # Fallback for models without chat template chat_text = f"{system_message}\n\n" for msg in messages: if msg["role"] == "system": continue # Already added at the beginning elif msg["role"] == "user": chat_text += f"User: {msg['content']}\n" elif msg["role"] == "assistant": chat_text += f"Assistant: {msg['content']}\n\n" chat_text += "Assistant:" # Tokenize the input inputs = tokenizer(chat_text, return_tensors="pt") input_ids = inputs["input_ids"].to(model.device) # Set up generation parameters gen_kwargs = { "max_new_tokens": int(max_tokens), "temperature": float(temperature), "top_p": float(top_p), "top_k": int(top_k), "repetition_penalty": float(repetition_penalty), "do_sample": True, "pad_token_id": tokenizer.eos_token_id } # Stream the response token by token streamer = iter(model.generate( input_ids, **gen_kwargs, streamer=None )) # Initial empty response response = "" # Process the streamed tokens for output in streamer: # Get the last token generated next_token_id = output[-1] # Decode the token next_token = tokenizer.decode(next_token_id, skip_special_tokens=True) # Append to the response response += next_token # Yield the response so far yield response.strip() """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ with gr.Blocks() as demo: with gr.Row(): gr.Markdown("# 🤖 Multi-Model Chat with Zero GPU") with gr.Row(): with gr.Column(scale=4): chatbot = gr.Chatbot(height=600) msg = gr.Textbox( placeholder="Ask me anything...", container=False, scale=7, ) submit = gr.Button("Submit", variant="primary") clear = gr.Button("Clear") with gr.Column(scale=1): gr.Markdown("## Model Settings") model_dropdown = gr.Dropdown( choices=AVAILABLE_MODELS, value=DEFAULT_MODEL, label="Select Model", info="Choose a model for chat" ) load_button = gr.Button("Load Model") system_message = gr.Textbox( value="You are a friendly and helpful AI assistant.", label="System Message", info="Instructions for the AI" ) gr.Markdown("## Sampling Parameters") max_tokens = gr.Slider( minimum=1, maximum=4096, value=512, step=1, label="Max New Tokens", info="Maximum number of tokens to generate" ) temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature", info="Higher = more creative, Lower = more focused" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", info="Cumulative probability cutoff for token selection" ) repetition_penalty = gr.Slider( minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty", info="Penalty for repeating tokens, 1.0 = no penalty" ) top_k = gr.Slider( minimum=1, maximum=100, value=50, step=1, label="Top-k", info="Number of highest probability tokens to consider" ) # Function to handle chat chat_history = gr.State([]) def user(user_message, history): return "", history + [[user_message, None]] def bot(history, model_name, system_msg, max_len, temp, top_p_val, rep_penalty, top_k_val): user_message = history[-1][0] history[-1][1] = "" for response in respond( user_message, history[:-1], model_name, system_msg, max_len, temp, top_p_val, rep_penalty, top_k_val ): history[-1][1] = response yield history def clear_chat(): return [], [] msg.submit( user, [msg, chat_history], [msg, chat_history], queue=False ).then( bot, [chat_history, model_dropdown, system_message, max_tokens, temperature, top_p, repetition_penalty, top_k], chatbot ) submit.click( user, [msg, chat_history], [msg, chat_history], queue=False ).then( bot, [chat_history, model_dropdown, system_message, max_tokens, temperature, top_p, repetition_penalty, top_k], chatbot ) clear.click(clear_chat, None, [chatbot, chat_history]) load_button.click( load_model, inputs=[model_dropdown], outputs=[gr.Textbox(label="Model Loading Status")] ) if __name__ == "__main__": demo.launch()