import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import InferenceClient
import os
import spaces

# Available models for selection
AVAILABLE_MODELS = [
    "Qwen/Qwen2.5-0.5B",
    "Qwen/Qwen2.5-1.5B",
    "Qwen/Qwen2.5-7B",
    "Qwen/Qwen2.5-14B",
    "meta-llama/Llama-2-7b-chat-hf",
    "microsoft/phi-2",
    "bigscience/bloom-560m"
]

# Default model
DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B"

# Check if we're running in a Space or locally
# Hugging Face Spaces set this environment variable
IS_SPACE = os.getenv("SPACE_ID") is not None

# Global variables for model and tokenizer
model = None
tokenizer = None

def load_model(model_name):
    global model, tokenizer
    
    if IS_SPACE:
        print(f"Loading model: {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
        model.to('cuda').eval()
        return f"Model {model_name} loaded successfully!"
    else:
        raise ValueError("Model loading is only supported in Hugging Face Spaces.")

# Model configuration
MODEL_NAME = DEFAULT_MODEL
load_model(MODEL_NAME)


@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    model_name,
    system_message,
    max_tokens,
    temperature,
    top_p,
    repetition_penalty,
    top_k,
):
    global model, tokenizer
    
    # If model name changed, load the new model
    if model_name != MODEL_NAME:
        load_model(model_name)
    
    # Prepare the conversation in ChatML format
    messages = []
    
    # Add system message if provided
    if system_message:
        messages.append({"role": "system", "content": system_message})
    
    # Add conversation history
    for user_msg, assistant_msg in history:
        if user_msg:  # Add user message
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:  # Add assistant message
            messages.append({"role": "assistant", "content": assistant_msg})
    
    # Add the current message
    messages.append({"role": "user", "content": message})
    
    # Apply the chat template
    try:
        # Use apply_chat_template which handles different model formats
        chat_text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
    except (AttributeError, NotImplementedError):
        # Fallback for models without chat template
        chat_text = f"{system_message}\n\n"
        for msg in messages:
            if msg["role"] == "system":
                continue  # Already added at the beginning
            elif msg["role"] == "user":
                chat_text += f"User: {msg['content']}\n"
            elif msg["role"] == "assistant":
                chat_text += f"Assistant: {msg['content']}\n\n"
        chat_text += "Assistant:"
    
    # Tokenize the input
    inputs = tokenizer(chat_text, return_tensors="pt")
    input_ids = inputs["input_ids"].to(model.device)
    
    # Set up generation parameters
    gen_kwargs = {
        "max_new_tokens": int(max_tokens),
        "temperature": float(temperature),
        "top_p": float(top_p),
        "top_k": int(top_k),
        "repetition_penalty": float(repetition_penalty),
        "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id
    }
    
    # Stream the response token by token
    streamer = iter(model.generate(
        input_ids,
        **gen_kwargs,
        streamer=None
    ))
    
    # Initial empty response
    response = ""
    
    # Process the streamed tokens
    for output in streamer:
        # Get the last token generated
        next_token_id = output[-1]
        
        # Decode the token
        next_token = tokenizer.decode(next_token_id, skip_special_tokens=True)
        
        # Append to the response
        response += next_token
        
        # Yield the response so far
        yield response.strip()


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("# 🤖 Multi-Model Chat with Zero GPU")
    
    with gr.Row():
        with gr.Column(scale=4):
            chatbot = gr.Chatbot(height=600)
            msg = gr.Textbox(
                placeholder="Ask me anything...",
                container=False,
                scale=7,
            )
            submit = gr.Button("Submit", variant="primary")
            clear = gr.Button("Clear")
        
        with gr.Column(scale=1):
            gr.Markdown("## Model Settings")
            model_dropdown = gr.Dropdown(
                choices=AVAILABLE_MODELS,
                value=DEFAULT_MODEL,
                label="Select Model",
                info="Choose a model for chat"
            )
            load_button = gr.Button("Load Model")
            
            system_message = gr.Textbox(
                value="You are a friendly and helpful AI assistant.",
                label="System Message",
                info="Instructions for the AI"
            )
            
            gr.Markdown("## Sampling Parameters")
            max_tokens = gr.Slider(
                minimum=1, maximum=4096, value=512, step=1,
                label="Max New Tokens",
                info="Maximum number of tokens to generate"
            )
            temperature = gr.Slider(
                minimum=0.1, maximum=2.0, value=0.7, step=0.1,
                label="Temperature",
                info="Higher = more creative, Lower = more focused"
            )
            top_p = gr.Slider(
                minimum=0.1, maximum=1.0, value=0.95, step=0.05,
                label="Top-p (nucleus sampling)",
                info="Cumulative probability cutoff for token selection"
            )
            repetition_penalty = gr.Slider(
                minimum=1.0, maximum=2.0, value=1.1, step=0.05,
                label="Repetition Penalty",
                info="Penalty for repeating tokens, 1.0 = no penalty"
            )
            top_k = gr.Slider(
                minimum=1, maximum=100, value=50, step=1,
                label="Top-k",
                info="Number of highest probability tokens to consider"
            )
    
    # Function to handle chat
    chat_history = gr.State([])
    
    def user(user_message, history):
        return "", history + [[user_message, None]]
    
    def bot(history, model_name, system_msg, max_len, temp, top_p_val, rep_penalty, top_k_val):
        user_message = history[-1][0]
        history[-1][1] = ""
        
        for response in respond(
            user_message, 
            history[:-1], 
            model_name,
            system_msg,
            max_len,
            temp,
            top_p_val,
            rep_penalty,
            top_k_val
        ):
            history[-1][1] = response
            yield history
    
    def clear_chat():
        return [], []
    
    msg.submit(
        user, 
        [msg, chat_history], 
        [msg, chat_history],
        queue=False
    ).then(
        bot, 
        [chat_history, model_dropdown, system_message, max_tokens, temperature, top_p, repetition_penalty, top_k], 
        chatbot
    )
    
    submit.click(
        user, 
        [msg, chat_history], 
        [msg, chat_history],
        queue=False
    ).then(
        bot, 
        [chat_history, model_dropdown, system_message, max_tokens, temperature, top_p, repetition_penalty, top_k], 
        chatbot
    )
    
    clear.click(clear_chat, None, [chatbot, chat_history])
    
    load_button.click(
        load_model,
        inputs=[model_dropdown],
        outputs=[gr.Textbox(label="Model Loading Status")]
    )

if __name__ == "__main__":
    demo.launch()