import os
import gradio as gr
from llama_cpp import Llama


HF_TOKEN = os.environ.get("scalable")


MODEL_ID = "FatimaZh/llama-3.2-1b-merged-code-gguf-v2"

GGUF_FILENAME = "merged_fp16_code_v2.gguf"  

# Load the GGUF model via llama-cpp
llm = Llama.from_pretrained(
    repo_id=MODEL_ID,
    filename=GGUF_FILENAME,
    hf_token=HF_TOKEN,
    n_ctx=4096,      # context length
    n_gpu_layers=-1, # -1 = all layers on GPU if available
)

def respond(
    message,
    history: list[dict[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    # 1. Safety: history can be None
    if history is None:
        history = []

    #   2. Truncate history so it doesn't grow forever
    #    0  = stateless (no previous turns)
    #    2  = last 2 exchanges, etc.
    MAX_HISTORY_TURNS = 0  

    if MAX_HISTORY_TURNS <= 0:
        trimmed_history = []
    else:
        trimmed_history = history[-MAX_HISTORY_TURNS:]

    # 3. Build chat messages for llama-cpp
    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})

    # Use ONLY trimmed history (or none)
    messages.extend(trimmed_history)
    messages.append({"role": "user", "content": message})

    #  4. Generate with llama-cpp
    out = llm.create_chat_completion(
        messages=messages,
        max_tokens=int(max_tokens),
        temperature=float(temperature),
        top_p=float(top_p),
    )

    return out["choices"][0]["message"]["content"]

#  UI
with gr.Blocks(theme=gr.themes.Soft(), title="Khadija Chatbot") as demo:
    gr.Markdown(
        """
        # 🤖 Khadija's Fine-tuned LLM (GGUF)

        Talk with your fine-tuned Llama model.  
        Use the controls on the right to tweak creativity and response length.
        """
    )

    with gr.Row():
        # Chat area
        with gr.Column(scale=3):
            gr.ChatInterface(
                fn=respond,
                type="messages",  # history is list of {"role","content"}
                additional_inputs=[
                    gr.Textbox(
                        value="You are a helpful and friendly assistant.",
                        label="System message",
                        lines=2,
                    ),
                    gr.Slider(
                        minimum=1,
                        maximum=1024,
                        value=256,
                        step=1,
                        label="Max new tokens",
                    ),
                    gr.Slider(
                        minimum=0.1,
                        maximum=4.0,
                        value=0.8,
                        step=0.1,
                        label="Temperature",
                    ),
                    gr.Slider(
                        minimum=0.1,
                        maximum=1.0,
                        value=0.9,
                        step=0.05,
                        label="Top-p (nucleus sampling)",
                    ),
                ],
            )

        # Side info
        with gr.Column(scale=1):
            gr.Markdown(
                f"""
                ### ℹ️ Model info
                - **Model repo:** `{MODEL_ID}`
                - **File:** `{GGUF_FILENAME}`
                - **Backend:** `llama-cpp-python`

                ### 💡 Tips
                - Lower **Max new tokens** for faster answers.
                - Lower **Temperature** (≈0.3–0.7) for focused replies.
                - Higher **Temperature** (>1.0) for more creative replies.
                """
            )

if __name__ == "__main__":
    demo.launch(share=True)