Spaces:

FatimaZh
/

iris

Sleeping

File size: 3,912 Bytes

2bb241c
 
e290e43
2bb241c
e290e43
 
2bb241c
e290e43
3b6d645
3ee4c6e
e290e43
3b6d645
3ee4c6e
e290e43
 
 
 
 
 
 
3ee4c6e
 
2bb241c
e290e43
2bb241c
e290e43
 
 
 
 
4900935
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ee4c6e
 
 
2bb241c
4900935
 
3ee4c6e
 
4900935
e290e43
 
 
 
 
 
f2238eb
e290e43
f2238eb
e290e43
2bb241c
 
 
e290e43
2bb241c
e290e43
2bb241c
 
 
 
 
 
 
e290e43
2bb241c
 
 
 
 
 
 
 
 
 
 
e290e43
2bb241c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e290e43
2bb241c
 
 
 
e290e43
 
 
f2238eb
2bb241c
 
e290e43
 
2bb241c
 
f2238eb
 
29f672c

import os
import gradio as gr
from llama_cpp import Llama

# 🔐 HF token from Space secrets
HF_TOKEN = os.environ.get("scalable")

# 🧠 Your GGUF repo on Hugging Face
MODEL_ID = "FatimaZh/llama-3.2-1b-merged-code-gguf-v2"

# ❗ Replace this with the EXACT gguf filename in that repo
GGUF_FILENAME = "merged_fp16_code_v2.gguf"  # <-- CHANGE ME

# 🧠 Load the GGUF model via llama-cpp
llm = Llama.from_pretrained(
    repo_id=MODEL_ID,
    filename=GGUF_FILENAME,
    hf_token=HF_TOKEN,
    n_ctx=4096,      # context length
    n_gpu_layers=-1, # -1 = all layers on GPU if available, 0 = CPU only
)

def respond(
    message,
    history: list[dict[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    # 👉 1. Safety: history can be None
    if history is None:
        history = []

    # 👉 2. Truncate history so it doesn't grow forever
    #    0  = stateless (no previous turns)
    #    2  = last 2 exchanges, etc.
    MAX_HISTORY_TURNS = 0  # for eval, I'd keep this at 0 or very small (e.g. 2)

    if MAX_HISTORY_TURNS <= 0:
        trimmed_history = []
    else:
        trimmed_history = history[-MAX_HISTORY_TURNS:]

    # 👉 3. Build chat messages for llama-cpp
    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})

    # Use ONLY trimmed history (or none)
    messages.extend(trimmed_history)
    messages.append({"role": "user", "content": message})

    # 👉 4. Generate with llama-cpp
    out = llm.create_chat_completion(
        messages=messages,
        max_tokens=int(max_tokens),
        temperature=float(temperature),
        top_p=float(top_p),
    )

    return out["choices"][0]["message"]["content"]

# 🎨 UI
with gr.Blocks(theme=gr.themes.Soft(), title="Khadija Chatbot") as demo:
    gr.Markdown(
        """
        # 🤖 Khadija's Fine-tuned LLM (GGUF)

        Talk with your fine-tuned Llama model.  
        Use the controls on the right to tweak creativity and response length.
        """
    )

    with gr.Row():
        # 🗨️ Chat area
        with gr.Column(scale=3):
            gr.ChatInterface(
                fn=respond,
                type="messages",  # history is list of {"role","content"}
                additional_inputs=[
                    gr.Textbox(
                        value="You are a helpful and friendly assistant.",
                        label="System message",
                        lines=2,
                    ),
                    gr.Slider(
                        minimum=1,
                        maximum=1024,
                        value=256,
                        step=1,
                        label="Max new tokens",
                    ),
                    gr.Slider(
                        minimum=0.1,
                        maximum=4.0,
                        value=0.8,
                        step=0.1,
                        label="Temperature",
                    ),
                    gr.Slider(
                        minimum=0.1,
                        maximum=1.0,
                        value=0.9,
                        step=0.05,
                        label="Top-p (nucleus sampling)",
                    ),
                ],
            )

        # ℹ️ Side info
        with gr.Column(scale=1):
            gr.Markdown(
                f"""
                ### ℹ️ Model info
                - **Model repo:** `{MODEL_ID}`
                - **File:** `{GGUF_FILENAME}`
                - **Backend:** `llama-cpp-python`

                ### 💡 Tips
                - Lower **Max new tokens** for faster answers.
                - Lower **Temperature** (≈0.3–0.7) for focused replies.
                - Higher **Temperature** (>1.0) for more creative replies.
                """
            )

if __name__ == "__main__":
    demo.launch(share=True, show_error=True)