File size: 3,912 Bytes
2bb241c
 
e290e43
2bb241c
e290e43
 
2bb241c
e290e43
3b6d645
3ee4c6e
e290e43
3b6d645
3ee4c6e
e290e43
 
 
 
 
 
 
3ee4c6e
 
2bb241c
e290e43
2bb241c
e290e43
 
 
 
 
4900935
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ee4c6e
 
 
2bb241c
4900935
 
3ee4c6e
 
4900935
e290e43
 
 
 
 
 
f2238eb
e290e43
f2238eb
e290e43
2bb241c
 
 
e290e43
2bb241c
e290e43
2bb241c
 
 
 
 
 
 
e290e43
2bb241c
 
 
 
 
 
 
 
 
 
 
e290e43
2bb241c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e290e43
2bb241c
 
 
 
e290e43
 
 
f2238eb
2bb241c
 
e290e43
 
2bb241c
 
f2238eb
 
29f672c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import gradio as gr
from llama_cpp import Llama

# πŸ” HF token from Space secrets
HF_TOKEN = os.environ.get("scalable")

# 🧠 Your GGUF repo on Hugging Face
MODEL_ID = "FatimaZh/llama-3.2-1b-merged-code-gguf-v2"

# ❗ Replace this with the EXACT gguf filename in that repo
GGUF_FILENAME = "merged_fp16_code_v2.gguf"  # <-- CHANGE ME

# 🧠 Load the GGUF model via llama-cpp
llm = Llama.from_pretrained(
    repo_id=MODEL_ID,
    filename=GGUF_FILENAME,
    hf_token=HF_TOKEN,
    n_ctx=4096,      # context length
    n_gpu_layers=-1, # -1 = all layers on GPU if available, 0 = CPU only
)

def respond(
    message,
    history: list[dict[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    # πŸ‘‰ 1. Safety: history can be None
    if history is None:
        history = []

    # πŸ‘‰ 2. Truncate history so it doesn't grow forever
    #    0  = stateless (no previous turns)
    #    2  = last 2 exchanges, etc.
    MAX_HISTORY_TURNS = 0  # for eval, I'd keep this at 0 or very small (e.g. 2)

    if MAX_HISTORY_TURNS <= 0:
        trimmed_history = []
    else:
        trimmed_history = history[-MAX_HISTORY_TURNS:]

    # πŸ‘‰ 3. Build chat messages for llama-cpp
    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})

    # Use ONLY trimmed history (or none)
    messages.extend(trimmed_history)
    messages.append({"role": "user", "content": message})

    # πŸ‘‰ 4. Generate with llama-cpp
    out = llm.create_chat_completion(
        messages=messages,
        max_tokens=int(max_tokens),
        temperature=float(temperature),
        top_p=float(top_p),
    )

    return out["choices"][0]["message"]["content"]

# 🎨 UI
with gr.Blocks(theme=gr.themes.Soft(), title="Khadija Chatbot") as demo:
    gr.Markdown(
        """
        # πŸ€– Khadija's Fine-tuned LLM (GGUF)

        Talk with your fine-tuned Llama model.  
        Use the controls on the right to tweak creativity and response length.
        """
    )

    with gr.Row():
        # πŸ—¨οΈ Chat area
        with gr.Column(scale=3):
            gr.ChatInterface(
                fn=respond,
                type="messages",  # history is list of {"role","content"}
                additional_inputs=[
                    gr.Textbox(
                        value="You are a helpful and friendly assistant.",
                        label="System message",
                        lines=2,
                    ),
                    gr.Slider(
                        minimum=1,
                        maximum=1024,
                        value=256,
                        step=1,
                        label="Max new tokens",
                    ),
                    gr.Slider(
                        minimum=0.1,
                        maximum=4.0,
                        value=0.8,
                        step=0.1,
                        label="Temperature",
                    ),
                    gr.Slider(
                        minimum=0.1,
                        maximum=1.0,
                        value=0.9,
                        step=0.05,
                        label="Top-p (nucleus sampling)",
                    ),
                ],
            )

        # ℹ️ Side info
        with gr.Column(scale=1):
            gr.Markdown(
                f"""
                ### ℹ️ Model info
                - **Model repo:** `{MODEL_ID}`
                - **File:** `{GGUF_FILENAME}`
                - **Backend:** `llama-cpp-python`

                ### πŸ’‘ Tips
                - Lower **Max new tokens** for faster answers.
                - Lower **Temperature** (β‰ˆ0.3–0.7) for focused replies.
                - Higher **Temperature** (>1.0) for more creative replies.
                """
            )

if __name__ == "__main__":
    demo.launch(share=True, show_error=True)