File size: 3,912 Bytes
2bb241c e290e43 2bb241c e290e43 2bb241c e290e43 3b6d645 3ee4c6e e290e43 3b6d645 3ee4c6e e290e43 3ee4c6e 2bb241c e290e43 2bb241c e290e43 4900935 3ee4c6e 2bb241c 4900935 3ee4c6e 4900935 e290e43 f2238eb e290e43 f2238eb e290e43 2bb241c e290e43 2bb241c e290e43 2bb241c e290e43 2bb241c e290e43 2bb241c e290e43 2bb241c e290e43 f2238eb 2bb241c e290e43 2bb241c f2238eb 29f672c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import os
import gradio as gr
from llama_cpp import Llama
# π HF token from Space secrets
HF_TOKEN = os.environ.get("scalable")
# π§ Your GGUF repo on Hugging Face
MODEL_ID = "FatimaZh/llama-3.2-1b-merged-code-gguf-v2"
# β Replace this with the EXACT gguf filename in that repo
GGUF_FILENAME = "merged_fp16_code_v2.gguf" # <-- CHANGE ME
# π§ Load the GGUF model via llama-cpp
llm = Llama.from_pretrained(
repo_id=MODEL_ID,
filename=GGUF_FILENAME,
hf_token=HF_TOKEN,
n_ctx=4096, # context length
n_gpu_layers=-1, # -1 = all layers on GPU if available, 0 = CPU only
)
def respond(
message,
history: list[dict[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
# π 1. Safety: history can be None
if history is None:
history = []
# π 2. Truncate history so it doesn't grow forever
# 0 = stateless (no previous turns)
# 2 = last 2 exchanges, etc.
MAX_HISTORY_TURNS = 0 # for eval, I'd keep this at 0 or very small (e.g. 2)
if MAX_HISTORY_TURNS <= 0:
trimmed_history = []
else:
trimmed_history = history[-MAX_HISTORY_TURNS:]
# π 3. Build chat messages for llama-cpp
messages = []
if system_message:
messages.append({"role": "system", "content": system_message})
# Use ONLY trimmed history (or none)
messages.extend(trimmed_history)
messages.append({"role": "user", "content": message})
# π 4. Generate with llama-cpp
out = llm.create_chat_completion(
messages=messages,
max_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
)
return out["choices"][0]["message"]["content"]
# π¨ UI
with gr.Blocks(theme=gr.themes.Soft(), title="Khadija Chatbot") as demo:
gr.Markdown(
"""
# π€ Khadija's Fine-tuned LLM (GGUF)
Talk with your fine-tuned Llama model.
Use the controls on the right to tweak creativity and response length.
"""
)
with gr.Row():
# π¨οΈ Chat area
with gr.Column(scale=3):
gr.ChatInterface(
fn=respond,
type="messages", # history is list of {"role","content"}
additional_inputs=[
gr.Textbox(
value="You are a helpful and friendly assistant.",
label="System message",
lines=2,
),
gr.Slider(
minimum=1,
maximum=1024,
value=256,
step=1,
label="Max new tokens",
),
gr.Slider(
minimum=0.1,
maximum=4.0,
value=0.8,
step=0.1,
label="Temperature",
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
# βΉοΈ Side info
with gr.Column(scale=1):
gr.Markdown(
f"""
### βΉοΈ Model info
- **Model repo:** `{MODEL_ID}`
- **File:** `{GGUF_FILENAME}`
- **Backend:** `llama-cpp-python`
### π‘ Tips
- Lower **Max new tokens** for faster answers.
- Lower **Temperature** (β0.3β0.7) for focused replies.
- Higher **Temperature** (>1.0) for more creative replies.
"""
)
if __name__ == "__main__":
demo.launch(share=True, show_error=True)
|