iris / app.py
FatimaZh's picture
Update app.py
4900935 verified
raw
history blame
3.91 kB
import os
import gradio as gr
from llama_cpp import Llama
# πŸ” HF token from Space secrets
HF_TOKEN = os.environ.get("scalable")
# 🧠 Your GGUF repo on Hugging Face
MODEL_ID = "FatimaZh/llama-3.2-1b-merged-code-gguf-v2"
# ❗ Replace this with the EXACT gguf filename in that repo
GGUF_FILENAME = "merged_fp16_code_v2.gguf" # <-- CHANGE ME
# 🧠 Load the GGUF model via llama-cpp
llm = Llama.from_pretrained(
repo_id=MODEL_ID,
filename=GGUF_FILENAME,
hf_token=HF_TOKEN,
n_ctx=4096, # context length
n_gpu_layers=-1, # -1 = all layers on GPU if available, 0 = CPU only
)
def respond(
message,
history: list[dict[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
# πŸ‘‰ 1. Safety: history can be None
if history is None:
history = []
# πŸ‘‰ 2. Truncate history so it doesn't grow forever
# 0 = stateless (no previous turns)
# 2 = last 2 exchanges, etc.
MAX_HISTORY_TURNS = 0 # for eval, I'd keep this at 0 or very small (e.g. 2)
if MAX_HISTORY_TURNS <= 0:
trimmed_history = []
else:
trimmed_history = history[-MAX_HISTORY_TURNS:]
# πŸ‘‰ 3. Build chat messages for llama-cpp
messages = []
if system_message:
messages.append({"role": "system", "content": system_message})
# Use ONLY trimmed history (or none)
messages.extend(trimmed_history)
messages.append({"role": "user", "content": message})
# πŸ‘‰ 4. Generate with llama-cpp
out = llm.create_chat_completion(
messages=messages,
max_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
)
return out["choices"][0]["message"]["content"]
# 🎨 UI
with gr.Blocks(theme=gr.themes.Soft(), title="Khadija Chatbot") as demo:
gr.Markdown(
"""
# πŸ€– Khadija's Fine-tuned LLM (GGUF)
Talk with your fine-tuned Llama model.
Use the controls on the right to tweak creativity and response length.
"""
)
with gr.Row():
# πŸ—¨οΈ Chat area
with gr.Column(scale=3):
gr.ChatInterface(
fn=respond,
type="messages", # history is list of {"role","content"}
additional_inputs=[
gr.Textbox(
value="You are a helpful and friendly assistant.",
label="System message",
lines=2,
),
gr.Slider(
minimum=1,
maximum=1024,
value=256,
step=1,
label="Max new tokens",
),
gr.Slider(
minimum=0.1,
maximum=4.0,
value=0.8,
step=0.1,
label="Temperature",
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
# ℹ️ Side info
with gr.Column(scale=1):
gr.Markdown(
f"""
### ℹ️ Model info
- **Model repo:** `{MODEL_ID}`
- **File:** `{GGUF_FILENAME}`
- **Backend:** `llama-cpp-python`
### πŸ’‘ Tips
- Lower **Max new tokens** for faster answers.
- Lower **Temperature** (β‰ˆ0.3–0.7) for focused replies.
- Higher **Temperature** (>1.0) for more creative replies.
"""
)
if __name__ == "__main__":
demo.launch(share=True, show_error=True)