Spaces:

FatimaZh
/

iris

Sleeping

App Files Files Community

iris / app.py

FatimaZh

Update app.py

4900935 verified 15 days ago

raw

history blame

3.91 kB

	import os
	import gradio as gr
	from llama_cpp import Llama

	# 🔐 HF token from Space secrets
	HF_TOKEN = os.environ.get("scalable")

	# 🧠 Your GGUF repo on Hugging Face
	MODEL_ID = "FatimaZh/llama-3.2-1b-merged-code-gguf-v2"

	# ❗ Replace this with the EXACT gguf filename in that repo
	GGUF_FILENAME = "merged_fp16_code_v2.gguf" # <-- CHANGE ME

	# 🧠 Load the GGUF model via llama-cpp
	llm = Llama.from_pretrained(
	repo_id=MODEL_ID,
	filename=GGUF_FILENAME,
	hf_token=HF_TOKEN,
	n_ctx=4096, # context length
	n_gpu_layers=-1, # -1 = all layers on GPU if available, 0 = CPU only
	)

	def respond(
	message,
	history: list[dict[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	):
	# 👉 1. Safety: history can be None
	if history is None:
	history = []

	# 👉 2. Truncate history so it doesn't grow forever
	# 0 = stateless (no previous turns)
	# 2 = last 2 exchanges, etc.
	MAX_HISTORY_TURNS = 0 # for eval, I'd keep this at 0 or very small (e.g. 2)

	if MAX_HISTORY_TURNS <= 0:
	trimmed_history = []
	else:
	trimmed_history = history[-MAX_HISTORY_TURNS:]

	# 👉 3. Build chat messages for llama-cpp
	messages = []
	if system_message:
	messages.append({"role": "system", "content": system_message})

	# Use ONLY trimmed history (or none)
	messages.extend(trimmed_history)
	messages.append({"role": "user", "content": message})

	# 👉 4. Generate with llama-cpp
	out = llm.create_chat_completion(
	messages=messages,
	max_tokens=int(max_tokens),
	temperature=float(temperature),
	top_p=float(top_p),
	)

	return out["choices"][0]["message"]["content"]

	# 🎨 UI
	with gr.Blocks(theme=gr.themes.Soft(), title="Khadija Chatbot") as demo:
	gr.Markdown(
	"""
	# 🤖 Khadija's Fine-tuned LLM (GGUF)

	Talk with your fine-tuned Llama model.
	Use the controls on the right to tweak creativity and response length.
	"""
	)

	with gr.Row():
	# 🗨️ Chat area
	with gr.Column(scale=3):
	gr.ChatInterface(
	fn=respond,
	type="messages", # history is list of {"role","content"}
	additional_inputs=[
	gr.Textbox(
	value="You are a helpful and friendly assistant.",
	label="System message",
	lines=2,
	),
	gr.Slider(
	minimum=1,
	maximum=1024,
	value=256,
	step=1,
	label="Max new tokens",
	),
	gr.Slider(
	minimum=0.1,
	maximum=4.0,
	value=0.8,
	step=0.1,
	label="Temperature",
	),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top-p (nucleus sampling)",
	),
	],
	)

	# ℹ️ Side info
	with gr.Column(scale=1):
	gr.Markdown(
	f"""
	### ℹ️ Model info
	- Model repo: `{MODEL_ID}`
	- File: `{GGUF_FILENAME}`
	- Backend: `llama-cpp-python`

	### 💡 Tips
	- Lower Max new tokens for faster answers.
	- Lower Temperature (≈0.3–0.7) for focused replies.
	- Higher Temperature (>1.0) for more creative replies.
	"""
	)

	if __name__ == "__main__":
	demo.launch(share=True, show_error=True)