import os import gradio as gr from llama_cpp import Llama HF_TOKEN = os.environ.get("scalable") MODEL_ID = "FatimaZh/llama-3.2-1b-merged-code-gguf-v2" GGUF_FILENAME = "merged_fp16_code_v2.gguf" # Load the GGUF model via llama-cpp llm = Llama.from_pretrained( repo_id=MODEL_ID, filename=GGUF_FILENAME, hf_token=HF_TOKEN, n_ctx=4096, # context length n_gpu_layers=-1, # -1 = all layers on GPU if available ) def respond( message, history: list[dict[str, str]], system_message, max_tokens, temperature, top_p, ): # 1. Safety: history can be None if history is None: history = [] # 2. Truncate history so it doesn't grow forever # 0 = stateless (no previous turns) # 2 = last 2 exchanges, etc. MAX_HISTORY_TURNS = 0 if MAX_HISTORY_TURNS <= 0: trimmed_history = [] else: trimmed_history = history[-MAX_HISTORY_TURNS:] # 3. Build chat messages for llama-cpp messages = [] if system_message: messages.append({"role": "system", "content": system_message}) # Use ONLY trimmed history (or none) messages.extend(trimmed_history) messages.append({"role": "user", "content": message}) # 4. Generate with llama-cpp out = llm.create_chat_completion( messages=messages, max_tokens=int(max_tokens), temperature=float(temperature), top_p=float(top_p), ) return out["choices"][0]["message"]["content"] # UI with gr.Blocks(theme=gr.themes.Soft(), title="Khadija Chatbot") as demo: gr.Markdown( """ # 🤖 Khadija's Fine-tuned LLM (GGUF) Talk with your fine-tuned Llama model. Use the controls on the right to tweak creativity and response length. """ ) with gr.Row(): # Chat area with gr.Column(scale=3): gr.ChatInterface( fn=respond, type="messages", # history is list of {"role","content"} additional_inputs=[ gr.Textbox( value="You are a helpful and friendly assistant.", label="System message", lines=2, ), gr.Slider( minimum=1, maximum=1024, value=256, step=1, label="Max new tokens", ), gr.Slider( minimum=0.1, maximum=4.0, value=0.8, step=0.1, label="Temperature", ), gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)", ), ], ) # Side info with gr.Column(scale=1): gr.Markdown( f""" ### â„šī¸ Model info - **Model repo:** `{MODEL_ID}` - **File:** `{GGUF_FILENAME}` - **Backend:** `llama-cpp-python` ### 💡 Tips - Lower **Max new tokens** for faster answers. - Lower **Temperature** (≈0.3–0.7) for focused replies. - Higher **Temperature** (>1.0) for more creative replies. """ ) if __name__ == "__main__": demo.launch(share=True)