import gradio as gr import requests import json import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # llama.cpp server endpoint LLAMA_API_URL = "http://localhost:8000/v1/chat/completions" class QwenChatbot: def __init__(self, model="qwen3-14b-q4_k_xl"): self.model = model self.history = [] def generate_response(self, user_input, max_new_tokens=512): think_mode = user_input.endswith("/think") if think_mode: user_input = user_input.replace("/think", "").strip() elif user_input.endswith("/no_think"): user_input = user_input.replace("/no_think", "").strip() # Format messages for llama.cpp messages = self.history + [{"role": "user", "content": user_input}] if think_mode: messages.append({"role": "assistant", "content": "\n\n\n\n"}) # Call llama.cpp API try: response = requests.post( LLAMA_API_URL, json={ "model": self.model, "messages": messages, "max_tokens": max_new_tokens, "temperature": 0.6 if think_mode else 0.7, "top_p": 0.95 if think_mode else 0.8, "top_k": 20, "stream": True }, stream=True ) response.raise_for_status() full_response = "" for line in response.iter_lines(): if line: chunk = json.loads(line.decode("utf-8").replace("data: ", "")) if "choices" in chunk and chunk["choices"]: content = chunk["choices"][0]["delta"].get("content", "") full_response += content yield full_response self.history.append({"role": "user", "content": user_input}) self.history.append({"role": "assistant", "content": full_response}) except Exception as e: logger.error(f"Error calling llama.cpp API: {e}") yield f"Error: {str(e)}" def chat_function(user_input, history): chatbot = QwenChatbot() for response in chatbot.generate_response(user_input): yield response demo = gr.ChatInterface( fn=chat_function, title="Qwen3 GGUF Chatbot (Streaming)", description="Chat with Qwen3-14B GGUF model via llama.cpp. Use /think for thoughtful responses, /no_think for direct responses.", chatbot=gr.Chatbot(height=500), textbox=gr.Textbox(placeholder="Type your message..."), submit_btn="Send", concurrency_limit=1, max_batch_size=1 ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)