Spaces:
Build error
Build error
File size: 2,803 Bytes
b7cc45c 482c776 b7cc45c 482c776 b7cc45c 482c776 b7cc45c 482c776 b7cc45c 482c776 b7cc45c 482c776 b7cc45c 482c776 b7cc45c 482c776 b7cc45c 482c776 b7cc45c 482c776 b7cc45c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import gradio as gr
import requests
import json
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# llama.cpp server endpoint
LLAMA_API_URL = "http://localhost:8000/v1/chat/completions"
class QwenChatbot:
def __init__(self, model="qwen3-14b-q4_k_xl"):
self.model = model
self.history = []
def generate_response(self, user_input, max_new_tokens=512):
think_mode = user_input.endswith("/think")
if think_mode:
user_input = user_input.replace("/think", "").strip()
elif user_input.endswith("/no_think"):
user_input = user_input.replace("/no_think", "").strip()
# Format messages for llama.cpp
messages = self.history + [{"role": "user", "content": user_input}]
if think_mode:
messages.append({"role": "assistant", "content": "<think>\n\n</think>\n\n"})
# Call llama.cpp API
try:
response = requests.post(
LLAMA_API_URL,
json={
"model": self.model,
"messages": messages,
"max_tokens": max_new_tokens,
"temperature": 0.6 if think_mode else 0.7,
"top_p": 0.95 if think_mode else 0.8,
"top_k": 20,
"stream": True
},
stream=True
)
response.raise_for_status()
full_response = ""
for line in response.iter_lines():
if line:
chunk = json.loads(line.decode("utf-8").replace("data: ", ""))
if "choices" in chunk and chunk["choices"]:
content = chunk["choices"][0]["delta"].get("content", "")
full_response += content
yield full_response
self.history.append({"role": "user", "content": user_input})
self.history.append({"role": "assistant", "content": full_response})
except Exception as e:
logger.error(f"Error calling llama.cpp API: {e}")
yield f"Error: {str(e)}"
def chat_function(user_input, history):
chatbot = QwenChatbot()
for response in chatbot.generate_response(user_input):
yield response
demo = gr.ChatInterface(
fn=chat_function,
title="Qwen3 GGUF Chatbot (Streaming)",
description="Chat with Qwen3-14B GGUF model via llama.cpp. Use /think for thoughtful responses, /no_think for direct responses.",
chatbot=gr.Chatbot(height=500),
textbox=gr.Textbox(placeholder="Type your message..."),
submit_btn="Send",
concurrency_limit=1,
max_batch_size=1
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |