Spaces:

broadfield-dev
/

QWEN3-GGUF

Build error

File size: 2,803 Bytes

b7cc45c
 
 
 
 
 
 
 
482c776
 
b7cc45c
 
482c776
b7cc45c
 
 
 
 
 
 
 
 
 
482c776
b7cc45c
 
482c776
b7cc45c
482c776
b7cc45c
 
482c776
b7cc45c
 
482c776
 
 
 
 
 
b7cc45c
 
 
 
 
 
 
 
482c776
 
 
 
b7cc45c
 
 
 
 
 
482c776
b7cc45c
 
 
 
 
 
 
 
 
 
482c776
b7cc45c

import gradio as gr
import requests
import json
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# llama.cpp server endpoint
LLAMA_API_URL = "http://localhost:8000/v1/chat/completions"

class QwenChatbot:
    def __init__(self, model="qwen3-14b-q4_k_xl"):
        self.model = model
        self.history = []

    def generate_response(self, user_input, max_new_tokens=512):
        think_mode = user_input.endswith("/think")
        if think_mode:
            user_input = user_input.replace("/think", "").strip()
        elif user_input.endswith("/no_think"):
            user_input = user_input.replace("/no_think", "").strip()

        # Format messages for llama.cpp
        messages = self.history + [{"role": "user", "content": user_input}]
        if think_mode:
            messages.append({"role": "assistant", "content": "<think>\n\n</think>\n\n"})

        # Call llama.cpp API
        try:
            response = requests.post(
                LLAMA_API_URL,
                json={
                    "model": self.model,
                    "messages": messages,
                    "max_tokens": max_new_tokens,
                    "temperature": 0.6 if think_mode else 0.7,
                    "top_p": 0.95 if think_mode else 0.8,
                    "top_k": 20,
                    "stream": True
                },
                stream=True
            )
            response.raise_for_status()

            full_response = ""
            for line in response.iter_lines():
                if line:
                    chunk = json.loads(line.decode("utf-8").replace("data: ", ""))
                    if "choices" in chunk and chunk["choices"]:
                        content = chunk["choices"][0]["delta"].get("content", "")
                        full_response += content
                        yield full_response

            self.history.append({"role": "user", "content": user_input})
            self.history.append({"role": "assistant", "content": full_response})

        except Exception as e:
            logger.error(f"Error calling llama.cpp API: {e}")
            yield f"Error: {str(e)}"

def chat_function(user_input, history):
    chatbot = QwenChatbot()
    for response in chatbot.generate_response(user_input):
        yield response

demo = gr.ChatInterface(
    fn=chat_function,
    title="Qwen3 GGUF Chatbot (Streaming)",
    description="Chat with Qwen3-14B GGUF model via llama.cpp. Use /think for thoughtful responses, /no_think for direct responses.",
    chatbot=gr.Chatbot(height=500),
    textbox=gr.Textbox(placeholder="Type your message..."),
    submit_btn="Send",
    concurrency_limit=1,
    max_batch_size=1
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)