File size: 2,803 Bytes
b7cc45c
 
 
 
 
 
 
 
482c776
 
b7cc45c
 
482c776
b7cc45c
 
 
 
 
 
 
 
 
 
482c776
b7cc45c
 
482c776
b7cc45c
482c776
b7cc45c
 
482c776
b7cc45c
 
482c776
 
 
 
 
 
b7cc45c
 
 
 
 
 
 
 
482c776
 
 
 
b7cc45c
 
 
 
 
 
482c776
b7cc45c
 
 
 
 
 
 
 
 
 
482c776
b7cc45c
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import gradio as gr
import requests
import json
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# llama.cpp server endpoint
LLAMA_API_URL = "http://localhost:8000/v1/chat/completions"

class QwenChatbot:
    def __init__(self, model="qwen3-14b-q4_k_xl"):
        self.model = model
        self.history = []

    def generate_response(self, user_input, max_new_tokens=512):
        think_mode = user_input.endswith("/think")
        if think_mode:
            user_input = user_input.replace("/think", "").strip()
        elif user_input.endswith("/no_think"):
            user_input = user_input.replace("/no_think", "").strip()

        # Format messages for llama.cpp
        messages = self.history + [{"role": "user", "content": user_input}]
        if think_mode:
            messages.append({"role": "assistant", "content": "<think>\n\n</think>\n\n"})

        # Call llama.cpp API
        try:
            response = requests.post(
                LLAMA_API_URL,
                json={
                    "model": self.model,
                    "messages": messages,
                    "max_tokens": max_new_tokens,
                    "temperature": 0.6 if think_mode else 0.7,
                    "top_p": 0.95 if think_mode else 0.8,
                    "top_k": 20,
                    "stream": True
                },
                stream=True
            )
            response.raise_for_status()

            full_response = ""
            for line in response.iter_lines():
                if line:
                    chunk = json.loads(line.decode("utf-8").replace("data: ", ""))
                    if "choices" in chunk and chunk["choices"]:
                        content = chunk["choices"][0]["delta"].get("content", "")
                        full_response += content
                        yield full_response

            self.history.append({"role": "user", "content": user_input})
            self.history.append({"role": "assistant", "content": full_response})

        except Exception as e:
            logger.error(f"Error calling llama.cpp API: {e}")
            yield f"Error: {str(e)}"

def chat_function(user_input, history):
    chatbot = QwenChatbot()
    for response in chatbot.generate_response(user_input):
        yield response

demo = gr.ChatInterface(
    fn=chat_function,
    title="Qwen3 GGUF Chatbot (Streaming)",
    description="Chat with Qwen3-14B GGUF model via llama.cpp. Use /think for thoughtful responses, /no_think for direct responses.",
    chatbot=gr.Chatbot(height=500),
    textbox=gr.Textbox(placeholder="Type your message..."),
    submit_btn="Send",
    concurrency_limit=1,
    max_batch_size=1
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)