QWEN3-GGUF / tapp.py
broadfield-dev's picture
Rename app.py to tapp.py
ec09d64 verified
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import logging
import shutil
import stat
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class QwenChatbot:
#https://huggingface.co/unsloth/Qwen3-14B-GGUF/resolve/main/Qwen3-14B-UD-Q4_K_XL.gguf?download=true
def __init__(self, model_repo="unsloth/Qwen3-14B-GGUF", gguf_file="Qwen3-14B-UD-Q4_K_XL.gguf"):
# Use the Hugging Face cache directory
self.cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE", "./cache")
logger.info(f"Using cache directory: {self.cache_dir}")
# Ensure cache directory exists and is writable
try:
os.makedirs(self.cache_dir, exist_ok=True)
cache_stat = os.stat(self.cache_dir)
logger.info(f"Cache directory permissions: {oct(cache_stat.st_mode & 0o777)}")
logger.info(f"Cache directory owner: UID {cache_stat.st_uid}, GID {cache_stat.st_gid}")
if not (cache_stat.st_mode & stat.S_IWUSR):
logger.error(f"Cache directory is not writable: {self.cache_dir}")
raise PermissionError(f"Cache directory is not writable: {self.cache_dir}")
except PermissionError as e:
logger.error(f"Permission error with cache directory: {e}")
raise
except Exception as e:
logger.error(f"Failed to set up cache directory: {e}")
raise
# Check disk space (assume 15GB needed)
total, used, free = shutil.disk_usage(self.cache_dir)
free_mb = free / (1024 * 1024)
if free_mb < 15000:
logger.error(f"Insufficient disk space: {free_mb:.2f}MB available")
raise RuntimeError("Insufficient disk space")
# Download model manually
logger.info(f"Downloading model from {model_repo}/{gguf_file}")
try:
model_path = hf_hub_download(
repo_id=model_repo,
filename=gguf_file,
cache_dir=self.cache_dir
)
logger.info(f"Model downloaded to: {model_path}")
# Fix file permissions
os.chmod(model_path, 0o644)
file_stat = os.stat(model_path)
logger.info(f"Model file permissions: {oct(file_stat.st_mode & 0o777)}")
logger.info(f"Model file owner: UID {file_stat.st_uid}, GID {file_stat.st_gid}")
# Fix parent directory permissions
parent_dir = os.path.dirname(model_path)
os.chmod(parent_dir, 0o755)
logger.info(f"Set parent directory permissions to 0o755: {parent_dir}")
except Exception as e:
logger.error(f"Failed to download model: {e}")
raise
# Load model
logger.info(f"Loading GGUF model from: {model_path}")
try:
self.llm = Llama(
model_path=model_path,
n_ctx=1024, # Reduced for lower memory usage
n_threads=4,
n_batch=128, # Reduced for lower memory usage
n_gpu_layers=1 if os.getenv("CUDA_VISIBLE_DEVICES") else 0, # GPU offloading if available
verbose=True
)
logger.info("Model loaded successfully")
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise
self.history = []
def generate_response(self, user_input, max_new_tokens=512):
think_mode = user_input.endswith("/think")
if think_mode:
user_input = user_input.replace("/think", "").strip()
elif user_input.endswith("/no_think"):
user_input = user_input.replace("/no_think", "").strip()
messages = self.history + [{"role": "user", "content": user_input}]
prompt = self._format_chatml(messages, think_mode=think_mode)
gen_kwargs = {
"max_tokens": max_new_tokens,
"stream": True,
"temperature": 0.6 if think_mode else 0.7,
"top_p": 0.95 if think_mode else 0.8,
"top_k": 20,
"min_p": 0.0,
"stop": ["<|im_end|>"]
}
full_response = ""
for chunk in self.llm(prompt, **gen_kwargs):
new_text = chunk["choices"][0]["text"]
full_response += new_text
yield full_response
self.history.append({"role": "user", "content": user_input})
self.history.append({"role": "assistant", "content": full_response})
def _format_chatml(self, messages, think_mode=False):
prompt = ""
for msg in messages:
if msg["role"] == "user":
prompt += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
elif msg["role"] == "assistant":
prompt += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
prompt += "<|im_start|>assistant\n"
if think_mode:
prompt += "<think>\n\n</think>\n\n"
return prompt
try:
chatbot = QwenChatbot()
except Exception as e:
logger.error(f"Failed to initialize chatbot: {e}")
raise
def chat_function(user_input, history):
yield from chatbot.generate_response(user_input)
demo = gr.ChatInterface(
fn=chat_function,
title="Qwen3 GGUF Chatbot (Streaming)",
description="Chat with Qwen3-14B GGUF model. Use /think for thoughtful responses.",
chatbot=gr.Chatbot(height=500),
textbox=gr.Textbox(placeholder="Type your message..."),
submit_btn="Send",
concurrency_limit=1,
max_batch_size=1
)
demo.launch()