Spaces:
Build error
Build error
import gradio as gr | |
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
import os | |
import logging | |
import shutil | |
import stat | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class QwenChatbot: | |
#https://huggingface.co/unsloth/Qwen3-14B-GGUF/resolve/main/Qwen3-14B-UD-Q4_K_XL.gguf?download=true | |
def __init__(self, model_repo="unsloth/Qwen3-14B-GGUF", gguf_file="Qwen3-14B-UD-Q4_K_XL.gguf"): | |
# Use the Hugging Face cache directory | |
self.cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE", "./cache") | |
logger.info(f"Using cache directory: {self.cache_dir}") | |
# Ensure cache directory exists and is writable | |
try: | |
os.makedirs(self.cache_dir, exist_ok=True) | |
cache_stat = os.stat(self.cache_dir) | |
logger.info(f"Cache directory permissions: {oct(cache_stat.st_mode & 0o777)}") | |
logger.info(f"Cache directory owner: UID {cache_stat.st_uid}, GID {cache_stat.st_gid}") | |
if not (cache_stat.st_mode & stat.S_IWUSR): | |
logger.error(f"Cache directory is not writable: {self.cache_dir}") | |
raise PermissionError(f"Cache directory is not writable: {self.cache_dir}") | |
except PermissionError as e: | |
logger.error(f"Permission error with cache directory: {e}") | |
raise | |
except Exception as e: | |
logger.error(f"Failed to set up cache directory: {e}") | |
raise | |
# Check disk space (assume 15GB needed) | |
total, used, free = shutil.disk_usage(self.cache_dir) | |
free_mb = free / (1024 * 1024) | |
if free_mb < 15000: | |
logger.error(f"Insufficient disk space: {free_mb:.2f}MB available") | |
raise RuntimeError("Insufficient disk space") | |
# Download model manually | |
logger.info(f"Downloading model from {model_repo}/{gguf_file}") | |
try: | |
model_path = hf_hub_download( | |
repo_id=model_repo, | |
filename=gguf_file, | |
cache_dir=self.cache_dir | |
) | |
logger.info(f"Model downloaded to: {model_path}") | |
# Fix file permissions | |
os.chmod(model_path, 0o644) | |
file_stat = os.stat(model_path) | |
logger.info(f"Model file permissions: {oct(file_stat.st_mode & 0o777)}") | |
logger.info(f"Model file owner: UID {file_stat.st_uid}, GID {file_stat.st_gid}") | |
# Fix parent directory permissions | |
parent_dir = os.path.dirname(model_path) | |
os.chmod(parent_dir, 0o755) | |
logger.info(f"Set parent directory permissions to 0o755: {parent_dir}") | |
except Exception as e: | |
logger.error(f"Failed to download model: {e}") | |
raise | |
# Load model | |
logger.info(f"Loading GGUF model from: {model_path}") | |
try: | |
self.llm = Llama( | |
model_path=model_path, | |
n_ctx=1024, # Reduced for lower memory usage | |
n_threads=4, | |
n_batch=128, # Reduced for lower memory usage | |
n_gpu_layers=1 if os.getenv("CUDA_VISIBLE_DEVICES") else 0, # GPU offloading if available | |
verbose=True | |
) | |
logger.info("Model loaded successfully") | |
except Exception as e: | |
logger.error(f"Failed to load model: {e}") | |
raise | |
self.history = [] | |
def generate_response(self, user_input, max_new_tokens=512): | |
think_mode = user_input.endswith("/think") | |
if think_mode: | |
user_input = user_input.replace("/think", "").strip() | |
elif user_input.endswith("/no_think"): | |
user_input = user_input.replace("/no_think", "").strip() | |
messages = self.history + [{"role": "user", "content": user_input}] | |
prompt = self._format_chatml(messages, think_mode=think_mode) | |
gen_kwargs = { | |
"max_tokens": max_new_tokens, | |
"stream": True, | |
"temperature": 0.6 if think_mode else 0.7, | |
"top_p": 0.95 if think_mode else 0.8, | |
"top_k": 20, | |
"min_p": 0.0, | |
"stop": ["<|im_end|>"] | |
} | |
full_response = "" | |
for chunk in self.llm(prompt, **gen_kwargs): | |
new_text = chunk["choices"][0]["text"] | |
full_response += new_text | |
yield full_response | |
self.history.append({"role": "user", "content": user_input}) | |
self.history.append({"role": "assistant", "content": full_response}) | |
def _format_chatml(self, messages, think_mode=False): | |
prompt = "" | |
for msg in messages: | |
if msg["role"] == "user": | |
prompt += f"<|im_start|>user\n{msg['content']}<|im_end|>\n" | |
elif msg["role"] == "assistant": | |
prompt += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n" | |
prompt += "<|im_start|>assistant\n" | |
if think_mode: | |
prompt += "<think>\n\n</think>\n\n" | |
return prompt | |
try: | |
chatbot = QwenChatbot() | |
except Exception as e: | |
logger.error(f"Failed to initialize chatbot: {e}") | |
raise | |
def chat_function(user_input, history): | |
yield from chatbot.generate_response(user_input) | |
demo = gr.ChatInterface( | |
fn=chat_function, | |
title="Qwen3 GGUF Chatbot (Streaming)", | |
description="Chat with Qwen3-14B GGUF model. Use /think for thoughtful responses.", | |
chatbot=gr.Chatbot(height=500), | |
textbox=gr.Textbox(placeholder="Type your message..."), | |
submit_btn="Send", | |
concurrency_limit=1, | |
max_batch_size=1 | |
) | |
demo.launch() |