Spaces:
Build error
Build error
File size: 5,591 Bytes
7372005 c855771 ac750c5 7372005 e86cd7e e5abddf 9059a15 e86cd7e 7372005 ca9fe3f 9600f65 9059a15 007bb91 c855771 9059a15 8757a8c 9059a15 c855771 9059a15 c855771 9059a15 e5abddf ac750c5 e86cd7e ac750c5 e86cd7e c855771 ac750c5 e5abddf 007bb91 e5abddf 677c723 e86cd7e 845f023 7372005 ac750c5 7372005 ac750c5 7372005 ac750c5 7372005 ac750c5 7372005 e86cd7e 7372005 007bb91 e5abddf 7372005 007bb91 7372005 f092576 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import logging
import shutil
import stat
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class QwenChatbot:
#https://huggingface.co/unsloth/Qwen3-14B-GGUF/resolve/main/Qwen3-14B-UD-Q4_K_XL.gguf?download=true
def __init__(self, model_repo="unsloth/Qwen3-14B-GGUF", gguf_file="Qwen3-14B-UD-Q4_K_XL.gguf"):
# Use the Hugging Face cache directory
self.cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE", "./cache")
logger.info(f"Using cache directory: {self.cache_dir}")
# Ensure cache directory exists and is writable
try:
os.makedirs(self.cache_dir, exist_ok=True)
cache_stat = os.stat(self.cache_dir)
logger.info(f"Cache directory permissions: {oct(cache_stat.st_mode & 0o777)}")
logger.info(f"Cache directory owner: UID {cache_stat.st_uid}, GID {cache_stat.st_gid}")
if not (cache_stat.st_mode & stat.S_IWUSR):
logger.error(f"Cache directory is not writable: {self.cache_dir}")
raise PermissionError(f"Cache directory is not writable: {self.cache_dir}")
except PermissionError as e:
logger.error(f"Permission error with cache directory: {e}")
raise
except Exception as e:
logger.error(f"Failed to set up cache directory: {e}")
raise
# Check disk space (assume 15GB needed)
total, used, free = shutil.disk_usage(self.cache_dir)
free_mb = free / (1024 * 1024)
if free_mb < 15000:
logger.error(f"Insufficient disk space: {free_mb:.2f}MB available")
raise RuntimeError("Insufficient disk space")
# Download model manually
logger.info(f"Downloading model from {model_repo}/{gguf_file}")
try:
model_path = hf_hub_download(
repo_id=model_repo,
filename=gguf_file,
cache_dir=self.cache_dir
)
logger.info(f"Model downloaded to: {model_path}")
# Fix file permissions
os.chmod(model_path, 0o644)
file_stat = os.stat(model_path)
logger.info(f"Model file permissions: {oct(file_stat.st_mode & 0o777)}")
logger.info(f"Model file owner: UID {file_stat.st_uid}, GID {file_stat.st_gid}")
# Fix parent directory permissions
parent_dir = os.path.dirname(model_path)
os.chmod(parent_dir, 0o755)
logger.info(f"Set parent directory permissions to 0o755: {parent_dir}")
except Exception as e:
logger.error(f"Failed to download model: {e}")
raise
# Load model
logger.info(f"Loading GGUF model from: {model_path}")
try:
self.llm = Llama(
model_path=model_path,
n_ctx=1024, # Reduced for lower memory usage
n_threads=4,
n_batch=128, # Reduced for lower memory usage
n_gpu_layers=1 if os.getenv("CUDA_VISIBLE_DEVICES") else 0, # GPU offloading if available
verbose=True
)
logger.info("Model loaded successfully")
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise
self.history = []
def generate_response(self, user_input, max_new_tokens=512):
think_mode = user_input.endswith("/think")
if think_mode:
user_input = user_input.replace("/think", "").strip()
elif user_input.endswith("/no_think"):
user_input = user_input.replace("/no_think", "").strip()
messages = self.history + [{"role": "user", "content": user_input}]
prompt = self._format_chatml(messages, think_mode=think_mode)
gen_kwargs = {
"max_tokens": max_new_tokens,
"stream": True,
"temperature": 0.6 if think_mode else 0.7,
"top_p": 0.95 if think_mode else 0.8,
"top_k": 20,
"min_p": 0.0,
"stop": ["<|im_end|>"]
}
full_response = ""
for chunk in self.llm(prompt, **gen_kwargs):
new_text = chunk["choices"][0]["text"]
full_response += new_text
yield full_response
self.history.append({"role": "user", "content": user_input})
self.history.append({"role": "assistant", "content": full_response})
def _format_chatml(self, messages, think_mode=False):
prompt = ""
for msg in messages:
if msg["role"] == "user":
prompt += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
elif msg["role"] == "assistant":
prompt += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
prompt += "<|im_start|>assistant\n"
if think_mode:
prompt += "<think>\n\n</think>\n\n"
return prompt
try:
chatbot = QwenChatbot()
except Exception as e:
logger.error(f"Failed to initialize chatbot: {e}")
raise
def chat_function(user_input, history):
yield from chatbot.generate_response(user_input)
demo = gr.ChatInterface(
fn=chat_function,
title="Qwen3 GGUF Chatbot (Streaming)",
description="Chat with Qwen3-14B GGUF model. Use /think for thoughtful responses.",
chatbot=gr.Chatbot(height=500),
textbox=gr.Textbox(placeholder="Type your message..."),
submit_btn="Send",
concurrency_limit=1,
max_batch_size=1
)
demo.launch() |