Spaces:

broadfield-dev
/

QWEN3-GGUF

Build error

App Files Files Community

QWEN3-GGUF / tapp.py

broadfield-dev

Rename app.py to tapp.py

ec09d64 verified 4 months ago

raw

history blame contribute delete

5.59 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os
	import logging
	import shutil
	import stat

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class QwenChatbot:
	#https://huggingface.co/unsloth/Qwen3-14B-GGUF/resolve/main/Qwen3-14B-UD-Q4_K_XL.gguf?download=true
	def __init__(self, model_repo="unsloth/Qwen3-14B-GGUF", gguf_file="Qwen3-14B-UD-Q4_K_XL.gguf"):
	# Use the Hugging Face cache directory
	self.cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE", "./cache")
	logger.info(f"Using cache directory: {self.cache_dir}")

	# Ensure cache directory exists and is writable
	try:
	os.makedirs(self.cache_dir, exist_ok=True)
	cache_stat = os.stat(self.cache_dir)
	logger.info(f"Cache directory permissions: {oct(cache_stat.st_mode & 0o777)}")
	logger.info(f"Cache directory owner: UID {cache_stat.st_uid}, GID {cache_stat.st_gid}")
	if not (cache_stat.st_mode & stat.S_IWUSR):
	logger.error(f"Cache directory is not writable: {self.cache_dir}")
	raise PermissionError(f"Cache directory is not writable: {self.cache_dir}")
	except PermissionError as e:
	logger.error(f"Permission error with cache directory: {e}")
	raise
	except Exception as e:
	logger.error(f"Failed to set up cache directory: {e}")
	raise

	# Check disk space (assume 15GB needed)
	total, used, free = shutil.disk_usage(self.cache_dir)
	free_mb = free / (1024 * 1024)
	if free_mb < 15000:
	logger.error(f"Insufficient disk space: {free_mb:.2f}MB available")
	raise RuntimeError("Insufficient disk space")

	# Download model manually
	logger.info(f"Downloading model from {model_repo}/{gguf_file}")
	try:
	model_path = hf_hub_download(
	repo_id=model_repo,
	filename=gguf_file,
	cache_dir=self.cache_dir
	)
	logger.info(f"Model downloaded to: {model_path}")
	# Fix file permissions
	os.chmod(model_path, 0o644)
	file_stat = os.stat(model_path)
	logger.info(f"Model file permissions: {oct(file_stat.st_mode & 0o777)}")
	logger.info(f"Model file owner: UID {file_stat.st_uid}, GID {file_stat.st_gid}")
	# Fix parent directory permissions
	parent_dir = os.path.dirname(model_path)
	os.chmod(parent_dir, 0o755)
	logger.info(f"Set parent directory permissions to 0o755: {parent_dir}")
	except Exception as e:
	logger.error(f"Failed to download model: {e}")
	raise

	# Load model
	logger.info(f"Loading GGUF model from: {model_path}")
	try:
	self.llm = Llama(
	model_path=model_path,
	n_ctx=1024, # Reduced for lower memory usage
	n_threads=4,
	n_batch=128, # Reduced for lower memory usage
	n_gpu_layers=1 if os.getenv("CUDA_VISIBLE_DEVICES") else 0, # GPU offloading if available
	verbose=True
	)
	logger.info("Model loaded successfully")
	except Exception as e:
	logger.error(f"Failed to load model: {e}")
	raise

	self.history = []

	def generate_response(self, user_input, max_new_tokens=512):
	think_mode = user_input.endswith("/think")
	if think_mode:
	user_input = user_input.replace("/think", "").strip()
	elif user_input.endswith("/no_think"):
	user_input = user_input.replace("/no_think", "").strip()

	messages = self.history + [{"role": "user", "content": user_input}]
	prompt = self._format_chatml(messages, think_mode=think_mode)

	gen_kwargs = {
	"max_tokens": max_new_tokens,
	"stream": True,
	"temperature": 0.6 if think_mode else 0.7,
	"top_p": 0.95 if think_mode else 0.8,
	"top_k": 20,
	"min_p": 0.0,
	"stop": ["<\|im_end\|>"]
	}

	full_response = ""
	for chunk in self.llm(prompt, **gen_kwargs):
	new_text = chunk["choices"][0]["text"]
	full_response += new_text
	yield full_response

	self.history.append({"role": "user", "content": user_input})
	self.history.append({"role": "assistant", "content": full_response})

	def _format_chatml(self, messages, think_mode=False):
	prompt = ""
	for msg in messages:
	if msg["role"] == "user":
	prompt += f"<\|im_start\|>user\n{msg['content']}<\|im_end\|>\n"
	elif msg["role"] == "assistant":
	prompt += f"<\|im_start\|>assistant\n{msg['content']}<\|im_end\|>\n"
	prompt += "<\|im_start\|>assistant\n"
	if think_mode:
	prompt += "<think>\n\n</think>\n\n"
	return prompt

	try:
	chatbot = QwenChatbot()
	except Exception as e:
	logger.error(f"Failed to initialize chatbot: {e}")
	raise

	def chat_function(user_input, history):
	yield from chatbot.generate_response(user_input)

	demo = gr.ChatInterface(
	fn=chat_function,
	title="Qwen3 GGUF Chatbot (Streaming)",
	description="Chat with Qwen3-14B GGUF model. Use /think for thoughtful responses.",
	chatbot=gr.Chatbot(height=500),
	textbox=gr.Textbox(placeholder="Type your message..."),
	submit_btn="Send",
	concurrency_limit=1,
	max_batch_size=1
	)

	demo.launch()