Spaces:

broadfield-dev
/

QWEN3-GGUF

Build error

File size: 5,591 Bytes

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import logging
import shutil
import stat

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class QwenChatbot:
    #https://huggingface.co/unsloth/Qwen3-14B-GGUF/resolve/main/Qwen3-14B-UD-Q4_K_XL.gguf?download=true
    def __init__(self, model_repo="unsloth/Qwen3-14B-GGUF", gguf_file="Qwen3-14B-UD-Q4_K_XL.gguf"):
        # Use the Hugging Face cache directory
        self.cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE", "./cache")
        logger.info(f"Using cache directory: {self.cache_dir}")

        # Ensure cache directory exists and is writable
        try:
            os.makedirs(self.cache_dir, exist_ok=True)
            cache_stat = os.stat(self.cache_dir)
            logger.info(f"Cache directory permissions: {oct(cache_stat.st_mode & 0o777)}")
            logger.info(f"Cache directory owner: UID {cache_stat.st_uid}, GID {cache_stat.st_gid}")
            if not (cache_stat.st_mode & stat.S_IWUSR):
                logger.error(f"Cache directory is not writable: {self.cache_dir}")
                raise PermissionError(f"Cache directory is not writable: {self.cache_dir}")
        except PermissionError as e:
            logger.error(f"Permission error with cache directory: {e}")
            raise
        except Exception as e:
            logger.error(f"Failed to set up cache directory: {e}")
            raise

        # Check disk space (assume 15GB needed)
        total, used, free = shutil.disk_usage(self.cache_dir)
        free_mb = free / (1024 * 1024)
        if free_mb < 15000:
            logger.error(f"Insufficient disk space: {free_mb:.2f}MB available")
            raise RuntimeError("Insufficient disk space")

        # Download model manually
        logger.info(f"Downloading model from {model_repo}/{gguf_file}")
        try:
            model_path = hf_hub_download(
                repo_id=model_repo,
                filename=gguf_file,
                cache_dir=self.cache_dir
            )
            logger.info(f"Model downloaded to: {model_path}")
            # Fix file permissions
            os.chmod(model_path, 0o644)
            file_stat = os.stat(model_path)
            logger.info(f"Model file permissions: {oct(file_stat.st_mode & 0o777)}")
            logger.info(f"Model file owner: UID {file_stat.st_uid}, GID {file_stat.st_gid}")
            # Fix parent directory permissions
            parent_dir = os.path.dirname(model_path)
            os.chmod(parent_dir, 0o755)
            logger.info(f"Set parent directory permissions to 0o755: {parent_dir}")
        except Exception as e:
            logger.error(f"Failed to download model: {e}")
            raise

        # Load model
        logger.info(f"Loading GGUF model from: {model_path}")
        try:
            self.llm = Llama(
                model_path=model_path,
                n_ctx=1024,  # Reduced for lower memory usage
                n_threads=4,
                n_batch=128,  # Reduced for lower memory usage
                n_gpu_layers=1 if os.getenv("CUDA_VISIBLE_DEVICES") else 0,  # GPU offloading if available
                verbose=True
            )
            logger.info("Model loaded successfully")
        except Exception as e:
            logger.error(f"Failed to load model: {e}")
            raise

        self.history = []

    def generate_response(self, user_input, max_new_tokens=512):
        think_mode = user_input.endswith("/think")
        if think_mode:
            user_input = user_input.replace("/think", "").strip()
        elif user_input.endswith("/no_think"):
            user_input = user_input.replace("/no_think", "").strip()

        messages = self.history + [{"role": "user", "content": user_input}]
        prompt = self._format_chatml(messages, think_mode=think_mode)

        gen_kwargs = {
            "max_tokens": max_new_tokens,
            "stream": True,
            "temperature": 0.6 if think_mode else 0.7,
            "top_p": 0.95 if think_mode else 0.8,
            "top_k": 20,
            "min_p": 0.0,
            "stop": ["<|im_end|>"]
        }

        full_response = ""
        for chunk in self.llm(prompt, **gen_kwargs):
            new_text = chunk["choices"][0]["text"]
            full_response += new_text
            yield full_response

        self.history.append({"role": "user", "content": user_input})
        self.history.append({"role": "assistant", "content": full_response})

    def _format_chatml(self, messages, think_mode=False):
        prompt = ""
        for msg in messages:
            if msg["role"] == "user":
                prompt += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
            elif msg["role"] == "assistant":
                prompt += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
        prompt += "<|im_start|>assistant\n"
        if think_mode:
            prompt += "<think>\n\n</think>\n\n"
        return prompt

try:
    chatbot = QwenChatbot()
except Exception as e:
    logger.error(f"Failed to initialize chatbot: {e}")
    raise

def chat_function(user_input, history):
    yield from chatbot.generate_response(user_input)

demo = gr.ChatInterface(
    fn=chat_function,
    title="Qwen3 GGUF Chatbot (Streaming)",
    description="Chat with Qwen3-14B GGUF model. Use /think for thoughtful responses.",
    chatbot=gr.Chatbot(height=500),
    textbox=gr.Textbox(placeholder="Type your message..."),
    submit_btn="Send",
    concurrency_limit=1,
    max_batch_size=1
)

demo.launch()