File size: 5,591 Bytes
7372005
c855771
ac750c5
7372005
e86cd7e
e5abddf
9059a15
e86cd7e
 
 
7372005
 
ca9fe3f
9600f65
9059a15
 
007bb91
 
c855771
9059a15
 
 
8757a8c
 
9059a15
 
 
c855771
 
 
9059a15
c855771
9059a15
 
e5abddf
 
 
 
 
 
 
ac750c5
 
e86cd7e
ac750c5
e86cd7e
c855771
ac750c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5abddf
007bb91
e5abddf
 
677c723
e86cd7e
 
 
 
 
 
845f023
 
7372005
 
 
 
 
 
 
 
ac750c5
7372005
 
 
 
ac750c5
 
 
 
7372005
 
 
 
 
 
 
 
 
 
 
 
ac750c5
7372005
 
 
 
 
 
 
ac750c5
 
7372005
 
e86cd7e
 
 
 
 
7372005
 
 
 
 
 
007bb91
e5abddf
7372005
 
 
 
007bb91
7372005
 
f092576
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import logging
import shutil
import stat

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class QwenChatbot:
    #https://huggingface.co/unsloth/Qwen3-14B-GGUF/resolve/main/Qwen3-14B-UD-Q4_K_XL.gguf?download=true
    def __init__(self, model_repo="unsloth/Qwen3-14B-GGUF", gguf_file="Qwen3-14B-UD-Q4_K_XL.gguf"):
        # Use the Hugging Face cache directory
        self.cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE", "./cache")
        logger.info(f"Using cache directory: {self.cache_dir}")

        # Ensure cache directory exists and is writable
        try:
            os.makedirs(self.cache_dir, exist_ok=True)
            cache_stat = os.stat(self.cache_dir)
            logger.info(f"Cache directory permissions: {oct(cache_stat.st_mode & 0o777)}")
            logger.info(f"Cache directory owner: UID {cache_stat.st_uid}, GID {cache_stat.st_gid}")
            if not (cache_stat.st_mode & stat.S_IWUSR):
                logger.error(f"Cache directory is not writable: {self.cache_dir}")
                raise PermissionError(f"Cache directory is not writable: {self.cache_dir}")
        except PermissionError as e:
            logger.error(f"Permission error with cache directory: {e}")
            raise
        except Exception as e:
            logger.error(f"Failed to set up cache directory: {e}")
            raise

        # Check disk space (assume 15GB needed)
        total, used, free = shutil.disk_usage(self.cache_dir)
        free_mb = free / (1024 * 1024)
        if free_mb < 15000:
            logger.error(f"Insufficient disk space: {free_mb:.2f}MB available")
            raise RuntimeError("Insufficient disk space")

        # Download model manually
        logger.info(f"Downloading model from {model_repo}/{gguf_file}")
        try:
            model_path = hf_hub_download(
                repo_id=model_repo,
                filename=gguf_file,
                cache_dir=self.cache_dir
            )
            logger.info(f"Model downloaded to: {model_path}")
            # Fix file permissions
            os.chmod(model_path, 0o644)
            file_stat = os.stat(model_path)
            logger.info(f"Model file permissions: {oct(file_stat.st_mode & 0o777)}")
            logger.info(f"Model file owner: UID {file_stat.st_uid}, GID {file_stat.st_gid}")
            # Fix parent directory permissions
            parent_dir = os.path.dirname(model_path)
            os.chmod(parent_dir, 0o755)
            logger.info(f"Set parent directory permissions to 0o755: {parent_dir}")
        except Exception as e:
            logger.error(f"Failed to download model: {e}")
            raise

        # Load model
        logger.info(f"Loading GGUF model from: {model_path}")
        try:
            self.llm = Llama(
                model_path=model_path,
                n_ctx=1024,  # Reduced for lower memory usage
                n_threads=4,
                n_batch=128,  # Reduced for lower memory usage
                n_gpu_layers=1 if os.getenv("CUDA_VISIBLE_DEVICES") else 0,  # GPU offloading if available
                verbose=True
            )
            logger.info("Model loaded successfully")
        except Exception as e:
            logger.error(f"Failed to load model: {e}")
            raise

        self.history = []

    def generate_response(self, user_input, max_new_tokens=512):
        think_mode = user_input.endswith("/think")
        if think_mode:
            user_input = user_input.replace("/think", "").strip()
        elif user_input.endswith("/no_think"):
            user_input = user_input.replace("/no_think", "").strip()

        messages = self.history + [{"role": "user", "content": user_input}]
        prompt = self._format_chatml(messages, think_mode=think_mode)

        gen_kwargs = {
            "max_tokens": max_new_tokens,
            "stream": True,
            "temperature": 0.6 if think_mode else 0.7,
            "top_p": 0.95 if think_mode else 0.8,
            "top_k": 20,
            "min_p": 0.0,
            "stop": ["<|im_end|>"]
        }

        full_response = ""
        for chunk in self.llm(prompt, **gen_kwargs):
            new_text = chunk["choices"][0]["text"]
            full_response += new_text
            yield full_response

        self.history.append({"role": "user", "content": user_input})
        self.history.append({"role": "assistant", "content": full_response})

    def _format_chatml(self, messages, think_mode=False):
        prompt = ""
        for msg in messages:
            if msg["role"] == "user":
                prompt += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
            elif msg["role"] == "assistant":
                prompt += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
        prompt += "<|im_start|>assistant\n"
        if think_mode:
            prompt += "<think>\n\n</think>\n\n"
        return prompt

try:
    chatbot = QwenChatbot()
except Exception as e:
    logger.error(f"Failed to initialize chatbot: {e}")
    raise

def chat_function(user_input, history):
    yield from chatbot.generate_response(user_input)

demo = gr.ChatInterface(
    fn=chat_function,
    title="Qwen3 GGUF Chatbot (Streaming)",
    description="Chat with Qwen3-14B GGUF model. Use /think for thoughtful responses.",
    chatbot=gr.Chatbot(height=500),
    textbox=gr.Textbox(placeholder="Type your message..."),
    submit_btn="Send",
    concurrency_limit=1,
    max_batch_size=1
)

demo.launch()