import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import torch import os model_name = "whidbeysea/luther-phi3-merged" offload_directory = "./offload_dir" os.makedirs(offload_directory, exist_ok=True) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for GPU bnb_4bit_use_double_quant=True, llm_int8_enable_fp32_cpu_offload=True, ) model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", # Use auto to detect GPU torch_dtype=None, quantization_config=bnb_config, offload_folder=offload_directory, offload_state_dict=True, ) tokenizer = AutoTokenizer.from_pretrained(model_name) def chat(message, history): prompt = f"[INST] {message} [/INST]" inputs = tokenizer(prompt, return_tensors="pt") inputs = {k: v.to("cuda") for k, v in inputs.items()} # Move inputs to GPU outputs = model.generate(**inputs, max_new_tokens=200) return tokenizer.decode(outputs, skip_special_tokens=True) gr.ChatInterface(chat).launch(share=True)