Spaces:
Paused
Paused
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
import torch | |
import os | |
model_name = "whidbeysea/luther-phi3-merged" | |
offload_directory = "./offload_dir" | |
os.makedirs(offload_directory, exist_ok=True) | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for GPU | |
bnb_4bit_use_double_quant=True, | |
llm_int8_enable_fp32_cpu_offload=True, | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
device_map="auto", # Use auto to detect GPU | |
torch_dtype=None, | |
quantization_config=bnb_config, | |
offload_folder=offload_directory, | |
offload_state_dict=True, | |
) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def chat(message, history): | |
prompt = f"<s>[INST] {message} [/INST]" | |
inputs = tokenizer(prompt, return_tensors="pt") | |
inputs = {k: v.to("cuda") for k, v in inputs.items()} # Move inputs to GPU | |
outputs = model.generate(**inputs, max_new_tokens=200) | |
return tokenizer.decode(outputs, skip_special_tokens=True) | |
gr.ChatInterface(chat).launch(share=True) |