|
import torch |
|
from unsloth import FastLanguageModel |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
base_model_name = "unsloth/Llama-3.2-3B-Instruct" |
|
base_model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name=base_model_name, |
|
max_seq_length=2048, |
|
dtype=None, |
|
load_in_4bit=False, |
|
) |
|
base_model.to(device) |
|
|
|
|
|
from peft import PeftModel |
|
|
|
lora_model_name = "oskaralf/lora_model" |
|
model = PeftModel.from_pretrained(base_model, lora_model_name) |
|
model.to(device) |
|
|
|
|
|
FastLanguageModel.for_inference(model) |
|
|
|
|
|
import gradio as gr |
|
|
|
def chatbot(input_text): |
|
inputs = tokenizer(input_text, return_tensors="pt").to(device) |
|
outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=64) |
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
return response |
|
|
|
iface = gr.Interface(fn=chatbot, inputs="text", outputs="text", title="Chatbot") |
|
iface.launch() |
|
|