|
from unsloth import FastLanguageModel |
|
from peft import PeftModel |
|
from transformers import AutoTokenizer |
|
import gradio as gr |
|
|
|
base_model_name = "unsloth/Llama-3.2-3B-Instruct" |
|
base_model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name=base_model_name, |
|
max_seq_length=2048, |
|
dtype=None, |
|
load_in_4bit=True |
|
) |
|
|
|
|
|
lora_model_name = "oskaralf/lora_model" |
|
model = PeftModel.from_pretrained(base_model, lora_model_name) |
|
FastLanguageModel.for_inference(model) |
|
|
|
def chatbot(input_text): |
|
inputs = tokenizer(input_text, return_tensors="pt").to("cuda") |
|
outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=64) |
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
return response |
|
|
|
iface = gr.Interface(fn=chatbot, inputs="text", outputs="text", title="Chatbot") |
|
iface.launch() |
|
|