Lab2 / app.py
oskaralf's picture
try
1785e1f
raw
history blame
1.2 kB
import torch
import torch
torch.cuda.is_available = lambda: False # Force torch to disable CUDA
from unsloth import FastLanguageModel
# Force CPU mode
device = "cpu"
# Load the base model in CPU mode
base_model_name = "unsloth/Llama-3.2-3B-Instruct"
base_model, tokenizer = FastLanguageModel.from_pretrained(
model_name=base_model_name,
max_seq_length=2048,
dtype="float32", # Use float32 for CPU
load_in_4bit=False # Disable 4-bit quantization for CPU
)
base_model.to(device)
# Apply LoRA adapters in CPU mode
from peft import PeftModel
lora_model_name = "oskaralf/lora_model" # Replace with your LoRA model path
model = PeftModel.from_pretrained(base_model, lora_model_name)
model.to(device)
# Prepare for inference in CPU mode
FastLanguageModel.for_inference(model)
# Gradio interface
import gradio as gr
def chatbot(input_text):
inputs = tokenizer(input_text, return_tensors="pt").to(device)
outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=64)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
iface = gr.Interface(fn=chatbot, inputs="text", outputs="text", title="Chatbot")
iface.launch()