import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import torch # Replace with your model repository ID model_repo_id = "ubiodee/Plutuslearn-Llama-3.2-3B-Instruct" # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_repo_id) # Load the base model and apply the PEFT adapter base_model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-3.2-3B-Instruct", torch_dtype=torch.float16, device_map="auto" ) model = PeftModel.from_pretrained(base_model, model_repo_id) # Define the prediction function def predict(text): inputs = tokenizer(text, return_tensors="pt").to("cuda") outputs = model.generate(**inputs, max_length=100) # Adjust parameters as needed return tokenizer.decode(outputs[0], skip_special_tokens=True) # Create Gradio interface demo = gr.Interface( fn=predict, inputs=gr.Textbox(label="Input Text"), outputs=gr.Textbox(label="Model Output"), title="My Model Demo", description="Test the fine-tuned model hosted on Hugging Face." ) # Launch the app demo.launch()