import torch import spaces from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import gradio as gr # Step 1: Load base model base_model_name = "meta-llama/Llama-3.2-1B-Instruct" adapter_repo = "MegaTronX/Llama-3.2-1B-Instruct-Selectolax-QLoRA" base_model = AutoModelForCausalLM.from_pretrained( base_model_name, device_map="auto", torch_dtype=torch.bfloat16, ) # Step 2: Load LoRA adapter model_with_adapter = PeftModel.from_pretrained( base_model, adapter_repo, device_map="auto", ) print(f"Loaded LoRA adapter from {adapter_repo}") # Verify adapter configuration print(model_with_adapter.config) # Step 3: Load tokenizer tokenizer = AutoTokenizer.from_pretrained(base_model_name) # Step 4: Define inference function @spaces.GPU(duration=120) def generate_text(prompt, max_length=1024): inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to("cuda") outputs = model_with_adapter.generate(**inputs, max_length=max_length) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Step 5: Create Gradio interface iface = gr.Interface( fn=generate_text, inputs=[ gr.Textbox(label="Prompt", placeholder="Enter your text prompt here..."), gr.Slider(label="Max Length", minimum=50, maximum=1024, step=10, value=256), ], outputs="text", title="LLaMA + LoRA Text Generator", description="Generate text using a LLaMA model with LoRA adapters." ) # Step 6: Launch Gradio app if __name__ == "__main__": iface.launch()