%%writefile app.py import streamlit as st import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from peft import LoraConfig from trl import SFTTrainer from datasets import load_dataset # Define Streamlit interface st.title("Llama-2-7b-Chat Fine-Tuned Model") st.write("This app demonstrates a fine-tuned Llama-2-7b model using QLoRA.") # Input text prompt prompt = st.text_input("Enter your prompt:", value="What is opensource llm?") # Model settings st.write("Loading the model...") # Load model and tokenizer model_name = "NousResearch/Llama-2-7b-chat-hf" dataset_name = "mlabonne/guanaco-llama2-1k" # QLoRA parameters lora_r = 64 lora_alpha = 16 lora_dropout = 0.1 use_4bit = True bnb_4bit_compute_dtype = "float16" bnb_4bit_quant_type = "nf4" use_nested_quant = False compute_dtype = getattr(torch, bnb_4bit_compute_dtype) bnb_config = BitsAndBytesConfig( load_in_4bit=use_4bit, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=use_nested_quant, ) device_map = {"": 0} model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map=device_map ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" # Run inference if st.button("Generate"): pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200) result = pipe(f"[INST] {prompt} [/INST]") st.write(result[0]['generated_text']) prompt = "What is open-source LLM?" print(generate_text(prompt))