Spaces:
Sleeping
Sleeping
%%writefile app.py | |
import streamlit as st | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
from peft import LoraConfig | |
from trl import SFTTrainer | |
from datasets import load_dataset | |
# Define Streamlit interface | |
st.title("Llama-2-7b-Chat Fine-Tuned Model") | |
st.write("This app demonstrates a fine-tuned Llama-2-7b model using QLoRA.") | |
# Input text prompt | |
prompt = st.text_input("Enter your prompt:", value="What is opensource llm?") | |
# Model settings | |
st.write("Loading the model...") | |
# Load model and tokenizer | |
model_name = "NousResearch/Llama-2-7b-chat-hf" | |
dataset_name = "mlabonne/guanaco-llama2-1k" | |
# QLoRA parameters | |
lora_r = 64 | |
lora_alpha = 16 | |
lora_dropout = 0.1 | |
use_4bit = True | |
bnb_4bit_compute_dtype = "float16" | |
bnb_4bit_quant_type = "nf4" | |
use_nested_quant = False | |
compute_dtype = getattr(torch, bnb_4bit_compute_dtype) | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=use_4bit, | |
bnb_4bit_quant_type=bnb_4bit_quant_type, | |
bnb_4bit_compute_dtype=compute_dtype, | |
bnb_4bit_use_double_quant=use_nested_quant, | |
) | |
device_map = {"": 0} | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
quantization_config=bnb_config, | |
device_map=device_map | |
) | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
tokenizer.pad_token = tokenizer.eos_token | |
tokenizer.padding_side = "right" | |
# Run inference | |
if st.button("Generate"): | |
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200) | |
result = pipe(f"<s>[INST] {prompt} [/INST]") | |
st.write(result[0]['generated_text']) | |
prompt = "What is open-source LLM?" | |
print(generate_text(prompt)) | |