LutherAI / app.py
whidbeysea's picture
gemma model 2
855f77d
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from peft import PeftModel # Import PeftModel
# Define your fine-tuned model ID on Hugging Face Hub
# Make sure this matches the repo_id you used when pushing
model_id = "whidbeysea/gemma-2b-it-fine-tuned-catechism"
# Define the base model ID (the original model you fine-tuned)
base_model_id = "google/gemma-2b-it"
# Set the device to use (GPU if available, otherwise CPU)
# You might need to adjust this based on your Space hardware and configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load the tokenizer
try:
tokenizer = AutoTokenizer.from_pretrained(model_id)
print("Tokenizer loaded successfully.")
except Exception as e:
print(f"Error loading tokenizer from {model_id}: {e}")
tokenizer = None
# Load the base model
# You might need to specify the dtype and quantization based on how you trained
# For a T4 High-RAM, using bfloat16 might be possible, or load_in_4bit=True
# Since we fine-tuned with LoRA, we load the base model first.
base_model = None
if device == "cuda":
try:
base_model = AutoModelForCausalLM.from_pretrained(
base_model_id,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, # Try bfloat16 for T4 High-RAM
# load_in_4bit=True # You might need this if bfloat16 is not enough
)
print(f"Base model '{base_model_id}' loaded successfully on GPU.")
except Exception as e:
print(f"Error loading base model '{base_model_id}' on GPU: {e}")
print("Trying to load on CPU or with different settings...")
try:
base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
print(f"Base model '{base_model_id}' loaded successfully on CPU.")
except Exception as e_cpu:
print(f"Error loading base model '{base_model_id}' on CPU: {e_cpu}")
base_model = None
else: # Load on CPU
try:
base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
print(f"Base model '{base_model_id}' loaded successfully on CPU.")
except Exception as e:
print(f"Error loading base model '{base_model_id}' on CPU: {e}")
base_model = None
model = None
if base_model and tokenizer:
# Load the PEFT model (LoRA adapters)
try:
model = PeftModel.from_pretrained(base_model, model_id)
print(f"PEFT model loaded from {model_id}.")
# Move the model to the specified device
model.to(device)
print("Model moved to device.")
# Optional: Merge the LoRA adapters with the base model for potentially faster inference
# This might require more memory, so test if it works on your Space hardware
# print("Merging LoRA adapters...")
# model = model.merge_and_unload()
# print("LoRA adapters merged.")
except Exception as e:
print(f"Error loading PEFT model or moving to device: {e}")
model = None
generator = None
if model and tokenizer:
# Create a Hugging Face pipeline for text generation
try:
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=0 if device == "cuda" else -1, # Use GPU device 0 if cuda, else CPU -1
# Add other parameters as needed for generation (e.g., max_new_tokens, temperature, top_p, top_k)
max_new_tokens=200,
temperature=0.7,
top_p=0.95,
repetition_penalty=1.2,
pad_token_id=tokenizer.eos_token_id, # Set pad token id for generation
)
print("Text generation pipeline created.")
except Exception as e:
print(f"Error creating text generation pipeline: {e}")
generator = None
def generate_answer(question):
"""
Generates an answer using the fine-tuned model based on the input question.
"""
if generator is None:
return "Error: Model or pipeline not loaded."
# Format the prompt to match the training data format
# We used "Question: [question]\nAnswer: [answer]" during training
prompt = f"Question: {question}\nAnswer:"
try:
# Generate text
# The pipeline will handle tokenization and generation
response = generator(prompt)
# Extract the generated text
# The pipeline output is typically a list of dictionaries
if response and len(response) > 0 and 'generated_text' in response[0]:
generated_text = response[0]['generated_text']
# Post-process the generated text to extract only the answer part
# This depends on how your model was trained to respond after "Answer:"
# Find the start of the answer after the prompt
answer_start_marker = "Answer:"
if answer_start_marker in generated_text:
answer = generated_text.split(answer_start_marker, 1)[1].strip()
# You might want to further clean up the answer, e.g., remove extra newline characters or incomplete sentences.
# For example, split by newline and take the parts that look like answers.
lines = answer.split('\n')
cleaned_answer_lines = []
for line in lines:
cleaned_line = line.strip()
if cleaned_line and not cleaned_line.startswith("Question:"): # Avoid including subsequent Q&A if generated
cleaned_answer_lines.append(cleaned_line)
answer = "\n".join(cleaned_answer_lines)
else:
# If the prefix is not found, return the whole generated text after the prompt
# This might happen if the model doesn't follow the expected format
answer = generated_text.split(prompt, 1)[-1].strip() # Attempt to remove the prompt
return answer if answer else "Could not generate a relevant answer."
else:
return "Error: Could not generate text from the model."
except Exception as e:
return f"An error occurred during text generation: {e}"
# Create the Gradio interface
iface = gr.Interface(
fn=generate_answer,
inputs=gr.Textbox(label="Enter your question:"),
outputs=gr.Textbox(label="Generated Answer:"),
title="LutherAI Catechism Chatbot",
description="Ask questions about Luther's Large Catechism.",
allow_flagging="never" # Disable flagging for this example
)
# Launch the Gradio interface
iface.launch()