Spaces:

whidbeysea
/

LutherAI

Paused

App Files Files Community

LutherAI / app.py

whidbeysea

gemma model 2

855f77d 2 months ago

raw

history blame contribute delete

6.66 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	import torch
	from peft import PeftModel # Import PeftModel

	# Define your fine-tuned model ID on Hugging Face Hub
	# Make sure this matches the repo_id you used when pushing
	model_id = "whidbeysea/gemma-2b-it-fine-tuned-catechism"

	# Define the base model ID (the original model you fine-tuned)
	base_model_id = "google/gemma-2b-it"

	# Set the device to use (GPU if available, otherwise CPU)
	# You might need to adjust this based on your Space hardware and configuration
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Load the tokenizer
	try:
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	print("Tokenizer loaded successfully.")
	except Exception as e:
	print(f"Error loading tokenizer from {model_id}: {e}")
	tokenizer = None


	# Load the base model
	# You might need to specify the dtype and quantization based on how you trained
	# For a T4 High-RAM, using bfloat16 might be possible, or load_in_4bit=True
	# Since we fine-tuned with LoRA, we load the base model first.
	base_model = None
	if device == "cuda":
	try:
	base_model = AutoModelForCausalLM.from_pretrained(
	base_model_id,
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, # Try bfloat16 for T4 High-RAM
	# load_in_4bit=True # You might need this if bfloat16 is not enough
	)
	print(f"Base model '{base_model_id}' loaded successfully on GPU.")
	except Exception as e:
	print(f"Error loading base model '{base_model_id}' on GPU: {e}")
	print("Trying to load on CPU or with different settings...")
	try:
	base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
	print(f"Base model '{base_model_id}' loaded successfully on CPU.")
	except Exception as e_cpu:
	print(f"Error loading base model '{base_model_id}' on CPU: {e_cpu}")
	base_model = None
	else: # Load on CPU
	try:
	base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
	print(f"Base model '{base_model_id}' loaded successfully on CPU.")
	except Exception as e:
	print(f"Error loading base model '{base_model_id}' on CPU: {e}")
	base_model = None


	model = None
	if base_model and tokenizer:
	# Load the PEFT model (LoRA adapters)
	try:
	model = PeftModel.from_pretrained(base_model, model_id)
	print(f"PEFT model loaded from {model_id}.")

	# Move the model to the specified device
	model.to(device)
	print("Model moved to device.")

	# Optional: Merge the LoRA adapters with the base model for potentially faster inference
	# This might require more memory, so test if it works on your Space hardware
	# print("Merging LoRA adapters...")
	# model = model.merge_and_unload()
	# print("LoRA adapters merged.")

	except Exception as e:
	print(f"Error loading PEFT model or moving to device: {e}")
	model = None


	generator = None
	if model and tokenizer:
	# Create a Hugging Face pipeline for text generation
	try:
	generator = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	device=0 if device == "cuda" else -1, # Use GPU device 0 if cuda, else CPU -1
	# Add other parameters as needed for generation (e.g., max_new_tokens, temperature, top_p, top_k)
	max_new_tokens=200,
	temperature=0.7,
	top_p=0.95,
	repetition_penalty=1.2,
	pad_token_id=tokenizer.eos_token_id, # Set pad token id for generation
	)
	print("Text generation pipeline created.")
	except Exception as e:
	print(f"Error creating text generation pipeline: {e}")
	generator = None


	def generate_answer(question):
	"""
	Generates an answer using the fine-tuned model based on the input question.
	"""
	if generator is None:
	return "Error: Model or pipeline not loaded."

	# Format the prompt to match the training data format
	# We used "Question: [question]\nAnswer: [answer]" during training
	prompt = f"Question: {question}\nAnswer:"

	try:
	# Generate text
	# The pipeline will handle tokenization and generation
	response = generator(prompt)

	# Extract the generated text
	# The pipeline output is typically a list of dictionaries
	if response and len(response) > 0 and 'generated_text' in response[0]:
	generated_text = response[0]['generated_text']

	# Post-process the generated text to extract only the answer part
	# This depends on how your model was trained to respond after "Answer:"
	# Find the start of the answer after the prompt
	answer_start_marker = "Answer:"
	if answer_start_marker in generated_text:
	answer = generated_text.split(answer_start_marker, 1)[1].strip()
	# You might want to further clean up the answer, e.g., remove extra newline characters or incomplete sentences.
	# For example, split by newline and take the parts that look like answers.
	lines = answer.split('\n')
	cleaned_answer_lines = []
	for line in lines:
	cleaned_line = line.strip()
	if cleaned_line and not cleaned_line.startswith("Question:"): # Avoid including subsequent Q&A if generated
	cleaned_answer_lines.append(cleaned_line)
	answer = "\n".join(cleaned_answer_lines)

	else:
	# If the prefix is not found, return the whole generated text after the prompt
	# This might happen if the model doesn't follow the expected format
	answer = generated_text.split(prompt, 1)[-1].strip() # Attempt to remove the prompt

	return answer if answer else "Could not generate a relevant answer."
	else:
	return "Error: Could not generate text from the model."

	except Exception as e:
	return f"An error occurred during text generation: {e}"

	# Create the Gradio interface
	iface = gr.Interface(
	fn=generate_answer,
	inputs=gr.Textbox(label="Enter your question:"),
	outputs=gr.Textbox(label="Generated Answer:"),
	title="LutherAI Catechism Chatbot",
	description="Ask questions about Luther's Large Catechism.",
	allow_flagging="never" # Disable flagging for this example
	)

	# Launch the Gradio interface
	iface.launch()