import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from peft import PeftModel # Import PeftModel

# Define your fine-tuned model ID on Hugging Face Hub
# Make sure this matches the repo_id you used when pushing
model_id = "whidbeysea/gemma-2b-it-fine-tuned-catechism"

# Define the base model ID (the original model you fine-tuned)
base_model_id = "google/gemma-2b-it"

# Set the device to use (GPU if available, otherwise CPU)
# You might need to adjust this based on your Space hardware and configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    print("Tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer from {model_id}: {e}")
    tokenizer = None


# Load the base model
# You might need to specify the dtype and quantization based on how you trained
# For a T4 High-RAM, using bfloat16 might be possible, or load_in_4bit=True
# Since we fine-tuned with LoRA, we load the base model first.
base_model = None
if device == "cuda":
    try:
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_id,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, # Try bfloat16 for T4 High-RAM
            # load_in_4bit=True # You might need this if bfloat16 is not enough
        )
        print(f"Base model '{base_model_id}' loaded successfully on GPU.")
    except Exception as e:
        print(f"Error loading base model '{base_model_id}' on GPU: {e}")
        print("Trying to load on CPU or with different settings...")
        try:
            base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
            print(f"Base model '{base_model_id}' loaded successfully on CPU.")
        except Exception as e_cpu:
            print(f"Error loading base model '{base_model_id}' on CPU: {e_cpu}")
            base_model = None
else: # Load on CPU
    try:
        base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
        print(f"Base model '{base_model_id}' loaded successfully on CPU.")
    except Exception as e:
        print(f"Error loading base model '{base_model_id}' on CPU: {e}")
        base_model = None


model = None
if base_model and tokenizer:
    # Load the PEFT model (LoRA adapters)
    try:
        model = PeftModel.from_pretrained(base_model, model_id)
        print(f"PEFT model loaded from {model_id}.")

        # Move the model to the specified device
        model.to(device)
        print("Model moved to device.")

        # Optional: Merge the LoRA adapters with the base model for potentially faster inference
        # This might require more memory, so test if it works on your Space hardware
        # print("Merging LoRA adapters...")
        # model = model.merge_and_unload()
        # print("LoRA adapters merged.")

    except Exception as e:
        print(f"Error loading PEFT model or moving to device: {e}")
        model = None


generator = None
if model and tokenizer:
    # Create a Hugging Face pipeline for text generation
    try:
        generator = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=0 if device == "cuda" else -1, # Use GPU device 0 if cuda, else CPU -1
            # Add other parameters as needed for generation (e.g., max_new_tokens, temperature, top_p, top_k)
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.95,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id, # Set pad token id for generation
        )
        print("Text generation pipeline created.")
    except Exception as e:
        print(f"Error creating text generation pipeline: {e}")
        generator = None


def generate_answer(question):
    """
    Generates an answer using the fine-tuned model based on the input question.
    """
    if generator is None:
        return "Error: Model or pipeline not loaded."

    # Format the prompt to match the training data format
    # We used "Question: [question]\nAnswer: [answer]" during training
    prompt = f"Question: {question}\nAnswer:"

    try:
        # Generate text
        # The pipeline will handle tokenization and generation
        response = generator(prompt)

        # Extract the generated text
        # The pipeline output is typically a list of dictionaries
        if response and len(response) > 0 and 'generated_text' in response[0]:
            generated_text = response[0]['generated_text']

            # Post-process the generated text to extract only the answer part
            # This depends on how your model was trained to respond after "Answer:"
            # Find the start of the answer after the prompt
            answer_start_marker = "Answer:"
            if answer_start_marker in generated_text:
                 answer = generated_text.split(answer_start_marker, 1)[1].strip()
                 # You might want to further clean up the answer, e.g., remove extra newline characters or incomplete sentences.
                 # For example, split by newline and take the parts that look like answers.
                 lines = answer.split('\n')
                 cleaned_answer_lines = []
                 for line in lines:
                      cleaned_line = line.strip()
                      if cleaned_line and not cleaned_line.startswith("Question:"): # Avoid including subsequent Q&A if generated
                           cleaned_answer_lines.append(cleaned_line)
                 answer = "\n".join(cleaned_answer_lines)

            else:
                # If the prefix is not found, return the whole generated text after the prompt
                # This might happen if the model doesn't follow the expected format
                answer = generated_text.split(prompt, 1)[-1].strip() # Attempt to remove the prompt

            return answer if answer else "Could not generate a relevant answer."
        else:
            return "Error: Could not generate text from the model."

    except Exception as e:
        return f"An error occurred during text generation: {e}"

# Create the Gradio interface
iface = gr.Interface(
    fn=generate_answer,
    inputs=gr.Textbox(label="Enter your question:"),
    outputs=gr.Textbox(label="Generated Answer:"),
    title="LutherAI Catechism Chatbot",
    description="Ask questions about Luther's Large Catechism.",
    allow_flagging="never" # Disable flagging for this example
)

# Launch the Gradio interface
iface.launch()