import os
import streamlit as st
from PyPDF2 import PdfReader
import pandas as pd
import docx
from sentence_transformers import SentenceTransformer
import faiss
from groq import Groq
import numpy as np
from sklearn.preprocessing import normalize

# Initialize Groq API
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# Initialize SentenceTransformer model
embedder_model = SentenceTransformer("all-MiniLM-L6-v2")

# Helper function to extract text from PDF
def extract_text_from_pdf(file):
    pdf_reader = PdfReader(file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Helper function to extract text from Excel
def extract_text_from_excel(file):
    df = pd.read_excel(file)
    return df.to_string()

# Helper function to extract text from Word document
def extract_text_from_docx(file):
    doc = docx.Document(file)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

# Function to chunk text into smaller parts
def chunk_text(text, chunk_size=512):
    # Split text into chunks of specified size
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Function to create FAISS index and store embeddings
def create_faiss_index(texts, model):
    embeddings = model.encode(texts)
    embeddings = normalize(embeddings)  # Normalize embeddings for better comparison
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Create FAISS index
    index.add(embeddings)  # Add embeddings to FAISS index
    return index, embeddings

# Function to retrieve context from FAISS
def retrieve_context(query, index, texts, model, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    retrieved_texts = [texts[i] for i in indices[0]]
    return "\n".join(retrieved_texts)

# Function to query Groq API
def query_groq_api(context, question):
    try:
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Context: {context}\nQuestion: {question}"}
            ],
            model="llama-3.3-70b-versatile",
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error querying Groq API: {e}"

# Streamlit App
def main():
    st.title("RAG-based Document Q&A")
    st.write("Upload a document, and ask questions based on its content.")

    uploaded_file = st.file_uploader("Upload your document", type=["pdf", "xlsx", "docx", "txt"])
    user_question = st.text_input("Enter your question:")

    if uploaded_file is not None:
        # Extract text based on file type
        if uploaded_file.name.endswith(".pdf"):
            context = extract_text_from_pdf(uploaded_file)
        elif uploaded_file.name.endswith(".xlsx"):
            context = extract_text_from_excel(uploaded_file)
        elif uploaded_file.name.endswith(".docx"):
            context = extract_text_from_docx(uploaded_file)
        elif uploaded_file.name.endswith(".txt"):
            context = uploaded_file.read().decode("utf-8")
        else:
            st.error("Unsupported file format!")
            return

        # Chunk the extracted text into smaller segments
        chunks = chunk_text(context)

        # Create FAISS index for the text chunks
        index, embeddings = create_faiss_index(chunks, embedder_model)

        if user_question:
            if st.button("Submit Question"):
                st.write("Answer:")

                # Retrieve relevant context from the FAISS index
                retrieved_context = retrieve_context(user_question, index, chunks, embedder_model)

                # Query Groq API with the context and question
                answer = query_groq_api(retrieved_context, user_question)
                st.success(answer)

if __name__ == "__main__":
    main()