import os
import PyPDF2
from qdrant_client import QdrantClient
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
import streamlit as st
import hashlib

# Load environment variables from .env
load_dotenv(".env")

# Initialize Azure OpenAI (as in notebook)
llm = AzureChatOpenAI(
    temperature=0,
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    model=os.getenv("AZURE_OPENAI_MODEL_NAME")  # Must match deployment name
)

# Qdrant configuration from environment
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
QDRANT_URL = os.getenv('QDRANT_CLOUD_URL')

# Helper functions from notebook

def load_pdf_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n"
    return text

def split_text(text, chunk_size=800, chunk_overlap=150):
    sentences = text.split('. ')
    chunks, chunk = [], ''
    for sentence in sentences:
        next_piece = (sentence + '. ').strip()
        if len(chunk) + len(next_piece) <= chunk_size:
            chunk += (next_piece + ' ')
        else:
            if chunk:
                chunks.append(chunk.strip())
            # start new chunk with overlap
            overlap = chunk[-chunk_overlap:] if chunk_overlap and len(chunk) > chunk_overlap else ''
            chunk = (overlap + next_piece + ' ')
    if chunk:
        chunks.append(chunk.strip())
    return chunks

# Azure embeddings helper

def _azure_base(url: str | None) -> str | None:
    if not url:
        return None
    idx = url.find("/openai")
    return url[:idx] if idx > 0 else url


def _init_azure_embedder():
    return AzureOpenAIEmbeddings(
        api_key=os.getenv("AZURE_OPENAI_EMBEDDING_API_KEY") or os.getenv("AZURE_OPENAI_KEY"),
        azure_endpoint=_azure_base(os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") or os.getenv("AZURE_OPENAI_ENDPOINT")),
        api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION") or os.getenv("AZURE_OPENAI_API_VERSION"),
        model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")
    )

# Streamlit UI
st.title("Chatbot using PDF Documents")

# Sidebar: upload PDFs
with st.sidebar:
    st.header("Upload PDFs")
    uploaded_files = st.file_uploader(
        "Upload one or more PDF files",
        type=["pdf"],
        accept_multiple_files=True
    )

# Automatically process when files are uploaded or changed
files_sig = (lambda files: (None if not files else hashlib.sha1("|".join(sorted([
    f"{uf.name}:{len((uf.getvalue() if hasattr(uf, 'getvalue') else uf.read()))}:{hashlib.sha1((uf.getvalue() if hasattr(uf, 'getvalue') else (uf.seek(0) or uf.read() or b''))).hexdigest()}"  # type: ignore
    for uf in files
])).encode()).hexdigest()))(uploaded_files)
if uploaded_files:
    if not QDRANT_URL or not QDRANT_API_KEY:
        st.error("QDRANT_URL or QDRANT_API_KEY is missing in the .env file.")
    elif files_sig != st.session_state.get('files_sig'):
        with st.spinner("Processing PDFs and building index..."):
            # Load and process uploaded PDF(s) with metadata and better chunking
            pdf_chunks, pdf_meta = [], []
            for uf in uploaded_files:
                try:
                    uf.seek(0)
                    reader = PyPDF2.PdfReader(uf)
                    for page_idx, page in enumerate(reader.pages, start=1):
                        page_text = page.extract_text() or ""
                        if not page_text.strip():
                            continue
                        for ch in split_text(page_text, chunk_size=800, chunk_overlap=150):
                            pdf_chunks.append(ch)
                            pdf_meta.append({"source": uf.name, "page": page_idx})
                except Exception as e:
                    st.error(f"Failed to read {uf.name}: {e}")

            # Generate embeddings using Azure OpenAI Embeddings
            embedder = _init_azure_embedder()
            embeddings = embedder.embed_documents(pdf_chunks) if pdf_chunks else []

            # Initialize Qdrant (always recreate to match embedding dimension)
            client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
            collection_name = 'pdf-chatbot-collection'
            dim = (len(embeddings[0]) if embeddings else 1536)
            client.recreate_collection(
                collection_name=collection_name,
                vectors_config={"size": dim, "distance": "Cosine"}
            )

            # Index embeddings with metadata
            points = [
                {
                    "id": i,
                    "vector": emb,
                    "payload": {"text": chunk, **meta}
                }
                for i, (emb, chunk, meta) in enumerate(zip(embeddings, pdf_chunks, pdf_meta))
            ]
            if points:
                client.upsert(collection_name=collection_name, points=points)

            # Store in session for querying
            st.session_state['qdrant_client'] = client
            st.session_state['collection_name'] = collection_name
            st.session_state['embedder'] = embedder
            st.session_state['index_ready'] = True
            st.session_state['files_sig'] = files_sig
        st.success("Index built successfully. You can now ask questions.")

# Text cleaning utility for retrieved chunks

def clean_text(t: str) -> str:
    if not t:
        return ""
    # Normalize whitespace
    t = t.replace('\u00A0', ' ').replace('\t', ' ')
    # Fix hyphenation across line breaks: "exam-\nple" -> "example"
    t = t.replace('-\n', '')
    # Collapse newlines and multiple spaces
    t = '\n'.join(line.strip() for line in t.splitlines())
    while '  ' in t:
        t = t.replace('  ', ' ')
    # Trim
    return t.strip()

# Retrieval logic — synthesize a single structured answer with history-aware prompting

def retrieve_answer(query, top_k=4):
    embedder = st.session_state.get('embedder')
    client = st.session_state.get('qdrant_client')
    collection_name = st.session_state.get('collection_name')

    if not embedder or not client or not collection_name:
        return "Index not initialized. Upload PDFs to build the index first."

    query_emb = embedder.embed_query(query)
    hits = client.search(collection_name=collection_name, query_vector=query_emb, limit=top_k)
    contexts, citations = [], []
    for h in hits:
        payload = getattr(h, 'payload', {}) or {}
        text = clean_text(payload.get('text', ''))
        src = payload.get('source', 'document')
        page = payload.get('page', None)
        if text:
            contexts.append(text)
            citations.append(f"{src} (page {page})" if page else src)

    context_block = "\n\n---\n\n".join(contexts[:top_k]) if contexts else ""

    # Build system prompt to enforce structured, user-friendly answers (generic for any PDF)
    system_prompt = (
        "You are a reliable retrieval-augmented assistant that answers questions about any kind of PDF content "
        "(technical, legal, scientific, financial, educational, etc.). Use ONLY the provided context snippets. "
        "Do not speculate or invent facts. If the information is not present, reply exactly: 'Not found in documents.' "
        "Return a clear, structured, user-friendly response with: a brief summary, bullet-point key facts, and a short conclusion. "
        "Include short citations with source filename and page numbers when available. Be concise and neutral."
    )

    # Include brief chat history for continuity (last 3 exchanges)
    history = st.session_state.get('messages', [])[-6:]
    history_msgs = []
    for m in history:
        role = m.get('role')
        content = m.get('content', '')
        if role == 'user':
            history_msgs.append(HumanMessage(content=content))
        elif role == 'assistant':
            history_msgs.append(AIMessage(content=content))

    user_content = (
        f"CONTEXT:\n{context_block}\n\n"
        f"QUESTION: {query}\n\n"
        "Format:\n# Answer\n\n- Bullet points of key facts\n\nConclusion\n\nCitations: list source and page numbers if available."
    )

    messages = [SystemMessage(content=system_prompt), *history_msgs, HumanMessage(content=user_content)]
    result = llm.invoke(messages)
    answer_text = getattr(result, 'content', str(result))

    if citations:
        answer_text += "\n\nSources: " + "; ".join(dict.fromkeys(citations))
    return answer_text

# Simple chat-style UI (only shown after index is ready)
ready = st.session_state.get('index_ready')
if 'messages' not in st.session_state:
    st.session_state['messages'] = []

if ready:
    for msg in st.session_state['messages']:
        with st.chat_message(msg['role']):
            st.markdown(msg['content'])

    user_input = st.chat_input("Ask a question about the uploaded PDFs")
    if user_input:
        st.session_state['messages'].append({"role": "user", "content": user_input})
        with st.chat_message("user"):
            st.markdown(user_input)

        with st.chat_message("assistant"):
            with st.spinner("Retrieving answer..."):
                answer_text = retrieve_answer(user_input, top_k=4)
            st.markdown(answer_text)
        st.session_state['messages'].append({"role": "assistant", "content": answer_text})
else:
    st.caption("Upload PDFs in the sidebar to start chatting.")