import os import PyPDF2 from qdrant_client import QdrantClient from dotenv import load_dotenv from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings from langchain_core.messages import SystemMessage, HumanMessage, AIMessage import streamlit as st import hashlib # Load environment variables from .env load_dotenv(".env") # Initialize Azure OpenAI (as in notebook) llm = AzureChatOpenAI( temperature=0, api_key=os.getenv("AZURE_OPENAI_KEY"), api_version=os.getenv("AZURE_OPENAI_API_VERSION"), azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), model=os.getenv("AZURE_OPENAI_MODEL_NAME") # Must match deployment name ) # Qdrant configuration from environment QDRANT_API_KEY = os.getenv('QDRANT_API_KEY') QDRANT_URL = os.getenv('QDRANT_CLOUD_URL') # Helper functions from notebook def load_pdf_text(pdf_path): text = "" with open(pdf_path, 'rb') as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: page_text = page.extract_text() or "" text += page_text + "\n" return text def split_text(text, chunk_size=800, chunk_overlap=150): sentences = text.split('. ') chunks, chunk = [], '' for sentence in sentences: next_piece = (sentence + '. ').strip() if len(chunk) + len(next_piece) <= chunk_size: chunk += (next_piece + ' ') else: if chunk: chunks.append(chunk.strip()) # start new chunk with overlap overlap = chunk[-chunk_overlap:] if chunk_overlap and len(chunk) > chunk_overlap else '' chunk = (overlap + next_piece + ' ') if chunk: chunks.append(chunk.strip()) return chunks # Azure embeddings helper def _azure_base(url: str | None) -> str | None: if not url: return None idx = url.find("/openai") return url[:idx] if idx > 0 else url def _init_azure_embedder(): return AzureOpenAIEmbeddings( api_key=os.getenv("AZURE_OPENAI_EMBEDDING_API_KEY") or os.getenv("AZURE_OPENAI_KEY"), azure_endpoint=_azure_base(os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") or os.getenv("AZURE_OPENAI_ENDPOINT")), api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION") or os.getenv("AZURE_OPENAI_API_VERSION"), model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME") ) # Streamlit UI st.title("Chatbot using PDF Documents") # Sidebar: upload PDFs with st.sidebar: st.header("Upload PDFs") uploaded_files = st.file_uploader( "Upload one or more PDF files", type=["pdf"], accept_multiple_files=True ) # Automatically process when files are uploaded or changed files_sig = (lambda files: (None if not files else hashlib.sha1("|".join(sorted([ f"{uf.name}:{len((uf.getvalue() if hasattr(uf, 'getvalue') else uf.read()))}:{hashlib.sha1((uf.getvalue() if hasattr(uf, 'getvalue') else (uf.seek(0) or uf.read() or b''))).hexdigest()}" # type: ignore for uf in files ])).encode()).hexdigest()))(uploaded_files) if uploaded_files: if not QDRANT_URL or not QDRANT_API_KEY: st.error("QDRANT_URL or QDRANT_API_KEY is missing in the .env file.") elif files_sig != st.session_state.get('files_sig'): with st.spinner("Processing PDFs and building index..."): # Load and process uploaded PDF(s) with metadata and better chunking pdf_chunks, pdf_meta = [], [] for uf in uploaded_files: try: uf.seek(0) reader = PyPDF2.PdfReader(uf) for page_idx, page in enumerate(reader.pages, start=1): page_text = page.extract_text() or "" if not page_text.strip(): continue for ch in split_text(page_text, chunk_size=800, chunk_overlap=150): pdf_chunks.append(ch) pdf_meta.append({"source": uf.name, "page": page_idx}) except Exception as e: st.error(f"Failed to read {uf.name}: {e}") # Generate embeddings using Azure OpenAI Embeddings embedder = _init_azure_embedder() embeddings = embedder.embed_documents(pdf_chunks) if pdf_chunks else [] # Initialize Qdrant (always recreate to match embedding dimension) client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY) collection_name = 'pdf-chatbot-collection' dim = (len(embeddings[0]) if embeddings else 1536) client.recreate_collection( collection_name=collection_name, vectors_config={"size": dim, "distance": "Cosine"} ) # Index embeddings with metadata points = [ { "id": i, "vector": emb, "payload": {"text": chunk, **meta} } for i, (emb, chunk, meta) in enumerate(zip(embeddings, pdf_chunks, pdf_meta)) ] if points: client.upsert(collection_name=collection_name, points=points) # Store in session for querying st.session_state['qdrant_client'] = client st.session_state['collection_name'] = collection_name st.session_state['embedder'] = embedder st.session_state['index_ready'] = True st.session_state['files_sig'] = files_sig st.success("Index built successfully. You can now ask questions.") # Text cleaning utility for retrieved chunks def clean_text(t: str) -> str: if not t: return "" # Normalize whitespace t = t.replace('\u00A0', ' ').replace('\t', ' ') # Fix hyphenation across line breaks: "exam-\nple" -> "example" t = t.replace('-\n', '') # Collapse newlines and multiple spaces t = '\n'.join(line.strip() for line in t.splitlines()) while ' ' in t: t = t.replace(' ', ' ') # Trim return t.strip() # Retrieval logic — synthesize a single structured answer with history-aware prompting def retrieve_answer(query, top_k=4): embedder = st.session_state.get('embedder') client = st.session_state.get('qdrant_client') collection_name = st.session_state.get('collection_name') if not embedder or not client or not collection_name: return "Index not initialized. Upload PDFs to build the index first." query_emb = embedder.embed_query(query) hits = client.search(collection_name=collection_name, query_vector=query_emb, limit=top_k) contexts, citations = [], [] for h in hits: payload = getattr(h, 'payload', {}) or {} text = clean_text(payload.get('text', '')) src = payload.get('source', 'document') page = payload.get('page', None) if text: contexts.append(text) citations.append(f"{src} (page {page})" if page else src) context_block = "\n\n---\n\n".join(contexts[:top_k]) if contexts else "" # Build system prompt to enforce structured, user-friendly answers (generic for any PDF) system_prompt = ( "You are a reliable retrieval-augmented assistant that answers questions about any kind of PDF content " "(technical, legal, scientific, financial, educational, etc.). Use ONLY the provided context snippets. " "Do not speculate or invent facts. If the information is not present, reply exactly: 'Not found in documents.' " "Return a clear, structured, user-friendly response with: a brief summary, bullet-point key facts, and a short conclusion. " "Include short citations with source filename and page numbers when available. Be concise and neutral." ) # Include brief chat history for continuity (last 3 exchanges) history = st.session_state.get('messages', [])[-6:] history_msgs = [] for m in history: role = m.get('role') content = m.get('content', '') if role == 'user': history_msgs.append(HumanMessage(content=content)) elif role == 'assistant': history_msgs.append(AIMessage(content=content)) user_content = ( f"CONTEXT:\n{context_block}\n\n" f"QUESTION: {query}\n\n" "Format:\n# Answer\n\n- Bullet points of key facts\n\nConclusion\n\nCitations: list source and page numbers if available." ) messages = [SystemMessage(content=system_prompt), *history_msgs, HumanMessage(content=user_content)] result = llm.invoke(messages) answer_text = getattr(result, 'content', str(result)) if citations: answer_text += "\n\nSources: " + "; ".join(dict.fromkeys(citations)) return answer_text # Simple chat-style UI (only shown after index is ready) ready = st.session_state.get('index_ready') if 'messages' not in st.session_state: st.session_state['messages'] = [] if ready: for msg in st.session_state['messages']: with st.chat_message(msg['role']): st.markdown(msg['content']) user_input = st.chat_input("Ask a question about the uploaded PDFs") if user_input: st.session_state['messages'].append({"role": "user", "content": user_input}) with st.chat_message("user"): st.markdown(user_input) with st.chat_message("assistant"): with st.spinner("Retrieving answer..."): answer_text = retrieve_answer(user_input, top_k=4) st.markdown(answer_text) st.session_state['messages'].append({"role": "assistant", "content": answer_text}) else: st.caption("Upload PDFs in the sidebar to start chatting.")