import os import streamlit as st import numpy as np from pypdf import PdfReader from typing import List, Dict from sentence_transformers import SentenceTransformer import chromadb # Try importing Groq client try: from groq import Groq except ImportError: Groq = None # Config SIMILARITY_THRESHOLD = 0.20 TOP_K = 3 # Utility Functions def load_api_key() -> str: """Load the GROQ API key from environment or Hugging Face token fallback.""" api_key = os.environ.get("GROQ_API_KEY") if not api_key: try: from huggingface_hub import HfFolder api_key = HfFolder.get_token() except Exception: pass return api_key def setup_groq() -> Groq: """Initialize Groq client with API key.""" api_key = load_api_key() if not api_key: st.error("❌ Missing GROQ_API_KEY in environment or Hugging Face secrets.") return None if Groq is None: st.error("❌ Groq library not installed. Please add `groq` to requirements.txt.") return None try: client = Groq(api_key=api_key) return client except Exception as e: st.error(f"Failed to initialize Groq client: {e}") return None @st.cache_resource def load_embedding_model(model_name: str = "all-MiniLM-L6-v2") -> SentenceTransformer: """Load and cache the embedding model.""" return SentenceTransformer(model_name) def pdf_to_chunks(uploaded_file, chunk_size: int = 500, overlap: int = 50) -> List[Dict]: """Convert PDF to overlapping text chunks.""" try: reader = PdfReader(uploaded_file) except Exception as e: st.error(f"Error reading PDF: {e}") return [] chunks = [] for page_num, page in enumerate(reader.pages, start=1): try: text = page.extract_text() or "" except Exception: text = "" if not text.strip(): continue words = text.split() step = max(1, chunk_size - overlap) for i in range(0, len(words), step): chunk_text = " ".join(words[i:i + chunk_size]) if chunk_text.strip(): chunks.append({ "page_number": page_num, "text": chunk_text }) return chunks def create_vector_database(chunks: List[Dict], embedding_model: SentenceTransformer) -> str: """Create a new ChromaDB collection with embeddings and return its name.""" if not chunks: st.error("No text chunks extracted from PDF.") return None client = chromadb.Client() collection_name = f"pdf_chunks_{np.random.randint(10000)}" try: collection = client.create_collection(collection_name) except Exception as e: st.error(f"Error creating collection: {e}") return None texts = [c["text"] for c in chunks] ids = [str(i) for i in range(len(chunks))] # Encode in batches for safety embeddings = [] batch_size = 64 for i in range(0, len(texts), batch_size): batch = texts[i:i + batch_size] emb = embedding_model.encode(batch) embeddings.extend(emb.tolist() if hasattr(emb, 'tolist') else list(map(list, emb))) try: collection.add( embeddings=embeddings, documents=texts, ids=ids, metadatas=chunks ) except Exception as e: st.error(f"Error adding embeddings: {e}") return None # Store only the collection name (not object) in session_state st.session_state.collection_name = collection_name # Also store a simple flag in vector_db for UI readiness st.session_state.vector_db = collection_name return collection_name def query_vector_database(query: str, embedding_model: SentenceTransformer, top_k: int = TOP_K) -> List[Dict]: """Query ChromaDB for relevant chunks.""" if "collection_name" not in st.session_state: st.error("No active collection found. Upload and process a PDF first.") return [] try: client = chromadb.Client() collection = client.get_collection(st.session_state.collection_name) except Exception as e: st.error(f"Error accessing collection: {e}") return [] try: query_embedding = embedding_model.encode([query]).tolist() except Exception as e: st.error(f"Error encoding query: {e}") return [] try: results = collection.query( query_embeddings=query_embedding, n_results=top_k ) except Exception as e: st.error(f"Error querying database: {e}") return [] documents = results.get("documents", [[]])[0] metadatas = results.get("metadatas", [[]])[0] dists = results.get("distances", [[]])[0] if "distances" in results else [] relevant_chunks = [] for i, doc in enumerate(documents): meta = metadatas[i] if i < len(metadatas) else {} distance = dists[i] if i < len(dists) else None if distance is None: similarity = 1.0 elif isinstance(distance, (int, float)) and distance <= 1: similarity = max(0, 1 - distance) else: try: similarity = float(distance) except Exception: similarity = 0.0 relevant_chunks.append({ "text": doc, "page_number": meta.get("page_number", "N/A"), "similarity": similarity }) return relevant_chunks def generate_answer_with_groq(client, query: str, relevant_chunks: List[Dict]) -> str: """Generate answer from Groq LLM using retrieved context.""" try: context_parts = [f"[Page {c['page_number']}]: {c['text']}" for c in relevant_chunks] context = "\n\n".join(context_parts) if context_parts else "" prompt = f"""Based ONLY on the following context from a PDF document, answer the user's question. Context: {context} Question: {query} Instructions: - Answer using ONLY the information provided in the context above - If the context does not contain enough information to answer the question, reply exactly: ❌ Insufficient evidence - Always include page citations in your answer using the format [Page X] - Be accurate and concise - Do not add information not present in the context Answer:""" if hasattr(client, "chat") and hasattr(client.chat, "completions"): chat_resp = client.chat.completions.create( model="llama-3.1-8b-instant", messages=[ {"role": "system", "content": "You are a strict assistant that only uses provided context."}, {"role": "user", "content": prompt} ], temperature=0.1, max_tokens=500 ) else: # Fallback generic call chat_resp = client.create(prompt=prompt, max_tokens=500) if hasattr(chat_resp, "choices"): return chat_resp.choices[0].message.content elif isinstance(chat_resp, dict): choices = chat_resp.get("choices") or [] if choices: return choices[0].get("message", {}).get("content") \ or choices[0].get("text") \ or str(choices[0]) return str(chat_resp) except Exception as e: return f"Error generating answer: {e}" # STREAMLIT UI # STREAMLIT UI def main(): """Main Streamlit application.""" # Page configuration with wide layout for centered design st.set_page_config( page_title="PageMentor", page_icon="📚", layout="wide" ) # Custom CSS for professional styling and centered layout st.markdown(""" """, unsafe_allow_html=True) # Header Section with gradient background st.markdown("""
Upload a PDF document above to start your learning journey.
Support for textbooks, research papers, articles, and more!