# app.py import os import json from flask import Flask, request, render_template, jsonify from PyPDF2 import PdfReader from bertopic import BERTopic from huggingface_hub import InferenceClient from sklearn.feature_extraction.text import CountVectorizer # --- Initialization --- # Initialize the Flask application app = Flask(__name__) # --- Model and Client Setup --- # Initialize the BERTopic model. # BERTopic is a powerful library for topic modeling. It groups documents # (in this case, pages of a PDF) into topics based on their content. # We use CountVectorizer with specific parameters to prevent a potential # issue with BERTopic's default vectorizer on some deployment platforms. vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english") topic_model = BERTopic( vectorizer_model=vectorizer_model, min_topic_size=5, # A topic will only be considered if it has at least 5 documents (pages). verbose=True # Prints progress during topic modeling. ) # Initialize the Hugging Face Inference Client. # This client allows us to connect to the Hugging Face Hub and use a hosted # large language model (LLM) for the chat functionality. client = None # Initialize client as None try: # It's best practice to store sensitive keys as environment variables. # The app will look for a secret named HF_TOKEN. hf_token = os.environ.get("HF_TOKEN") if not hf_token: print("Warning: HF_TOKEN environment variable is not set. The AI Chat feature may not work.") # Create an instance of the client with the token. client = InferenceClient(token=hf_token) except Exception as e: # If there's an error (e.g., library not installed, network issue), # print the error and set the client to None. print(f"Error initializing Hugging Face InferenceClient: {e}") client = None # --- Application Routes --- @app.route('/') def index(): """ Render the main page of the application. This route serves the 'index.html' file, which should contain the frontend UI for uploading a PDF and interacting with the app. """ return render_template('index.html') @app.route('/process', methods=['POST']) def process_pdf(): """ Process the uploaded PDF file. This function extracts text, performs topic modeling, and returns data for a knowledge graph visualization. """ # Check if a file was included in the request if 'pdf_file' not in request.files: return jsonify({"error": "No PDF file provided"}), 400 file = request.files['pdf_file'] # Check if the filename is empty (e.g., user clicked submit without selecting a file) if file.filename == '': return jsonify({"error": "No selected file"}), 400 try: # --- PDF Text Extraction --- reader = PdfReader(file.stream) # Extract text from each page, creating a list of strings. # We only include pages that actually contain text. text_chunks = [page.extract_text() for page in reader.pages if page.extract_text()] if not text_chunks: return jsonify({"error": "Could not extract any text from the provided PDF."}), 400 # --- Topic Modeling --- # The fit_transform method trains the BERTopic model on the text chunks. topics, _ = topic_model.fit_transform(text_chunks) # Get detailed information about the generated topics (names, document counts, etc.). topic_info = topic_model.get_topic_info() # --- Knowledge Graph Creation --- # We will represent the topics as a network graph (nodes and links). nodes = [] links = [] # Add a central node to represent the entire document. # We give it a unique ID and a larger 'val' to make it bigger in the visualization. doc_node_id = topic_info['Topic'].max() + 1 nodes.append({"id": doc_node_id, "name": "Main Document", "val": 20}) # Iterate through the topics found by BERTopic. for index, row in topic_info.iterrows(): topic_id = row['Topic'] topic_name = row['Name'] # A descriptive name like "0_data_science_models_learning" # Topic -1 is a special case in BERTopic, containing all outliers (documents # that don't fit well into any other topic). We'll skip it for the graph. if topic_id == -1: continue # Add a node for each valid topic. nodes.append({"id": int(topic_id), "name": topic_name, "val": 5}) # Create a link from the central document node to this topic node. links.append({"source": doc_node_id, "target": int(topic_id)}) # Return the graph data as a JSON response. return jsonify({"nodes": nodes, "links": links}) except Exception as e: # General error handling for any part of the process. print(f"Error processing PDF: {e}") return jsonify({"error": "An error occurred while processing the PDF."}), 500 @app.route('/chat', methods=['POST']) def chat(): """ Handle chat requests by sending the user's message to the AI Tutor model. """ # Check if the Hugging Face client was initialized successfully. if not client: return jsonify({"error": "AI Chat service is not available."}), 503 data = request.json user_message = data.get("message") if not user_message: return jsonify({"error": "No message provided in the chat request"}), 400 # --- AI Persona and Prompt Engineering --- # This system prompt defines the personality and goal of the AI. system_prompt = "You are the 'Funny Tutor'. Your goal is to explain concepts in a light-hearted, humorous, and engaging way. Use wacky analogies and jokes to make learning fun, but ensure the core information is still accurate." # This prompt format is specifically structured for the Zephyr-7B model. # It clearly separates the system's instructions, the user's query, and # the placeholder for the AI's response. prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_message}\n<|assistant|>" try: # --- Call Hugging Face Inference API --- response_text = "" # We use streaming to get the response token by token, which feels more responsive. for token in client.text_generation( prompt, model="HuggingFaceH4/zephyr-7b-beta", # The specific model to use max_new_tokens=250, # Limit the length of the response stream=True ): response_text += token return jsonify({"reply": response_text}) except Exception as e: print(f"Error with AI chat: {e}") return jsonify({"error": "The AI tutor is currently unavailable."}), 503 # --- Run the Application --- if __name__ == '__main__': # This block runs the app in debug mode for local testing. # When deployed on a platform like Hugging Face Spaces, a production # web server like Gunicorn is used instead of this development server. app.run(debug=True, port=5000)