Spaces:

Samanfatima563474
/

tutor-ai

Sleeping

App Files Files Community

Samanfatima563474 commited on Jul 14

Commit

d0f2c71

verified ·

1 Parent(s): da37d01

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -40

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import json
 from flask import Flask, request, render_template, jsonify
@@ -7,83 +8,126 @@ from huggingface_hub import InferenceClient
 from sklearn.feature_extraction.text import CountVectorizer
 # --- Initialization ---
 app = Flask(__name__)
-# Initialize the BERTopic model
-# We use CountVectorizer to prevent an issue with BERTopic on some platforms
 vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
 topic_model = BERTopic(
     vectorizer_model=vectorizer_model,
-    min_topic_size=5,  # Consider topics with at least 5 documents
-    verbose=True
 )
-# Initialize the Hugging Face Inference Client
-# Make sure to set your HF_TOKEN in the Space's secrets
 try:
     hf_token = os.environ.get("HF_TOKEN")
     if not hf_token:
-        print("Warning: HF_TOKEN not set. AI Chat may not work.")
     client = InferenceClient(token=hf_token)
 except Exception as e:
-    print(f"Error initializing InferenceClient: {e}")
     client = None
-# --- Routes ---
 @app.route('/')
 def index():
-    """Render the main page."""
     return render_template('index.html')
 @app.route('/process', methods=['POST'])
 def process_pdf():
-    """Process the uploaded PDF to extract topics and create a knowledge graph."""
     if 'pdf_file' not in request.files:
         return jsonify({"error": "No PDF file provided"}), 400
     file = request.files['pdf_file']
     if file.filename == '':
         return jsonify({"error": "No selected file"}), 400
     try:
-        # Extract text from PDF
         reader = PdfReader(file.stream)
         text_chunks = [page.extract_text() for page in reader.pages if page.extract_text()]
         if not text_chunks:
-            return jsonify({"error": "Could not extract text from PDF."}), 400
-        # Generate topics using BERTopic
         topics, _ = topic_model.fit_transform(text_chunks)
         topic_info = topic_model.get_topic_info()
-        # Create graph data
         nodes = []
         links = []
-        # Add a central node for the document itself
         doc_node_id = topic_info['Topic'].max() + 1
-        nodes.append({"id": doc_node_id, "name": "Main Document", "val": 20}) # Larger value for the main node
         for index, row in topic_info.iterrows():
             topic_id = row['Topic']
-            topic_name = row['Name']
-            if topic_id == -1: continue # Skip outlier topic
-            # Add node for each topic
-            nodes.append({"id": topic_id, "name": topic_name, "val": 5})
-            # Link each topic to the central document node
-            links.append({"source": doc_node_id, "target": topic_id})
         return jsonify({"nodes": nodes, "links": links})
     except Exception as e:
         print(f"Error processing PDF: {e}")
         return jsonify({"error": "An error occurred while processing the PDF."}), 500
 @app.route('/chat', methods=['POST'])
 def chat():
-    """Handle chat requests with the AI Tutor."""
     if not client:
         return jsonify({"error": "AI Chat service is not available."}), 503
@@ -91,32 +135,38 @@ def chat():
     user_message = data.get("message")
     if not user_message:
-        return jsonify({"error": "No message provided"}), 400
-    # Define the persona and prompt for the AI
     system_prompt = "You are the 'Funny Tutor'. Your goal is to explain concepts in a light-hearted, humorous, and engaging way. Use wacky analogies and jokes to make learning fun, but ensure the core information is still accurate."
-    # We use the recommended chat template for Zephyr models
     prompt = f"<s><|system|>\n{system_prompt}</s>\n<|user|>\n{user_message}</s>\n<|assistant|>"
     try:
-        # Call the Hugging Face Inference API
         response_text = ""
         for token in client.text_generation(
             prompt,
-            model="HuggingFaceH4/zephyr-7b-beta",
-            max_new_tokens=250,
             stream=True
         ):
             response_text += token
         return jsonify({"reply": response_text})
     except Exception as e:
         print(f"Error with AI chat: {e}")
         return jsonify({"error": "The AI tutor is currently unavailable."}), 503
-# --- Run the App ---
 if __name__ == '__main__':
-    # This is for local testing only. Gunicorn is used in production.
-    app.run(debug=True, port=5000)

+# app.py
 import os
 import json
 from flask import Flask, request, render_template, jsonify
 from sklearn.feature_extraction.text import CountVectorizer
 # --- Initialization ---
+# Initialize the Flask application
 app = Flask(__name__)
+# --- Model and Client Setup ---
+# Initialize the BERTopic model.
+# BERTopic is a powerful library for topic modeling. It groups documents
+# (in this case, pages of a PDF) into topics based on their content.
+# We use CountVectorizer with specific parameters to prevent a potential
+# issue with BERTopic's default vectorizer on some deployment platforms.
 vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
 topic_model = BERTopic(
     vectorizer_model=vectorizer_model,
+    min_topic_size=5,  # A topic will only be considered if it has at least 5 documents (pages).
+    verbose=True       # Prints progress during topic modeling.
 )
+# Initialize the Hugging Face Inference Client.
+# This client allows us to connect to the Hugging Face Hub and use a hosted
+# large language model (LLM) for the chat functionality.
+client = None # Initialize client as None
 try:
+    # It's best practice to store sensitive keys as environment variables.
+    # The app will look for a secret named HF_TOKEN.
     hf_token = os.environ.get("HF_TOKEN")
     if not hf_token:
+        print("Warning: HF_TOKEN environment variable is not set. The AI Chat feature may not work.")
+    # Create an instance of the client with the token.
     client = InferenceClient(token=hf_token)
 except Exception as e:
+    # If there's an error (e.g., library not installed, network issue),
+    # print the error and set the client to None.
+    print(f"Error initializing Hugging Face InferenceClient: {e}")
     client = None
+# --- Application Routes ---
 @app.route('/')
 def index():
+    """
+    Render the main page of the application.
+    This route serves the 'index.html' file, which should contain the
+    frontend UI for uploading a PDF and interacting with the app.
+    """
     return render_template('index.html')
 @app.route('/process', methods=['POST'])
 def process_pdf():
+    """
+    Process the uploaded PDF file. This function extracts text, performs
+    topic modeling, and returns data for a knowledge graph visualization.
+    """
+    # Check if a file was included in the request
     if 'pdf_file' not in request.files:
         return jsonify({"error": "No PDF file provided"}), 400
     file = request.files['pdf_file']
+    # Check if the filename is empty (e.g., user clicked submit without selecting a file)
     if file.filename == '':
         return jsonify({"error": "No selected file"}), 400
     try:
+        # --- PDF Text Extraction ---
         reader = PdfReader(file.stream)
+        # Extract text from each page, creating a list of strings.
+        # We only include pages that actually contain text.
         text_chunks = [page.extract_text() for page in reader.pages if page.extract_text()]
         if not text_chunks:
+            return jsonify({"error": "Could not extract any text from the provided PDF."}), 400
+        # --- Topic Modeling ---
+        # The fit_transform method trains the BERTopic model on the text chunks.
         topics, _ = topic_model.fit_transform(text_chunks)
+        # Get detailed information about the generated topics (names, document counts, etc.).
         topic_info = topic_model.get_topic_info()
+        # --- Knowledge Graph Creation ---
+        # We will represent the topics as a network graph (nodes and links).
         nodes = []
         links = []
+        # Add a central node to represent the entire document.
+        # We give it a unique ID and a larger 'val' to make it bigger in the visualization.
         doc_node_id = topic_info['Topic'].max() + 1
+        nodes.append({"id": doc_node_id, "name": "Main Document", "val": 20})
+        # Iterate through the topics found by BERTopic.
         for index, row in topic_info.iterrows():
             topic_id = row['Topic']
+            topic_name = row['Name'] # A descriptive name like "0_data_science_models_learning"
+            # Topic -1 is a special case in BERTopic, containing all outliers (documents
+            # that don't fit well into any other topic). We'll skip it for the graph.
+            if topic_id == -1:
+                continue
+            # Add a node for each valid topic.
+            nodes.append({"id": int(topic_id), "name": topic_name, "val": 5})
+            # Create a link from the central document node to this topic node.
+            links.append({"source": doc_node_id, "target": int(topic_id)})
+        # Return the graph data as a JSON response.
         return jsonify({"nodes": nodes, "links": links})
     except Exception as e:
+        # General error handling for any part of the process.
         print(f"Error processing PDF: {e}")
         return jsonify({"error": "An error occurred while processing the PDF."}), 500
 @app.route('/chat', methods=['POST'])
 def chat():
+    """
+    Handle chat requests by sending the user's message to the AI Tutor model.
+    """
+    # Check if the Hugging Face client was initialized successfully.
     if not client:
         return jsonify({"error": "AI Chat service is not available."}), 503
     user_message = data.get("message")
     if not user_message:
+        return jsonify({"error": "No message provided in the chat request"}), 400
+    # --- AI Persona and Prompt Engineering ---
+    # This system prompt defines the personality and goal of the AI.
     system_prompt = "You are the 'Funny Tutor'. Your goal is to explain concepts in a light-hearted, humorous, and engaging way. Use wacky analogies and jokes to make learning fun, but ensure the core information is still accurate."
+    # This prompt format is specifically structured for the Zephyr-7B model.
+    # It clearly separates the system's instructions, the user's query, and
+    # the placeholder for the AI's response.
     prompt = f"<s><|system|>\n{system_prompt}</s>\n<|user|>\n{user_message}</s>\n<|assistant|>"
     try:
+        # --- Call Hugging Face Inference API ---
         response_text = ""
+        # We use streaming to get the response token by token, which feels more responsive.
         for token in client.text_generation(
             prompt,
+            model="HuggingFaceH4/zephyr-7b-beta", # The specific model to use
+            max_new_tokens=250,                   # Limit the length of the response
             stream=True
         ):
             response_text += token
         return jsonify({"reply": response_text})
     except Exception as e:
         print(f"Error with AI chat: {e}")
         return jsonify({"error": "The AI tutor is currently unavailable."}), 503
+# --- Run the Application ---
 if __name__ == '__main__':
+    # This block runs the app in debug mode for local testing.
+    # When deployed on a platform like Hugging Face Spaces, a production
+    # web server like Gunicorn is used instead of this development server.
+    app.run(debug=True, port=5000)