# app.py
import os
import json
from flask import Flask, request, render_template, jsonify
from PyPDF2 import PdfReader
from bertopic import BERTopic
from huggingface_hub import InferenceClient
from sklearn.feature_extraction.text import CountVectorizer

# --- Initialization ---

# Initialize the Flask application
app = Flask(__name__)

# --- Model and Client Setup ---

# Initialize the BERTopic model.
# BERTopic is a powerful library for topic modeling. It groups documents
# (in this case, pages of a PDF) into topics based on their content.
# We use CountVectorizer with specific parameters to prevent a potential
# issue with BERTopic's default vectorizer on some deployment platforms.
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    min_topic_size=5,  # A topic will only be considered if it has at least 5 documents (pages).
    verbose=True       # Prints progress during topic modeling.
)

# Initialize the Hugging Face Inference Client.
# This client allows us to connect to the Hugging Face Hub and use a hosted
# large language model (LLM) for the chat functionality.
client = None # Initialize client as None
try:
    # It's best practice to store sensitive keys as environment variables.
    # The app will look for a secret named HF_TOKEN.
    hf_token = os.environ.get("HF_TOKEN")
    if not hf_token:
        print("Warning: HF_TOKEN environment variable is not set. The AI Chat feature may not work.")
    
    # Create an instance of the client with the token.
    client = InferenceClient(token=hf_token)

except Exception as e:
    # If there's an error (e.g., library not installed, network issue),
    # print the error and set the client to None.
    print(f"Error initializing Hugging Face InferenceClient: {e}")
    client = None

# --- Application Routes ---

@app.route('/')
def index():
    """
    Render the main page of the application.
    This route serves the 'index.html' file, which should contain the
    frontend UI for uploading a PDF and interacting with the app.
    """
    return render_template('index.html')

@app.route('/process', methods=['POST'])
def process_pdf():
    """
    Process the uploaded PDF file. This function extracts text, performs
    topic modeling, and returns data for a knowledge graph visualization.
    """
    # Check if a file was included in the request
    if 'pdf_file' not in request.files:
        return jsonify({"error": "No PDF file provided"}), 400
        
    file = request.files['pdf_file']

    # Check if the filename is empty (e.g., user clicked submit without selecting a file)
    if file.filename == '':
        return jsonify({"error": "No selected file"}), 400

    try:
        # --- PDF Text Extraction ---
        reader = PdfReader(file.stream)
        # Extract text from each page, creating a list of strings.
        # We only include pages that actually contain text.
        text_chunks = [page.extract_text() for page in reader.pages if page.extract_text()]
        
        if not text_chunks:
            return jsonify({"error": "Could not extract any text from the provided PDF."}), 400
            
        # --- Topic Modeling ---
        # The fit_transform method trains the BERTopic model on the text chunks.
        topics, _ = topic_model.fit_transform(text_chunks)
        # Get detailed information about the generated topics (names, document counts, etc.).
        topic_info = topic_model.get_topic_info()
        
        # --- Knowledge Graph Creation ---
        # We will represent the topics as a network graph (nodes and links).
        nodes = []
        links = []
        
        # Add a central node to represent the entire document.
        # We give it a unique ID and a larger 'val' to make it bigger in the visualization.
        doc_node_id = topic_info['Topic'].max() + 1
        nodes.append({"id": doc_node_id, "name": "Main Document", "val": 20})
        
        # Iterate through the topics found by BERTopic.
        for index, row in topic_info.iterrows():
            topic_id = row['Topic']
            topic_name = row['Name'] # A descriptive name like "0_data_science_models_learning"
            
            # Topic -1 is a special case in BERTopic, containing all outliers (documents
            # that don't fit well into any other topic). We'll skip it for the graph.
            if topic_id == -1:
                continue
            
            # Add a node for each valid topic.
            nodes.append({"id": int(topic_id), "name": topic_name, "val": 5})
            # Create a link from the central document node to this topic node.
            links.append({"source": doc_node_id, "target": int(topic_id)})
            
        # Return the graph data as a JSON response.
        return jsonify({"nodes": nodes, "links": links})

    except Exception as e:
        # General error handling for any part of the process.
        print(f"Error processing PDF: {e}")
        return jsonify({"error": "An error occurred while processing the PDF."}), 500

@app.route('/chat', methods=['POST'])
def chat():
    """
    Handle chat requests by sending the user's message to the AI Tutor model.
    """
    # Check if the Hugging Face client was initialized successfully.
    if not client:
        return jsonify({"error": "AI Chat service is not available."}), 503
        
    data = request.json
    user_message = data.get("message")

    if not user_message:
        return jsonify({"error": "No message provided in the chat request"}), 400

    # --- AI Persona and Prompt Engineering ---
    # This system prompt defines the personality and goal of the AI.
    system_prompt = "You are the 'Funny Tutor'. Your goal is to explain concepts in a light-hearted, humorous, and engaging way. Use wacky analogies and jokes to make learning fun, but ensure the core information is still accurate."
    
    # This prompt format is specifically structured for the Zephyr-7B model.
    # It clearly separates the system's instructions, the user's query, and
    # the placeholder for the AI's response.
    prompt = f"<s><|system|>\n{system_prompt}</s>\n<|user|>\n{user_message}</s>\n<|assistant|>"
    
    try:
        # --- Call Hugging Face Inference API ---
        response_text = ""
        # We use streaming to get the response token by token, which feels more responsive.
        for token in client.text_generation(
            prompt,
            model="HuggingFaceH4/zephyr-7b-beta", # The specific model to use
            max_new_tokens=250,                   # Limit the length of the response
            stream=True
        ):
            response_text += token
            
        return jsonify({"reply": response_text})

    except Exception as e:
        print(f"Error with AI chat: {e}")
        return jsonify({"error": "The AI tutor is currently unavailable."}), 503

# --- Run the Application ---
if __name__ == '__main__':
    # This block runs the app in debug mode for local testing.
    # When deployed on a platform like Hugging Face Spaces, a production
    # web server like Gunicorn is used instead of this development server.
    app.run(debug=True, port=5000)