Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import os
|
2 |
import json
|
3 |
from flask import Flask, request, render_template, jsonify
|
@@ -7,83 +8,126 @@ from huggingface_hub import InferenceClient
|
|
7 |
from sklearn.feature_extraction.text import CountVectorizer
|
8 |
|
9 |
# --- Initialization ---
|
|
|
|
|
10 |
app = Flask(__name__)
|
11 |
|
12 |
-
#
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
14 |
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
|
15 |
topic_model = BERTopic(
|
16 |
vectorizer_model=vectorizer_model,
|
17 |
-
min_topic_size=5, #
|
18 |
-
verbose=True
|
19 |
)
|
20 |
|
21 |
-
# Initialize the Hugging Face Inference Client
|
22 |
-
#
|
|
|
|
|
23 |
try:
|
|
|
|
|
24 |
hf_token = os.environ.get("HF_TOKEN")
|
25 |
if not hf_token:
|
26 |
-
print("Warning: HF_TOKEN not set. AI Chat may not work.")
|
|
|
|
|
27 |
client = InferenceClient(token=hf_token)
|
|
|
28 |
except Exception as e:
|
29 |
-
|
|
|
|
|
30 |
client = None
|
31 |
|
32 |
-
# --- Routes ---
|
|
|
33 |
@app.route('/')
|
34 |
def index():
|
35 |
-
"""
|
|
|
|
|
|
|
|
|
36 |
return render_template('index.html')
|
37 |
|
38 |
@app.route('/process', methods=['POST'])
|
39 |
def process_pdf():
|
40 |
-
"""
|
|
|
|
|
|
|
|
|
41 |
if 'pdf_file' not in request.files:
|
42 |
return jsonify({"error": "No PDF file provided"}), 400
|
43 |
-
|
44 |
file = request.files['pdf_file']
|
|
|
|
|
45 |
if file.filename == '':
|
46 |
return jsonify({"error": "No selected file"}), 400
|
47 |
|
48 |
try:
|
49 |
-
#
|
50 |
reader = PdfReader(file.stream)
|
|
|
|
|
51 |
text_chunks = [page.extract_text() for page in reader.pages if page.extract_text()]
|
52 |
|
53 |
if not text_chunks:
|
54 |
-
return jsonify({"error": "Could not extract text from PDF."}), 400
|
55 |
-
|
56 |
-
#
|
|
|
57 |
topics, _ = topic_model.fit_transform(text_chunks)
|
|
|
58 |
topic_info = topic_model.get_topic_info()
|
59 |
|
60 |
-
#
|
|
|
61 |
nodes = []
|
62 |
links = []
|
63 |
|
64 |
-
# Add a central node
|
|
|
65 |
doc_node_id = topic_info['Topic'].max() + 1
|
66 |
-
nodes.append({"id": doc_node_id, "name": "Main Document", "val": 20})
|
67 |
-
|
|
|
68 |
for index, row in topic_info.iterrows():
|
69 |
topic_id = row['Topic']
|
70 |
-
topic_name = row['Name']
|
71 |
-
|
72 |
-
|
73 |
-
#
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
78 |
return jsonify({"nodes": nodes, "links": links})
|
79 |
|
80 |
except Exception as e:
|
|
|
81 |
print(f"Error processing PDF: {e}")
|
82 |
return jsonify({"error": "An error occurred while processing the PDF."}), 500
|
83 |
|
84 |
@app.route('/chat', methods=['POST'])
|
85 |
def chat():
|
86 |
-
"""
|
|
|
|
|
|
|
87 |
if not client:
|
88 |
return jsonify({"error": "AI Chat service is not available."}), 503
|
89 |
|
@@ -91,32 +135,38 @@ def chat():
|
|
91 |
user_message = data.get("message")
|
92 |
|
93 |
if not user_message:
|
94 |
-
return jsonify({"error": "No message provided"}), 400
|
95 |
|
96 |
-
#
|
|
|
97 |
system_prompt = "You are the 'Funny Tutor'. Your goal is to explain concepts in a light-hearted, humorous, and engaging way. Use wacky analogies and jokes to make learning fun, but ensure the core information is still accurate."
|
98 |
|
99 |
-
#
|
|
|
|
|
100 |
prompt = f"<s><|system|>\n{system_prompt}</s>\n<|user|>\n{user_message}</s>\n<|assistant|>"
|
101 |
-
|
102 |
try:
|
103 |
-
# Call
|
104 |
response_text = ""
|
|
|
105 |
for token in client.text_generation(
|
106 |
prompt,
|
107 |
-
model="HuggingFaceH4/zephyr-7b-beta",
|
108 |
-
max_new_tokens=250,
|
109 |
stream=True
|
110 |
):
|
111 |
response_text += token
|
112 |
-
|
113 |
return jsonify({"reply": response_text})
|
114 |
|
115 |
except Exception as e:
|
116 |
print(f"Error with AI chat: {e}")
|
117 |
return jsonify({"error": "The AI tutor is currently unavailable."}), 503
|
118 |
|
119 |
-
# --- Run the
|
120 |
if __name__ == '__main__':
|
121 |
-
# This
|
122 |
-
|
|
|
|
|
|
1 |
+
# app.py
|
2 |
import os
|
3 |
import json
|
4 |
from flask import Flask, request, render_template, jsonify
|
|
|
8 |
from sklearn.feature_extraction.text import CountVectorizer
|
9 |
|
10 |
# --- Initialization ---
|
11 |
+
|
12 |
+
# Initialize the Flask application
|
13 |
app = Flask(__name__)
|
14 |
|
15 |
+
# --- Model and Client Setup ---
|
16 |
+
|
17 |
+
# Initialize the BERTopic model.
|
18 |
+
# BERTopic is a powerful library for topic modeling. It groups documents
|
19 |
+
# (in this case, pages of a PDF) into topics based on their content.
|
20 |
+
# We use CountVectorizer with specific parameters to prevent a potential
|
21 |
+
# issue with BERTopic's default vectorizer on some deployment platforms.
|
22 |
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
|
23 |
topic_model = BERTopic(
|
24 |
vectorizer_model=vectorizer_model,
|
25 |
+
min_topic_size=5, # A topic will only be considered if it has at least 5 documents (pages).
|
26 |
+
verbose=True # Prints progress during topic modeling.
|
27 |
)
|
28 |
|
29 |
+
# Initialize the Hugging Face Inference Client.
|
30 |
+
# This client allows us to connect to the Hugging Face Hub and use a hosted
|
31 |
+
# large language model (LLM) for the chat functionality.
|
32 |
+
client = None # Initialize client as None
|
33 |
try:
|
34 |
+
# It's best practice to store sensitive keys as environment variables.
|
35 |
+
# The app will look for a secret named HF_TOKEN.
|
36 |
hf_token = os.environ.get("HF_TOKEN")
|
37 |
if not hf_token:
|
38 |
+
print("Warning: HF_TOKEN environment variable is not set. The AI Chat feature may not work.")
|
39 |
+
|
40 |
+
# Create an instance of the client with the token.
|
41 |
client = InferenceClient(token=hf_token)
|
42 |
+
|
43 |
except Exception as e:
|
44 |
+
# If there's an error (e.g., library not installed, network issue),
|
45 |
+
# print the error and set the client to None.
|
46 |
+
print(f"Error initializing Hugging Face InferenceClient: {e}")
|
47 |
client = None
|
48 |
|
49 |
+
# --- Application Routes ---
|
50 |
+
|
51 |
@app.route('/')
|
52 |
def index():
|
53 |
+
"""
|
54 |
+
Render the main page of the application.
|
55 |
+
This route serves the 'index.html' file, which should contain the
|
56 |
+
frontend UI for uploading a PDF and interacting with the app.
|
57 |
+
"""
|
58 |
return render_template('index.html')
|
59 |
|
60 |
@app.route('/process', methods=['POST'])
|
61 |
def process_pdf():
|
62 |
+
"""
|
63 |
+
Process the uploaded PDF file. This function extracts text, performs
|
64 |
+
topic modeling, and returns data for a knowledge graph visualization.
|
65 |
+
"""
|
66 |
+
# Check if a file was included in the request
|
67 |
if 'pdf_file' not in request.files:
|
68 |
return jsonify({"error": "No PDF file provided"}), 400
|
69 |
+
|
70 |
file = request.files['pdf_file']
|
71 |
+
|
72 |
+
# Check if the filename is empty (e.g., user clicked submit without selecting a file)
|
73 |
if file.filename == '':
|
74 |
return jsonify({"error": "No selected file"}), 400
|
75 |
|
76 |
try:
|
77 |
+
# --- PDF Text Extraction ---
|
78 |
reader = PdfReader(file.stream)
|
79 |
+
# Extract text from each page, creating a list of strings.
|
80 |
+
# We only include pages that actually contain text.
|
81 |
text_chunks = [page.extract_text() for page in reader.pages if page.extract_text()]
|
82 |
|
83 |
if not text_chunks:
|
84 |
+
return jsonify({"error": "Could not extract any text from the provided PDF."}), 400
|
85 |
+
|
86 |
+
# --- Topic Modeling ---
|
87 |
+
# The fit_transform method trains the BERTopic model on the text chunks.
|
88 |
topics, _ = topic_model.fit_transform(text_chunks)
|
89 |
+
# Get detailed information about the generated topics (names, document counts, etc.).
|
90 |
topic_info = topic_model.get_topic_info()
|
91 |
|
92 |
+
# --- Knowledge Graph Creation ---
|
93 |
+
# We will represent the topics as a network graph (nodes and links).
|
94 |
nodes = []
|
95 |
links = []
|
96 |
|
97 |
+
# Add a central node to represent the entire document.
|
98 |
+
# We give it a unique ID and a larger 'val' to make it bigger in the visualization.
|
99 |
doc_node_id = topic_info['Topic'].max() + 1
|
100 |
+
nodes.append({"id": doc_node_id, "name": "Main Document", "val": 20})
|
101 |
+
|
102 |
+
# Iterate through the topics found by BERTopic.
|
103 |
for index, row in topic_info.iterrows():
|
104 |
topic_id = row['Topic']
|
105 |
+
topic_name = row['Name'] # A descriptive name like "0_data_science_models_learning"
|
106 |
+
|
107 |
+
# Topic -1 is a special case in BERTopic, containing all outliers (documents
|
108 |
+
# that don't fit well into any other topic). We'll skip it for the graph.
|
109 |
+
if topic_id == -1:
|
110 |
+
continue
|
111 |
+
|
112 |
+
# Add a node for each valid topic.
|
113 |
+
nodes.append({"id": int(topic_id), "name": topic_name, "val": 5})
|
114 |
+
# Create a link from the central document node to this topic node.
|
115 |
+
links.append({"source": doc_node_id, "target": int(topic_id)})
|
116 |
+
|
117 |
+
# Return the graph data as a JSON response.
|
118 |
return jsonify({"nodes": nodes, "links": links})
|
119 |
|
120 |
except Exception as e:
|
121 |
+
# General error handling for any part of the process.
|
122 |
print(f"Error processing PDF: {e}")
|
123 |
return jsonify({"error": "An error occurred while processing the PDF."}), 500
|
124 |
|
125 |
@app.route('/chat', methods=['POST'])
|
126 |
def chat():
|
127 |
+
"""
|
128 |
+
Handle chat requests by sending the user's message to the AI Tutor model.
|
129 |
+
"""
|
130 |
+
# Check if the Hugging Face client was initialized successfully.
|
131 |
if not client:
|
132 |
return jsonify({"error": "AI Chat service is not available."}), 503
|
133 |
|
|
|
135 |
user_message = data.get("message")
|
136 |
|
137 |
if not user_message:
|
138 |
+
return jsonify({"error": "No message provided in the chat request"}), 400
|
139 |
|
140 |
+
# --- AI Persona and Prompt Engineering ---
|
141 |
+
# This system prompt defines the personality and goal of the AI.
|
142 |
system_prompt = "You are the 'Funny Tutor'. Your goal is to explain concepts in a light-hearted, humorous, and engaging way. Use wacky analogies and jokes to make learning fun, but ensure the core information is still accurate."
|
143 |
|
144 |
+
# This prompt format is specifically structured for the Zephyr-7B model.
|
145 |
+
# It clearly separates the system's instructions, the user's query, and
|
146 |
+
# the placeholder for the AI's response.
|
147 |
prompt = f"<s><|system|>\n{system_prompt}</s>\n<|user|>\n{user_message}</s>\n<|assistant|>"
|
148 |
+
|
149 |
try:
|
150 |
+
# --- Call Hugging Face Inference API ---
|
151 |
response_text = ""
|
152 |
+
# We use streaming to get the response token by token, which feels more responsive.
|
153 |
for token in client.text_generation(
|
154 |
prompt,
|
155 |
+
model="HuggingFaceH4/zephyr-7b-beta", # The specific model to use
|
156 |
+
max_new_tokens=250, # Limit the length of the response
|
157 |
stream=True
|
158 |
):
|
159 |
response_text += token
|
160 |
+
|
161 |
return jsonify({"reply": response_text})
|
162 |
|
163 |
except Exception as e:
|
164 |
print(f"Error with AI chat: {e}")
|
165 |
return jsonify({"error": "The AI tutor is currently unavailable."}), 503
|
166 |
|
167 |
+
# --- Run the Application ---
|
168 |
if __name__ == '__main__':
|
169 |
+
# This block runs the app in debug mode for local testing.
|
170 |
+
# When deployed on a platform like Hugging Face Spaces, a production
|
171 |
+
# web server like Gunicorn is used instead of this development server.
|
172 |
+
app.run(debug=True, port=5000)
|