Samanfatima563474 commited on
Commit
d0f2c71
·
verified ·
1 Parent(s): da37d01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -40
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import json
3
  from flask import Flask, request, render_template, jsonify
@@ -7,83 +8,126 @@ from huggingface_hub import InferenceClient
7
  from sklearn.feature_extraction.text import CountVectorizer
8
 
9
  # --- Initialization ---
 
 
10
  app = Flask(__name__)
11
 
12
- # Initialize the BERTopic model
13
- # We use CountVectorizer to prevent an issue with BERTopic on some platforms
 
 
 
 
 
14
  vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
15
  topic_model = BERTopic(
16
  vectorizer_model=vectorizer_model,
17
- min_topic_size=5, # Consider topics with at least 5 documents
18
- verbose=True
19
  )
20
 
21
- # Initialize the Hugging Face Inference Client
22
- # Make sure to set your HF_TOKEN in the Space's secrets
 
 
23
  try:
 
 
24
  hf_token = os.environ.get("HF_TOKEN")
25
  if not hf_token:
26
- print("Warning: HF_TOKEN not set. AI Chat may not work.")
 
 
27
  client = InferenceClient(token=hf_token)
 
28
  except Exception as e:
29
- print(f"Error initializing InferenceClient: {e}")
 
 
30
  client = None
31
 
32
- # --- Routes ---
 
33
  @app.route('/')
34
  def index():
35
- """Render the main page."""
 
 
 
 
36
  return render_template('index.html')
37
 
38
  @app.route('/process', methods=['POST'])
39
  def process_pdf():
40
- """Process the uploaded PDF to extract topics and create a knowledge graph."""
 
 
 
 
41
  if 'pdf_file' not in request.files:
42
  return jsonify({"error": "No PDF file provided"}), 400
43
-
44
  file = request.files['pdf_file']
 
 
45
  if file.filename == '':
46
  return jsonify({"error": "No selected file"}), 400
47
 
48
  try:
49
- # Extract text from PDF
50
  reader = PdfReader(file.stream)
 
 
51
  text_chunks = [page.extract_text() for page in reader.pages if page.extract_text()]
52
 
53
  if not text_chunks:
54
- return jsonify({"error": "Could not extract text from PDF."}), 400
55
-
56
- # Generate topics using BERTopic
 
57
  topics, _ = topic_model.fit_transform(text_chunks)
 
58
  topic_info = topic_model.get_topic_info()
59
 
60
- # Create graph data
 
61
  nodes = []
62
  links = []
63
 
64
- # Add a central node for the document itself
 
65
  doc_node_id = topic_info['Topic'].max() + 1
66
- nodes.append({"id": doc_node_id, "name": "Main Document", "val": 20}) # Larger value for the main node
67
-
 
68
  for index, row in topic_info.iterrows():
69
  topic_id = row['Topic']
70
- topic_name = row['Name']
71
- if topic_id == -1: continue # Skip outlier topic
72
-
73
- # Add node for each topic
74
- nodes.append({"id": topic_id, "name": topic_name, "val": 5})
75
- # Link each topic to the central document node
76
- links.append({"source": doc_node_id, "target": topic_id})
77
-
 
 
 
 
 
78
  return jsonify({"nodes": nodes, "links": links})
79
 
80
  except Exception as e:
 
81
  print(f"Error processing PDF: {e}")
82
  return jsonify({"error": "An error occurred while processing the PDF."}), 500
83
 
84
  @app.route('/chat', methods=['POST'])
85
  def chat():
86
- """Handle chat requests with the AI Tutor."""
 
 
 
87
  if not client:
88
  return jsonify({"error": "AI Chat service is not available."}), 503
89
 
@@ -91,32 +135,38 @@ def chat():
91
  user_message = data.get("message")
92
 
93
  if not user_message:
94
- return jsonify({"error": "No message provided"}), 400
95
 
96
- # Define the persona and prompt for the AI
 
97
  system_prompt = "You are the 'Funny Tutor'. Your goal is to explain concepts in a light-hearted, humorous, and engaging way. Use wacky analogies and jokes to make learning fun, but ensure the core information is still accurate."
98
 
99
- # We use the recommended chat template for Zephyr models
 
 
100
  prompt = f"<s><|system|>\n{system_prompt}</s>\n<|user|>\n{user_message}</s>\n<|assistant|>"
101
-
102
  try:
103
- # Call the Hugging Face Inference API
104
  response_text = ""
 
105
  for token in client.text_generation(
106
  prompt,
107
- model="HuggingFaceH4/zephyr-7b-beta",
108
- max_new_tokens=250,
109
  stream=True
110
  ):
111
  response_text += token
112
-
113
  return jsonify({"reply": response_text})
114
 
115
  except Exception as e:
116
  print(f"Error with AI chat: {e}")
117
  return jsonify({"error": "The AI tutor is currently unavailable."}), 503
118
 
119
- # --- Run the App ---
120
  if __name__ == '__main__':
121
- # This is for local testing only. Gunicorn is used in production.
122
- app.run(debug=True, port=5000)
 
 
 
1
+ # app.py
2
  import os
3
  import json
4
  from flask import Flask, request, render_template, jsonify
 
8
  from sklearn.feature_extraction.text import CountVectorizer
9
 
10
  # --- Initialization ---
11
+
12
+ # Initialize the Flask application
13
  app = Flask(__name__)
14
 
15
+ # --- Model and Client Setup ---
16
+
17
+ # Initialize the BERTopic model.
18
+ # BERTopic is a powerful library for topic modeling. It groups documents
19
+ # (in this case, pages of a PDF) into topics based on their content.
20
+ # We use CountVectorizer with specific parameters to prevent a potential
21
+ # issue with BERTopic's default vectorizer on some deployment platforms.
22
  vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
23
  topic_model = BERTopic(
24
  vectorizer_model=vectorizer_model,
25
+ min_topic_size=5, # A topic will only be considered if it has at least 5 documents (pages).
26
+ verbose=True # Prints progress during topic modeling.
27
  )
28
 
29
+ # Initialize the Hugging Face Inference Client.
30
+ # This client allows us to connect to the Hugging Face Hub and use a hosted
31
+ # large language model (LLM) for the chat functionality.
32
+ client = None # Initialize client as None
33
  try:
34
+ # It's best practice to store sensitive keys as environment variables.
35
+ # The app will look for a secret named HF_TOKEN.
36
  hf_token = os.environ.get("HF_TOKEN")
37
  if not hf_token:
38
+ print("Warning: HF_TOKEN environment variable is not set. The AI Chat feature may not work.")
39
+
40
+ # Create an instance of the client with the token.
41
  client = InferenceClient(token=hf_token)
42
+
43
  except Exception as e:
44
+ # If there's an error (e.g., library not installed, network issue),
45
+ # print the error and set the client to None.
46
+ print(f"Error initializing Hugging Face InferenceClient: {e}")
47
  client = None
48
 
49
+ # --- Application Routes ---
50
+
51
  @app.route('/')
52
  def index():
53
+ """
54
+ Render the main page of the application.
55
+ This route serves the 'index.html' file, which should contain the
56
+ frontend UI for uploading a PDF and interacting with the app.
57
+ """
58
  return render_template('index.html')
59
 
60
  @app.route('/process', methods=['POST'])
61
  def process_pdf():
62
+ """
63
+ Process the uploaded PDF file. This function extracts text, performs
64
+ topic modeling, and returns data for a knowledge graph visualization.
65
+ """
66
+ # Check if a file was included in the request
67
  if 'pdf_file' not in request.files:
68
  return jsonify({"error": "No PDF file provided"}), 400
69
+
70
  file = request.files['pdf_file']
71
+
72
+ # Check if the filename is empty (e.g., user clicked submit without selecting a file)
73
  if file.filename == '':
74
  return jsonify({"error": "No selected file"}), 400
75
 
76
  try:
77
+ # --- PDF Text Extraction ---
78
  reader = PdfReader(file.stream)
79
+ # Extract text from each page, creating a list of strings.
80
+ # We only include pages that actually contain text.
81
  text_chunks = [page.extract_text() for page in reader.pages if page.extract_text()]
82
 
83
  if not text_chunks:
84
+ return jsonify({"error": "Could not extract any text from the provided PDF."}), 400
85
+
86
+ # --- Topic Modeling ---
87
+ # The fit_transform method trains the BERTopic model on the text chunks.
88
  topics, _ = topic_model.fit_transform(text_chunks)
89
+ # Get detailed information about the generated topics (names, document counts, etc.).
90
  topic_info = topic_model.get_topic_info()
91
 
92
+ # --- Knowledge Graph Creation ---
93
+ # We will represent the topics as a network graph (nodes and links).
94
  nodes = []
95
  links = []
96
 
97
+ # Add a central node to represent the entire document.
98
+ # We give it a unique ID and a larger 'val' to make it bigger in the visualization.
99
  doc_node_id = topic_info['Topic'].max() + 1
100
+ nodes.append({"id": doc_node_id, "name": "Main Document", "val": 20})
101
+
102
+ # Iterate through the topics found by BERTopic.
103
  for index, row in topic_info.iterrows():
104
  topic_id = row['Topic']
105
+ topic_name = row['Name'] # A descriptive name like "0_data_science_models_learning"
106
+
107
+ # Topic -1 is a special case in BERTopic, containing all outliers (documents
108
+ # that don't fit well into any other topic). We'll skip it for the graph.
109
+ if topic_id == -1:
110
+ continue
111
+
112
+ # Add a node for each valid topic.
113
+ nodes.append({"id": int(topic_id), "name": topic_name, "val": 5})
114
+ # Create a link from the central document node to this topic node.
115
+ links.append({"source": doc_node_id, "target": int(topic_id)})
116
+
117
+ # Return the graph data as a JSON response.
118
  return jsonify({"nodes": nodes, "links": links})
119
 
120
  except Exception as e:
121
+ # General error handling for any part of the process.
122
  print(f"Error processing PDF: {e}")
123
  return jsonify({"error": "An error occurred while processing the PDF."}), 500
124
 
125
  @app.route('/chat', methods=['POST'])
126
  def chat():
127
+ """
128
+ Handle chat requests by sending the user's message to the AI Tutor model.
129
+ """
130
+ # Check if the Hugging Face client was initialized successfully.
131
  if not client:
132
  return jsonify({"error": "AI Chat service is not available."}), 503
133
 
 
135
  user_message = data.get("message")
136
 
137
  if not user_message:
138
+ return jsonify({"error": "No message provided in the chat request"}), 400
139
 
140
+ # --- AI Persona and Prompt Engineering ---
141
+ # This system prompt defines the personality and goal of the AI.
142
  system_prompt = "You are the 'Funny Tutor'. Your goal is to explain concepts in a light-hearted, humorous, and engaging way. Use wacky analogies and jokes to make learning fun, but ensure the core information is still accurate."
143
 
144
+ # This prompt format is specifically structured for the Zephyr-7B model.
145
+ # It clearly separates the system's instructions, the user's query, and
146
+ # the placeholder for the AI's response.
147
  prompt = f"<s><|system|>\n{system_prompt}</s>\n<|user|>\n{user_message}</s>\n<|assistant|>"
148
+
149
  try:
150
+ # --- Call Hugging Face Inference API ---
151
  response_text = ""
152
+ # We use streaming to get the response token by token, which feels more responsive.
153
  for token in client.text_generation(
154
  prompt,
155
+ model="HuggingFaceH4/zephyr-7b-beta", # The specific model to use
156
+ max_new_tokens=250, # Limit the length of the response
157
  stream=True
158
  ):
159
  response_text += token
160
+
161
  return jsonify({"reply": response_text})
162
 
163
  except Exception as e:
164
  print(f"Error with AI chat: {e}")
165
  return jsonify({"error": "The AI tutor is currently unavailable."}), 503
166
 
167
+ # --- Run the Application ---
168
  if __name__ == '__main__':
169
+ # This block runs the app in debug mode for local testing.
170
+ # When deployed on a platform like Hugging Face Spaces, a production
171
+ # web server like Gunicorn is used instead of this development server.
172
+ app.run(debug=True, port=5000)