milwright commited on
Commit
c347de0
·
1 Parent(s): 51d0f03

Debug + enhance data visualization

Browse files
Files changed (3) hide show
  1. LICENSE +3 -2
  2. advanced_scraper_ui.py +77 -9
  3. requirements.txt +1 -1
LICENSE CHANGED
@@ -1,11 +1,12 @@
1
  GNU GENERAL PUBLIC LICENSE
2
- Version 3, 29 June 2007
 
3
 
4
  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
  Everyone is permitted to copy and distribute verbatim copies
6
  of this license document, but changing it is not allowed.
7
 
8
- Preamble
9
 
10
  The GNU General Public License is a free, copyleft license for
11
  software and other kinds of works.
 
1
  GNU GENERAL PUBLIC LICENSE
2
+
3
+ Version 3, 29 June 2007
4
 
5
  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
6
  Everyone is permitted to copy and distribute verbatim copies
7
  of this license document, but changing it is not allowed.
8
 
9
+ Preamble
10
 
11
  The GNU General Public License is a free, copyleft license for
12
  software and other kinds of works.
advanced_scraper_ui.py CHANGED
@@ -113,6 +113,12 @@ def filter_results(results, filters):
113
  def create_data_visualization(results):
114
  """Create data visualizations based on results"""
115
  try:
 
 
 
 
 
 
116
  # Combine all results
117
  all_posts = []
118
  for subreddit, posts in results.items():
@@ -143,6 +149,7 @@ def create_data_visualization(results):
143
  if 'subreddit' not in df.columns:
144
  missing_columns.append('subreddit')
145
  st.error(f"Required column(s) missing: {', '.join(missing_columns)}")
 
146
  return
147
 
148
  # Create tabs for different visualizations
@@ -154,9 +161,23 @@ def create_data_visualization(results):
154
  st.subheader("Score Distribution")
155
  fig = px.histogram(df, x="score", color="subreddit", nbins=20,
156
  title="Distribution of Post Scores")
157
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  except Exception as e:
159
  st.error(f"Error creating Score Distribution: {str(e)}")
 
160
 
161
  # Posts by Subreddit
162
  with viz_tab2:
@@ -164,11 +185,26 @@ def create_data_visualization(results):
164
  st.subheader("Posts by Subreddit")
165
  subreddit_counts = df['subreddit'].value_counts().reset_index()
166
  subreddit_counts.columns = ['subreddit', 'count']
 
167
  fig = px.bar(subreddit_counts, x='subreddit', y='count',
168
  title="Number of Matching Posts by Subreddit")
169
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  except Exception as e:
171
  st.error(f"Error creating Posts by Subreddit chart: {str(e)}")
 
172
 
173
  # Time Analysis
174
  with viz_tab3:
@@ -188,8 +224,21 @@ def create_data_visualization(results):
188
 
189
  fig = px.histogram(df, x="hour_of_day", nbins=24,
190
  title="Posts by Hour of Day")
191
- fig.update_layout(xaxis_title="Hour of Day (UTC)")
192
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  except Exception as e:
194
  st.error(f"Error processing dates: {str(e)}")
195
  else:
@@ -332,8 +381,18 @@ def main():
332
 
333
  # Show detailed post view
334
  st.subheader("Post Details")
335
- post_index = st.slider(f"Select post from r/{subreddit}",
336
- 0, max(0, len(posts)-1), 0)
 
 
 
 
 
 
 
 
 
 
337
 
338
  if len(posts) > 0:
339
  post = posts[post_index]
@@ -393,9 +452,18 @@ def main():
393
  # Tab 2: Visualizations
394
  with tab2:
395
  if st.session_state.results:
396
- # Apply current filters to visualization data
397
- filtered_results = filter_results(st.session_state.results, st.session_state.filters)
398
- create_data_visualization(filtered_results)
 
 
 
 
 
 
 
 
 
399
  else:
400
  st.info("Run a search to generate visualizations.")
401
 
 
113
  def create_data_visualization(results):
114
  """Create data visualizations based on results"""
115
  try:
116
+ # Check if we have any data
117
+ total_posts = sum(len(posts) for posts in results.values())
118
+ if total_posts == 0:
119
+ st.warning("No posts found matching your search criteria. Try adjusting your filters.")
120
+ return
121
+
122
  # Combine all results
123
  all_posts = []
124
  for subreddit, posts in results.items():
 
149
  if 'subreddit' not in df.columns:
150
  missing_columns.append('subreddit')
151
  st.error(f"Required column(s) missing: {', '.join(missing_columns)}")
152
+ st.write("Available columns:", df.columns.tolist())
153
  return
154
 
155
  # Create tabs for different visualizations
 
161
  st.subheader("Score Distribution")
162
  fig = px.histogram(df, x="score", color="subreddit", nbins=20,
163
  title="Distribution of Post Scores")
164
+ fig.update_layout(
165
+ xaxis_title="Score (Upvotes)",
166
+ yaxis_title="Number of Posts",
167
+ legend_title="Subreddit"
168
+ )
169
+ # Add error handling with detailed output
170
+ try:
171
+ st.plotly_chart(fig, use_container_width=True)
172
+ except Exception as e:
173
+ st.error(f"Error rendering plotly chart: {str(e)}")
174
+ # More detailed error info
175
+ import traceback
176
+ st.code(traceback.format_exc())
177
+ st.write("Figure data type:", type(fig))
178
  except Exception as e:
179
  st.error(f"Error creating Score Distribution: {str(e)}")
180
+ st.write("DataFrame head:", df.head())
181
 
182
  # Posts by Subreddit
183
  with viz_tab2:
 
185
  st.subheader("Posts by Subreddit")
186
  subreddit_counts = df['subreddit'].value_counts().reset_index()
187
  subreddit_counts.columns = ['subreddit', 'count']
188
+
189
  fig = px.bar(subreddit_counts, x='subreddit', y='count',
190
  title="Number of Matching Posts by Subreddit")
191
+ fig.update_layout(
192
+ xaxis_title="Subreddit",
193
+ yaxis_title="Number of Posts"
194
+ )
195
+
196
+ # Add error handling with detailed output
197
+ try:
198
+ st.plotly_chart(fig, use_container_width=True)
199
+ except Exception as e:
200
+ st.error(f"Error rendering plotly chart: {str(e)}")
201
+ # More detailed error info
202
+ import traceback
203
+ st.code(traceback.format_exc())
204
+ st.write("Figure data type:", type(fig))
205
  except Exception as e:
206
  st.error(f"Error creating Posts by Subreddit chart: {str(e)}")
207
+ st.write("DataFrame unique subreddits:", df['subreddit'].unique())
208
 
209
  # Time Analysis
210
  with viz_tab3:
 
224
 
225
  fig = px.histogram(df, x="hour_of_day", nbins=24,
226
  title="Posts by Hour of Day")
227
+ fig.update_layout(
228
+ xaxis_title="Hour of Day (UTC)",
229
+ yaxis_title="Number of Posts",
230
+ xaxis=dict(tickmode='linear', tick0=0, dtick=1) # Ensure all hours are shown
231
+ )
232
+
233
+ # Add error handling with detailed output
234
+ try:
235
+ st.plotly_chart(fig, use_container_width=True)
236
+ except Exception as e:
237
+ st.error(f"Error rendering plotly chart: {str(e)}")
238
+ # More detailed error info
239
+ import traceback
240
+ st.code(traceback.format_exc())
241
+ st.write("Figure data type:", type(fig))
242
  except Exception as e:
243
  st.error(f"Error processing dates: {str(e)}")
244
  else:
 
381
 
382
  # Show detailed post view
383
  st.subheader("Post Details")
384
+
385
+ # Handle the case where there are no posts or only one post
386
+ if len(posts) == 0:
387
+ st.info(f"No posts found to display details.")
388
+ elif len(posts) == 1:
389
+ # For a single post, no need for a slider
390
+ post_index = 0
391
+ st.info(f"Displaying the only post found.")
392
+ else:
393
+ # For multiple posts, create a slider
394
+ post_index = st.slider(f"Select post from r/{subreddit} ({len(posts)} posts)",
395
+ 0, len(posts)-1, 0)
396
 
397
  if len(posts) > 0:
398
  post = posts[post_index]
 
452
  # Tab 2: Visualizations
453
  with tab2:
454
  if st.session_state.results:
455
+ # Display loading state while generating visualizations
456
+ with st.spinner("Generating visualizations..."):
457
+ # Apply current filters to visualization data
458
+ filtered_results = filter_results(st.session_state.results, st.session_state.filters)
459
+
460
+ # Check if we have any results after filtering
461
+ total_posts = sum(len(posts) for posts in filtered_results.values())
462
+ if total_posts == 0:
463
+ st.warning("No posts match your current filters. Try adjusting your filter criteria.")
464
+ else:
465
+ # Continue with visualization
466
+ create_data_visualization(filtered_results)
467
  else:
468
  st.info("Run a search to generate visualizations.")
469
 
requirements.txt CHANGED
@@ -4,4 +4,4 @@ streamlit>=1.3.0
4
  plotly>=5.5.0
5
  matplotlib>=3.5.0
6
  python-dotenv>=0.20.0
7
- pyarrow>=6.0.0 # Arrow for DataFrame serialization
 
4
  plotly>=5.5.0
5
  matplotlib>=3.5.0
6
  python-dotenv>=0.20.0
7
+ pyarrow>=6.0.0 # Arrow for DataFrame serialization