Spaces:
Running
Running
Debug + enhance data visualization
Browse files- LICENSE +3 -2
- advanced_scraper_ui.py +77 -9
- requirements.txt +1 -1
LICENSE
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
GNU GENERAL PUBLIC LICENSE
|
2 |
-
|
|
|
3 |
|
4 |
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
5 |
Everyone is permitted to copy and distribute verbatim copies
|
6 |
of this license document, but changing it is not allowed.
|
7 |
|
8 |
-
|
9 |
|
10 |
The GNU General Public License is a free, copyleft license for
|
11 |
software and other kinds of works.
|
|
|
1 |
GNU GENERAL PUBLIC LICENSE
|
2 |
+
|
3 |
+
Version 3, 29 June 2007
|
4 |
|
5 |
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
6 |
Everyone is permitted to copy and distribute verbatim copies
|
7 |
of this license document, but changing it is not allowed.
|
8 |
|
9 |
+
Preamble
|
10 |
|
11 |
The GNU General Public License is a free, copyleft license for
|
12 |
software and other kinds of works.
|
advanced_scraper_ui.py
CHANGED
@@ -113,6 +113,12 @@ def filter_results(results, filters):
|
|
113 |
def create_data_visualization(results):
|
114 |
"""Create data visualizations based on results"""
|
115 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
# Combine all results
|
117 |
all_posts = []
|
118 |
for subreddit, posts in results.items():
|
@@ -143,6 +149,7 @@ def create_data_visualization(results):
|
|
143 |
if 'subreddit' not in df.columns:
|
144 |
missing_columns.append('subreddit')
|
145 |
st.error(f"Required column(s) missing: {', '.join(missing_columns)}")
|
|
|
146 |
return
|
147 |
|
148 |
# Create tabs for different visualizations
|
@@ -154,9 +161,23 @@ def create_data_visualization(results):
|
|
154 |
st.subheader("Score Distribution")
|
155 |
fig = px.histogram(df, x="score", color="subreddit", nbins=20,
|
156 |
title="Distribution of Post Scores")
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
except Exception as e:
|
159 |
st.error(f"Error creating Score Distribution: {str(e)}")
|
|
|
160 |
|
161 |
# Posts by Subreddit
|
162 |
with viz_tab2:
|
@@ -164,11 +185,26 @@ def create_data_visualization(results):
|
|
164 |
st.subheader("Posts by Subreddit")
|
165 |
subreddit_counts = df['subreddit'].value_counts().reset_index()
|
166 |
subreddit_counts.columns = ['subreddit', 'count']
|
|
|
167 |
fig = px.bar(subreddit_counts, x='subreddit', y='count',
|
168 |
title="Number of Matching Posts by Subreddit")
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
except Exception as e:
|
171 |
st.error(f"Error creating Posts by Subreddit chart: {str(e)}")
|
|
|
172 |
|
173 |
# Time Analysis
|
174 |
with viz_tab3:
|
@@ -188,8 +224,21 @@ def create_data_visualization(results):
|
|
188 |
|
189 |
fig = px.histogram(df, x="hour_of_day", nbins=24,
|
190 |
title="Posts by Hour of Day")
|
191 |
-
fig.update_layout(
|
192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
except Exception as e:
|
194 |
st.error(f"Error processing dates: {str(e)}")
|
195 |
else:
|
@@ -332,8 +381,18 @@ def main():
|
|
332 |
|
333 |
# Show detailed post view
|
334 |
st.subheader("Post Details")
|
335 |
-
|
336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
|
338 |
if len(posts) > 0:
|
339 |
post = posts[post_index]
|
@@ -393,9 +452,18 @@ def main():
|
|
393 |
# Tab 2: Visualizations
|
394 |
with tab2:
|
395 |
if st.session_state.results:
|
396 |
-
#
|
397 |
-
|
398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
else:
|
400 |
st.info("Run a search to generate visualizations.")
|
401 |
|
|
|
113 |
def create_data_visualization(results):
|
114 |
"""Create data visualizations based on results"""
|
115 |
try:
|
116 |
+
# Check if we have any data
|
117 |
+
total_posts = sum(len(posts) for posts in results.values())
|
118 |
+
if total_posts == 0:
|
119 |
+
st.warning("No posts found matching your search criteria. Try adjusting your filters.")
|
120 |
+
return
|
121 |
+
|
122 |
# Combine all results
|
123 |
all_posts = []
|
124 |
for subreddit, posts in results.items():
|
|
|
149 |
if 'subreddit' not in df.columns:
|
150 |
missing_columns.append('subreddit')
|
151 |
st.error(f"Required column(s) missing: {', '.join(missing_columns)}")
|
152 |
+
st.write("Available columns:", df.columns.tolist())
|
153 |
return
|
154 |
|
155 |
# Create tabs for different visualizations
|
|
|
161 |
st.subheader("Score Distribution")
|
162 |
fig = px.histogram(df, x="score", color="subreddit", nbins=20,
|
163 |
title="Distribution of Post Scores")
|
164 |
+
fig.update_layout(
|
165 |
+
xaxis_title="Score (Upvotes)",
|
166 |
+
yaxis_title="Number of Posts",
|
167 |
+
legend_title="Subreddit"
|
168 |
+
)
|
169 |
+
# Add error handling with detailed output
|
170 |
+
try:
|
171 |
+
st.plotly_chart(fig, use_container_width=True)
|
172 |
+
except Exception as e:
|
173 |
+
st.error(f"Error rendering plotly chart: {str(e)}")
|
174 |
+
# More detailed error info
|
175 |
+
import traceback
|
176 |
+
st.code(traceback.format_exc())
|
177 |
+
st.write("Figure data type:", type(fig))
|
178 |
except Exception as e:
|
179 |
st.error(f"Error creating Score Distribution: {str(e)}")
|
180 |
+
st.write("DataFrame head:", df.head())
|
181 |
|
182 |
# Posts by Subreddit
|
183 |
with viz_tab2:
|
|
|
185 |
st.subheader("Posts by Subreddit")
|
186 |
subreddit_counts = df['subreddit'].value_counts().reset_index()
|
187 |
subreddit_counts.columns = ['subreddit', 'count']
|
188 |
+
|
189 |
fig = px.bar(subreddit_counts, x='subreddit', y='count',
|
190 |
title="Number of Matching Posts by Subreddit")
|
191 |
+
fig.update_layout(
|
192 |
+
xaxis_title="Subreddit",
|
193 |
+
yaxis_title="Number of Posts"
|
194 |
+
)
|
195 |
+
|
196 |
+
# Add error handling with detailed output
|
197 |
+
try:
|
198 |
+
st.plotly_chart(fig, use_container_width=True)
|
199 |
+
except Exception as e:
|
200 |
+
st.error(f"Error rendering plotly chart: {str(e)}")
|
201 |
+
# More detailed error info
|
202 |
+
import traceback
|
203 |
+
st.code(traceback.format_exc())
|
204 |
+
st.write("Figure data type:", type(fig))
|
205 |
except Exception as e:
|
206 |
st.error(f"Error creating Posts by Subreddit chart: {str(e)}")
|
207 |
+
st.write("DataFrame unique subreddits:", df['subreddit'].unique())
|
208 |
|
209 |
# Time Analysis
|
210 |
with viz_tab3:
|
|
|
224 |
|
225 |
fig = px.histogram(df, x="hour_of_day", nbins=24,
|
226 |
title="Posts by Hour of Day")
|
227 |
+
fig.update_layout(
|
228 |
+
xaxis_title="Hour of Day (UTC)",
|
229 |
+
yaxis_title="Number of Posts",
|
230 |
+
xaxis=dict(tickmode='linear', tick0=0, dtick=1) # Ensure all hours are shown
|
231 |
+
)
|
232 |
+
|
233 |
+
# Add error handling with detailed output
|
234 |
+
try:
|
235 |
+
st.plotly_chart(fig, use_container_width=True)
|
236 |
+
except Exception as e:
|
237 |
+
st.error(f"Error rendering plotly chart: {str(e)}")
|
238 |
+
# More detailed error info
|
239 |
+
import traceback
|
240 |
+
st.code(traceback.format_exc())
|
241 |
+
st.write("Figure data type:", type(fig))
|
242 |
except Exception as e:
|
243 |
st.error(f"Error processing dates: {str(e)}")
|
244 |
else:
|
|
|
381 |
|
382 |
# Show detailed post view
|
383 |
st.subheader("Post Details")
|
384 |
+
|
385 |
+
# Handle the case where there are no posts or only one post
|
386 |
+
if len(posts) == 0:
|
387 |
+
st.info(f"No posts found to display details.")
|
388 |
+
elif len(posts) == 1:
|
389 |
+
# For a single post, no need for a slider
|
390 |
+
post_index = 0
|
391 |
+
st.info(f"Displaying the only post found.")
|
392 |
+
else:
|
393 |
+
# For multiple posts, create a slider
|
394 |
+
post_index = st.slider(f"Select post from r/{subreddit} ({len(posts)} posts)",
|
395 |
+
0, len(posts)-1, 0)
|
396 |
|
397 |
if len(posts) > 0:
|
398 |
post = posts[post_index]
|
|
|
452 |
# Tab 2: Visualizations
|
453 |
with tab2:
|
454 |
if st.session_state.results:
|
455 |
+
# Display loading state while generating visualizations
|
456 |
+
with st.spinner("Generating visualizations..."):
|
457 |
+
# Apply current filters to visualization data
|
458 |
+
filtered_results = filter_results(st.session_state.results, st.session_state.filters)
|
459 |
+
|
460 |
+
# Check if we have any results after filtering
|
461 |
+
total_posts = sum(len(posts) for posts in filtered_results.values())
|
462 |
+
if total_posts == 0:
|
463 |
+
st.warning("No posts match your current filters. Try adjusting your filter criteria.")
|
464 |
+
else:
|
465 |
+
# Continue with visualization
|
466 |
+
create_data_visualization(filtered_results)
|
467 |
else:
|
468 |
st.info("Run a search to generate visualizations.")
|
469 |
|
requirements.txt
CHANGED
@@ -4,4 +4,4 @@ streamlit>=1.3.0
|
|
4 |
plotly>=5.5.0
|
5 |
matplotlib>=3.5.0
|
6 |
python-dotenv>=0.20.0
|
7 |
-
pyarrow>=6.0.0 # Arrow for DataFrame serialization
|
|
|
4 |
plotly>=5.5.0
|
5 |
matplotlib>=3.5.0
|
6 |
python-dotenv>=0.20.0
|
7 |
+
pyarrow>=6.0.0 # Arrow for DataFrame serialization
|