milwright commited on
Commit
d4ba5c3
·
1 Parent(s): de4f577

Fix data visualization issues with robust error handling

Browse files
Files changed (1) hide show
  1. advanced_scraper_ui.py +86 -40
advanced_scraper_ui.py CHANGED
@@ -109,47 +109,93 @@ def filter_results(results, filters):
109
 
110
  def create_data_visualization(results):
111
  """Create data visualizations based on results"""
112
- # Combine all results
113
- all_posts = []
114
- for subreddit, posts in results.items():
115
- for post in posts:
116
- post['subreddit'] = subreddit
117
- all_posts.append(post)
118
-
119
- if not all_posts:
120
- st.warning("No data to visualize.")
121
- return
122
-
123
- df = pd.DataFrame(all_posts)
124
-
125
- # Create tabs for different visualizations
126
- viz_tab1, viz_tab2, viz_tab3 = st.tabs(["Score Distribution", "Posts by Subreddit", "Time Analysis"])
127
-
128
- with viz_tab1:
129
- st.subheader("Score Distribution")
130
- fig = px.histogram(df, x="score", color="subreddit", nbins=20,
131
- title="Distribution of Post Scores")
132
- st.plotly_chart(fig, use_container_width=True)
133
-
134
- with viz_tab2:
135
- st.subheader("Posts by Subreddit")
136
- subreddit_counts = df['subreddit'].value_counts().reset_index()
137
- subreddit_counts.columns = ['subreddit', 'count']
138
- fig = px.bar(subreddit_counts, x='subreddit', y='count',
139
- title="Number of Matching Posts by Subreddit")
140
- st.plotly_chart(fig, use_container_width=True)
141
-
142
- with viz_tab3:
143
- st.subheader("Time Analysis")
144
- # Convert created_utc to datetime if it's not already
145
- if 'created_utc' in df.columns:
146
- df['created_date'] = pd.to_datetime(df['created_utc'])
147
- df['hour_of_day'] = df['created_date'].dt.hour
148
 
149
- fig = px.histogram(df, x="hour_of_day", nbins=24,
150
- title="Posts by Hour of Day")
151
- fig.update_layout(xaxis_title="Hour of Day (UTC)")
152
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  def main():
155
  # Suppress the "No secrets files found" warning
 
109
 
110
  def create_data_visualization(results):
111
  """Create data visualizations based on results"""
112
+ try:
113
+ # Combine all results
114
+ all_posts = []
115
+ for subreddit, posts in results.items():
116
+ for post in posts:
117
+ try:
118
+ post_copy = post.copy()
119
+ post_copy['subreddit'] = subreddit
120
+ all_posts.append(post_copy)
121
+ except Exception as e:
122
+ st.warning(f"Skipping post due to error: {str(e)}")
123
+
124
+ if not all_posts:
125
+ st.warning("No data to visualize.")
126
+ return
127
+
128
+ # Create DataFrame with error handling
129
+ try:
130
+ df = pd.DataFrame(all_posts)
131
+ except Exception as e:
132
+ st.error(f"Could not create DataFrame: {str(e)}")
133
+ return
134
+
135
+ # Basic data validation
136
+ if 'score' not in df.columns or 'subreddit' not in df.columns:
137
+ missing_columns = []
138
+ if 'score' not in df.columns:
139
+ missing_columns.append('score')
140
+ if 'subreddit' not in df.columns:
141
+ missing_columns.append('subreddit')
142
+ st.error(f"Required column(s) missing: {', '.join(missing_columns)}")
143
+ return
 
 
 
 
144
 
145
+ # Create tabs for different visualizations
146
+ viz_tab1, viz_tab2, viz_tab3 = st.tabs(["Score Distribution", "Posts by Subreddit", "Time Analysis"])
147
+
148
+ # Score Distribution
149
+ with viz_tab1:
150
+ try:
151
+ st.subheader("Score Distribution")
152
+ fig = px.histogram(df, x="score", color="subreddit", nbins=20,
153
+ title="Distribution of Post Scores")
154
+ st.plotly_chart(fig, use_container_width=True)
155
+ except Exception as e:
156
+ st.error(f"Error creating Score Distribution: {str(e)}")
157
+
158
+ # Posts by Subreddit
159
+ with viz_tab2:
160
+ try:
161
+ st.subheader("Posts by Subreddit")
162
+ subreddit_counts = df['subreddit'].value_counts().reset_index()
163
+ subreddit_counts.columns = ['subreddit', 'count']
164
+ fig = px.bar(subreddit_counts, x='subreddit', y='count',
165
+ title="Number of Matching Posts by Subreddit")
166
+ st.plotly_chart(fig, use_container_width=True)
167
+ except Exception as e:
168
+ st.error(f"Error creating Posts by Subreddit chart: {str(e)}")
169
+
170
+ # Time Analysis
171
+ with viz_tab3:
172
+ try:
173
+ st.subheader("Time Analysis")
174
+ if 'created_utc' in df.columns:
175
+ try:
176
+ # Handle different date formats
177
+ df['created_date'] = pd.to_datetime(df['created_utc'], errors='coerce')
178
+
179
+ # Check if conversion was successful
180
+ if df['created_date'].isna().all():
181
+ st.warning("Could not parse date formats properly.")
182
+ return
183
+
184
+ df['hour_of_day'] = df['created_date'].dt.hour
185
+
186
+ fig = px.histogram(df, x="hour_of_day", nbins=24,
187
+ title="Posts by Hour of Day")
188
+ fig.update_layout(xaxis_title="Hour of Day (UTC)")
189
+ st.plotly_chart(fig, use_container_width=True)
190
+ except Exception as e:
191
+ st.error(f"Error processing dates: {str(e)}")
192
+ else:
193
+ st.warning("No date information available for Time Analysis.")
194
+ except Exception as e:
195
+ st.error(f"Error creating Time Analysis: {str(e)}")
196
+
197
+ except Exception as e:
198
+ st.error(f"Data visualization failed: {str(e)}")
199
 
200
  def main():
201
  # Suppress the "No secrets files found" warning