# Add warning suppression at the very beginning before any other imports import warnings warnings.filterwarnings("ignore", message="No secrets files found.*") import streamlit as st import pandas as pd import matplotlib.pyplot as plt import plotly.express as px import time import os import json from datetime import datetime from dotenv import load_dotenv from enhanced_scraper import EnhancedRedditScraper # Disable static file serving to prevent the warning os.environ['STREAMLIT_SERVER_ENABLE_STATIC_SERVING'] = 'false' # Note: Page configuration and session state initialization are handled in app.py # Functions def initialize_scraper(client_id, client_secret, user_agent): """Initialize the scraper with API credentials""" try: scraper = EnhancedRedditScraper( client_id=client_id, client_secret=client_secret, user_agent=user_agent ) st.session_state.scraper = scraper return True except Exception as e: st.error(f"Failed to initialize scraper: {str(e)}") return False def run_search(subreddits, keywords, limit, sort_by, include_comments, include_selftext, min_score): """Run the search with provided parameters""" if not st.session_state.scraper: st.error("Scraper not initialized. Please set up API credentials first.") return False try: with st.spinner("Scraping Reddit..."): if len(subreddits) == 1: # Single subreddit search results = st.session_state.scraper.scrape_subreddit( subreddit_name=subreddits[0], keywords=keywords, limit=limit, sort_by=sort_by, include_comments=include_comments, include_selftext=include_selftext, min_score=min_score ) st.session_state.results = {subreddits[0]: results} else: # Multiple subreddit search results = st.session_state.scraper.search_multiple_subreddits( subreddits=subreddits, keywords=keywords, limit=limit, sort_by=sort_by, include_comments=include_comments, include_selftext=include_selftext, min_score=min_score ) st.session_state.results = results # Add to search history search_info = { 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'subreddits': subreddits, 'keywords': keywords, 'total_results': sum(len(results) for results in st.session_state.results.values()) } st.session_state.search_history.append(search_info) return True except Exception as e: st.error(f"Search failed: {str(e)}") return False def filter_results(results, filters): """Apply filters to results""" filtered = {} for subreddit, posts in results.items(): filtered_posts = [] for post in posts: # Apply score filter if post['score'] < filters['min_score']: continue # Apply date filters if set if filters['date_from'] or filters['date_to']: post_date = datetime.strptime(post['created_utc'], '%Y-%m-%d %H:%M:%S') if filters['date_from'] and post_date < filters['date_from']: continue if filters['date_to'] and post_date > filters['date_to']: continue # Filter for posts with comments if requested if filters['show_only_with_comments'] and ( 'matching_comments' not in post or not post['matching_comments']): continue filtered_posts.append(post) filtered[subreddit] = filtered_posts return filtered def create_data_visualization(results): """Create data visualizations based on results""" try: # Check if we have any data total_posts = sum(len(posts) for posts in results.values()) if total_posts == 0: st.warning("No posts found matching your search criteria. Try adjusting your filters.") return # Combine all results all_posts = [] for subreddit, posts in results.items(): for post in posts: try: post_copy = post.copy() post_copy['subreddit'] = subreddit all_posts.append(post_copy) except Exception as e: st.warning(f"Skipping post due to error: {str(e)}") if not all_posts: st.warning("No data to visualize.") return # Create DataFrame with error handling try: df = pd.DataFrame(all_posts) except Exception as e: st.error(f"Could not create DataFrame: {str(e)}") return # Basic data validation if 'score' not in df.columns or 'subreddit' not in df.columns: missing_columns = [] if 'score' not in df.columns: missing_columns.append('score') if 'subreddit' not in df.columns: missing_columns.append('subreddit') st.error(f"Required column(s) missing: {', '.join(missing_columns)}") st.write("Available columns:", df.columns.tolist()) return # Create tabs for different visualizations viz_tab1, viz_tab2, viz_tab3 = st.tabs(["Score Distribution", "Posts by Subreddit", "Time Analysis"]) # Score Distribution with viz_tab1: try: fig = px.histogram(df, x="score", color="subreddit", nbins=20, title="Distribution of Post Scores") fig.update_layout( xaxis_title="Score (Upvotes)", yaxis_title="Number of Posts", legend_title="Subreddit" ) # Add error handling with detailed output try: st.plotly_chart(fig, use_container_width=True) except Exception as e: st.error(f"Error rendering plotly chart: {str(e)}") # More detailed error info import traceback st.code(traceback.format_exc()) st.write("Figure data type:", type(fig)) except Exception as e: st.error(f"Error creating Score Distribution: {str(e)}") st.write("DataFrame head:", df.head()) # Posts by Subreddit with viz_tab2: try: subreddit_counts = df['subreddit'].value_counts().reset_index() subreddit_counts.columns = ['subreddit', 'count'] fig = px.bar(subreddit_counts, x='subreddit', y='count', title="Number of Matching Posts by Subreddit") fig.update_layout( xaxis_title="Subreddit", yaxis_title="Number of Posts" ) # Add error handling with detailed output try: st.plotly_chart(fig, use_container_width=True) except Exception as e: st.error(f"Error rendering plotly chart: {str(e)}") # More detailed error info import traceback st.code(traceback.format_exc()) st.write("Figure data type:", type(fig)) except Exception as e: st.error(f"Error creating Posts by Subreddit chart: {str(e)}") st.write("DataFrame unique subreddits:", df['subreddit'].unique()) # Time Analysis with viz_tab3: try: if 'created_utc' in df.columns: try: # Handle different date formats df['created_date'] = pd.to_datetime(df['created_utc'], errors='coerce') # Check if conversion was successful if df['created_date'].isna().all(): st.warning("Could not parse date formats properly.") return df['hour_of_day'] = df['created_date'].dt.hour fig = px.histogram(df, x="hour_of_day", nbins=24, title="Posts by Hour of Day") fig.update_layout( xaxis_title="Hour of Day (UTC)", yaxis_title="Number of Posts", xaxis=dict(tickmode='linear', tick0=0, dtick=1) # Ensure all hours are shown ) # Add error handling with detailed output try: st.plotly_chart(fig, use_container_width=True) except Exception as e: st.error(f"Error rendering plotly chart: {str(e)}") # More detailed error info import traceback st.code(traceback.format_exc()) st.write("Figure data type:", type(fig)) except Exception as e: st.error(f"Error processing dates: {str(e)}") else: st.warning("No date information available for Time Analysis.") except Exception as e: st.error(f"Error creating Time Analysis: {str(e)}") except Exception as e: st.error(f"Data visualization failed: {str(e)}") def main(): # Suppress the "No secrets files found" warning warnings.filterwarnings("ignore", message="No secrets files found.*") # Ensure session state variables are initialized if 'results' not in st.session_state: st.session_state['results'] = None if 'scraper' not in st.session_state: st.session_state['scraper'] = None if 'search_history' not in st.session_state: st.session_state['search_history'] = [] if 'filters' not in st.session_state: st.session_state['filters'] = { 'min_score': 0, 'date_from': None, 'date_to': None, 'show_only_with_comments': False } # Header using Streamlit's native heading components st.title("Reddit Scraper") st.header("Data Collection Tool") # Sidebar for configuration with st.sidebar: st.header("Configuration") # Search Parameters st.subheader("Search Parameters") # Multiple subreddit input subreddits_input = st.text_area("Subreddits (one per line)", value="cuny\ncollegequestions") subreddits = [s.strip() for s in subreddits_input.split("\n") if s.strip()] # Keywords input keywords_input = st.text_area("Keywords (one per line)", value="question\nhelp\nconfused") keywords = [k.strip() for k in keywords_input.split("\n") if k.strip()] # Other parameters limit = st.slider("Number of posts to scan per subreddit", 10, 200, 50) sort_by = st.selectbox("Sort posts by", ["hot", "new", "top", "rising"], index=0) include_selftext = st.checkbox("Include post content in search", value=True) include_comments = st.checkbox("Include comments in search", value=True) min_score = st.slider("Minimum score (upvotes)", 0, 1000, 0) # Action buttons search_col, clear_col = st.columns(2) with search_col: search_button = st.button("Run Search", type="primary", use_container_width=True) with clear_col: clear_button = st.button("Clear Results", type="secondary", use_container_width=True) # Main interface tabs tab1, tab2, tab3, tab4, tab5 = st.tabs(["Results", "Visualizations", "Export", "History", "API Credentials"]) # Handle Actions if clear_button: st.session_state.results = None st.rerun() if search_button: if not subreddits: st.error("Please enter at least one subreddit to search.") elif not keywords: st.error("Please enter at least one keyword to search.") else: success = run_search( subreddits=subreddits, keywords=keywords, limit=limit, sort_by=sort_by, include_comments=include_comments, include_selftext=include_selftext, min_score=min_score ) if success: st.success(f"Search completed! Found results in {len(st.session_state.results)} subreddits.") # Tab 1: Results with tab1: if st.session_state.results: # Post-search filters st.markdown('