import pandas as pd import nltk import re from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import matplotlib.pyplot as plt import seaborn as sns import streamlit as st from collections import Counter # Download NLTK resources (run once) try: nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') nltk.download('punkt_tab') except Exception as e: print(f"Warning: Could not download NLTK data: {e}") print("Please run: import nltk; nltk.download('all') in Python console") # 1. Text Preprocessing Function def preprocess_text(text): # Lowercase text = text.lower() # Remove special characters and digits text = re.sub(r'[^a-zA-Z\s]', '', text) # Tokenize tokens = word_tokenize(text) # Remove stopwords stop_words = set(stopwords.words('english')) tokens = [word for word in tokens if word not in stop_words] # Lemmatize lemmatizer = WordNetLemmatizer() tokens = [lemmatizer.lemmatize(word) for word in tokens] # Join tokens back to string return ' '.join(tokens) # 2. Load and Preprocess Dataset def load_and_preprocess_data(file_path="IMDB Dataset.csv"): try: df = pd.read_csv(file_path) except FileNotFoundError: print(f"Error: Could not find file '{file_path}'") print("Please make sure the CSV file is in the same directory as this script.") return None # Apply preprocessing to reviews df['cleaned_review'] = df['review'].apply(preprocess_text) # Convert sentiment to binary (1 for positive, 0 for negative) df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0}) return df # 3. Train and Evaluate Models def train_and_evaluate(df): # Convert text to TF-IDF features vectorizer = TfidfVectorizer(max_features=5000) X = vectorizer.fit_transform(df['cleaned_review']) y = df['sentiment'] # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Logistic Regression lr_model = LogisticRegression(max_iter=1000) lr_model.fit(X_train, y_train) lr_predictions = lr_model.predict(X_test) lr_accuracy = accuracy_score(y_test, lr_predictions) print("Logistic Regression Accuracy:", lr_accuracy) print("Logistic Regression Classification Report:\n", classification_report(y_test, lr_predictions)) # Naive Bayes nb_model = MultinomialNB() nb_model.fit(X_train, y_train) nb_predictions = nb_model.predict(X_test) nb_accuracy = accuracy_score(y_test, nb_predictions) print("Naive Bayes Accuracy:", nb_accuracy) print("Naive Bayes Classification Report:\n", classification_report(y_test, nb_predictions)) return vectorizer, lr_model, nb_model, X_test, y_test # 4. Visualize Frequent Words def visualize_frequent_words(df): # Separate positive and negative reviews positive_reviews = df[df['sentiment'] == 1]['cleaned_review'] negative_reviews = df[df['sentiment'] == 0]['cleaned_review'] # Count words positive_words = ' '.join(positive_reviews).split() negative_words = ' '.join(negative_reviews).split() # Get top 10 words positive_freq = Counter(positive_words).most_common(10) negative_freq = Counter(negative_words).most_common(10) # Plot plt.figure(figsize=(12, 5)) plt.subplot(1, 2, 1) sns.barplot(x=[count for word, count in positive_freq], y=[word for word, count in positive_freq]) plt.title('Top 10 Positive Words') plt.subplot(1, 2, 2) sns.barplot(x=[count for word, count in negative_freq], y=[word for word, count in negative_freq]) plt.title('Top 10 Negative Words') plt.tight_layout() plt.savefig('word_frequency.png') plt.close() # 5. Streamlit App for Model Deployment def run_streamlit_app(vectorizer, lr_model, nb_model): st.title("IMDb Review Sentiment Analysis") st.write("Enter a movie review to predict its sentiment (positive or negative).") # Text input user_input = st.text_area("Enter your review:", "") if st.button("Predict Sentiment"): if user_input: # Preprocess input cleaned_input = preprocess_text(user_input) input_vector = vectorizer.transform([cleaned_input]) # Predict with both models lr_prediction = lr_model.predict(input_vector)[0] lr_prob = lr_model.predict_proba(input_vector)[0] nb_prediction = nb_model.predict(input_vector)[0] nb_prob = nb_model.predict_proba(input_vector)[0] # Display results st.write("### Logistic Regression Prediction") st.write(f"Sentiment: {'Positive' if lr_prediction == 1 else 'Negative'}") st.write(f"Confidence: {max(lr_prob):.2f}") st.write("### Naive Bayes Prediction") st.write(f"Sentiment: {'Positive' if nb_prediction == 1 else 'Negative'}") st.write(f"Confidence: {max(nb_prob):.2f}") else: st.write("Please enter a review.") # Main execution if __name__ == "__main__": file_path = "IMDB Dataset.csv" df = load_and_preprocess_data(file_path) if df is not None: # Train and evaluate models vectorizer, lr_model, nb_model, X_test, y_test = train_and_evaluate(df) # Visualize frequent words visualize_frequent_words(df) # Run Streamlit app run_streamlit_app(vectorizer, lr_model, nb_model) else: print("Exiting due to data loading error.")