MON3EMPASHA
/

imdb-movie-review-sentiment-analysis

@@ -1,141 +0,0 @@
-import pandas as pd
-import nltk
-import re
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
-from nltk.stem import WordNetLemmatizer
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.model_selection import train_test_split
-from sklearn.linear_model import LogisticRegression
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
-import matplotlib.pyplot as plt
-import seaborn as sns
-import streamlit as st
-from collections import Counter
-import pickle
-import os
-# Download NLTK resources
-try:
-    nltk.download('punkt')
-    nltk.download('stopwords')
-    nltk.download('wordnet')
-    nltk.download('punkt_tab')
-except Exception as e:
-    st.error(f"Could not download NLTK data: {e}")
-# Text Preprocessing Function
-def preprocess_text(text):
-    # Lowercase
-    text = text.lower()
-    # Remove special characters and digits
-    text = re.sub(r'[^a-zA-Z\s]', '', text)
-    # Tokenize
-    tokens = word_tokenize(text)
-    # Remove stopwords
-    stop_words = set(stopwords.words('english'))
-    tokens = [word for word in tokens if word not in stop_words]
-    # Lemmatize
-    lemmatizer = WordNetLemmatizer()
-    tokens = [lemmatizer.lemmatize(word) for word in tokens]
-    # Join tokens back to string
-    return ' '.join(tokens)
-# Load and Preprocess Dataset
-def load_and_preprocess_data(file_path="IMDB Dataset.csv"):
-    try:
-        df = pd.read_csv(file_path)
-        # Apply preprocessing to reviews
-        df['cleaned_review'] = df['review'].apply(preprocess_text)
-        # Convert sentiment to binary (1 for positive, 0 for negative)
-        df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})
-        return df
-    except FileNotFoundError:
-        st.error(f"Could not find file '{file_path}'")
-        return None
-# Train Models
-def train_models(df):
-    # Convert text to TF-IDF features
-    vectorizer = TfidfVectorizer(max_features=5000)
-    X = vectorizer.fit_transform(df['cleaned_review'])
-    y = df['sentiment']
-    # Split data
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    # Logistic Regression
-    lr_model = LogisticRegression(max_iter=1000)
-    lr_model.fit(X_train, y_train)
-    lr_predictions = lr_model.predict(X_test)
-    lr_accuracy = accuracy_score(y_test, lr_predictions)
-    # Naive Bayes
-    nb_model = MultinomialNB()
-    nb_model.fit(X_train, y_train)
-    nb_predictions = nb_model.predict(X_test)
-    nb_accuracy = accuracy_score(y_test, nb_predictions)
-    return vectorizer, lr_model, nb_model, lr_accuracy, nb_accuracy
-# Streamlit App
-def main():
-    st.title("IMDb Review Sentiment Analysis")
-    st.write("This app analyzes movie reviews to predict whether they are positive or negative.")
-    # Load data and train models
-    with st.spinner("Loading data and training models..."):
-        df = load_and_preprocess_data()
-        if df is not None:
-            vectorizer, lr_model, nb_model, lr_accuracy, nb_accuracy = train_models(df)
-            st.success("Models trained successfully!")
-            # Display model accuracies
-            col1, col2 = st.columns(2)
-            with col1:
-                st.metric("Logistic Regression Accuracy", f"{lr_accuracy:.2%}")
-            with col2:
-                st.metric("Naive Bayes Accuracy", f"{nb_accuracy:.2%}")
-            # Text input for prediction
-            st.subheader("Predict Sentiment")
-            user_input = st.text_area("Enter a movie review:", height=150)
-            if st.button("Predict Sentiment"):
-                if user_input:
-                    # Preprocess input
-                    cleaned_input = preprocess_text(user_input)
-                    input_vector = vectorizer.transform([cleaned_input])
-                    # Predict with both models
-                    lr_prediction = lr_model.predict(input_vector)[0]
-                    lr_prob = lr_model.predict_proba(input_vector)[0]
-                    nb_prediction = nb_model.predict(input_vector)[0]
-                    nb_prob = nb_model.predict_proba(input_vector)[0]
-                    # Display results
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        st.subheader("Logistic Regression")
-                        if lr_prediction == 1:
-                            st.success("Positive Sentiment")
-                        else:
-                            st.error("Negative Sentiment")
-                        st.write(f"Confidence: {max(lr_prob):.2%}")
-                    with col2:
-                        st.subheader("Naive Bayes")
-                        if nb_prediction == 1:
-                            st.success("Positive Sentiment")
-                        else:
-                            st.error("Negative Sentiment")
-                        st.write(f"Confidence: {max(nb_prob):.2%}")
-                else:
-                    st.warning("Please enter a review.")
-        else:
-            st.error("Failed to load data. Please check if 'IMDB Dataset.csv' is in the same directory.")
-if __name__ == "__main__":
-    main()