import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
from collections import Counter

# Download NLTK resources (run once)
try:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('punkt_tab')
except Exception as e:
    print(f"Warning: Could not download NLTK data: {e}")
    print("Please run: import nltk; nltk.download('all') in Python console")

# 1. Text Preprocessing Function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to string
    return ' '.join(tokens)

# 2. Load and Preprocess Dataset
def load_and_preprocess_data(file_path="IMDB Dataset.csv"):
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: Could not find file '{file_path}'")
        print("Please make sure the CSV file is in the same directory as this script.")
        return None
    # Apply preprocessing to reviews
    df['cleaned_review'] = df['review'].apply(preprocess_text)
    # Convert sentiment to binary (1 for positive, 0 for negative)
    df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})
    return df

# 3. Train and Evaluate Models
def train_and_evaluate(df):
    # Convert text to TF-IDF features
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(df['cleaned_review'])
    y = df['sentiment']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Logistic Regression
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train, y_train)
    lr_predictions = lr_model.predict(X_test)
    lr_accuracy = accuracy_score(y_test, lr_predictions)
    print("Logistic Regression Accuracy:", lr_accuracy)
    print("Logistic Regression Classification Report:\n", classification_report(y_test, lr_predictions))
    
    # Naive Bayes
    nb_model = MultinomialNB()
    nb_model.fit(X_train, y_train)
    nb_predictions = nb_model.predict(X_test)
    nb_accuracy = accuracy_score(y_test, nb_predictions)
    print("Naive Bayes Accuracy:", nb_accuracy)
    print("Naive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))
    
    return vectorizer, lr_model, nb_model, X_test, y_test

# 4. Visualize Frequent Words
def visualize_frequent_words(df):
    # Separate positive and negative reviews
    positive_reviews = df[df['sentiment'] == 1]['cleaned_review']
    negative_reviews = df[df['sentiment'] == 0]['cleaned_review']
    
    # Count words
    positive_words = ' '.join(positive_reviews).split()
    negative_words = ' '.join(negative_reviews).split()
    
    # Get top 10 words
    positive_freq = Counter(positive_words).most_common(10)
    negative_freq = Counter(negative_words).most_common(10)
    
    # Plot
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    sns.barplot(x=[count for word, count in positive_freq], y=[word for word, count in positive_freq])
    plt.title('Top 10 Positive Words')
    
    plt.subplot(1, 2, 2)
    sns.barplot(x=[count for word, count in negative_freq], y=[word for word, count in negative_freq])
    plt.title('Top 10 Negative Words')
    
    plt.tight_layout()
    plt.savefig('word_frequency.png')
    plt.close()

# 5. Streamlit App for Model Deployment
def run_streamlit_app(vectorizer, lr_model, nb_model):
    st.title("IMDb Review Sentiment Analysis")
    st.write("Enter a movie review to predict its sentiment (positive or negative).")
    
    # Text input
    user_input = st.text_area("Enter your review:", "")
    
    if st.button("Predict Sentiment"):
        if user_input:
            # Preprocess input
            cleaned_input = preprocess_text(user_input)
            input_vector = vectorizer.transform([cleaned_input])
            
            # Predict with both models
            lr_prediction = lr_model.predict(input_vector)[0]
            lr_prob = lr_model.predict_proba(input_vector)[0]
            nb_prediction = nb_model.predict(input_vector)[0]
            nb_prob = nb_model.predict_proba(input_vector)[0]
            
            # Display results
            st.write("### Logistic Regression Prediction")
            st.write(f"Sentiment: {'Positive' if lr_prediction == 1 else 'Negative'}")
            st.write(f"Confidence: {max(lr_prob):.2f}")
            
            st.write("### Naive Bayes Prediction")
            st.write(f"Sentiment: {'Positive' if nb_prediction == 1 else 'Negative'}")
            st.write(f"Confidence: {max(nb_prob):.2f}")
        else:
            st.write("Please enter a review.")

# Main execution
if __name__ == "__main__":
    file_path = "IMDB Dataset.csv"
    df = load_and_preprocess_data(file_path)
    
    if df is not None:
        # Train and evaluate models
        vectorizer, lr_model, nb_model, X_test, y_test = train_and_evaluate(df)
        
        # Visualize frequent words
        visualize_frequent_words(df)
        
        # Run Streamlit app
        run_streamlit_app(vectorizer, lr_model, nb_model)
    else:
        print("Exiting due to data loading error.")