import streamlit as st
import joblib
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os

# Download NLTK resources
try:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
except:
    pass

class SentimentAnalyzer:
    def __init__(self, model_dir="saved_models"):
        try:
            # Load models
            self.vectorizer = joblib.load(f"{model_dir}/tfidf_vectorizer.pkl")
            self.lr_model = joblib.load(f"{model_dir}/logistic_regression_model.pkl")
            self.nb_model = joblib.load(f"{model_dir}/naive_bayes_model.pkl")
            
            # Load metadata
            with open(f"{model_dir}/model_metadata.json", 'r') as f:
                self.metadata = json.load(f)
            
            self.models_loaded = True
        except Exception as e:
            st.error(f"Error loading models: {e}")
            self.models_loaded = False
    
    def preprocess_text(self, text):
        # Lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        # Lemmatize
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        # Join tokens back to string
        return ' '.join(tokens)
    
    def predict(self, text, model_type='both'):
        if not self.models_loaded:
            return None
            
        # Preprocess text
        cleaned_text = self.preprocess_text(text)
        
        # Vectorize
        text_vector = self.vectorizer.transform([cleaned_text])
        
        results = {}
        
        if model_type in ['lr', 'both']:
            lr_pred = self.lr_model.predict(text_vector)[0]
            lr_prob = self.lr_model.predict_proba(text_vector)[0]
            results['logistic_regression'] = {
                'prediction': 'positive' if lr_pred == 1 else 'negative',
                'confidence': float(max(lr_prob)),
                'probabilities': {
                    'negative': float(lr_prob[0]),
                    'positive': float(lr_prob[1])
                }
            }
        
        if model_type in ['nb', 'both']:
            nb_pred = self.nb_model.predict(text_vector)[0]
            nb_prob = self.nb_model.predict_proba(text_vector)[0]
            results['naive_bayes'] = {
                'prediction': 'positive' if nb_pred == 1 else 'negative',
                'confidence': float(max(nb_prob)),
                'probabilities': {
                    'negative': float(nb_prob[0]),
                    'positive': float(nb_prob[1])
                }
            }
        
        return results

def main():
    st.set_page_config(
        page_title="IMDb Sentiment Analysis",
        page_icon="🎬",
        layout="wide"
    )
    
    st.title("🎬 IMDb Review Sentiment Analysis")
    st.markdown("---")
    
    # Check if models exist
    if not os.path.exists("saved_models"):
        st.error("❌ Models not found! Please run `python train_and_save_model.py` first to train and save the models.")
        st.info("This will create the 'saved_models' directory with your trained models.")
        return
    
    # Initialize analyzer
    with st.spinner("Loading models..."):
        analyzer = SentimentAnalyzer()
    
    if not analyzer.models_loaded:
        st.error("Failed to load models. Please check if the model files exist in the 'saved_models' directory.")
        return
    
    # Display model info
    st.success("✅ Models loaded successfully!")
    
    # Model performance metrics
    col1, col2 = st.columns(2)
    with col1:
        st.metric("Logistic Regression Accuracy", f"{analyzer.metadata['lr_accuracy']:.2%}")
    with col2:
        st.metric("Naive Bayes Accuracy", f"{analyzer.metadata['nb_accuracy']:.2%}")
    
    st.markdown("---")
    
    # Input section
    st.subheader("📝 Enter a Movie Review")
    
    # Text input
    user_input = st.text_area(
        "Write your movie review here:",
        height=150,
        placeholder="Example: This movie was absolutely fantastic! The acting was superb and the plot was engaging..."
    )
    
    # Model selection
    model_choice = st.selectbox(
        "Choose model for prediction:",
        ["Both Models", "Logistic Regression Only", "Naive Bayes Only"],
        help="Select which model(s) to use for prediction"
    )
    
    # Prediction button
    if st.button("🔍 Analyze Sentiment", type="primary"):
        if user_input.strip():
            with st.spinner("Analyzing sentiment..."):
                # Map model choice to parameter
                model_type = 'both'
                if model_choice == "Logistic Regression Only":
                    model_type = 'lr'
                elif model_choice == "Naive Bayes Only":
                    model_type = 'nb'
                
                # Get predictions
                results = analyzer.predict(user_input, model_type)
                
                if results:
                    st.markdown("---")
                    st.subheader("📊 Analysis Results")
                    
                    # Display results
                    if model_type == 'both' or model_choice == "Both Models":
                        col1, col2 = st.columns(2)
                        
                        with col1:
                            st.subheader("🤖 Logistic Regression")
                            lr_result = results['logistic_regression']
                            if lr_result['prediction'] == 'positive':
                                st.success(f"✅ Positive Sentiment")
                            else:
                                st.error(f"❌ Negative Sentiment")
                            st.metric("Confidence", f"{lr_result['confidence']:.2%}")
                            
                            # Progress bar for probabilities
                            st.write("**Probabilities:**")
                            st.progress(lr_result['probabilities']['positive'])
                            st.write(f"Positive: {lr_result['probabilities']['positive']:.2%}")
                            st.progress(lr_result['probabilities']['negative'])
                            st.write(f"Negative: {lr_result['probabilities']['negative']:.2%}")
                        
                        with col2:
                            st.subheader("🧠 Naive Bayes")
                            nb_result = results['naive_bayes']
                            if nb_result['prediction'] == 'positive':
                                st.success(f"✅ Positive Sentiment")
                            else:
                                st.error(f"❌ Negative Sentiment")
                            st.metric("Confidence", f"{nb_result['confidence']:.2%}")
                            
                            # Progress bar for probabilities
                            st.write("**Probabilities:**")
                            st.progress(nb_result['probabilities']['positive'])
                            st.write(f"Positive: {nb_result['probabilities']['positive']:.2%}")
                            st.progress(nb_result['probabilities']['negative'])
                            st.write(f"Negative: {nb_result['probabilities']['negative']:.2%}")
                    
                    else:
                        # Single model result
                        model_name = "Logistic Regression" if model_type == 'lr' else "Naive Bayes"
                        result = results['logistic_regression'] if model_type == 'lr' else results['naive_bayes']
                        
                        st.subheader(f"🤖 {model_name}")
                        if result['prediction'] == 'positive':
                            st.success(f"✅ Positive Sentiment")
                        else:
                            st.error(f"❌ Negative Sentiment")
                        st.metric("Confidence", f"{result['confidence']:.2%}")
                        
                        # Progress bar for probabilities
                        st.write("**Probabilities:**")
                        st.progress(result['probabilities']['positive'])
                        st.write(f"Positive: {result['probabilities']['positive']:.2%}")
                        st.progress(result['probabilities']['negative'])
                        st.write(f"Negative: {result['probabilities']['negative']:.2%}")
                    
                    # Model comparison
                    if model_type == 'both':
                        st.markdown("---")
                        st.subheader("📈 Model Comparison")
                        
                        # Create comparison chart
                        import plotly.graph_objects as go
                        
                        models = list(results.keys())
                        confidences = [results[model]['confidence'] for model in models]
                        predictions = [results[model]['prediction'] for model in models]
                        
                        fig = go.Figure(data=[
                            go.Bar(
                                x=models,
                                y=confidences,
                                text=[f"{conf:.2%}" for conf in confidences],
                                textposition='auto',
                                marker_color=['green' if pred == 'positive' else 'red' for pred in predictions]
                            )
                        ])
                        
                        fig.update_layout(
                            title="Model Confidence Comparison",
                            xaxis_title="Model",
                            yaxis_title="Confidence",
                            yaxis_range=[0, 1]
                        )
                        
                        st.plotly_chart(fig, use_container_width=True)
                
                else:
                    st.error("Failed to get predictions. Please try again.")
        else:
            st.warning("⚠️ Please enter a review to analyze.")
    
    # Sidebar with additional info
    with st.sidebar:
        st.header("ℹ️ About")
        st.write("This app uses machine learning models to analyze the sentiment of movie reviews.")
        st.write("**Models:**")
        st.write("- Logistic Regression")
        st.write("- Naive Bayes")
        
        st.header("📋 Model Details")
        st.write(f"**Training Samples:** {analyzer.metadata['training_samples']:,}")
        st.write(f"**Test Samples:** {analyzer.metadata['test_samples']:,}")
        st.write(f"**Features:** {analyzer.metadata['max_features']:,}")
        
        st.header("🔧 Preprocessing Steps")
        for step in analyzer.metadata['preprocessing_steps']:
            st.write(f"- {step.replace('_', ' ').title()}")
        
        st.header("📊 Sample Reviews")
        sample_reviews = [
            "This movie was absolutely fantastic! I loved every minute of it.",
            "Terrible film, waste of time. Don't watch it.",
            "It was okay, nothing special but not bad either.",
            "Amazing performance by the actors, great storyline!",
            "Boring and predictable plot, poor acting."
        ]
        
        for i, review in enumerate(sample_reviews, 1):
            if st.button(f"Sample {i}", key=f"sample_{i}"):
                st.session_state.user_input = review
                st.rerun()

if __name__ == "__main__":
    main()