MON3EMPASHA commited on
Commit
5a2ee52
·
1 Parent(s): 3242546
Files changed (2) hide show
  1. .gitignore +6 -0
  2. streamlit_app.py +0 -141
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ huggingface_deploy.py
2
+ IMDB Dataset.csv
3
+ sentiment_analysis.py
4
+ train_and_save_model.py
5
+ *.pyc
6
+ __pycache__/
streamlit_app.py DELETED
@@ -1,141 +0,0 @@
1
- import pandas as pd
2
- import nltk
3
- import re
4
- from nltk.corpus import stopwords
5
- from nltk.tokenize import word_tokenize
6
- from nltk.stem import WordNetLemmatizer
7
- from sklearn.feature_extraction.text import TfidfVectorizer
8
- from sklearn.model_selection import train_test_split
9
- from sklearn.linear_model import LogisticRegression
10
- from sklearn.naive_bayes import MultinomialNB
11
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
12
- import matplotlib.pyplot as plt
13
- import seaborn as sns
14
- import streamlit as st
15
- from collections import Counter
16
- import pickle
17
- import os
18
-
19
- # Download NLTK resources
20
- try:
21
- nltk.download('punkt')
22
- nltk.download('stopwords')
23
- nltk.download('wordnet')
24
- nltk.download('punkt_tab')
25
- except Exception as e:
26
- st.error(f"Could not download NLTK data: {e}")
27
-
28
- # Text Preprocessing Function
29
- def preprocess_text(text):
30
- # Lowercase
31
- text = text.lower()
32
- # Remove special characters and digits
33
- text = re.sub(r'[^a-zA-Z\s]', '', text)
34
- # Tokenize
35
- tokens = word_tokenize(text)
36
- # Remove stopwords
37
- stop_words = set(stopwords.words('english'))
38
- tokens = [word for word in tokens if word not in stop_words]
39
- # Lemmatize
40
- lemmatizer = WordNetLemmatizer()
41
- tokens = [lemmatizer.lemmatize(word) for word in tokens]
42
- # Join tokens back to string
43
- return ' '.join(tokens)
44
-
45
- # Load and Preprocess Dataset
46
- def load_and_preprocess_data(file_path="IMDB Dataset.csv"):
47
- try:
48
- df = pd.read_csv(file_path)
49
- # Apply preprocessing to reviews
50
- df['cleaned_review'] = df['review'].apply(preprocess_text)
51
- # Convert sentiment to binary (1 for positive, 0 for negative)
52
- df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})
53
- return df
54
- except FileNotFoundError:
55
- st.error(f"Could not find file '{file_path}'")
56
- return None
57
-
58
- # Train Models
59
- def train_models(df):
60
- # Convert text to TF-IDF features
61
- vectorizer = TfidfVectorizer(max_features=5000)
62
- X = vectorizer.fit_transform(df['cleaned_review'])
63
- y = df['sentiment']
64
-
65
- # Split data
66
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
67
-
68
- # Logistic Regression
69
- lr_model = LogisticRegression(max_iter=1000)
70
- lr_model.fit(X_train, y_train)
71
- lr_predictions = lr_model.predict(X_test)
72
- lr_accuracy = accuracy_score(y_test, lr_predictions)
73
-
74
- # Naive Bayes
75
- nb_model = MultinomialNB()
76
- nb_model.fit(X_train, y_train)
77
- nb_predictions = nb_model.predict(X_test)
78
- nb_accuracy = accuracy_score(y_test, nb_predictions)
79
-
80
- return vectorizer, lr_model, nb_model, lr_accuracy, nb_accuracy
81
-
82
- # Streamlit App
83
- def main():
84
- st.title("IMDb Review Sentiment Analysis")
85
- st.write("This app analyzes movie reviews to predict whether they are positive or negative.")
86
-
87
- # Load data and train models
88
- with st.spinner("Loading data and training models..."):
89
- df = load_and_preprocess_data()
90
- if df is not None:
91
- vectorizer, lr_model, nb_model, lr_accuracy, nb_accuracy = train_models(df)
92
- st.success("Models trained successfully!")
93
-
94
- # Display model accuracies
95
- col1, col2 = st.columns(2)
96
- with col1:
97
- st.metric("Logistic Regression Accuracy", f"{lr_accuracy:.2%}")
98
- with col2:
99
- st.metric("Naive Bayes Accuracy", f"{nb_accuracy:.2%}")
100
-
101
- # Text input for prediction
102
- st.subheader("Predict Sentiment")
103
- user_input = st.text_area("Enter a movie review:", height=150)
104
-
105
- if st.button("Predict Sentiment"):
106
- if user_input:
107
- # Preprocess input
108
- cleaned_input = preprocess_text(user_input)
109
- input_vector = vectorizer.transform([cleaned_input])
110
-
111
- # Predict with both models
112
- lr_prediction = lr_model.predict(input_vector)[0]
113
- lr_prob = lr_model.predict_proba(input_vector)[0]
114
- nb_prediction = nb_model.predict(input_vector)[0]
115
- nb_prob = nb_model.predict_proba(input_vector)[0]
116
-
117
- # Display results
118
- col1, col2 = st.columns(2)
119
-
120
- with col1:
121
- st.subheader("Logistic Regression")
122
- if lr_prediction == 1:
123
- st.success("Positive Sentiment")
124
- else:
125
- st.error("Negative Sentiment")
126
- st.write(f"Confidence: {max(lr_prob):.2%}")
127
-
128
- with col2:
129
- st.subheader("Naive Bayes")
130
- if nb_prediction == 1:
131
- st.success("Positive Sentiment")
132
- else:
133
- st.error("Negative Sentiment")
134
- st.write(f"Confidence: {max(nb_prob):.2%}")
135
- else:
136
- st.warning("Please enter a review.")
137
- else:
138
- st.error("Failed to load data. Please check if 'IMDB Dataset.csv' is in the same directory.")
139
-
140
- if __name__ == "__main__":
141
- main()