imdb-movie-review-sentiment-analysis / huggingface_deploy.py
MON3EMPASHA's picture
Upload folder using huggingface_hub
f19f2e1 verified
import os
import json
import joblib
from transformers import pipeline
import torch
def create_huggingface_config():
"""Create Hugging Face model card and configuration"""
# Create model card
model_card = """---
language: en
tags:
- sentiment-analysis
- text-classification
- nltk
- scikit-learn
license: mit
---
# IMDb Sentiment Analysis Model
This model analyzes the sentiment of IMDb movie reviews to classify them as positive or negative.
## Model Details
- **Model Type**: Ensemble of Logistic Regression and Naive Bayes
- **Vectorizer**: TF-IDF with 5000 features
- **Accuracy**:
- Logistic Regression: ~88.47%
- Naive Bayes: ~85.2%
## Usage
```python
from transformers import pipeline
# Load the model
classifier = pipeline("text-classification", model="your-username/imdb-sentiment")
# Make predictions
result = classifier("This movie was absolutely fantastic!")
print(result)
```
## Training Data
The model was trained on the IMDb dataset containing 50,000 movie reviews with binary sentiment labels.
## Preprocessing
1. Text lowercase conversion
2. Special character removal
3. Tokenization using NLTK
4. Stopword removal
5. Lemmatization using WordNet
## Model Architecture
- **Feature Extraction**: TF-IDF Vectorizer (5000 features)
- **Classification**:
- Logistic Regression with L2 regularization
- Multinomial Naive Bayes
## Performance
- **Logistic Regression**: 88.47% accuracy
- **Naive Bayes**: 85.2% accuracy
- **Ensemble**: Improved robustness and confidence
## Citation
If you use this model in your research, please cite:
```bibtex
@misc{imdb-sentiment-analysis,
author = {Your Name},
title = {IMDb Sentiment Analysis Model},
year = {2024},
publisher = {Hugging Face},
url = {https://huggingface.co/your-username/imdb-sentiment}
}
```
"""
with open("README.md", "w") as f:
f.write(model_card)
# Create .gitattributes
gitattributes = """*.pkl filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.json filter=lfs diff=lfs merge=lfs -text
"""
with open(".gitattributes", "w") as f:
f.write(gitattributes)
print("Created Hugging Face configuration files")
def create_kaggle_notebook():
"""Create a Kaggle notebook for model deployment"""
notebook_code = '''{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# IMDb Sentiment Analysis Model Deployment\\n",
"\\n",
"This notebook demonstrates how to use the trained sentiment analysis model for IMDb reviews.\\n",
"\\n",
"## Model Details\\n",
"- **Logistic Regression Accuracy**: ~88.47%\\n",
"- **Naive Bayes Accuracy**: ~85.2%\\n",
"- **Vectorizer**: TF-IDF with 5000 features\\n",
"- **Preprocessing**: Lowercase, tokenization, stopword removal, lemmatization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Install required packages\\n",
"!pip install nltk scikit-learn joblib pandas numpy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import joblib\\n",
"import json\\n",
"import re\\n",
"import nltk\\n",
"from nltk.corpus import stopwords\\n",
"from nltk.tokenize import word_tokenize\\n",
"from nltk.stem import WordNetLemmatizer\\n",
"import pandas as pd\\n",
"import numpy as np\\n",
"\\n",
"# Download NLTK resources\\n",
"nltk.download('punkt')\\n",
"nltk.download('stopwords')\\n",
"nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class SentimentAnalyzer:\\n",
" def __init__(self, model_dir=\\"saved_models\\"):\\n",
" # Load models\\n",
" self.vectorizer = joblib.load(f\\"{model_dir}/tfidf_vectorizer.pkl\\")\\n",
" self.lr_model = joblib.load(f\\"{model_dir}/logistic_regression_model.pkl\\")\\n",
" self.nb_model = joblib.load(f\\"{model_dir}/naive_bayes_model.pkl\\")\\n",
" \\n",
" # Load metadata\\n",
" with open(f\\"{model_dir}/model_metadata.json\\", \\"r\\") as f:\\n",
" self.metadata = json.load(f)\\n",
" \\n",
" def preprocess_text(self, text):\\n",
" # Lowercase\\n",
" text = text.lower()\\n",
" # Remove special characters and digits\\n",
" text = re.sub(r\\"[^a-zA-Z\\\\s]\\", \\"\\", text)\\n",
" # Tokenize\\n",
" tokens = word_tokenize(text)\\n",
" # Remove stopwords\\n",
" stop_words = set(stopwords.words(\\"english\\"))\\n",
" tokens = [word for word in tokens if word not in stop_words]\\n",
" # Lemmatize\\n",
" lemmatizer = WordNetLemmatizer()\\n",
" tokens = [lemmatizer.lemmatize(word) for word in tokens]\\n",
" # Join tokens back to string\\n",
" return \\" \\".join(tokens)\\n",
" \\n",
" def predict(self, text, model_type=\\"both\\"):\\n",
" # Preprocess text\\n",
" cleaned_text = self.preprocess_text(text)\\n",
" \\n",
" # Vectorize\\n",
" text_vector = self.vectorizer.transform([cleaned_text])\\n",
" \\n",
" results = {}\\n",
" \\n",
" if model_type in [\\"lr\\", \\"both\\"]:\\n",
" lr_pred = self.lr_model.predict(text_vector)[0]\\n",
" lr_prob = self.lr_model.predict_proba(text_vector)[0]\\n",
" results[\\"logistic_regression\\"] = {\\n",
" \\"prediction\\": \\"positive\\" if lr_pred == 1 else \\"negative\\",\\n",
" \\"confidence\\": float(max(lr_prob)),\\n",
" \\"probabilities\\": {\\n",
" \\"negative\\": float(lr_prob[0]),\\n",
" \\"positive\\": float(lr_prob[1])\\n",
" }\\n",
" }\\n",
" \\n",
" if model_type in [\\"nb\\", \\"both\\"]:\\n",
" nb_pred = self.nb_model.predict(text_vector)[0]\\n",
" nb_prob = self.nb_model.predict_proba(text_vector)[0]\\n",
" results[\\"naive_bayes\\"] = {\\n",
" \\"prediction\\": \\"positive\\" if nb_pred == 1 else \\"negative\\",\\n",
" \\"confidence\\": float(max(nb_prob)),\\n",
" \\"probabilities\\": {\\n",
" \\"negative\\": float(nb_prob[0]),\\n",
" \\"positive\\": float(nb_prob[1])\\n",
" }\\n",
" }\\n",
" \\n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Initialize analyzer\\n",
"analyzer = SentimentAnalyzer()\\n",
"\\n",
"print(\\"Model loaded successfully!\\")\\n",
"print(f\\"Logistic Regression Accuracy: {analyzer.metadata['lr_accuracy']:.2%}\\")\\n",
"print(f\\"Naive Bayes Accuracy: {analyzer.metadata['nb_accuracy']:.2%}\\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test with sample reviews\\n",
"test_reviews = [\\n",
" \\"This movie was absolutely fantastic! I loved every minute of it.\\",\\n",
" \\"Terrible film, waste of time. Don\\"t watch it.\\",\\n",
" \\"It was okay, nothing special but not bad either.\\",\\n",
" \\"Amazing performance by the actors, great storyline!\\",\\n",
" \\"Boring and predictable plot, poor acting.\\"\\n",
"]\\n",
"\\n",
"for review in test_reviews:\\n",
" print(f\\"\\nReview: {review}\\")\\n",
" results = analyzer.predict(review)\\n",
" for model, result in results.items():\\n",
" print(f\\"{model}: {result['prediction']} (confidence: {result['confidence']:.2f})\\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Interactive prediction\\n",
"def predict_sentiment(review):\\n",
" results = analyzer.predict(review)\\n",
" print(f\\"Review: {review}\\")\\n",
" print(\\"Results:\\")\\n",
" for model, result in results.items():\\n",
" print(f\\" {model}: {result['prediction']} (confidence: {result['confidence']:.2%})\\")\\n",
" return results\\n",
"\\n",
"# Example usage\\n",
"# predict_sentiment(\\"Your review here\\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}'''
with open("kaggle_notebook.ipynb", "w") as f:
f.write(notebook_code)
print("Created Kaggle notebook")
def create_dockerfile():
"""Create Dockerfile for containerized deployment"""
dockerfile = '''FROM python:3.9-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \\
gcc \\
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Download NLTK data
RUN python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')"
# Copy model files
COPY saved_models/ ./saved_models/
COPY inference.py .
COPY streamlit_deployment.py .
# Expose port
EXPOSE 8501
# Run Streamlit app
CMD ["streamlit", "run", "streamlit_deployment.py", "--server.port=8501", "--server.address=0.0.0.0"]'''
with open("Dockerfile", "w") as f:
f.write(dockerfile)
print("Created Dockerfile")
def create_docker_compose():
"""Create docker-compose.yml for easy deployment"""
compose = '''version: '3.8'
services:
sentiment-analysis:
build: .
ports:
- "8501:8501"
volumes:
- ./saved_models:/app/saved_models
environment:
- STREAMLIT_SERVER_PORT=8501
- STREAMLIT_SERVER_ADDRESS=0.0.0.0'''
with open("docker-compose.yml", "w") as f:
f.write(compose)
print("Created docker-compose.yml")
if __name__ == "__main__":
print("Creating deployment configurations...")
# Check if models exist
if not os.path.exists("saved_models"):
print("❌ Models not found! Please run 'python train_and_save_model.py' first.")
exit(1)
# Create deployment files
create_huggingface_config()
create_kaggle_notebook()
create_dockerfile()
create_docker_compose()
print("\n✅ Deployment files created!")
print("\n📋 Next steps:")
print("1. For Hugging Face: Upload the entire directory to HF Hub")
print("2. For Kaggle: Upload kaggle_notebook.ipynb to Kaggle")
print("3. For Docker: Run 'docker-compose up'")
print("4. For Streamlit Cloud: Push to GitHub and connect to Streamlit Cloud")