imdb-movie-review-sentiment-analysis / huggingface_deploy.py

Upload folder using huggingface_hub

f19f2e1 verified 4 months ago

11.8 kB

	import os
	import json
	import joblib
	from transformers import pipeline
	import torch

	def create_huggingface_config():
	"""Create Hugging Face model card and configuration"""

	# Create model card
	model_card = """---
	language: en
	tags:
	- sentiment-analysis
	- text-classification
	- nltk
	- scikit-learn
	license: mit
	---

	# IMDb Sentiment Analysis Model

	This model analyzes the sentiment of IMDb movie reviews to classify them as positive or negative.

	## Model Details

	- Model Type: Ensemble of Logistic Regression and Naive Bayes
	- Vectorizer: TF-IDF with 5000 features
	- Accuracy:
	- Logistic Regression: ~88.47%
	- Naive Bayes: ~85.2%

	## Usage

	```python
	from transformers import pipeline

	# Load the model
	classifier = pipeline("text-classification", model="your-username/imdb-sentiment")

	# Make predictions
	result = classifier("This movie was absolutely fantastic!")
	print(result)
	```

	## Training Data

	The model was trained on the IMDb dataset containing 50,000 movie reviews with binary sentiment labels.

	## Preprocessing

	1. Text lowercase conversion
	2. Special character removal
	3. Tokenization using NLTK
	4. Stopword removal
	5. Lemmatization using WordNet

	## Model Architecture

	- Feature Extraction: TF-IDF Vectorizer (5000 features)
	- Classification:
	- Logistic Regression with L2 regularization
	- Multinomial Naive Bayes

	## Performance

	- Logistic Regression: 88.47% accuracy
	- Naive Bayes: 85.2% accuracy
	- Ensemble: Improved robustness and confidence

	## Citation

	If you use this model in your research, please cite:

	```bibtex
	@misc{imdb-sentiment-analysis,
	author = {Your Name},
	title = {IMDb Sentiment Analysis Model},
	year = {2024},
	publisher = {Hugging Face},
	url = {https://huggingface.co/your-username/imdb-sentiment}
	}
	```
	"""

	with open("README.md", "w") as f:
	f.write(model_card)

	# Create .gitattributes
	gitattributes = """*.pkl filter=lfs diff=lfs merge=lfs -text
	*.joblib filter=lfs diff=lfs merge=lfs -text
	*.json filter=lfs diff=lfs merge=lfs -text
	"""

	with open(".gitattributes", "w") as f:
	f.write(gitattributes)

	print("Created Hugging Face configuration files")

	def create_kaggle_notebook():
	"""Create a Kaggle notebook for model deployment"""

	notebook_code = '''{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# IMDb Sentiment Analysis Model Deployment\\n",
	"\\n",
	"This notebook demonstrates how to use the trained sentiment analysis model for IMDb reviews.\\n",
	"\\n",
	"## Model Details\\n",
	"- Logistic Regression Accuracy: ~88.47%\\n",
	"- Naive Bayes Accuracy: ~85.2%\\n",
	"- Vectorizer: TF-IDF with 5000 features\\n",
	"- Preprocessing: Lowercase, tokenization, stopword removal, lemmatization"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Install required packages\\n",
	"!pip install nltk scikit-learn joblib pandas numpy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import joblib\\n",
	"import json\\n",
	"import re\\n",
	"import nltk\\n",
	"from nltk.corpus import stopwords\\n",
	"from nltk.tokenize import word_tokenize\\n",
	"from nltk.stem import WordNetLemmatizer\\n",
	"import pandas as pd\\n",
	"import numpy as np\\n",
	"\\n",
	"# Download NLTK resources\\n",
	"nltk.download('punkt')\\n",
	"nltk.download('stopwords')\\n",
	"nltk.download('wordnet')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"class SentimentAnalyzer:\\n",
	" def __init__(self, model_dir=\\"saved_models\\"):\\n",
	" # Load models\\n",
	" self.vectorizer = joblib.load(f\\"{model_dir}/tfidf_vectorizer.pkl\\")\\n",
	" self.lr_model = joblib.load(f\\"{model_dir}/logistic_regression_model.pkl\\")\\n",
	" self.nb_model = joblib.load(f\\"{model_dir}/naive_bayes_model.pkl\\")\\n",
	" \\n",
	" # Load metadata\\n",
	" with open(f\\"{model_dir}/model_metadata.json\\", \\"r\\") as f:\\n",
	" self.metadata = json.load(f)\\n",
	" \\n",
	" def preprocess_text(self, text):\\n",
	" # Lowercase\\n",
	" text = text.lower()\\n",
	" # Remove special characters and digits\\n",
	" text = re.sub(r\\"[^a-zA-Z\\\\s]\\", \\"\\", text)\\n",
	" # Tokenize\\n",
	" tokens = word_tokenize(text)\\n",
	" # Remove stopwords\\n",
	" stop_words = set(stopwords.words(\\"english\\"))\\n",
	" tokens = [word for word in tokens if word not in stop_words]\\n",
	" # Lemmatize\\n",
	" lemmatizer = WordNetLemmatizer()\\n",
	" tokens = [lemmatizer.lemmatize(word) for word in tokens]\\n",
	" # Join tokens back to string\\n",
	" return \\" \\".join(tokens)\\n",
	" \\n",
	" def predict(self, text, model_type=\\"both\\"):\\n",
	" # Preprocess text\\n",
	" cleaned_text = self.preprocess_text(text)\\n",
	" \\n",
	" # Vectorize\\n",
	" text_vector = self.vectorizer.transform([cleaned_text])\\n",
	" \\n",
	" results = {}\\n",
	" \\n",
	" if model_type in [\\"lr\\", \\"both\\"]:\\n",
	" lr_pred = self.lr_model.predict(text_vector)[0]\\n",
	" lr_prob = self.lr_model.predict_proba(text_vector)[0]\\n",
	" results[\\"logistic_regression\\"] = {\\n",
	" \\"prediction\\": \\"positive\\" if lr_pred == 1 else \\"negative\\",\\n",
	" \\"confidence\\": float(max(lr_prob)),\\n",
	" \\"probabilities\\": {\\n",
	" \\"negative\\": float(lr_prob[0]),\\n",
	" \\"positive\\": float(lr_prob[1])\\n",
	" }\\n",
	" }\\n",
	" \\n",
	" if model_type in [\\"nb\\", \\"both\\"]:\\n",
	" nb_pred = self.nb_model.predict(text_vector)[0]\\n",
	" nb_prob = self.nb_model.predict_proba(text_vector)[0]\\n",
	" results[\\"naive_bayes\\"] = {\\n",
	" \\"prediction\\": \\"positive\\" if nb_pred == 1 else \\"negative\\",\\n",
	" \\"confidence\\": float(max(nb_prob)),\\n",
	" \\"probabilities\\": {\\n",
	" \\"negative\\": float(nb_prob[0]),\\n",
	" \\"positive\\": float(nb_prob[1])\\n",
	" }\\n",
	" }\\n",
	" \\n",
	" return results"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Initialize analyzer\\n",
	"analyzer = SentimentAnalyzer()\\n",
	"\\n",
	"print(\\"Model loaded successfully!\\")\\n",
	"print(f\\"Logistic Regression Accuracy: {analyzer.metadata['lr_accuracy']:.2%}\\")\\n",
	"print(f\\"Naive Bayes Accuracy: {analyzer.metadata['nb_accuracy']:.2%}\\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Test with sample reviews\\n",
	"test_reviews = [\\n",
	" \\"This movie was absolutely fantastic! I loved every minute of it.\\",\\n",
	" \\"Terrible film, waste of time. Don\\"t watch it.\\",\\n",
	" \\"It was okay, nothing special but not bad either.\\",\\n",
	" \\"Amazing performance by the actors, great storyline!\\",\\n",
	" \\"Boring and predictable plot, poor acting.\\"\\n",
	"]\\n",
	"\\n",
	"for review in test_reviews:\\n",
	" print(f\\"\\nReview: {review}\\")\\n",
	" results = analyzer.predict(review)\\n",
	" for model, result in results.items():\\n",
	" print(f\\"{model}: {result['prediction']} (confidence: {result['confidence']:.2f})\\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Interactive prediction\\n",
	"def predict_sentiment(review):\\n",
	" results = analyzer.predict(review)\\n",
	" print(f\\"Review: {review}\\")\\n",
	" print(\\"Results:\\")\\n",
	" for model, result in results.items():\\n",
	" print(f\\" {model}: {result['prediction']} (confidence: {result['confidence']:.2%})\\")\\n",
	" return results\\n",
	"\\n",
	"# Example usage\\n",
	"# predict_sentiment(\\"Your review here\\")"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}'''

	with open("kaggle_notebook.ipynb", "w") as f:
	f.write(notebook_code)

	print("Created Kaggle notebook")

	def create_dockerfile():
	"""Create Dockerfile for containerized deployment"""

	dockerfile = '''FROM python:3.9-slim

	WORKDIR /app

	# Install system dependencies
	RUN apt-get update && apt-get install -y \\
	gcc \\
	&& rm -rf /var/lib/apt/lists/*

	# Copy requirements and install Python dependencies
	COPY requirements.txt .
	RUN pip install --no-cache-dir -r requirements.txt

	# Download NLTK data
	RUN python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')"

	# Copy model files
	COPY saved_models/ ./saved_models/
	COPY inference.py .
	COPY streamlit_deployment.py .

	# Expose port
	EXPOSE 8501

	# Run Streamlit app
	CMD ["streamlit", "run", "streamlit_deployment.py", "--server.port=8501", "--server.address=0.0.0.0"]'''

	with open("Dockerfile", "w") as f:
	f.write(dockerfile)

	print("Created Dockerfile")

	def create_docker_compose():
	"""Create docker-compose.yml for easy deployment"""

	compose = '''version: '3.8'

	services:
	sentiment-analysis:
	build: .
	ports:
	- "8501:8501"
	volumes:
	- ./saved_models:/app/saved_models
	environment:
	- STREAMLIT_SERVER_PORT=8501
	- STREAMLIT_SERVER_ADDRESS=0.0.0.0'''

	with open("docker-compose.yml", "w") as f:
	f.write(compose)

	print("Created docker-compose.yml")

	if __name__ == "__main__":
	print("Creating deployment configurations...")

	# Check if models exist
	if not os.path.exists("saved_models"):
	print("❌ Models not found! Please run 'python train_and_save_model.py' first.")
	exit(1)

	# Create deployment files
	create_huggingface_config()
	create_kaggle_notebook()
	create_dockerfile()
	create_docker_compose()

	print("\n✅ Deployment files created!")
	print("\n📋 Next steps:")
	print("1. For Hugging Face: Upload the entire directory to HF Hub")
	print("2. For Kaggle: Upload kaggle_notebook.ipynb to Kaggle")
	print("3. For Docker: Run 'docker-compose up'")
	print("4. For Streamlit Cloud: Push to GitHub and connect to Streamlit Cloud")