|
|
import os
|
|
|
import json
|
|
|
import joblib
|
|
|
from transformers import pipeline
|
|
|
import torch
|
|
|
|
|
|
def create_huggingface_config():
|
|
|
"""Create Hugging Face model card and configuration"""
|
|
|
|
|
|
|
|
|
model_card = """---
|
|
|
language: en
|
|
|
tags:
|
|
|
- sentiment-analysis
|
|
|
- text-classification
|
|
|
- nltk
|
|
|
- scikit-learn
|
|
|
license: mit
|
|
|
---
|
|
|
|
|
|
# IMDb Sentiment Analysis Model
|
|
|
|
|
|
This model analyzes the sentiment of IMDb movie reviews to classify them as positive or negative.
|
|
|
|
|
|
## Model Details
|
|
|
|
|
|
- **Model Type**: Ensemble of Logistic Regression and Naive Bayes
|
|
|
- **Vectorizer**: TF-IDF with 5000 features
|
|
|
- **Accuracy**:
|
|
|
- Logistic Regression: ~88.47%
|
|
|
- Naive Bayes: ~85.2%
|
|
|
|
|
|
## Usage
|
|
|
|
|
|
```python
|
|
|
from transformers import pipeline
|
|
|
|
|
|
# Load the model
|
|
|
classifier = pipeline("text-classification", model="your-username/imdb-sentiment")
|
|
|
|
|
|
# Make predictions
|
|
|
result = classifier("This movie was absolutely fantastic!")
|
|
|
print(result)
|
|
|
```
|
|
|
|
|
|
## Training Data
|
|
|
|
|
|
The model was trained on the IMDb dataset containing 50,000 movie reviews with binary sentiment labels.
|
|
|
|
|
|
## Preprocessing
|
|
|
|
|
|
1. Text lowercase conversion
|
|
|
2. Special character removal
|
|
|
3. Tokenization using NLTK
|
|
|
4. Stopword removal
|
|
|
5. Lemmatization using WordNet
|
|
|
|
|
|
## Model Architecture
|
|
|
|
|
|
- **Feature Extraction**: TF-IDF Vectorizer (5000 features)
|
|
|
- **Classification**:
|
|
|
- Logistic Regression with L2 regularization
|
|
|
- Multinomial Naive Bayes
|
|
|
|
|
|
## Performance
|
|
|
|
|
|
- **Logistic Regression**: 88.47% accuracy
|
|
|
- **Naive Bayes**: 85.2% accuracy
|
|
|
- **Ensemble**: Improved robustness and confidence
|
|
|
|
|
|
## Citation
|
|
|
|
|
|
If you use this model in your research, please cite:
|
|
|
|
|
|
```bibtex
|
|
|
@misc{imdb-sentiment-analysis,
|
|
|
author = {Your Name},
|
|
|
title = {IMDb Sentiment Analysis Model},
|
|
|
year = {2024},
|
|
|
publisher = {Hugging Face},
|
|
|
url = {https://huggingface.co/your-username/imdb-sentiment}
|
|
|
}
|
|
|
```
|
|
|
"""
|
|
|
|
|
|
with open("README.md", "w") as f:
|
|
|
f.write(model_card)
|
|
|
|
|
|
|
|
|
gitattributes = """*.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
|
|
*.json filter=lfs diff=lfs merge=lfs -text
|
|
|
"""
|
|
|
|
|
|
with open(".gitattributes", "w") as f:
|
|
|
f.write(gitattributes)
|
|
|
|
|
|
print("Created Hugging Face configuration files")
|
|
|
|
|
|
def create_kaggle_notebook():
|
|
|
"""Create a Kaggle notebook for model deployment"""
|
|
|
|
|
|
notebook_code = '''{
|
|
|
"cells": [
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"# IMDb Sentiment Analysis Model Deployment\\n",
|
|
|
"\\n",
|
|
|
"This notebook demonstrates how to use the trained sentiment analysis model for IMDb reviews.\\n",
|
|
|
"\\n",
|
|
|
"## Model Details\\n",
|
|
|
"- **Logistic Regression Accuracy**: ~88.47%\\n",
|
|
|
"- **Naive Bayes Accuracy**: ~85.2%\\n",
|
|
|
"- **Vectorizer**: TF-IDF with 5000 features\\n",
|
|
|
"- **Preprocessing**: Lowercase, tokenization, stopword removal, lemmatization"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
"execution_count": null,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"# Install required packages\\n",
|
|
|
"!pip install nltk scikit-learn joblib pandas numpy"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
"execution_count": null,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"import joblib\\n",
|
|
|
"import json\\n",
|
|
|
"import re\\n",
|
|
|
"import nltk\\n",
|
|
|
"from nltk.corpus import stopwords\\n",
|
|
|
"from nltk.tokenize import word_tokenize\\n",
|
|
|
"from nltk.stem import WordNetLemmatizer\\n",
|
|
|
"import pandas as pd\\n",
|
|
|
"import numpy as np\\n",
|
|
|
"\\n",
|
|
|
"# Download NLTK resources\\n",
|
|
|
"nltk.download('punkt')\\n",
|
|
|
"nltk.download('stopwords')\\n",
|
|
|
"nltk.download('wordnet')"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
"execution_count": null,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"class SentimentAnalyzer:\\n",
|
|
|
" def __init__(self, model_dir=\\"saved_models\\"):\\n",
|
|
|
" # Load models\\n",
|
|
|
" self.vectorizer = joblib.load(f\\"{model_dir}/tfidf_vectorizer.pkl\\")\\n",
|
|
|
" self.lr_model = joblib.load(f\\"{model_dir}/logistic_regression_model.pkl\\")\\n",
|
|
|
" self.nb_model = joblib.load(f\\"{model_dir}/naive_bayes_model.pkl\\")\\n",
|
|
|
" \\n",
|
|
|
" # Load metadata\\n",
|
|
|
" with open(f\\"{model_dir}/model_metadata.json\\", \\"r\\") as f:\\n",
|
|
|
" self.metadata = json.load(f)\\n",
|
|
|
" \\n",
|
|
|
" def preprocess_text(self, text):\\n",
|
|
|
" # Lowercase\\n",
|
|
|
" text = text.lower()\\n",
|
|
|
" # Remove special characters and digits\\n",
|
|
|
" text = re.sub(r\\"[^a-zA-Z\\\\s]\\", \\"\\", text)\\n",
|
|
|
" # Tokenize\\n",
|
|
|
" tokens = word_tokenize(text)\\n",
|
|
|
" # Remove stopwords\\n",
|
|
|
" stop_words = set(stopwords.words(\\"english\\"))\\n",
|
|
|
" tokens = [word for word in tokens if word not in stop_words]\\n",
|
|
|
" # Lemmatize\\n",
|
|
|
" lemmatizer = WordNetLemmatizer()\\n",
|
|
|
" tokens = [lemmatizer.lemmatize(word) for word in tokens]\\n",
|
|
|
" # Join tokens back to string\\n",
|
|
|
" return \\" \\".join(tokens)\\n",
|
|
|
" \\n",
|
|
|
" def predict(self, text, model_type=\\"both\\"):\\n",
|
|
|
" # Preprocess text\\n",
|
|
|
" cleaned_text = self.preprocess_text(text)\\n",
|
|
|
" \\n",
|
|
|
" # Vectorize\\n",
|
|
|
" text_vector = self.vectorizer.transform([cleaned_text])\\n",
|
|
|
" \\n",
|
|
|
" results = {}\\n",
|
|
|
" \\n",
|
|
|
" if model_type in [\\"lr\\", \\"both\\"]:\\n",
|
|
|
" lr_pred = self.lr_model.predict(text_vector)[0]\\n",
|
|
|
" lr_prob = self.lr_model.predict_proba(text_vector)[0]\\n",
|
|
|
" results[\\"logistic_regression\\"] = {\\n",
|
|
|
" \\"prediction\\": \\"positive\\" if lr_pred == 1 else \\"negative\\",\\n",
|
|
|
" \\"confidence\\": float(max(lr_prob)),\\n",
|
|
|
" \\"probabilities\\": {\\n",
|
|
|
" \\"negative\\": float(lr_prob[0]),\\n",
|
|
|
" \\"positive\\": float(lr_prob[1])\\n",
|
|
|
" }\\n",
|
|
|
" }\\n",
|
|
|
" \\n",
|
|
|
" if model_type in [\\"nb\\", \\"both\\"]:\\n",
|
|
|
" nb_pred = self.nb_model.predict(text_vector)[0]\\n",
|
|
|
" nb_prob = self.nb_model.predict_proba(text_vector)[0]\\n",
|
|
|
" results[\\"naive_bayes\\"] = {\\n",
|
|
|
" \\"prediction\\": \\"positive\\" if nb_pred == 1 else \\"negative\\",\\n",
|
|
|
" \\"confidence\\": float(max(nb_prob)),\\n",
|
|
|
" \\"probabilities\\": {\\n",
|
|
|
" \\"negative\\": float(nb_prob[0]),\\n",
|
|
|
" \\"positive\\": float(nb_prob[1])\\n",
|
|
|
" }\\n",
|
|
|
" }\\n",
|
|
|
" \\n",
|
|
|
" return results"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
"execution_count": null,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"# Initialize analyzer\\n",
|
|
|
"analyzer = SentimentAnalyzer()\\n",
|
|
|
"\\n",
|
|
|
"print(\\"Model loaded successfully!\\")\\n",
|
|
|
"print(f\\"Logistic Regression Accuracy: {analyzer.metadata['lr_accuracy']:.2%}\\")\\n",
|
|
|
"print(f\\"Naive Bayes Accuracy: {analyzer.metadata['nb_accuracy']:.2%}\\")"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
"execution_count": null,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"# Test with sample reviews\\n",
|
|
|
"test_reviews = [\\n",
|
|
|
" \\"This movie was absolutely fantastic! I loved every minute of it.\\",\\n",
|
|
|
" \\"Terrible film, waste of time. Don\\"t watch it.\\",\\n",
|
|
|
" \\"It was okay, nothing special but not bad either.\\",\\n",
|
|
|
" \\"Amazing performance by the actors, great storyline!\\",\\n",
|
|
|
" \\"Boring and predictable plot, poor acting.\\"\\n",
|
|
|
"]\\n",
|
|
|
"\\n",
|
|
|
"for review in test_reviews:\\n",
|
|
|
" print(f\\"\\nReview: {review}\\")\\n",
|
|
|
" results = analyzer.predict(review)\\n",
|
|
|
" for model, result in results.items():\\n",
|
|
|
" print(f\\"{model}: {result['prediction']} (confidence: {result['confidence']:.2f})\\")"
|
|
|
]
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
"execution_count": null,
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"# Interactive prediction\\n",
|
|
|
"def predict_sentiment(review):\\n",
|
|
|
" results = analyzer.predict(review)\\n",
|
|
|
" print(f\\"Review: {review}\\")\\n",
|
|
|
" print(\\"Results:\\")\\n",
|
|
|
" for model, result in results.items():\\n",
|
|
|
" print(f\\" {model}: {result['prediction']} (confidence: {result['confidence']:.2%})\\")\\n",
|
|
|
" return results\\n",
|
|
|
"\\n",
|
|
|
"# Example usage\\n",
|
|
|
"# predict_sentiment(\\"Your review here\\")"
|
|
|
]
|
|
|
}
|
|
|
],
|
|
|
"metadata": {
|
|
|
"kernelspec": {
|
|
|
"display_name": "Python 3",
|
|
|
"language": "python",
|
|
|
"name": "python3"
|
|
|
},
|
|
|
"language_info": {
|
|
|
"codemirror_mode": {
|
|
|
"name": "ipython",
|
|
|
"version": 3
|
|
|
},
|
|
|
"file_extension": ".py",
|
|
|
"mimetype": "text/x-python",
|
|
|
"name": "python",
|
|
|
"nbconvert_exporter": "python",
|
|
|
"pygments_lexer": "ipython3",
|
|
|
"version": "3.8.5"
|
|
|
}
|
|
|
},
|
|
|
"nbformat": 4,
|
|
|
"nbformat_minor": 4
|
|
|
}'''
|
|
|
|
|
|
with open("kaggle_notebook.ipynb", "w") as f:
|
|
|
f.write(notebook_code)
|
|
|
|
|
|
print("Created Kaggle notebook")
|
|
|
|
|
|
def create_dockerfile():
|
|
|
"""Create Dockerfile for containerized deployment"""
|
|
|
|
|
|
dockerfile = '''FROM python:3.9-slim
|
|
|
|
|
|
WORKDIR /app
|
|
|
|
|
|
# Install system dependencies
|
|
|
RUN apt-get update && apt-get install -y \\
|
|
|
gcc \\
|
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
# Copy requirements and install Python dependencies
|
|
|
COPY requirements.txt .
|
|
|
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
|
|
# Download NLTK data
|
|
|
RUN python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')"
|
|
|
|
|
|
# Copy model files
|
|
|
COPY saved_models/ ./saved_models/
|
|
|
COPY inference.py .
|
|
|
COPY streamlit_deployment.py .
|
|
|
|
|
|
# Expose port
|
|
|
EXPOSE 8501
|
|
|
|
|
|
# Run Streamlit app
|
|
|
CMD ["streamlit", "run", "streamlit_deployment.py", "--server.port=8501", "--server.address=0.0.0.0"]'''
|
|
|
|
|
|
with open("Dockerfile", "w") as f:
|
|
|
f.write(dockerfile)
|
|
|
|
|
|
print("Created Dockerfile")
|
|
|
|
|
|
def create_docker_compose():
|
|
|
"""Create docker-compose.yml for easy deployment"""
|
|
|
|
|
|
compose = '''version: '3.8'
|
|
|
|
|
|
services:
|
|
|
sentiment-analysis:
|
|
|
build: .
|
|
|
ports:
|
|
|
- "8501:8501"
|
|
|
volumes:
|
|
|
- ./saved_models:/app/saved_models
|
|
|
environment:
|
|
|
- STREAMLIT_SERVER_PORT=8501
|
|
|
- STREAMLIT_SERVER_ADDRESS=0.0.0.0'''
|
|
|
|
|
|
with open("docker-compose.yml", "w") as f:
|
|
|
f.write(compose)
|
|
|
|
|
|
print("Created docker-compose.yml")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
print("Creating deployment configurations...")
|
|
|
|
|
|
|
|
|
if not os.path.exists("saved_models"):
|
|
|
print("❌ Models not found! Please run 'python train_and_save_model.py' first.")
|
|
|
exit(1)
|
|
|
|
|
|
|
|
|
create_huggingface_config()
|
|
|
create_kaggle_notebook()
|
|
|
create_dockerfile()
|
|
|
create_docker_compose()
|
|
|
|
|
|
print("\n✅ Deployment files created!")
|
|
|
print("\n📋 Next steps:")
|
|
|
print("1. For Hugging Face: Upload the entire directory to HF Hub")
|
|
|
print("2. For Kaggle: Upload kaggle_notebook.ipynb to Kaggle")
|
|
|
print("3. For Docker: Run 'docker-compose up'")
|
|
|
print("4. For Streamlit Cloud: Push to GitHub and connect to Streamlit Cloud") |