import os import json import joblib from transformers import pipeline import torch def create_huggingface_config(): """Create Hugging Face model card and configuration""" # Create model card model_card = """--- language: en tags: - sentiment-analysis - text-classification - nltk - scikit-learn license: mit --- # IMDb Sentiment Analysis Model This model analyzes the sentiment of IMDb movie reviews to classify them as positive or negative. ## Model Details - **Model Type**: Ensemble of Logistic Regression and Naive Bayes - **Vectorizer**: TF-IDF with 5000 features - **Accuracy**: - Logistic Regression: ~88.47% - Naive Bayes: ~85.2% ## Usage ```python from transformers import pipeline # Load the model classifier = pipeline("text-classification", model="your-username/imdb-sentiment") # Make predictions result = classifier("This movie was absolutely fantastic!") print(result) ``` ## Training Data The model was trained on the IMDb dataset containing 50,000 movie reviews with binary sentiment labels. ## Preprocessing 1. Text lowercase conversion 2. Special character removal 3. Tokenization using NLTK 4. Stopword removal 5. Lemmatization using WordNet ## Model Architecture - **Feature Extraction**: TF-IDF Vectorizer (5000 features) - **Classification**: - Logistic Regression with L2 regularization - Multinomial Naive Bayes ## Performance - **Logistic Regression**: 88.47% accuracy - **Naive Bayes**: 85.2% accuracy - **Ensemble**: Improved robustness and confidence ## Citation If you use this model in your research, please cite: ```bibtex @misc{imdb-sentiment-analysis, author = {Your Name}, title = {IMDb Sentiment Analysis Model}, year = {2024}, publisher = {Hugging Face}, url = {https://huggingface.co/your-username/imdb-sentiment} } ``` """ with open("README.md", "w") as f: f.write(model_card) # Create .gitattributes gitattributes = """*.pkl filter=lfs diff=lfs merge=lfs -text *.joblib filter=lfs diff=lfs merge=lfs -text *.json filter=lfs diff=lfs merge=lfs -text """ with open(".gitattributes", "w") as f: f.write(gitattributes) print("Created Hugging Face configuration files") def create_kaggle_notebook(): """Create a Kaggle notebook for model deployment""" notebook_code = '''{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# IMDb Sentiment Analysis Model Deployment\\n", "\\n", "This notebook demonstrates how to use the trained sentiment analysis model for IMDb reviews.\\n", "\\n", "## Model Details\\n", "- **Logistic Regression Accuracy**: ~88.47%\\n", "- **Naive Bayes Accuracy**: ~85.2%\\n", "- **Vectorizer**: TF-IDF with 5000 features\\n", "- **Preprocessing**: Lowercase, tokenization, stopword removal, lemmatization" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install required packages\\n", "!pip install nltk scikit-learn joblib pandas numpy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import joblib\\n", "import json\\n", "import re\\n", "import nltk\\n", "from nltk.corpus import stopwords\\n", "from nltk.tokenize import word_tokenize\\n", "from nltk.stem import WordNetLemmatizer\\n", "import pandas as pd\\n", "import numpy as np\\n", "\\n", "# Download NLTK resources\\n", "nltk.download('punkt')\\n", "nltk.download('stopwords')\\n", "nltk.download('wordnet')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class SentimentAnalyzer:\\n", " def __init__(self, model_dir=\\"saved_models\\"):\\n", " # Load models\\n", " self.vectorizer = joblib.load(f\\"{model_dir}/tfidf_vectorizer.pkl\\")\\n", " self.lr_model = joblib.load(f\\"{model_dir}/logistic_regression_model.pkl\\")\\n", " self.nb_model = joblib.load(f\\"{model_dir}/naive_bayes_model.pkl\\")\\n", " \\n", " # Load metadata\\n", " with open(f\\"{model_dir}/model_metadata.json\\", \\"r\\") as f:\\n", " self.metadata = json.load(f)\\n", " \\n", " def preprocess_text(self, text):\\n", " # Lowercase\\n", " text = text.lower()\\n", " # Remove special characters and digits\\n", " text = re.sub(r\\"[^a-zA-Z\\\\s]\\", \\"\\", text)\\n", " # Tokenize\\n", " tokens = word_tokenize(text)\\n", " # Remove stopwords\\n", " stop_words = set(stopwords.words(\\"english\\"))\\n", " tokens = [word for word in tokens if word not in stop_words]\\n", " # Lemmatize\\n", " lemmatizer = WordNetLemmatizer()\\n", " tokens = [lemmatizer.lemmatize(word) for word in tokens]\\n", " # Join tokens back to string\\n", " return \\" \\".join(tokens)\\n", " \\n", " def predict(self, text, model_type=\\"both\\"):\\n", " # Preprocess text\\n", " cleaned_text = self.preprocess_text(text)\\n", " \\n", " # Vectorize\\n", " text_vector = self.vectorizer.transform([cleaned_text])\\n", " \\n", " results = {}\\n", " \\n", " if model_type in [\\"lr\\", \\"both\\"]:\\n", " lr_pred = self.lr_model.predict(text_vector)[0]\\n", " lr_prob = self.lr_model.predict_proba(text_vector)[0]\\n", " results[\\"logistic_regression\\"] = {\\n", " \\"prediction\\": \\"positive\\" if lr_pred == 1 else \\"negative\\",\\n", " \\"confidence\\": float(max(lr_prob)),\\n", " \\"probabilities\\": {\\n", " \\"negative\\": float(lr_prob[0]),\\n", " \\"positive\\": float(lr_prob[1])\\n", " }\\n", " }\\n", " \\n", " if model_type in [\\"nb\\", \\"both\\"]:\\n", " nb_pred = self.nb_model.predict(text_vector)[0]\\n", " nb_prob = self.nb_model.predict_proba(text_vector)[0]\\n", " results[\\"naive_bayes\\"] = {\\n", " \\"prediction\\": \\"positive\\" if nb_pred == 1 else \\"negative\\",\\n", " \\"confidence\\": float(max(nb_prob)),\\n", " \\"probabilities\\": {\\n", " \\"negative\\": float(nb_prob[0]),\\n", " \\"positive\\": float(nb_prob[1])\\n", " }\\n", " }\\n", " \\n", " return results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Initialize analyzer\\n", "analyzer = SentimentAnalyzer()\\n", "\\n", "print(\\"Model loaded successfully!\\")\\n", "print(f\\"Logistic Regression Accuracy: {analyzer.metadata['lr_accuracy']:.2%}\\")\\n", "print(f\\"Naive Bayes Accuracy: {analyzer.metadata['nb_accuracy']:.2%}\\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Test with sample reviews\\n", "test_reviews = [\\n", " \\"This movie was absolutely fantastic! I loved every minute of it.\\",\\n", " \\"Terrible film, waste of time. Don\\"t watch it.\\",\\n", " \\"It was okay, nothing special but not bad either.\\",\\n", " \\"Amazing performance by the actors, great storyline!\\",\\n", " \\"Boring and predictable plot, poor acting.\\"\\n", "]\\n", "\\n", "for review in test_reviews:\\n", " print(f\\"\\nReview: {review}\\")\\n", " results = analyzer.predict(review)\\n", " for model, result in results.items():\\n", " print(f\\"{model}: {result['prediction']} (confidence: {result['confidence']:.2f})\\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Interactive prediction\\n", "def predict_sentiment(review):\\n", " results = analyzer.predict(review)\\n", " print(f\\"Review: {review}\\")\\n", " print(\\"Results:\\")\\n", " for model, result in results.items():\\n", " print(f\\" {model}: {result['prediction']} (confidence: {result['confidence']:.2%})\\")\\n", " return results\\n", "\\n", "# Example usage\\n", "# predict_sentiment(\\"Your review here\\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }''' with open("kaggle_notebook.ipynb", "w") as f: f.write(notebook_code) print("Created Kaggle notebook") def create_dockerfile(): """Create Dockerfile for containerized deployment""" dockerfile = '''FROM python:3.9-slim WORKDIR /app # Install system dependencies RUN apt-get update && apt-get install -y \\ gcc \\ && rm -rf /var/lib/apt/lists/* # Copy requirements and install Python dependencies COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt # Download NLTK data RUN python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')" # Copy model files COPY saved_models/ ./saved_models/ COPY inference.py . COPY streamlit_deployment.py . # Expose port EXPOSE 8501 # Run Streamlit app CMD ["streamlit", "run", "streamlit_deployment.py", "--server.port=8501", "--server.address=0.0.0.0"]''' with open("Dockerfile", "w") as f: f.write(dockerfile) print("Created Dockerfile") def create_docker_compose(): """Create docker-compose.yml for easy deployment""" compose = '''version: '3.8' services: sentiment-analysis: build: . ports: - "8501:8501" volumes: - ./saved_models:/app/saved_models environment: - STREAMLIT_SERVER_PORT=8501 - STREAMLIT_SERVER_ADDRESS=0.0.0.0''' with open("docker-compose.yml", "w") as f: f.write(compose) print("Created docker-compose.yml") if __name__ == "__main__": print("Creating deployment configurations...") # Check if models exist if not os.path.exists("saved_models"): print("āŒ Models not found! Please run 'python train_and_save_model.py' first.") exit(1) # Create deployment files create_huggingface_config() create_kaggle_notebook() create_dockerfile() create_docker_compose() print("\nāœ… Deployment files created!") print("\nšŸ“‹ Next steps:") print("1. For Hugging Face: Upload the entire directory to HF Hub") print("2. For Kaggle: Upload kaggle_notebook.ipynb to Kaggle") print("3. For Docker: Run 'docker-compose up'") print("4. For Streamlit Cloud: Push to GitHub and connect to Streamlit Cloud")