import os
import json
import joblib
from transformers import pipeline
import torch

def create_huggingface_config():
    """Create Hugging Face model card and configuration"""
    
    # Create model card
    model_card = """---
language: en
tags:
- sentiment-analysis
- text-classification
- nltk
- scikit-learn
license: mit
---

# IMDb Sentiment Analysis Model

This model analyzes the sentiment of IMDb movie reviews to classify them as positive or negative.

## Model Details

- **Model Type**: Ensemble of Logistic Regression and Naive Bayes
- **Vectorizer**: TF-IDF with 5000 features
- **Accuracy**: 
  - Logistic Regression: ~88.47%
  - Naive Bayes: ~85.2%

## Usage

```python
from transformers import pipeline

# Load the model
classifier = pipeline("text-classification", model="your-username/imdb-sentiment")

# Make predictions
result = classifier("This movie was absolutely fantastic!")
print(result)
```

## Training Data

The model was trained on the IMDb dataset containing 50,000 movie reviews with binary sentiment labels.

## Preprocessing

1. Text lowercase conversion
2. Special character removal
3. Tokenization using NLTK
4. Stopword removal
5. Lemmatization using WordNet

## Model Architecture

- **Feature Extraction**: TF-IDF Vectorizer (5000 features)
- **Classification**: 
  - Logistic Regression with L2 regularization
  - Multinomial Naive Bayes

## Performance

- **Logistic Regression**: 88.47% accuracy
- **Naive Bayes**: 85.2% accuracy
- **Ensemble**: Improved robustness and confidence

## Citation

If you use this model in your research, please cite:

```bibtex
@misc{imdb-sentiment-analysis,
  author = {Your Name},
  title = {IMDb Sentiment Analysis Model},
  year = {2024},
  publisher = {Hugging Face},
  url = {https://huggingface.co/your-username/imdb-sentiment}
}
```
"""
    
    with open("README.md", "w") as f:
        f.write(model_card)
    
    # Create .gitattributes
    gitattributes = """*.pkl filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.json filter=lfs diff=lfs merge=lfs -text
"""
    
    with open(".gitattributes", "w") as f:
        f.write(gitattributes)
    
    print("Created Hugging Face configuration files")

def create_kaggle_notebook():
    """Create a Kaggle notebook for model deployment"""
    
    notebook_code = '''{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IMDb Sentiment Analysis Model Deployment\\n",
    "\\n",
    "This notebook demonstrates how to use the trained sentiment analysis model for IMDb reviews.\\n",
    "\\n",
    "## Model Details\\n",
    "- **Logistic Regression Accuracy**: ~88.47%\\n",
    "- **Naive Bayes Accuracy**: ~85.2%\\n",
    "- **Vectorizer**: TF-IDF with 5000 features\\n",
    "- **Preprocessing**: Lowercase, tokenization, stopword removal, lemmatization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install required packages\\n",
    "!pip install nltk scikit-learn joblib pandas numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\\n",
    "import json\\n",
    "import re\\n",
    "import nltk\\n",
    "from nltk.corpus import stopwords\\n",
    "from nltk.tokenize import word_tokenize\\n",
    "from nltk.stem import WordNetLemmatizer\\n",
    "import pandas as pd\\n",
    "import numpy as np\\n",
    "\\n",
    "# Download NLTK resources\\n",
    "nltk.download('punkt')\\n",
    "nltk.download('stopwords')\\n",
    "nltk.download('wordnet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class SentimentAnalyzer:\\n",
    "    def __init__(self, model_dir=\\"saved_models\\"):\\n",
    "        # Load models\\n",
    "        self.vectorizer = joblib.load(f\\"{model_dir}/tfidf_vectorizer.pkl\\")\\n",
    "        self.lr_model = joblib.load(f\\"{model_dir}/logistic_regression_model.pkl\\")\\n",
    "        self.nb_model = joblib.load(f\\"{model_dir}/naive_bayes_model.pkl\\")\\n",
    "        \\n",
    "        # Load metadata\\n",
    "        with open(f\\"{model_dir}/model_metadata.json\\", \\"r\\") as f:\\n",
    "            self.metadata = json.load(f)\\n",
    "    \\n",
    "    def preprocess_text(self, text):\\n",
    "        # Lowercase\\n",
    "        text = text.lower()\\n",
    "        # Remove special characters and digits\\n",
    "        text = re.sub(r\\"[^a-zA-Z\\\\s]\\", \\"\\", text)\\n",
    "        # Tokenize\\n",
    "        tokens = word_tokenize(text)\\n",
    "        # Remove stopwords\\n",
    "        stop_words = set(stopwords.words(\\"english\\"))\\n",
    "        tokens = [word for word in tokens if word not in stop_words]\\n",
    "        # Lemmatize\\n",
    "        lemmatizer = WordNetLemmatizer()\\n",
    "        tokens = [lemmatizer.lemmatize(word) for word in tokens]\\n",
    "        # Join tokens back to string\\n",
    "        return \\" \\".join(tokens)\\n",
    "    \\n",
    "    def predict(self, text, model_type=\\"both\\"):\\n",
    "        # Preprocess text\\n",
    "        cleaned_text = self.preprocess_text(text)\\n",
    "        \\n",
    "        # Vectorize\\n",
    "        text_vector = self.vectorizer.transform([cleaned_text])\\n",
    "        \\n",
    "        results = {}\\n",
    "        \\n",
    "        if model_type in [\\"lr\\", \\"both\\"]:\\n",
    "            lr_pred = self.lr_model.predict(text_vector)[0]\\n",
    "            lr_prob = self.lr_model.predict_proba(text_vector)[0]\\n",
    "            results[\\"logistic_regression\\"] = {\\n",
    "                \\"prediction\\": \\"positive\\" if lr_pred == 1 else \\"negative\\",\\n",
    "                \\"confidence\\": float(max(lr_prob)),\\n",
    "                \\"probabilities\\": {\\n",
    "                    \\"negative\\": float(lr_prob[0]),\\n",
    "                    \\"positive\\": float(lr_prob[1])\\n",
    "                }\\n",
    "            }\\n",
    "        \\n",
    "        if model_type in [\\"nb\\", \\"both\\"]:\\n",
    "            nb_pred = self.nb_model.predict(text_vector)[0]\\n",
    "            nb_prob = self.nb_model.predict_proba(text_vector)[0]\\n",
    "            results[\\"naive_bayes\\"] = {\\n",
    "                \\"prediction\\": \\"positive\\" if nb_pred == 1 else \\"negative\\",\\n",
    "                \\"confidence\\": float(max(nb_prob)),\\n",
    "                \\"probabilities\\": {\\n",
    "                    \\"negative\\": float(nb_prob[0]),\\n",
    "                    \\"positive\\": float(nb_prob[1])\\n",
    "                }\\n",
    "            }\\n",
    "        \\n",
    "        return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize analyzer\\n",
    "analyzer = SentimentAnalyzer()\\n",
    "\\n",
    "print(\\"Model loaded successfully!\\")\\n",
    "print(f\\"Logistic Regression Accuracy: {analyzer.metadata['lr_accuracy']:.2%}\\")\\n",
    "print(f\\"Naive Bayes Accuracy: {analyzer.metadata['nb_accuracy']:.2%}\\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test with sample reviews\\n",
    "test_reviews = [\\n",
    "    \\"This movie was absolutely fantastic! I loved every minute of it.\\",\\n",
    "    \\"Terrible film, waste of time. Don\\"t watch it.\\",\\n",
    "    \\"It was okay, nothing special but not bad either.\\",\\n",
    "    \\"Amazing performance by the actors, great storyline!\\",\\n",
    "    \\"Boring and predictable plot, poor acting.\\"\\n",
    "]\\n",
    "\\n",
    "for review in test_reviews:\\n",
    "    print(f\\"\\nReview: {review}\\")\\n",
    "    results = analyzer.predict(review)\\n",
    "    for model, result in results.items():\\n",
    "        print(f\\"{model}: {result['prediction']} (confidence: {result['confidence']:.2f})\\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Interactive prediction\\n",
    "def predict_sentiment(review):\\n",
    "    results = analyzer.predict(review)\\n",
    "    print(f\\"Review: {review}\\")\\n",
    "    print(\\"Results:\\")\\n",
    "    for model, result in results.items():\\n",
    "        print(f\\"  {model}: {result['prediction']} (confidence: {result['confidence']:.2%})\\")\\n",
    "    return results\\n",
    "\\n",
    "# Example usage\\n",
    "# predict_sentiment(\\"Your review here\\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}'''
    
    with open("kaggle_notebook.ipynb", "w") as f:
        f.write(notebook_code)
    
    print("Created Kaggle notebook")

def create_dockerfile():
    """Create Dockerfile for containerized deployment"""
    
    dockerfile = '''FROM python:3.9-slim

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \\
    gcc \\
    && rm -rf /var/lib/apt/lists/*

# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Download NLTK data
RUN python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')"

# Copy model files
COPY saved_models/ ./saved_models/
COPY inference.py .
COPY streamlit_deployment.py .

# Expose port
EXPOSE 8501

# Run Streamlit app
CMD ["streamlit", "run", "streamlit_deployment.py", "--server.port=8501", "--server.address=0.0.0.0"]'''
    
    with open("Dockerfile", "w") as f:
        f.write(dockerfile)
    
    print("Created Dockerfile")

def create_docker_compose():
    """Create docker-compose.yml for easy deployment"""
    
    compose = '''version: '3.8'

services:
  sentiment-analysis:
    build: .
    ports:
      - "8501:8501"
    volumes:
      - ./saved_models:/app/saved_models
    environment:
      - STREAMLIT_SERVER_PORT=8501
      - STREAMLIT_SERVER_ADDRESS=0.0.0.0'''
    
    with open("docker-compose.yml", "w") as f:
        f.write(compose)
    
    print("Created docker-compose.yml")

if __name__ == "__main__":
    print("Creating deployment configurations...")
    
    # Check if models exist
    if not os.path.exists("saved_models"):
        print("❌ Models not found! Please run 'python train_and_save_model.py' first.")
        exit(1)
    
    # Create deployment files
    create_huggingface_config()
    create_kaggle_notebook()
    create_dockerfile()
    create_docker_compose()
    
    print("\n✅ Deployment files created!")
    print("\n📋 Next steps:")
    print("1. For Hugging Face: Upload the entire directory to HF Hub")
    print("2. For Kaggle: Upload kaggle_notebook.ipynb to Kaggle")
    print("3. For Docker: Run 'docker-compose up'")
    print("4. For Streamlit Cloud: Push to GitHub and connect to Streamlit Cloud")