# -- coding: utf-8 --
"""emotion-matcher.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1LTiGD09mHJRCtPkBO3f3XnYPACLCB_ro

## 1. Dataset
"""

import pandas as pd

# Define the file paths for each dataset split
splits = {
    'train': 'simplified/train-00000-of-00001.parquet',
    'validation': 'simplified/validation-00000-of-00001.parquet',
    'test': 'simplified/test-00000-of-00001.parquet'
}

# Load the training set from HuggingFace Hub using the hf:// protocol
df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])

# Preview the first few rows of the dataset
print(df.head())

# View dataset shape
print("Dataset shape:", df.shape)

# View basic column information
print("\nColumn names:", df.columns.tolist())

# View detailed info
df.info()

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Check for duplicated rows (convert unhashable columns to string)
print("\nNumber of duplicated rows:")
print(df.astype(str).duplicated().sum())

# Check how many unique combinations of emotion labels exist
print("\nNumber of unique label combinations:")
print(df["labels"].apply(lambda x: tuple(x)).nunique())

# Compute text lengths in number of words
df["text_length"] = df["text"].apply(lambda x: len(x.split()))

# Plot histogram of text lengths
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.hist(df["text_length"], bins=50)
plt.title("Distribution of Text Lengths (in words)")
plt.xlabel("Number of words")
plt.ylabel("Number of samples")
plt.grid(True)
plt.show()

# Count how many emotion labels each text has
df["num_labels"] = df["labels"].apply(len)

# Plot distribution
plt.figure(figsize=(8,5))
df["num_labels"].value_counts().sort_index().plot(kind="bar")
plt.xlabel("Number of emotion labels")
plt.ylabel("Number of samples")
plt.title("Distribution of Emotion Labels per Sample")
plt.show()

# Count frequency of each individual emotion label
from collections import Counter

# Flatten the list of labels across all samples
all_labels = [label for labels in df["labels"] for label in labels]
label_counts = Counter(all_labels)

# Convert to DataFrame for plotting
emotion_freq = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count'])
emotion_freq = emotion_freq.sort_values(by='count', ascending=False)

# Plot the frequency of each emotion
emotion_freq.plot(kind='bar', figsize=(15,5), legend=False)
plt.title("Frequency of Each Emotion Label")
plt.xlabel("Emotion Label ID")
plt.ylabel("Number of Occurrences")
plt.show()

# Create a binary matrix for emotions
import numpy as np
import seaborn as sns

num_labels = max([max(l.tolist()) if len(l) > 0 else 0 for l in df["labels"]]) + 1
emotion_matrix = np.zeros((len(df), num_labels), dtype=int)
for i, labels in enumerate(df["labels"]):
    for label in labels:
        emotion_matrix[i, label] = 1

# Compute co-occurrence matrix
co_occurrence = np.dot(emotion_matrix.T, emotion_matrix)

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(co_occurrence, cmap="Blues", linewidths=0.5)
plt.title("Emotion Co-occurrence Heatmap")
plt.xlabel("Emotion Label ID")
plt.ylabel("Emotion Label ID")
plt.show()

# Display 5 random rows
print("Sample text examples with emotion labels:")
print(df.sample(5)[["text", "labels"]])

# Define emotion label ID to name mapping manually (based on GoEmotions documentation)
id2label = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval',
    'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
    'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
    'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
    'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise',
    'neutral'
]

def decode_labels(label_ids):
    return [id2label[i] for i in label_ids]

# Display 5 random samples with readable label names
print("Sample text examples with emotion label names:")
sample_df = df.sample(5)
sample_df["label_names"] = sample_df["labels"].apply(decode_labels)
print(sample_df[["text", "label_names"]])

# Word cloud
from wordcloud import WordCloud

all_text = " ".join(df["text"])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Most Frequent Words in All Text Samples")
plt.show()

# Clean the text data
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["text"].apply(clean_text)

print("Sample cleaned texts:")
print(df[["text", "clean_text"]].sample(5))

# Plot label distribution
label_counts = Counter([label for sublist in df["labels"] for label in sublist])
label_df = pd.DataFrame.from_dict(label_counts, orient="index", columns=["count"])
label_df.index.name = "label_id"
label_df = label_df.sort_index()
label_df["label_name"] = label_df.index.map(lambda i: id2label[i])

plt.figure(figsize=(14, 6))
sns.barplot(x="label_name", y="count", data=label_df)
plt.xticks(rotation=45, ha="right")
plt.title("Distribution of Emotion Labels in Training Set")
plt.xlabel("Emotion")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

# Embeddings
from sentence_transformers import SentenceTransformer
import torch

model = SentenceTransformer('all-MiniLM-L6-v2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

sample_df = df.sample(n=3000, random_state=42).reset_index(drop=True)
embeddings = model.encode(sample_df["clean_text"].tolist(), show_progress_bar=True, device=device)
sample_df["embedding"] = embeddings.tolist()

# t-SNE visualization
from sklearn.manifold import TSNE

X = np.array(sample_df["embedding"].tolist())
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_embedded = tsne.fit_transform(X)
sample_df["x"] = X_embedded[:, 0]
sample_df["y"] = X_embedded[:, 1]

plt.figure(figsize=(10, 6))
plt.scatter(sample_df["x"], sample_df["y"], alpha=0.5)
plt.title("t-SNE Projection of Text Embeddings")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()

# KMeans Clustering
from sklearn.cluster import KMeans

num_clusters = 8
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
sample_df["cluster"] = kmeans.fit_predict(X)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(sample_df["x"], sample_df["y"], c=sample_df["cluster"], cmap='tab10', alpha=0.6)
plt.title(f"K-Means Clustering (k={num_clusters}) on t-SNE Projection")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.colorbar(scatter, label="Cluster")
plt.show()

# Recommendation Function
from sentence_transformers import util

EMBEDDINGS = torch.tensor(sample_df['embedding'].tolist(), device=device)

def recommend_similar_emotions(user_input):
    if not user_input.strip():
        return "Please enter some text."
    user_embedding = model.encode(user_input, convert_to_tensor=True, device=device)
    similarities = util.cos_sim(user_embedding, EMBEDDINGS)[0]
    top_indices = similarities.argsort(descending=True)[:5]
    results = []
    for idx in top_indices:
        row = sample_df.iloc[idx.item()]
        results.append(f"{row['text']}\nEmotions: {row['labels']}")
    return "\n\n".join(results)

# Gradio App
import gradio as gr

demo = gr.Interface(
    fn=recommend_similar_emotions,
    inputs=gr.Textbox(lines=2, placeholder="Type your situation or feeling..."),
    outputs="text",
    title="Emotion Matcher",
    description="Describe how you feel, and get similar examples with emotion labels."
)

demo.launch()