Spaces:
Sleeping
Sleeping
# -- coding: utf-8 -- | |
"""emotion-matcher.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1LTiGD09mHJRCtPkBO3f3XnYPACLCB_ro | |
## 1. Dataset | |
""" | |
import pandas as pd | |
# Define the file paths for each dataset split | |
splits = { | |
'train': 'simplified/train-00000-of-00001.parquet', | |
'validation': 'simplified/validation-00000-of-00001.parquet', | |
'test': 'simplified/test-00000-of-00001.parquet' | |
} | |
# Load the training set from HuggingFace Hub using the hf:// protocol | |
df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"]) | |
# Preview the first few rows of the dataset | |
print(df.head()) | |
# View dataset shape | |
print("Dataset shape:", df.shape) | |
# View basic column information | |
print("\nColumn names:", df.columns.tolist()) | |
# View detailed info | |
df.info() | |
# Check for missing values | |
print("Missing values per column:") | |
print(df.isnull().sum()) | |
# Check for duplicated rows (convert unhashable columns to string) | |
print("\nNumber of duplicated rows:") | |
print(df.astype(str).duplicated().sum()) | |
# Check how many unique combinations of emotion labels exist | |
print("\nNumber of unique label combinations:") | |
print(df["labels"].apply(lambda x: tuple(x)).nunique()) | |
# Compute text lengths in number of words | |
df["text_length"] = df["text"].apply(lambda x: len(x.split())) | |
# Plot histogram of text lengths | |
import matplotlib.pyplot as plt | |
plt.figure(figsize=(10,6)) | |
plt.hist(df["text_length"], bins=50) | |
plt.title("Distribution of Text Lengths (in words)") | |
plt.xlabel("Number of words") | |
plt.ylabel("Number of samples") | |
plt.grid(True) | |
plt.show() | |
# Count how many emotion labels each text has | |
df["num_labels"] = df["labels"].apply(len) | |
# Plot distribution | |
plt.figure(figsize=(8,5)) | |
df["num_labels"].value_counts().sort_index().plot(kind="bar") | |
plt.xlabel("Number of emotion labels") | |
plt.ylabel("Number of samples") | |
plt.title("Distribution of Emotion Labels per Sample") | |
plt.show() | |
# Count frequency of each individual emotion label | |
from collections import Counter | |
# Flatten the list of labels across all samples | |
all_labels = [label for labels in df["labels"] for label in labels] | |
label_counts = Counter(all_labels) | |
# Convert to DataFrame for plotting | |
emotion_freq = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count']) | |
emotion_freq = emotion_freq.sort_values(by='count', ascending=False) | |
# Plot the frequency of each emotion | |
emotion_freq.plot(kind='bar', figsize=(15,5), legend=False) | |
plt.title("Frequency of Each Emotion Label") | |
plt.xlabel("Emotion Label ID") | |
plt.ylabel("Number of Occurrences") | |
plt.show() | |
# Create a binary matrix for emotions | |
import numpy as np | |
import seaborn as sns | |
num_labels = max([max(l.tolist()) if len(l) > 0 else 0 for l in df["labels"]]) + 1 | |
emotion_matrix = np.zeros((len(df), num_labels), dtype=int) | |
for i, labels in enumerate(df["labels"]): | |
for label in labels: | |
emotion_matrix[i, label] = 1 | |
# Compute co-occurrence matrix | |
co_occurrence = np.dot(emotion_matrix.T, emotion_matrix) | |
# Plot heatmap | |
plt.figure(figsize=(12, 10)) | |
sns.heatmap(co_occurrence, cmap="Blues", linewidths=0.5) | |
plt.title("Emotion Co-occurrence Heatmap") | |
plt.xlabel("Emotion Label ID") | |
plt.ylabel("Emotion Label ID") | |
plt.show() | |
# Display 5 random rows | |
print("Sample text examples with emotion labels:") | |
print(df.sample(5)[["text", "labels"]]) | |
# Define emotion label ID to name mapping manually (based on GoEmotions documentation) | |
id2label = [ | |
'admiration', 'amusement', 'anger', 'annoyance', 'approval', | |
'caring', 'confusion', 'curiosity', 'desire', 'disappointment', | |
'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', | |
'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', | |
'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', | |
'neutral' | |
] | |
def decode_labels(label_ids): | |
return [id2label[i] for i in label_ids] | |
# Display 5 random samples with readable label names | |
print("Sample text examples with emotion label names:") | |
sample_df = df.sample(5) | |
sample_df["label_names"] = sample_df["labels"].apply(decode_labels) | |
print(sample_df[["text", "label_names"]]) | |
# Word cloud | |
from wordcloud import WordCloud | |
all_text = " ".join(df["text"]) | |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text) | |
plt.figure(figsize=(12, 6)) | |
plt.imshow(wordcloud, interpolation="bilinear") | |
plt.axis("off") | |
plt.title("Most Frequent Words in All Text Samples") | |
plt.show() | |
# Clean the text data | |
import re | |
import string | |
def clean_text(text): | |
text = text.lower() | |
text = re.sub(r"\[.*?\]", "", text) | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
text = re.sub(r"\d+", "", text) | |
text = re.sub(r"\s+", " ", text).strip() | |
return text | |
df["clean_text"] = df["text"].apply(clean_text) | |
print("Sample cleaned texts:") | |
print(df[["text", "clean_text"]].sample(5)) | |
# Plot label distribution | |
label_counts = Counter([label for sublist in df["labels"] for label in sublist]) | |
label_df = pd.DataFrame.from_dict(label_counts, orient="index", columns=["count"]) | |
label_df.index.name = "label_id" | |
label_df = label_df.sort_index() | |
label_df["label_name"] = label_df.index.map(lambda i: id2label[i]) | |
plt.figure(figsize=(14, 6)) | |
sns.barplot(x="label_name", y="count", data=label_df) | |
plt.xticks(rotation=45, ha="right") | |
plt.title("Distribution of Emotion Labels in Training Set") | |
plt.xlabel("Emotion") | |
plt.ylabel("Frequency") | |
plt.tight_layout() | |
plt.show() | |
# Embeddings | |
from sentence_transformers import SentenceTransformer | |
import torch | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
model = model.to(device) | |
sample_df = df.sample(n=3000, random_state=42).reset_index(drop=True) | |
embeddings = model.encode(sample_df["clean_text"].tolist(), show_progress_bar=True, device=device) | |
sample_df["embedding"] = embeddings.tolist() | |
# t-SNE visualization | |
from sklearn.manifold import TSNE | |
X = np.array(sample_df["embedding"].tolist()) | |
tsne = TSNE(n_components=2, random_state=42, perplexity=30) | |
X_embedded = tsne.fit_transform(X) | |
sample_df["x"] = X_embedded[:, 0] | |
sample_df["y"] = X_embedded[:, 1] | |
plt.figure(figsize=(10, 6)) | |
plt.scatter(sample_df["x"], sample_df["y"], alpha=0.5) | |
plt.title("t-SNE Projection of Text Embeddings") | |
plt.xlabel("Component 1") | |
plt.ylabel("Component 2") | |
plt.show() | |
# KMeans Clustering | |
from sklearn.cluster import KMeans | |
num_clusters = 8 | |
kmeans = KMeans(n_clusters=num_clusters, random_state=42) | |
sample_df["cluster"] = kmeans.fit_predict(X) | |
plt.figure(figsize=(10, 6)) | |
scatter = plt.scatter(sample_df["x"], sample_df["y"], c=sample_df["cluster"], cmap='tab10', alpha=0.6) | |
plt.title(f"K-Means Clustering (k={num_clusters}) on t-SNE Projection") | |
plt.xlabel("Component 1") | |
plt.ylabel("Component 2") | |
plt.colorbar(scatter, label="Cluster") | |
plt.show() | |
# Recommendation Function | |
from sentence_transformers import util | |
EMBEDDINGS = torch.tensor(sample_df['embedding'].tolist(), device=device) | |
def recommend_similar_emotions(user_input): | |
if not user_input.strip(): | |
return "Please enter some text." | |
user_embedding = model.encode(user_input, convert_to_tensor=True, device=device) | |
similarities = util.cos_sim(user_embedding, EMBEDDINGS)[0] | |
top_indices = similarities.argsort(descending=True)[:5] | |
results = [] | |
for idx in top_indices: | |
row = sample_df.iloc[idx.item()] | |
results.append(f"{row['text']}\nEmotions: {row['labels']}") | |
return "\n\n".join(results) | |
# Gradio App | |
import gradio as gr | |
demo = gr.Interface( | |
fn=recommend_similar_emotions, | |
inputs=gr.Textbox(lines=2, placeholder="Type your situation or feeling..."), | |
outputs="text", | |
title="Emotion Matcher", | |
description="Describe how you feel, and get similar examples with emotion labels." | |
) | |
demo.launch() |