emotion-matcher / app.py
vova631's picture
Update app.py
69d8d00 verified
# -- coding: utf-8 --
"""emotion-matcher.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1LTiGD09mHJRCtPkBO3f3XnYPACLCB_ro
## 1. Dataset
"""
import pandas as pd
# Define the file paths for each dataset split
splits = {
'train': 'simplified/train-00000-of-00001.parquet',
'validation': 'simplified/validation-00000-of-00001.parquet',
'test': 'simplified/test-00000-of-00001.parquet'
}
# Load the training set from HuggingFace Hub using the hf:// protocol
df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])
# Preview the first few rows of the dataset
print(df.head())
# View dataset shape
print("Dataset shape:", df.shape)
# View basic column information
print("\nColumn names:", df.columns.tolist())
# View detailed info
df.info()
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
# Check for duplicated rows (convert unhashable columns to string)
print("\nNumber of duplicated rows:")
print(df.astype(str).duplicated().sum())
# Check how many unique combinations of emotion labels exist
print("\nNumber of unique label combinations:")
print(df["labels"].apply(lambda x: tuple(x)).nunique())
# Compute text lengths in number of words
df["text_length"] = df["text"].apply(lambda x: len(x.split()))
# Plot histogram of text lengths
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
plt.hist(df["text_length"], bins=50)
plt.title("Distribution of Text Lengths (in words)")
plt.xlabel("Number of words")
plt.ylabel("Number of samples")
plt.grid(True)
plt.show()
# Count how many emotion labels each text has
df["num_labels"] = df["labels"].apply(len)
# Plot distribution
plt.figure(figsize=(8,5))
df["num_labels"].value_counts().sort_index().plot(kind="bar")
plt.xlabel("Number of emotion labels")
plt.ylabel("Number of samples")
plt.title("Distribution of Emotion Labels per Sample")
plt.show()
# Count frequency of each individual emotion label
from collections import Counter
# Flatten the list of labels across all samples
all_labels = [label for labels in df["labels"] for label in labels]
label_counts = Counter(all_labels)
# Convert to DataFrame for plotting
emotion_freq = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count'])
emotion_freq = emotion_freq.sort_values(by='count', ascending=False)
# Plot the frequency of each emotion
emotion_freq.plot(kind='bar', figsize=(15,5), legend=False)
plt.title("Frequency of Each Emotion Label")
plt.xlabel("Emotion Label ID")
plt.ylabel("Number of Occurrences")
plt.show()
# Create a binary matrix for emotions
import numpy as np
import seaborn as sns
num_labels = max([max(l.tolist()) if len(l) > 0 else 0 for l in df["labels"]]) + 1
emotion_matrix = np.zeros((len(df), num_labels), dtype=int)
for i, labels in enumerate(df["labels"]):
for label in labels:
emotion_matrix[i, label] = 1
# Compute co-occurrence matrix
co_occurrence = np.dot(emotion_matrix.T, emotion_matrix)
# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(co_occurrence, cmap="Blues", linewidths=0.5)
plt.title("Emotion Co-occurrence Heatmap")
plt.xlabel("Emotion Label ID")
plt.ylabel("Emotion Label ID")
plt.show()
# Display 5 random rows
print("Sample text examples with emotion labels:")
print(df.sample(5)[["text", "labels"]])
# Define emotion label ID to name mapping manually (based on GoEmotions documentation)
id2label = [
'admiration', 'amusement', 'anger', 'annoyance', 'approval',
'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise',
'neutral'
]
def decode_labels(label_ids):
return [id2label[i] for i in label_ids]
# Display 5 random samples with readable label names
print("Sample text examples with emotion label names:")
sample_df = df.sample(5)
sample_df["label_names"] = sample_df["labels"].apply(decode_labels)
print(sample_df[["text", "label_names"]])
# Word cloud
from wordcloud import WordCloud
all_text = " ".join(df["text"])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Most Frequent Words in All Text Samples")
plt.show()
# Clean the text data
import re
import string
def clean_text(text):
text = text.lower()
text = re.sub(r"\[.*?\]", "", text)
text = text.translate(str.maketrans('', '', string.punctuation))
text = re.sub(r"\d+", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
df["clean_text"] = df["text"].apply(clean_text)
print("Sample cleaned texts:")
print(df[["text", "clean_text"]].sample(5))
# Plot label distribution
label_counts = Counter([label for sublist in df["labels"] for label in sublist])
label_df = pd.DataFrame.from_dict(label_counts, orient="index", columns=["count"])
label_df.index.name = "label_id"
label_df = label_df.sort_index()
label_df["label_name"] = label_df.index.map(lambda i: id2label[i])
plt.figure(figsize=(14, 6))
sns.barplot(x="label_name", y="count", data=label_df)
plt.xticks(rotation=45, ha="right")
plt.title("Distribution of Emotion Labels in Training Set")
plt.xlabel("Emotion")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()
# Embeddings
from sentence_transformers import SentenceTransformer
import torch
model = SentenceTransformer('all-MiniLM-L6-v2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
sample_df = df.sample(n=3000, random_state=42).reset_index(drop=True)
embeddings = model.encode(sample_df["clean_text"].tolist(), show_progress_bar=True, device=device)
sample_df["embedding"] = embeddings.tolist()
# t-SNE visualization
from sklearn.manifold import TSNE
X = np.array(sample_df["embedding"].tolist())
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_embedded = tsne.fit_transform(X)
sample_df["x"] = X_embedded[:, 0]
sample_df["y"] = X_embedded[:, 1]
plt.figure(figsize=(10, 6))
plt.scatter(sample_df["x"], sample_df["y"], alpha=0.5)
plt.title("t-SNE Projection of Text Embeddings")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()
# KMeans Clustering
from sklearn.cluster import KMeans
num_clusters = 8
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
sample_df["cluster"] = kmeans.fit_predict(X)
plt.figure(figsize=(10, 6))
scatter = plt.scatter(sample_df["x"], sample_df["y"], c=sample_df["cluster"], cmap='tab10', alpha=0.6)
plt.title(f"K-Means Clustering (k={num_clusters}) on t-SNE Projection")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.colorbar(scatter, label="Cluster")
plt.show()
# Recommendation Function
from sentence_transformers import util
EMBEDDINGS = torch.tensor(sample_df['embedding'].tolist(), device=device)
def recommend_similar_emotions(user_input):
if not user_input.strip():
return "Please enter some text."
user_embedding = model.encode(user_input, convert_to_tensor=True, device=device)
similarities = util.cos_sim(user_embedding, EMBEDDINGS)[0]
top_indices = similarities.argsort(descending=True)[:5]
results = []
for idx in top_indices:
row = sample_df.iloc[idx.item()]
results.append(f"{row['text']}\nEmotions: {row['labels']}")
return "\n\n".join(results)
# Gradio App
import gradio as gr
demo = gr.Interface(
fn=recommend_similar_emotions,
inputs=gr.Textbox(lines=2, placeholder="Type your situation or feeling..."),
outputs="text",
title="Emotion Matcher",
description="Describe how you feel, and get similar examples with emotion labels."
)
demo.launch()