# -- coding: utf-8 -- """emotion-matcher.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1LTiGD09mHJRCtPkBO3f3XnYPACLCB_ro ## 1. Dataset """ import pandas as pd # Define the file paths for each dataset split splits = { 'train': 'simplified/train-00000-of-00001.parquet', 'validation': 'simplified/validation-00000-of-00001.parquet', 'test': 'simplified/test-00000-of-00001.parquet' } # Load the training set from HuggingFace Hub using the hf:// protocol df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"]) # Preview the first few rows of the dataset print(df.head()) # View dataset shape print("Dataset shape:", df.shape) # View basic column information print("\nColumn names:", df.columns.tolist()) # View detailed info df.info() # Check for missing values print("Missing values per column:") print(df.isnull().sum()) # Check for duplicated rows (convert unhashable columns to string) print("\nNumber of duplicated rows:") print(df.astype(str).duplicated().sum()) # Check how many unique combinations of emotion labels exist print("\nNumber of unique label combinations:") print(df["labels"].apply(lambda x: tuple(x)).nunique()) # Compute text lengths in number of words df["text_length"] = df["text"].apply(lambda x: len(x.split())) # Plot histogram of text lengths import matplotlib.pyplot as plt plt.figure(figsize=(10,6)) plt.hist(df["text_length"], bins=50) plt.title("Distribution of Text Lengths (in words)") plt.xlabel("Number of words") plt.ylabel("Number of samples") plt.grid(True) plt.show() # Count how many emotion labels each text has df["num_labels"] = df["labels"].apply(len) # Plot distribution plt.figure(figsize=(8,5)) df["num_labels"].value_counts().sort_index().plot(kind="bar") plt.xlabel("Number of emotion labels") plt.ylabel("Number of samples") plt.title("Distribution of Emotion Labels per Sample") plt.show() # Count frequency of each individual emotion label from collections import Counter # Flatten the list of labels across all samples all_labels = [label for labels in df["labels"] for label in labels] label_counts = Counter(all_labels) # Convert to DataFrame for plotting emotion_freq = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count']) emotion_freq = emotion_freq.sort_values(by='count', ascending=False) # Plot the frequency of each emotion emotion_freq.plot(kind='bar', figsize=(15,5), legend=False) plt.title("Frequency of Each Emotion Label") plt.xlabel("Emotion Label ID") plt.ylabel("Number of Occurrences") plt.show() # Create a binary matrix for emotions import numpy as np import seaborn as sns num_labels = max([max(l.tolist()) if len(l) > 0 else 0 for l in df["labels"]]) + 1 emotion_matrix = np.zeros((len(df), num_labels), dtype=int) for i, labels in enumerate(df["labels"]): for label in labels: emotion_matrix[i, label] = 1 # Compute co-occurrence matrix co_occurrence = np.dot(emotion_matrix.T, emotion_matrix) # Plot heatmap plt.figure(figsize=(12, 10)) sns.heatmap(co_occurrence, cmap="Blues", linewidths=0.5) plt.title("Emotion Co-occurrence Heatmap") plt.xlabel("Emotion Label ID") plt.ylabel("Emotion Label ID") plt.show() # Display 5 random rows print("Sample text examples with emotion labels:") print(df.sample(5)[["text", "labels"]]) # Define emotion label ID to name mapping manually (based on GoEmotions documentation) id2label = [ 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral' ] def decode_labels(label_ids): return [id2label[i] for i in label_ids] # Display 5 random samples with readable label names print("Sample text examples with emotion label names:") sample_df = df.sample(5) sample_df["label_names"] = sample_df["labels"].apply(decode_labels) print(sample_df[["text", "label_names"]]) # Word cloud from wordcloud import WordCloud all_text = " ".join(df["text"]) wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text) plt.figure(figsize=(12, 6)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.title("Most Frequent Words in All Text Samples") plt.show() # Clean the text data import re import string def clean_text(text): text = text.lower() text = re.sub(r"\[.*?\]", "", text) text = text.translate(str.maketrans('', '', string.punctuation)) text = re.sub(r"\d+", "", text) text = re.sub(r"\s+", " ", text).strip() return text df["clean_text"] = df["text"].apply(clean_text) print("Sample cleaned texts:") print(df[["text", "clean_text"]].sample(5)) # Plot label distribution label_counts = Counter([label for sublist in df["labels"] for label in sublist]) label_df = pd.DataFrame.from_dict(label_counts, orient="index", columns=["count"]) label_df.index.name = "label_id" label_df = label_df.sort_index() label_df["label_name"] = label_df.index.map(lambda i: id2label[i]) plt.figure(figsize=(14, 6)) sns.barplot(x="label_name", y="count", data=label_df) plt.xticks(rotation=45, ha="right") plt.title("Distribution of Emotion Labels in Training Set") plt.xlabel("Emotion") plt.ylabel("Frequency") plt.tight_layout() plt.show() # Embeddings from sentence_transformers import SentenceTransformer import torch model = SentenceTransformer('all-MiniLM-L6-v2') device = 'cuda' if torch.cuda.is_available() else 'cpu' model = model.to(device) sample_df = df.sample(n=3000, random_state=42).reset_index(drop=True) embeddings = model.encode(sample_df["clean_text"].tolist(), show_progress_bar=True, device=device) sample_df["embedding"] = embeddings.tolist() # t-SNE visualization from sklearn.manifold import TSNE X = np.array(sample_df["embedding"].tolist()) tsne = TSNE(n_components=2, random_state=42, perplexity=30) X_embedded = tsne.fit_transform(X) sample_df["x"] = X_embedded[:, 0] sample_df["y"] = X_embedded[:, 1] plt.figure(figsize=(10, 6)) plt.scatter(sample_df["x"], sample_df["y"], alpha=0.5) plt.title("t-SNE Projection of Text Embeddings") plt.xlabel("Component 1") plt.ylabel("Component 2") plt.show() # KMeans Clustering from sklearn.cluster import KMeans num_clusters = 8 kmeans = KMeans(n_clusters=num_clusters, random_state=42) sample_df["cluster"] = kmeans.fit_predict(X) plt.figure(figsize=(10, 6)) scatter = plt.scatter(sample_df["x"], sample_df["y"], c=sample_df["cluster"], cmap='tab10', alpha=0.6) plt.title(f"K-Means Clustering (k={num_clusters}) on t-SNE Projection") plt.xlabel("Component 1") plt.ylabel("Component 2") plt.colorbar(scatter, label="Cluster") plt.show() # Recommendation Function from sentence_transformers import util EMBEDDINGS = torch.tensor(sample_df['embedding'].tolist(), device=device) def recommend_similar_emotions(user_input): if not user_input.strip(): return "Please enter some text." user_embedding = model.encode(user_input, convert_to_tensor=True, device=device) similarities = util.cos_sim(user_embedding, EMBEDDINGS)[0] top_indices = similarities.argsort(descending=True)[:5] results = [] for idx in top_indices: row = sample_df.iloc[idx.item()] results.append(f"{row['text']}\nEmotions: {row['labels']}") return "\n\n".join(results) # Gradio App import gradio as gr demo = gr.Interface( fn=recommend_similar_emotions, inputs=gr.Textbox(lines=2, placeholder="Type your situation or feeling..."), outputs="text", title="Emotion Matcher", description="Describe how you feel, and get similar examples with emotion labels." ) demo.launch()