Spaces:

hantupocong
/

Text-to-MalaySignLanguage

Sleeping

App Files Files Community

Text-to-MalaySignLanguage / data.py

hantupocong

Update data.py

0af5be2 verified about 1 month ago

raw

history blame contribute delete

8.62 kB

	import torch
	import torch.nn as nn
	from torch.utils.data import Dataset, DataLoader, random_split
	import pandas as pd
	import numpy as np
	from transformers import BertTokenizer
	import os
	from pose_format import Pose
	import matplotlib.pyplot as plt
	from matplotlib import animation
	from fastdtw import fastdtw # Keep this import
	from scipy.spatial.distance import cosine
	from config import MAX_TEXT_LEN, TARGET_NUM_FRAMES, BATCH_SIZE, TEACHER_FORCING_RATIO, SMOOTHING_ENABLED

	# ===== KEYPOINT SELECTION =====
	selected_keypoint_indices = list(np.r_[0:25, 501:522, 522:543])
	NUM_KEYPOINTS = len(selected_keypoint_indices)
	POSE_DIM = NUM_KEYPOINTS * 3

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# ===== SMOOTHING =====
	def selective_smoothing(preds):
	smoothed = preds.clone()
	body_indices = slice(0, 25 * 3)
	for t in range(1, preds.shape[1] - 1):
	smoothed[:, t, body_indices] = (
	0.25 * preds[:, t - 1, body_indices] +
	0.5 * preds[:, t, body_indices] +
	0.25 * preds[:, t + 1, body_indices]
	)
	return smoothed


	# ===== HAND INDEX SETUP =====
	hand_indices = list(range(15 * 3, POSE_DIM)) # hand joints = after body joints in flattened 3D vector
	joint_weights = torch.ones(POSE_DIM).to(device)
	joint_weights[hand_indices] *= 3.0

	# ===== GLOBAL NORMALIZATION =====
	def compute_global_mean_std(pose_folder, csv_file):
	data = pd.read_csv(csv_file)
	all_poses = []
	# Store valid masks separately to ensure correct normalization
	all_masks = []

	for filename in data["filename"]:
	pose_path = os.path.join(pose_folder, filename)
	with open(pose_path, "rb") as f:
	pose = Pose.read(f.read())

	keypoints = np.array(selected_keypoint_indices)
	# (T, 1, K, 3) -> (T, K, 3)
	pose_data = np.squeeze(pose.body.data, axis=1)[:, keypoints, :]
	# (T, 1, K) -> (T, K)
	confidence = np.squeeze(pose.body.confidence, axis=1)[:, keypoints]

	# Reshape to (T, K*3)
	pose_data_flat = pose_data.reshape(pose_data.shape[0], -1)
	# Reshape confidence to (T, K*3) - repeat confidence for each coordinate
	confidence_flat = np.repeat(confidence, 3, axis=1)

	# Create a mask based on confidence for the flattened data
	mask_flat = (confidence_flat > 0.5).astype(np.float32)

	# Append the full pose data and mask for interpolation later
	all_poses.append(pose_data_flat)
	all_masks.append(mask_flat)


	# Pad or interpolate all poses and masks to a fixed length (TARGET_NUM_FRAMES)
	padded_poses = []
	padded_masks = []
	for pose_data_flat, mask_flat in zip(all_poses, all_masks):
	current_frames = pose_data_flat.shape[0]
	if current_frames < TARGET_NUM_FRAMES:
	pad_len = TARGET_NUM_FRAMES - current_frames
	pose_pad = np.zeros((pad_len, POSE_DIM))
	mask_pad = np.zeros((pad_len, POSE_DIM)) # Pad mask with zeros
	padded_pose = np.concatenate([pose_data_flat, pose_pad], axis=0)
	padded_mask = np.concatenate([mask_flat, mask_pad], axis=0)
	else:
	indices = np.linspace(0, current_frames - 1, TARGET_NUM_FRAMES).astype(int)
	padded_pose = pose_data_flat[indices]
	padded_mask = mask_flat[indices]

	padded_poses.append(padded_pose)
	padded_masks.append(padded_mask)

	# Stack all padded poses and masks: (Total_Samples * TARGET_NUM_FRAMES, POSE_DIM)
	stacked_poses = np.vstack(padded_poses)
	stacked_masks = np.vstack(padded_masks)

	# Compute mean and std using the masks to only include valid points
	# Weighted average using mask as weights
	mean = np.sum(stacked_poses * stacked_masks, axis=0) / (np.sum(stacked_masks, axis=0) + 1e-8) # Add epsilon for stability
	# Compute variance, then sqrt for std
	variance = np.sum(stacked_masks * (stacked_poses - mean)**2, axis=0) / (np.sum(stacked_masks, axis=0) + 1e-8)
	std = np.sqrt(variance)

	std[std == 0] = 1e-8 # Avoid division by zero
	return mean, std

	#POSE_FOLDER = "/content/drive/MyDrive/pose/words/ase"
	CSV_FILE = "annotated.csv"
	mean_path = "global_mean.npy"
	std_path = "global_std.npy"

	if os.path.exists(mean_path) and os.path.exists(std_path):
	print("Loading global mean and std from file.")
	GLOBAL_MEAN = np.load(mean_path)
	GLOBAL_STD = np.load(std_path)
	else:
	print("Computing global mean and std from dataset.")
	GLOBAL_MEAN, GLOBAL_STD = compute_global_mean_std(POSE_FOLDER, CSV_FILE)
	# Save the computed mean and std
	# Ensure they are not MaskedArrays if the computation somehow produced them
	# If compute_global_mean_std is modified to return standard arrays, this is redundant but safe
	if isinstance(GLOBAL_MEAN, np.ma.MaskedArray):
	GLOBAL_MEAN = GLOBAL_MEAN.data
	if isinstance(GLOBAL_STD, np.ma.MaskedArray):
	GLOBAL_STD = GLOBAL_STD.data

	np.save(mean_path, GLOBAL_MEAN)
	np.save(std_path, GLOBAL_STD)

	GLOBAL_MEAN_T = torch.tensor(GLOBAL_MEAN).float().to(device)
	GLOBAL_STD_T = torch.tensor(GLOBAL_STD).float().to(device)


	class TextToPoseDataset(Dataset):
	def __init__(self, csv_file, pose_folder, tokenizer, is_train=True):
	self.data = pd.read_csv(csv_file)
	self.pose_folder = pose_folder
	self.tokenizer = tokenizer
	self.is_train = is_train # enable augment only during training

	def __len__(self):
	return len(self.data)

	def load_pose_data_and_mask(self, filename):
	pose_path = os.path.join(self.pose_folder, filename)
	with open(pose_path, "rb") as f:
	pose = Pose.read(f.read())

	keypoints = np.array(selected_keypoint_indices)
	pose_data = np.squeeze(pose.body.data, axis=1)[:, keypoints, :]
	confidence = np.squeeze(pose.body.confidence, axis=1)[:, keypoints]

	return pose_data, confidence

	def apply_augmentations(self, pose_data, confidence):
	T = pose_data.shape[0]

	# Temporal warp (resample frame indices with small noise)
	if T > TARGET_NUM_FRAMES and np.random.rand() < 0.5:
	indices = np.linspace(0, T - 1, TARGET_NUM_FRAMES)
	jitter = np.random.uniform(-0.5, 0.5, size=indices.shape)
	indices = np.clip(indices + jitter, 0, T - 1).astype(int)
	pose_data = pose_data[indices]
	confidence = confidence[indices]

	# Mirror (flip X-axis)
	if np.random.rand() < 0.3:
	pose_data[..., 0] *= -1

	# Jitter (small Gaussian noise)
	if np.random.rand() < 0.3:
	pose_data += np.random.normal(0, 0.02, pose_data.shape)

	return pose_data, confidence

	def __getitem__(self, idx):
	row = self.data.iloc[idx]
	filename = row["filename"]
	text = row["text"]

	input_ids = self.tokenizer(
	text, padding="max_length", truncation=True,
	max_length=MAX_TEXT_LEN, return_tensors="pt"
	)

	pose_data, confidence = self.load_pose_data_and_mask(filename)

	if self.is_train:
	pose_data, confidence = self.apply_augmentations(pose_data, confidence)

	# OLD Flatten
	pose_data_flat = pose_data.reshape(pose_data.shape[0], -1)
	confidence_flat = np.repeat(confidence, 3, axis=1)
	mask_flat = (confidence_flat > 0.5).astype(np.float32)

	# Pad or warp to fixed length
	current_frames = pose_data_flat.shape[0]
	if current_frames < TARGET_NUM_FRAMES:
	pad_len = TARGET_NUM_FRAMES - current_frames
	pose_pad = np.zeros((pad_len, POSE_DIM))
	mask_pad = np.zeros((pad_len, POSE_DIM))
	padded_pose = np.concatenate([pose_data_flat, pose_pad], axis=0)
	padded_mask = np.concatenate([mask_flat, mask_pad], axis=0)
	else:
	indices = np.linspace(0, current_frames - 1, TARGET_NUM_FRAMES).astype(int)
	padded_pose = pose_data_flat[indices]
	padded_mask = mask_flat[indices]

	# Normalize
	normalized_pose = (padded_pose - GLOBAL_MEAN) / GLOBAL_STD

	return (
	input_ids.input_ids.squeeze(0),
	input_ids.attention_mask.squeeze(0),
	torch.tensor(normalized_pose).float(),
	torch.tensor(padded_mask).float(),
	text
	)


	def collate_fn(batch):
	input_ids, attn_masks, poses, masks, words = zip(*batch)
	return (
	torch.stack(input_ids),
	torch.stack(attn_masks),
	torch.stack(poses),
	torch.stack(masks),
	list(words)
	)