import os import argparse from collections import defaultdict import logging import numpy as np import torch from torch import nn from scipy.io import loadmat from configs.default import get_cfg_defaults def _reset_parameters(model): for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def get_video_style(video_name, style_type): person_id, direction, emotion, level, *_ = video_name.split("_") if style_type == "id_dir_emo_level": style = "_".join([person_id, direction, emotion, level]) elif style_type == "emotion": style = emotion else: raise ValueError("Unknown style type") return style def get_style_video_lists(video_list, style_type): style2video_list = defaultdict(list) for video in video_list: style = get_video_style(video, style_type) style2video_list[style].append(video) return style2video_list def get_face3d_clip(video_name, video_root_dir, num_frames, start_idx, dtype=torch.float32): """_summary_ Args: video_name (_type_): _description_ video_root_dir (_type_): _description_ num_frames (_type_): _description_ start_idx (_type_): "random" , middle, int dtype (_type_, optional): _description_. Defaults to torch.float32. Raises: ValueError: _description_ ValueError: _description_ Returns: _type_: _description_ """ video_path = os.path.join(video_root_dir, video_name) if video_path[-3:] == "mat": face3d_all = loadmat(video_path)["coeff"] face3d_exp = face3d_all[:, 80:144] # expression 3DMM range elif video_path[-3:] == "txt": face3d_exp = np.loadtxt(video_path) else: raise ValueError("Invalid 3DMM file extension") length = face3d_exp.shape[0] clip_num_frames = num_frames if start_idx == "random": clip_start_idx = np.random.randint(low=0, high=length - clip_num_frames + 1) elif start_idx == "middle": clip_start_idx = (length - clip_num_frames + 1) // 2 elif isinstance(start_idx, int): clip_start_idx = start_idx else: raise ValueError(f"Invalid start_idx {start_idx}") face3d_clip = face3d_exp[clip_start_idx : clip_start_idx + clip_num_frames] face3d_clip = torch.tensor(face3d_clip, dtype=dtype) return face3d_clip def get_video_style_clip(video_path, style_max_len, start_idx="random", dtype=torch.float32): if video_path[-3:] == "mat": face3d_all = loadmat(video_path)["coeff"] face3d_exp = face3d_all[:, 80:144] # expression 3DMM range elif video_path[-3:] == "txt": face3d_exp = np.loadtxt(video_path) else: raise ValueError("Invalid 3DMM file extension") face3d_exp = torch.tensor(face3d_exp, dtype=dtype) length = face3d_exp.shape[0] if length >= style_max_len: clip_num_frames = style_max_len if start_idx == "random": clip_start_idx = np.random.randint(low=0, high=length - clip_num_frames + 1) elif start_idx == "middle": clip_start_idx = (length - clip_num_frames + 1) // 2 elif isinstance(start_idx, int): clip_start_idx = start_idx else: raise ValueError(f"Invalid start_idx {start_idx}") face3d_clip = face3d_exp[clip_start_idx : clip_start_idx + clip_num_frames] pad_mask = torch.tensor([False] * style_max_len) else: padding = torch.zeros(style_max_len - length, face3d_exp.shape[1]) face3d_clip = torch.cat((face3d_exp, padding), dim=0) pad_mask = torch.tensor([False] * length + [True] * (style_max_len - length)) return face3d_clip, pad_mask def get_audio_name_from_video(video_name): audio_name = video_name[:-4] + "_seq.json" return audio_name def get_audio_window(audio, win_size): """ Args: audio (numpy.ndarray): (N,) Returns: audio_wins (numpy.ndarray): (N, W) """ num_frames = len(audio) ph_frames = [] for rid in range(0, num_frames): ph = [] for i in range(rid - win_size, rid + win_size + 1): if i < 0: ph.append(31) elif i >= num_frames: ph.append(31) else: ph.append(audio[i]) ph_frames.append(ph) audio_wins = np.array(ph_frames) return audio_wins def setup_config(): parser = argparse.ArgumentParser(description="voice2pose main program") parser.add_argument("--config_file", default="", metavar="FILE", help="path to config file") parser.add_argument("--resume_from", type=str, default=None, help="the checkpoint to resume from") parser.add_argument("--test_only", action="store_true", help="perform testing and evaluation only") parser.add_argument("--demo_input", type=str, default=None, help="path to input for demo") parser.add_argument("--checkpoint", type=str, default=None, help="the checkpoint to test with") parser.add_argument("--tag", type=str, default="", help="tag for the experiment") parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) parser.add_argument( "--local_rank", type=int, help="local rank for DistributedDataParallel", ) parser.add_argument( "--master_port", type=str, default="12345", ) args = parser.parse_args() cfg = get_cfg_defaults() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() return args, cfg def setup_logger(base_path, exp_name): rootLogger = logging.getLogger() rootLogger.setLevel(logging.INFO) logFormatter = logging.Formatter("%(asctime)s [%(levelname)-0.5s] %(message)s") log_path = "{0}/{1}.log".format(base_path, exp_name) fileHandler = logging.FileHandler(log_path) fileHandler.setFormatter(logFormatter) rootLogger.addHandler(fileHandler) consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logFormatter) rootLogger.addHandler(consoleHandler) rootLogger.handlers[0].setLevel(logging.ERROR) logging.info("log path: %s" % log_path) def get_pose_params(mat_path): """Get pose parameters from mat file Args: mat_path (str): path of mat file Returns: pose_params (numpy.ndarray): shape (L_video, 9), angle, translation, crop paramters """ mat_dict = loadmat(mat_path) np_3dmm = mat_dict["coeff"] angles = np_3dmm[:, 224:227] translations = np_3dmm[:, 254:257] np_trans_params = mat_dict["transform_params"] crop = np_trans_params[:, -3:] pose_params = np.concatenate((angles, translations, crop), axis=1) return pose_params def obtain_seq_index(index, num_frames, radius): seq = list(range(index - radius, index + radius + 1)) seq = [min(max(item, 0), num_frames - 1) for item in seq] return seq