diff --git a/AR/__init__.py b/AR/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/AR/data/__init__.py b/AR/data/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/AR/data/bucket_sampler.py b/AR/data/bucket_sampler.py deleted file mode 100644 index 45f91d8eb4eaee7867fee29e89971b843fa23f38..0000000000000000000000000000000000000000 --- a/AR/data/bucket_sampler.py +++ /dev/null @@ -1,163 +0,0 @@ -# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/bucket_sampler.py -# reference: https://github.com/lifeiteng/vall-e -import itertools -import math -import random -from random import shuffle -from typing import Iterator -from typing import Optional -from typing import TypeVar - -import torch -import torch.distributed as dist -from torch.utils.data import Dataset -from torch.utils.data import Sampler - -__all__ = [ - "DistributedBucketSampler", -] - -T_co = TypeVar("T_co", covariant=True) - - -class DistributedBucketSampler(Sampler[T_co]): - r""" - sort the dataset wrt. input length - divide samples into buckets - sort within buckets - divide buckets into batches - sort batches - """ - - def __init__( - self, - dataset: Dataset, - num_replicas: Optional[int] = None, - rank: Optional[int] = None, - shuffle: bool = True, - seed: int = 0, - drop_last: bool = False, - batch_size: int = 32, - ) -> None: - if num_replicas is None: - if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available") - num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1 - if rank is None: - if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available") - rank = dist.get_rank() if torch.cuda.is_available() else 0 - if torch.cuda.is_available(): - torch.cuda.set_device(rank) - if rank >= num_replicas or rank < 0: - raise ValueError( - "Invalid rank {}, rank should be in the interval" - " [0, {}]".format(rank, num_replicas - 1) - ) - self.dataset = dataset - self.num_replicas = num_replicas - self.rank = rank - self.epoch = 0 - self.drop_last = drop_last - # If the dataset length is evenly divisible by # of replicas, then there - # is no need to drop any data, since the dataset will be split equally. - if ( - self.drop_last and len(self.dataset) % self.num_replicas != 0 - ): # type: ignore[arg-type] - # Split to nearest available length that is evenly divisible. - # This is to ensure each rank receives the same amount of data when - # using this Sampler. - self.num_samples = math.ceil( - (len(self.dataset) - self.num_replicas) - / self.num_replicas # type: ignore[arg-type] - ) - else: - self.num_samples = math.ceil( - len(self.dataset) / self.num_replicas - ) # type: ignore[arg-type] - self.total_size = self.num_samples * self.num_replicas - self.shuffle = shuffle - self.seed = seed - self.batch_size = batch_size - self.id_with_length = self._get_sample_lengths() - self.id_buckets = self.make_buckets(bucket_width=2.0) - - def _get_sample_lengths(self): - id_with_lengths = [] - for i in range(len(self.dataset)): - id_with_lengths.append((i, self.dataset.get_sample_length(i))) - id_with_lengths.sort(key=lambda x: x[1]) - return id_with_lengths - - def make_buckets(self, bucket_width: float = 2.0): - buckets = [] - cur = [] - max_sec = bucket_width - for id, sec in self.id_with_length: - if sec < max_sec: - cur.append(id) - else: - buckets.append(cur) - cur = [id] - max_sec += bucket_width - if len(cur) > 0: - buckets.append(cur) - return buckets - - def __iter__(self) -> Iterator[T_co]: - if self.shuffle: - # deterministically shuffle based on epoch and seed - g = torch.Generator() - g.manual_seed(self.seed + self.epoch) - random.seed(self.epoch + self.seed) - shuffled_bucket = [] - for buc in self.id_buckets: - buc_copy = buc.copy() - shuffle(buc_copy) - shuffled_bucket.append(buc_copy) - grouped_batch_size = self.batch_size * self.num_replicas - shuffled_bucket = list(itertools.chain(*shuffled_bucket)) - n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size)) - batches = [ - shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size] - for b in range(n_batch) - ] - shuffle(batches) - indices = list(itertools.chain(*batches)) - else: - # type: ignore[arg-type] - indices = list(range(len(self.dataset))) - - if not self.drop_last: - # add extra samples to make it evenly divisible - padding_size = self.total_size - len(indices) - if padding_size <= len(indices): - indices += indices[:padding_size] - else: - indices += (indices * math.ceil(padding_size / len(indices)))[ - :padding_size - ] - else: - # remove tail of data to make it evenly divisible. - indices = indices[: self.total_size] - assert len(indices) == self.total_size - - # subsample - indices = indices[self.rank : self.total_size : self.num_replicas] - assert len(indices) == self.num_samples - - return iter(indices) - - def __len__(self) -> int: - return self.num_samples - - def set_epoch(self, epoch: int) -> None: - r""" - Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas - use a different random ordering for each epoch. Otherwise, the next iteration of this - sampler will yield the same ordering. - - Args: - epoch (int): Epoch number. - """ - self.epoch = epoch diff --git a/AR/data/data_module.py b/AR/data/data_module.py deleted file mode 100644 index cb947959b3f2fdb0951e6a6383399a9fb3826536..0000000000000000000000000000000000000000 --- a/AR/data/data_module.py +++ /dev/null @@ -1,76 +0,0 @@ -# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py -# reference: https://github.com/lifeiteng/vall-e -from pytorch_lightning import LightningDataModule -from AR.data.bucket_sampler import DistributedBucketSampler -from AR.data.dataset import Text2SemanticDataset -from torch.utils.data import DataLoader - - -class Text2SemanticDataModule(LightningDataModule): - def __init__( - self, - config, - train_semantic_path, - train_phoneme_path, - dev_semantic_path=None, - dev_phoneme_path=None, - ): - super().__init__() - self.config = config - self.train_semantic_path = train_semantic_path - self.train_phoneme_path = train_phoneme_path - self.dev_semantic_path = dev_semantic_path - self.dev_phoneme_path = dev_phoneme_path - self.num_workers = self.config["data"]["num_workers"] - - def prepare_data(self): - pass - - def setup(self, stage=None, output_logs=False): - self._train_dataset = Text2SemanticDataset( - phoneme_path=self.train_phoneme_path, - semantic_path=self.train_semantic_path, - max_sec=self.config["data"]["max_sec"], - pad_val=self.config["data"]["pad_val"], - ) - self._dev_dataset = self._train_dataset - # self._dev_dataset = Text2SemanticDataset( - # phoneme_path=self.dev_phoneme_path, - # semantic_path=self.dev_semantic_path, - # max_sample=self.config['data']['max_eval_sample'], - # max_sec=self.config['data']['max_sec'], - # pad_val=self.config['data']['pad_val']) - - def train_dataloader(self): - batch_size=self.config["train"]["batch_size"]//2 if self.config["train"].get("if_dpo",False)==True else self.config["train"]["batch_size"] - batch_size = max(min(batch_size,len(self._train_dataset)//4),1)#防止不保存 - sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size) - return DataLoader( - self._train_dataset, - batch_size=batch_size, - sampler=sampler, - collate_fn=self._train_dataset.collate, - num_workers=self.num_workers, - persistent_workers=True, - prefetch_factor=16, - ) - - def val_dataloader(self): - return DataLoader( - self._dev_dataset, - batch_size=1, - shuffle=False, - collate_fn=self._train_dataset.collate, - num_workers=max(self.num_workers, 12), - persistent_workers=True, - prefetch_factor=16, - ) - - # 这个会使用到嘛? - def test_dataloader(self): - return DataLoader( - self._dev_dataset, - batch_size=1, - shuffle=False, - collate_fn=self._train_dataset.collate, - ) diff --git a/AR/data/dataset.py b/AR/data/dataset.py deleted file mode 100644 index 9d2dfe8a2a6b48bc4703f547a5be77f26ed90a8d..0000000000000000000000000000000000000000 --- a/AR/data/dataset.py +++ /dev/null @@ -1,323 +0,0 @@ -# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/dataset.py -# reference: https://github.com/lifeiteng/vall-e -import pdb -import sys - -# sys.path.append("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert") -import traceback, os -from typing import Dict -from typing import List - -import numpy as np -import pandas as pd -import torch, json -from torch.utils.data import DataLoader -from torch.utils.data import Dataset -from transformers import AutoTokenizer - -version = os.environ.get('version',None) - -from text import cleaned_text_to_sequence - -# from config import exp_dir - - -def batch_sequences(sequences: List[np.array], axis: int = 0, pad_value: int = 0): - seq = sequences[0] - ndim = seq.ndim - if axis < 0: - axis += ndim - dtype = seq.dtype - pad_value = dtype.type(pad_value) - seq_lengths = [seq.shape[axis] for seq in sequences] - max_length = np.max(seq_lengths) - - padded_sequences = [] - for seq, length in zip(sequences, seq_lengths): - padding = ( - [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (ndim - axis - 1) - ) - padded_seq = np.pad(seq, padding, mode="constant", constant_values=pad_value) - padded_sequences.append(padded_seq) - batch = np.stack(padded_sequences) - return batch - - -class Text2SemanticDataset(Dataset): - """dataset class for text tokens to semantic model training.""" - - def __init__( - self, - phoneme_path: str, - semantic_path: str, - max_sample: int = None, - max_sec: int = 100, - pad_val: int = 1024, - # min value of phoneme/sec - min_ps_ratio: int = 3, - # max value of phoneme/sec - max_ps_ratio: int = 25, - ) -> None: - super().__init__() - - self.semantic_data = pd.read_csv( - semantic_path, delimiter="\t", encoding="utf-8" - ) - # get dict - self.path2 = phoneme_path # "%s/2-name2text.txt"%exp_dir#phoneme_path - self.path3 = "%s/3-bert" % ( - os.path.dirname(phoneme_path) - ) # "%s/3-bert"%exp_dir#bert_dir - self.path6 = semantic_path # "%s/6-name2semantic.tsv"%exp_dir#semantic_path - assert os.path.exists(self.path2) - assert os.path.exists(self.path6) - self.phoneme_data = {} - with open(self.path2, "r", encoding="utf8") as f: - lines = f.read().strip("\n").split("\n") - - for line in lines: - tmp = line.split("\t") - if len(tmp) != 4: - continue - self.phoneme_data[tmp[0]] = [tmp[1], tmp[2], tmp[3]] - - # self.phoneme_data = np.load(phoneme_path, allow_pickle=True).item() - # pad for semantic tokens - self.PAD: int = pad_val - # self.hz = 25 - # with open("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert/configs/s2.json", "r") as f:data = f.read() - # data=json.loads(data)["model"]["semantic_frame_rate"]#50hz - # self.hz=int(data[:-2])# - self.hz = int(os.environ.get("hz", "25hz")[:-2]) - - # max seconds of semantic token - self.max_sec = max_sec - self.min_ps_ratio = min_ps_ratio - self.max_ps_ratio = max_ps_ratio - - if max_sample is not None: - self.semantic_data = self.semantic_data[:max_sample] - - # {idx: (semantic, phoneme)} - # semantic list, phoneme list - self.semantic_phoneme = [] - self.item_names = [] - - self.inited = False - - if not self.inited: - # 调用初始化函数 - self.init_batch() - self.inited = True - del self.semantic_data - del self.phoneme_data - # self.tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large") - # self.tokenizer = AutoTokenizer.from_pretrained("/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large") - - def init_batch(self): - semantic_data_len = len(self.semantic_data) - phoneme_data_len = len(self.phoneme_data.keys()) - print("semantic_data_len:", semantic_data_len) - print("phoneme_data_len:", phoneme_data_len) - print(self.semantic_data) - idx = 0 - num_not_in = 0 - num_deleted_bigger = 0 - num_deleted_ps = 0 - for i in range(semantic_data_len): - # 先依次遍历 - # get str - item_name = self.semantic_data.iloc[i,0] - # print(self.phoneme_data) - try: - phoneme, word2ph, text = self.phoneme_data[item_name] - except Exception: - traceback.print_exc() - # print(f"{item_name} not in self.phoneme_data !") - num_not_in += 1 - continue - - semantic_str = self.semantic_data.iloc[i,1] - # get token list - semantic_ids = [int(idx) for idx in semantic_str.split(" ")] - # (T), 是否需要变成 (1, T) -> 不需要,因为需要求 len - # 过滤掉太长的样本 - if ( - len(semantic_ids) > self.max_sec * self.hz - ): #########1###根据token个数推测总时长过滤时长60s(config里)#40*25=1k - num_deleted_bigger += 1 - continue - # (T, ), 这个速度不会很慢,所以可以在一开始就处理,无需在 __getitem__ 里面单个处理#### - phoneme = phoneme.split(" ") - - try: - phoneme_ids = cleaned_text_to_sequence(phoneme, version) - except: - traceback.print_exc() - # print(f"{item_name} not in self.phoneme_data !") - num_not_in += 1 - continue - # if len(phoneme_ids) >400:###########2:改为恒定限制为semantic/2.5就行 - if ( - len(phoneme_ids) > self.max_sec * self.hz / 2.5 - ): ###########2:改为恒定限制为semantic/2.5就行 - num_deleted_ps += 1 - continue - # if len(semantic_ids) > 1000:###########3 - # num_deleted_bigger += 1 - # continue - - ps_ratio = len(phoneme_ids) / (len(semantic_ids) / self.hz) - - if ( - ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio - ): ##########4#3~25#每秒多少个phone - num_deleted_ps += 1 - # print(item_name) - continue - - self.semantic_phoneme.append((semantic_ids, phoneme_ids)) - idx += 1 - self.item_names.append(item_name) - - min_num = 100 # 20直接不补#30补了也不存ckpt - leng = len(self.semantic_phoneme) - if leng < min_num: - tmp1 = self.semantic_phoneme - tmp2 = self.item_names - self.semantic_phoneme = [] - self.item_names = [] - for _ in range(max(2, int(min_num / leng))): - self.semantic_phoneme += tmp1 - self.item_names += tmp2 - if num_not_in > 0: - print(f"there are {num_not_in} semantic datas not in phoneme datas") - if num_deleted_bigger > 0: - print( - f"deleted {num_deleted_bigger} audios who's duration are bigger than {self.max_sec} seconds" - ) - if num_deleted_ps > 0: - # 4702 for LibriTTS, LirbriTTS 是标注数据, 是否需要筛?=> 需要,有值为 100 的极端值 - print( - f"deleted {num_deleted_ps} audios who's phoneme/sec are bigger than {self.max_ps_ratio} or smaller than {self.min_ps_ratio}" - ) - """ - there are 31 semantic datas not in phoneme datas - deleted 34 audios who's duration are bigger than 54 seconds - deleted 3190 audios who's phoneme/sec are bigger than 25 or smaller than 3 - dataset.__len__(): 366463 - - """ - # 345410 for LibriTTS - print("dataset.__len__():", self.__len__()) - - def __get_item_names__(self) -> List[str]: - return self.item_names - - def __len__(self) -> int: - return len(self.semantic_phoneme) - - def __getitem__(self, idx: int) -> Dict: - semantic_ids, phoneme_ids = self.semantic_phoneme[idx] - item_name = self.item_names[idx] - phoneme_ids_len = len(phoneme_ids) - # semantic tokens target - semantic_ids_len = len(semantic_ids) - - flag = 0 - path_bert = "%s/%s.pt" % (self.path3, item_name) - if os.path.exists(path_bert) == True: - bert_feature = torch.load(path_bert, map_location="cpu") - else: - flag = 1 - if flag == 1: - # bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32) - bert_feature = None - else: - assert bert_feature.shape[-1] == len(phoneme_ids) - return { - "idx": idx, - "phoneme_ids": phoneme_ids, - "phoneme_ids_len": phoneme_ids_len, - "semantic_ids": semantic_ids, - "semantic_ids_len": semantic_ids_len, - "bert_feature": bert_feature, - } - - def get_sample_length(self, idx: int): - semantic_ids = self.semantic_phoneme[idx][0] - sec = 1.0 * len(semantic_ids) / self.hz - return sec - - def collate(self, examples: List[Dict]) -> Dict: - sample_index: List[int] = [] - phoneme_ids: List[torch.Tensor] = [] - phoneme_ids_lens: List[int] = [] - semantic_ids: List[torch.Tensor] = [] - semantic_ids_lens: List[int] = [] - # return - - for item in examples: - sample_index.append(item["idx"]) - phoneme_ids.append(np.array(item["phoneme_ids"], dtype=np.int64)) - semantic_ids.append(np.array(item["semantic_ids"], dtype=np.int64)) - phoneme_ids_lens.append(item["phoneme_ids_len"]) - semantic_ids_lens.append(item["semantic_ids_len"]) - - # pad 0 - phoneme_ids = batch_sequences(phoneme_ids) - semantic_ids = batch_sequences(semantic_ids, pad_value=self.PAD) - - # # convert each batch to torch.tensor - phoneme_ids = torch.tensor(phoneme_ids) - semantic_ids = torch.tensor(semantic_ids) - phoneme_ids_lens = torch.tensor(phoneme_ids_lens) - semantic_ids_lens = torch.tensor(semantic_ids_lens) - bert_padded = torch.FloatTensor(len(examples), 1024, max(phoneme_ids_lens)) - bert_padded.zero_() - - for idx, item in enumerate(examples): - bert = item["bert_feature"] - if bert != None: - bert_padded[idx, :, : bert.shape[-1]] = bert - - return { - # List[int] - "ids": sample_index, - # torch.Tensor (B, max_phoneme_length) - "phoneme_ids": phoneme_ids, - # torch.Tensor (B) - "phoneme_ids_len": phoneme_ids_lens, - # torch.Tensor (B, max_semantic_ids_length) - "semantic_ids": semantic_ids, - # torch.Tensor (B) - "semantic_ids_len": semantic_ids_lens, - # torch.Tensor (B, 1024, max_phoneme_length) - "bert_feature": bert_padded, - } - - -if __name__ == "__main__": - root_dir = "/data/docker/liujing04/gpt-vits/prepare/dump_mix/" - dataset = Text2SemanticDataset( - phoneme_path=root_dir + "phoneme_train.npy", - semantic_path=root_dir + "semantic_train.tsv", - ) - - batch_size = 12 - dataloader = DataLoader( - dataset, batch_size=batch_size, collate_fn=dataset.collate, shuffle=False - ) - for i, batch in enumerate(dataloader): - if i % 1000 == 0: - print(i) - # if i == 0: - # print('batch["ids"]:', batch["ids"]) - # print('batch["phoneme_ids"]:', batch["phoneme_ids"], - # batch["phoneme_ids"].shape) - # print('batch["phoneme_ids_len"]:', batch["phoneme_ids_len"], - # batch["phoneme_ids_len"].shape) - # print('batch["semantic_ids"]:', batch["semantic_ids"], - # batch["semantic_ids"].shape) - # print('batch["semantic_ids_len"]:', batch["semantic_ids_len"], - # batch["semantic_ids_len"].shape) diff --git a/AR/models/__init__.py b/AR/models/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/AR/modules/embedding.py b/AR/models/embedding.py similarity index 100% rename from AR/modules/embedding.py rename to AR/models/embedding.py diff --git a/AR/models/t2s_lightning_module.py b/AR/models/t2s_lightning_module.py deleted file mode 100644 index 2dd3f392893f1ea08a6e848f2ff2d9be1a425f15..0000000000000000000000000000000000000000 --- a/AR/models/t2s_lightning_module.py +++ /dev/null @@ -1,141 +0,0 @@ -# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py -# reference: https://github.com/lifeiteng/vall-e -import os, sys - -now_dir = os.getcwd() -sys.path.append(now_dir) -from typing import Dict - -import torch -from pytorch_lightning import LightningModule -from AR.models.t2s_model import Text2SemanticDecoder -from AR.modules.lr_schedulers import WarmupCosineLRSchedule -from AR.modules.optim import ScaledAdam - -class Text2SemanticLightningModule(LightningModule): - def __init__(self, config, output_dir, is_train=True): - super().__init__() - self.config = config - self.top_k = 3 - self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) - pretrained_s1 = config.get("pretrained_s1") - if pretrained_s1 and is_train: - # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) - print( - self.load_state_dict( - torch.load(pretrained_s1, map_location="cpu")["weight"] - ) - ) - if is_train: - self.automatic_optimization = False - self.save_hyperparameters() - self.eval_dir = output_dir / "eval" - self.eval_dir.mkdir(parents=True, exist_ok=True) - - def training_step(self, batch: Dict, batch_idx: int): - opt = self.optimizers() - scheduler = self.lr_schedulers() - forward=self.model.forward if self.config["train"].get("if_dpo",False)==True else self.model.forward_old - loss, acc = forward( - batch["phoneme_ids"], - batch["phoneme_ids_len"], - batch["semantic_ids"], - batch["semantic_ids_len"], - batch["bert_feature"], - ) - self.manual_backward(loss) - if batch_idx > 0 and batch_idx % 4 == 0: - opt.step() - opt.zero_grad() - scheduler.step() - - self.log( - "total_loss", - loss, - on_step=True, - on_epoch=True, - prog_bar=True, - sync_dist=True, - ) - self.log( - "lr", - scheduler.get_last_lr()[0], - on_epoch=True, - prog_bar=True, - sync_dist=True, - ) - self.log( - f"top_{self.top_k}_acc", - acc, - on_step=True, - on_epoch=True, - prog_bar=True, - sync_dist=True, - ) - - def validation_step(self, batch: Dict, batch_idx: int): - return - - # # get loss - # loss, acc = self.model.forward( - # batch['phoneme_ids'], batch['phoneme_ids_len'], - # batch['semantic_ids'], batch['semantic_ids_len'], - # batch['bert_feature'] - # ) - # - # self.log( - # "val_total_loss", - # loss, - # on_step=True, - # on_epoch=True, - # prog_bar=True, - # sync_dist=True) - # self.log( - # f"val_top_{self.top_k}_acc", - # acc, - # on_step=True, - # on_epoch=True, - # prog_bar=True, - # sync_dist=True) - # - # # get infer output - # semantic_len = batch['semantic_ids'].size(1) - # prompt_len = min(int(semantic_len * 0.5), 150) - # prompt = batch['semantic_ids'][:, :prompt_len] - # pred_semantic = self.model.infer(batch['phoneme_ids'], - # batch['phoneme_ids_len'], prompt, - # batch['bert_feature'] - # ) - # save_name = f'semantic_toks_{batch_idx}.pt' - # save_path = os.path.join(self.eval_dir, save_name) - # torch.save(pred_semantic.detach().cpu(), save_path) - - def configure_optimizers(self): - model_parameters = self.model.parameters() - parameters_names = [] - parameters_names.append( - [name_param_pair[0] for name_param_pair in self.model.named_parameters()] - ) - lm_opt = ScaledAdam( - model_parameters, - lr=0.01, - betas=(0.9, 0.95), - clipping_scale=2.0, - parameters_names=parameters_names, - show_dominant_parameters=False, - clipping_update_period=1000, - ) - - return { - "optimizer": lm_opt, - "lr_scheduler": { - "scheduler": WarmupCosineLRSchedule( - lm_opt, - init_lr=self.config["optimizer"]["lr_init"], - peak_lr=self.config["optimizer"]["lr"], - end_lr=self.config["optimizer"]["lr_end"], - warmup_steps=self.config["optimizer"]["warmup_steps"], - total_steps=self.config["optimizer"]["decay_steps"], - ) - }, - } diff --git a/AR/models/t2s_lightning_module_onnx.py b/AR/models/t2s_lightning_module_onnx.py deleted file mode 100644 index 487edb015203ebd70f9f6d3475ef892e28c57927..0000000000000000000000000000000000000000 --- a/AR/models/t2s_lightning_module_onnx.py +++ /dev/null @@ -1,107 +0,0 @@ -# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py -# reference: https://github.com/lifeiteng/vall-e -import os, sys - -now_dir = os.getcwd() -sys.path.append(now_dir) -from typing import Dict - -import torch -from pytorch_lightning import LightningModule -from AR.models.t2s_model_onnx import Text2SemanticDecoder -from AR.modules.lr_schedulers import WarmupCosineLRSchedule -from AR.modules.optim import ScaledAdam - - -class Text2SemanticLightningModule(LightningModule): - def __init__(self, config, output_dir, is_train=True): - super().__init__() - self.config = config - self.top_k = 3 - self.model = Text2SemanticDecoder(config=config, top_k=self.top_k) - pretrained_s1 = config.get("pretrained_s1") - if pretrained_s1 and is_train: - # print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"])) - print( - self.load_state_dict( - torch.load(pretrained_s1, map_location="cpu")["weight"] - ) - ) - if is_train: - self.automatic_optimization = False - self.save_hyperparameters() - self.eval_dir = output_dir / "eval" - self.eval_dir.mkdir(parents=True, exist_ok=True) - - def training_step(self, batch: Dict, batch_idx: int): - opt = self.optimizers() - scheduler = self.lr_schedulers() - loss, acc = self.model.forward( - batch["phoneme_ids"], - batch["phoneme_ids_len"], - batch["semantic_ids"], - batch["semantic_ids_len"], - batch["bert_feature"], - ) - self.manual_backward(loss) - if batch_idx > 0 and batch_idx % 4 == 0: - opt.step() - opt.zero_grad() - scheduler.step() - - self.log( - "total_loss", - loss, - on_step=True, - on_epoch=True, - prog_bar=True, - sync_dist=True, - ) - self.log( - "lr", - scheduler.get_last_lr()[0], - on_epoch=True, - prog_bar=True, - sync_dist=True, - ) - self.log( - f"top_{self.top_k}_acc", - acc, - on_step=True, - on_epoch=True, - prog_bar=True, - sync_dist=True, - ) - - def validation_step(self, batch: Dict, batch_idx: int): - return - - def configure_optimizers(self): - model_parameters = self.model.parameters() - parameters_names = [] - parameters_names.append( - [name_param_pair[0] for name_param_pair in self.model.named_parameters()] - ) - lm_opt = ScaledAdam( - model_parameters, - lr=0.01, - betas=(0.9, 0.95), - clipping_scale=2.0, - parameters_names=parameters_names, - show_dominant_parameters=False, - clipping_update_period=1000, - ) - - return { - "optimizer": lm_opt, - "lr_scheduler": { - "scheduler": WarmupCosineLRSchedule( - lm_opt, - init_lr=self.config["optimizer"]["lr_init"], - peak_lr=self.config["optimizer"]["lr"], - end_lr=self.config["optimizer"]["lr_end"], - warmup_steps=self.config["optimizer"]["warmup_steps"], - total_steps=self.config["optimizer"]["decay_steps"], - ) - }, - } diff --git a/AR/models/t2s_model.py b/AR/models/t2s_model.py deleted file mode 100644 index cecfcbc13349aadf8836a936f84fb742ec1f24ee..0000000000000000000000000000000000000000 --- a/AR/models/t2s_model.py +++ /dev/null @@ -1,586 +0,0 @@ -# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py -# reference: https://github.com/lifeiteng/vall-e -import torch -import random -import numpy as np - -from tqdm import tqdm -from typing import List -from AR.models.utils import make_pad_mask -from AR.models.utils import ( - topk_sampling, - sample, - logits_to_probs, - multinomial_sample_one_no_sync, - dpo_loss, - make_reject_y, - get_batch_logps -) -from AR.modules.embedding import SinePositionalEmbedding -from AR.modules.embedding import TokenEmbedding -from AR.modules.transformer import LayerNorm -from AR.modules.transformer import TransformerEncoder -from AR.modules.transformer import TransformerEncoderLayer -from torch import nn -from torch.nn import functional as F -from torchmetrics.classification import MulticlassAccuracy - -default_config = { - "embedding_dim": 512, - "hidden_dim": 512, - "num_head": 8, - "num_layers": 12, - "num_codebook": 8, - "p_dropout": 0.0, - "vocab_size": 1024 + 1, - "phoneme_vocab_size": 512, - "EOS": 1024, -} - - -@torch.jit.script -class T2SMLP: - def __init__(self, w1, b1, w2, b2): - self.w1 = w1 - self.b1 = b1 - self.w2 = w2 - self.b2 = b2 - - def forward(self, x): - x = F.relu(F.linear(x, self.w1, self.b1)) - x = F.linear(x, self.w2, self.b2) - return x - - -@torch.jit.script -class T2SBlock: - def __init__( - self, - num_heads, - hidden_dim: int, - mlp: T2SMLP, - qkv_w, - qkv_b, - out_w, - out_b, - norm_w1, - norm_b1, - norm_eps1, - norm_w2, - norm_b2, - norm_eps2, - ): - self.num_heads = num_heads - self.mlp = mlp - self.hidden_dim: int = hidden_dim - self.qkv_w = qkv_w - self.qkv_b = qkv_b - self.out_w = out_w - self.out_b = out_b - self.norm_w1 = norm_w1 - self.norm_b1 = norm_b1 - self.norm_eps1 = norm_eps1 - self.norm_w2 = norm_w2 - self.norm_b2 = norm_b2 - self.norm_eps2 = norm_eps2 - - def process_prompt(self, x, attn_mask: torch.Tensor): - q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1) - - batch_size = q.shape[0] - q_len = q.shape[1] - kv_len = k.shape[1] - - k_cache = k - v_cache = v - - q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2) - k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) - v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) - - attn = F.scaled_dot_product_attention(q, k, v, ~attn_mask) - - attn = attn.permute(2, 0, 1, 3).reshape(batch_size, -1, self.hidden_dim) - attn = F.linear(attn, self.out_w, self.out_b) - - x = F.layer_norm( - x + attn, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 - ) - x = F.layer_norm( - x + self.mlp.forward(x), - [self.hidden_dim], - self.norm_w2, - self.norm_b2, - self.norm_eps2, - ) - return x, k_cache, v_cache - - def decode_next_token(self, x, k_cache, v_cache): - q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1) - - k_cache = torch.cat([k_cache, k], dim=1) - v_cache = torch.cat([v_cache, v], dim=1) - kv_len = k_cache.shape[1] - - batch_size = q.shape[0] - q_len = q.shape[1] - - q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2) - k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) - v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2) - - attn = F.scaled_dot_product_attention(q, k, v) - - attn = attn.permute(2, 0, 1, 3).reshape(batch_size, -1, self.hidden_dim) - attn = F.linear(attn, self.out_w, self.out_b) - - x = F.layer_norm( - x + attn, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1 - ) - x = F.layer_norm( - x + self.mlp.forward(x), - [self.hidden_dim], - self.norm_w2, - self.norm_b2, - self.norm_eps2, - ) - return x, k_cache, v_cache - - -@torch.jit.script -class T2STransformer: - def __init__(self, num_blocks: int, blocks: List[T2SBlock]): - self.num_blocks: int = num_blocks - self.blocks = blocks - - def process_prompt( - self, x, attn_mask: torch.Tensor): - k_cache: List[torch.Tensor] = [] - v_cache: List[torch.Tensor] = [] - for i in range(self.num_blocks): - x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask) - k_cache.append(k_cache_) - v_cache.append(v_cache_) - return x, k_cache, v_cache - - def decode_next_token( - self, x, k_cache: List[torch.Tensor], v_cache: List[torch.Tensor] - ): - for i in range(self.num_blocks): - x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token(x, k_cache[i], v_cache[i]) - return x, k_cache, v_cache - - -class Text2SemanticDecoder(nn.Module): - def __init__(self, config, norm_first=False, top_k=3): - super(Text2SemanticDecoder, self).__init__() - self.model_dim = config["model"]["hidden_dim"] - self.embedding_dim = config["model"]["embedding_dim"] - self.num_head = config["model"]["head"] - self.num_layers = config["model"]["n_layer"] - self.norm_first = norm_first - self.vocab_size = config["model"]["vocab_size"] - self.phoneme_vocab_size = config["model"]["phoneme_vocab_size"] - self.p_dropout = config["model"]["dropout"] - self.EOS = config["model"]["EOS"] - self.norm_first = norm_first - assert self.EOS == self.vocab_size - 1 - # should be same as num of kmeans bin - # assert self.EOS == 1024 - self.bert_proj = nn.Linear(1024, self.embedding_dim) - self.ar_text_embedding = TokenEmbedding( - self.embedding_dim, self.phoneme_vocab_size, self.p_dropout - ) - self.ar_text_position = SinePositionalEmbedding( - self.embedding_dim, dropout=0.1, scale=False, alpha=True - ) - self.ar_audio_embedding = TokenEmbedding( - self.embedding_dim, self.vocab_size, self.p_dropout - ) - self.ar_audio_position = SinePositionalEmbedding( - self.embedding_dim, dropout=0.1, scale=False, alpha=True - ) - - self.h = TransformerEncoder( - TransformerEncoderLayer( - d_model=self.model_dim, - nhead=self.num_head, - dim_feedforward=self.model_dim * 4, - dropout=0.1, - batch_first=True, - norm_first=norm_first, - ), - num_layers=self.num_layers, - norm=LayerNorm(self.model_dim) if norm_first else None, - ) - - self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False) - self.loss_fct = nn.CrossEntropyLoss(reduction="sum") - - self.ar_accuracy_metric = MulticlassAccuracy( - self.vocab_size, - top_k=top_k, - average="micro", - multidim_average="global", - ignore_index=self.EOS, - ) - - blocks = [] - - for i in range(self.num_layers): - layer = self.h.layers[i] - t2smlp = T2SMLP( - layer.linear1.weight, - layer.linear1.bias, - layer.linear2.weight, - layer.linear2.bias - ) - # (layer.self_attn.in_proj_weight, layer.self_attn.in_proj_bias) - block = T2SBlock( - self.num_head, - self.model_dim, - t2smlp, - layer.self_attn.in_proj_weight, - layer.self_attn.in_proj_bias, - layer.self_attn.out_proj.weight, - layer.self_attn.out_proj.bias, - layer.norm1.weight, - layer.norm1.bias, - layer.norm1.eps, - layer.norm2.weight, - layer.norm2.bias, - layer.norm2.eps - ) - - blocks.append(block) - - self.t2s_transformer = T2STransformer(self.num_layers, blocks) - - def make_input_data(self, x, x_lens, y, y_lens, bert_feature): - x = self.ar_text_embedding(x) - x = x + self.bert_proj(bert_feature.transpose(1, 2)) - x = self.ar_text_position(x) - x_mask = make_pad_mask(x_lens) - - y_mask = make_pad_mask(y_lens) - y_mask_int = y_mask.type(torch.int64) - codes = y.type(torch.int64) * (1 - y_mask_int) - - # Training - # AR Decoder - y, targets = self.pad_y_eos(codes, y_mask_int, eos_id=self.EOS) - x_len = x_lens.max() - y_len = y_lens.max() - y_emb = self.ar_audio_embedding(y) - y_pos = self.ar_audio_position(y_emb) - - xy_padding_mask = torch.concat([x_mask, y_mask], dim=1) - - ar_xy_padding_mask = xy_padding_mask - - x_attn_mask = F.pad( - torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device), - (0, y_len), - value=True, - ) - - y_attn_mask = F.pad( - torch.triu( - torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), - diagonal=1, - ), - (x_len, 0), - value=False, - ) - - xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0) - bsz, src_len = x.shape[0], x_len + y_len - _xy_padding_mask = ( - ar_xy_padding_mask.view(bsz, 1, 1, src_len) - .expand(-1, self.num_head, -1, -1) - .reshape(bsz * self.num_head, 1, src_len) - ) - xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask) - new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype) - new_attn_mask.masked_fill_(xy_attn_mask, float("-inf")) - xy_attn_mask = new_attn_mask - # x 和完整的 y 一次性输入模型 - xy_pos = torch.concat([x, y_pos], dim=1) - - return xy_pos, xy_attn_mask, targets - - def forward(self, x, x_lens, y, y_lens, bert_feature): - """ - x: phoneme_ids - y: semantic_ids - """ - - reject_y, reject_y_lens = make_reject_y(y, y_lens) - - xy_pos, xy_attn_mask, targets = self.make_input_data(x, x_lens, y, y_lens, bert_feature) - - xy_dec, _ = self.h( - (xy_pos, None), - mask=xy_attn_mask, - ) - x_len = x_lens.max() - logits = self.ar_predict_layer(xy_dec[:, x_len:]) - - ###### DPO ############# - reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data(x, x_lens, reject_y, reject_y_lens, bert_feature) - - reject_xy_dec, _ = self.h( - (reject_xy_pos, None), - mask=reject_xy_attn_mask, - ) - x_len = x_lens.max() - reject_logits = self.ar_predict_layer(reject_xy_dec[:, x_len:]) - - # loss - # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum - - loss_1 = F.cross_entropy(logits.permute(0, 2, 1), targets, reduction="sum") - acc = self.ar_accuracy_metric(logits.permute(0, 2, 1).detach(), targets).item() - - A_logits, R_logits = get_batch_logps(logits, reject_logits, targets, reject_targets) - loss_2, _, _ = dpo_loss(A_logits, R_logits, 0, 0, 0.2, reference_free=True) - - loss = loss_1 + loss_2 - - return loss, acc - - def forward_old(self, x, x_lens, y, y_lens, bert_feature): - """ - x: phoneme_ids - y: semantic_ids - """ - x = self.ar_text_embedding(x) - x = x + self.bert_proj(bert_feature.transpose(1, 2)) - x = self.ar_text_position(x) - x_mask = make_pad_mask(x_lens) - - y_mask = make_pad_mask(y_lens) - y_mask_int = y_mask.type(torch.int64) - codes = y.type(torch.int64) * (1 - y_mask_int) - - # Training - # AR Decoder - y, targets = self.pad_y_eos(codes, y_mask_int, eos_id=self.EOS) - x_len = x_lens.max() - y_len = y_lens.max() - y_emb = self.ar_audio_embedding(y) - y_pos = self.ar_audio_position(y_emb) - - xy_padding_mask = torch.concat([x_mask, y_mask], dim=1) - ar_xy_padding_mask = xy_padding_mask - - x_attn_mask = F.pad( - torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device), - (0, y_len), - value=True, - ) - y_attn_mask = F.pad( - torch.triu( - torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), - diagonal=1, - ), - (x_len, 0), - value=False, - ) - xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0) - bsz, src_len = x.shape[0], x_len + y_len - _xy_padding_mask = ( - ar_xy_padding_mask.view(bsz, 1, 1, src_len) - .expand(-1, self.num_head, -1, -1) - .reshape(bsz * self.num_head, 1, src_len) - ) - xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask) - new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype) - new_attn_mask.masked_fill_(xy_attn_mask, float("-inf")) - xy_attn_mask = new_attn_mask - # x 和完整的 y 一次性输入模型 - xy_pos = torch.concat([x, y_pos], dim=1) - xy_dec, _ = self.h( - (xy_pos, None), - mask=xy_attn_mask, - ) - logits = self.ar_predict_layer(xy_dec[:, x_len:]).permute(0, 2, 1) - # loss - # from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum - loss = F.cross_entropy(logits, targets, reduction="sum") - acc = self.ar_accuracy_metric(logits.detach(), targets).item() - return loss, acc - - # 需要看下这个函数和 forward 的区别以及没有 semantic 的时候 prompts 输入什么 - def infer( - self, - x, - x_lens, - prompts, - bert_feature, - top_k: int = -100, - early_stop_num: int = -1, - temperature: float = 1.0, - ): - x = self.ar_text_embedding(x) - x = x + self.bert_proj(bert_feature.transpose(1, 2)) - x = self.ar_text_position(x) - - # AR Decoder - y = prompts - prefix_len = y.shape[1] - x_len = x.shape[1] - x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool) - stop = False - for _ in tqdm(range(1500)): - y_emb = self.ar_audio_embedding(y) - y_pos = self.ar_audio_position(y_emb) - # x 和逐渐增长的 y 一起输入给模型 - xy_pos = torch.concat([x, y_pos], dim=1) - y_len = y.shape[1] - x_attn_mask_pad = F.pad( - x_attn_mask, - (0, y_len), - value=True, - ) - y_attn_mask = F.pad( - torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), - (x_len, 0), - value=False, - ) - xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to( - y.device - ) - - xy_dec, _ = self.h( - (xy_pos, None), - mask=xy_attn_mask, - ) - logits = self.ar_predict_layer(xy_dec[:, -1]) - samples = topk_sampling( - logits, top_k=top_k, top_p=1.0, temperature=temperature - ) - - if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: - print("use early stop num:", early_stop_num) - stop = True - - if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: - # print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS) - stop = True - if stop: - if prompts.shape[1] == y.shape[1]: - y = torch.concat([y, torch.zeros_like(samples)], dim=1) - print("bad zero prediction") - print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") - break - # 本次生成的 semantic_ids 和之前的 y 构成新的 y - # print(samples.shape)#[1,1]#第一个1是bs - # import os - # os._exit(2333) - y = torch.concat([y, samples], dim=1) - return y - - def pad_y_eos(self, y, y_mask_int, eos_id): - targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad( - y_mask_int, (0, 1), value=1 - ) - # 错位 - return targets[:, :-1], targets[:, 1:] - - def infer_panel( - self, - x, #####全部文本token - x_lens, - prompts, ####参考音频token - bert_feature, - top_k: int = -100, - top_p: int = 100, - early_stop_num: int = -1, - temperature: float = 1.0, - ): - x = self.ar_text_embedding(x) - x = x + self.bert_proj(bert_feature.transpose(1, 2)) - x = self.ar_text_position(x) - - # AR Decoder - y = prompts - - x_len = x.shape[1] - x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool) - stop = False - # print(1111111,self.num_layers) - - k_cache = None - v_cache = None - ################### first step ########################## - if y is not None: - y_emb = self.ar_audio_embedding(y) - y_len = y_emb.shape[1] - prefix_len = y.shape[1] - y_pos = self.ar_audio_position(y_emb) - xy_pos = torch.concat([x, y_pos], dim=1) - ref_free = False - else: - y_emb = None - y_len = 0 - prefix_len = 0 - y_pos = None - xy_pos = x - y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device) - prompts = y - ref_free = True - - x_attn_mask_pad = F.pad( - x_attn_mask, - (0, y_len), ###xx的纯0扩展到xx纯0+xy纯1,(x,x+y) - value=True, - ) - y_attn_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y) - torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), - (x_len, 0), - value=False, - ) - xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to( - x.device - ) - - for idx in tqdm(range(1500)): - if xy_attn_mask is not None: - xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask) - else: - xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache) - - logits = self.ar_predict_layer( - xy_dec[:, -1] - ) - - if idx == 0: - xy_attn_mask = None - logits = logits[:, :-1] - samples = sample( - logits[0], y, top_k=top_k, top_p=top_p, repetition_penalty=1.35, temperature=temperature - )[0].unsqueeze(0) - - y = torch.concat([y, samples], dim=1) - - if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: - print("use early stop num:", early_stop_num) - stop = True - - if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: - stop = True - if stop: - if y.shape[1] == 0: - y = torch.concat([y, torch.zeros_like(samples)], dim=1) - print("bad zero prediction") - print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]") - break - - ####################### update next step ################################### - y_emb = self.ar_audio_embedding(y[:, -1:]) - xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device) - - if ref_free: - return y[:, :-1], 0 - return y[:, :-1], idx - 1 \ No newline at end of file diff --git a/AR/models/t2s_model_abc.py b/AR/models/t2s_model_abc.py index 2daec038ef78e40be1d4f3fb51393ed54381b9df..295e830a377a81bc9e8807ebec10c84965c8e438 100644 --- a/AR/models/t2s_model_abc.py +++ b/AR/models/t2s_model_abc.py @@ -12,10 +12,10 @@ import torch.nn.functional as F from torch.cuda.graphs import CUDAGraph from torch.profiler import ProfilerAction, tensorboard_trace_handler -from AR.modules.embedding import ( +from AR.models.embedding import ( SinePositionalEmbeddingNested as SinePositionalEmbedding, ) -from AR.modules.embedding import TokenEmbedding +from AR.models.embedding import TokenEmbedding Tensor = torch.Tensor diff --git a/AR/models/t2s_model_flash_attn.py b/AR/models/t2s_model_flash_attn.py index 5e781795f8a991f5c7444e214bab16cc725546c7..96ae0067108e9b80f9e717e4df7ef3f76fa191a2 100644 --- a/AR/models/t2s_model_flash_attn.py +++ b/AR/models/t2s_model_flash_attn.py @@ -9,6 +9,10 @@ import torch import torch.nn as nn from tqdm import tqdm +from AR.models.embedding import ( + SinePositionalEmbeddingNested as SinePositionalEmbedding, +) +from AR.models.embedding import TokenEmbedding from AR.models.structs import T2SRequest, T2SResult, T2SSession from AR.models.t2s_model_abc import ( AttentionABC, @@ -20,10 +24,6 @@ from AR.models.t2s_model_abc import ( TransformerBlockABC, TransformerDecoderABC, ) -from AR.modules.embedding import ( - SinePositionalEmbeddingNested as SinePositionalEmbedding, -) -from AR.modules.embedding import TokenEmbedding Tensor = torch.Tensor diff --git a/AR/models/t2s_model_onnx.py b/AR/models/t2s_model_onnx.py deleted file mode 100644 index 7834297d5561c87021c946cbe0671424b4a3466f..0000000000000000000000000000000000000000 --- a/AR/models/t2s_model_onnx.py +++ /dev/null @@ -1,338 +0,0 @@ -# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py -# reference: https://github.com/lifeiteng/vall-e -import torch -from tqdm import tqdm - -from AR.modules.embedding_onnx import SinePositionalEmbedding -from AR.modules.embedding_onnx import TokenEmbedding -from AR.modules.transformer_onnx import LayerNorm -from AR.modules.transformer_onnx import TransformerEncoder -from AR.modules.transformer_onnx import TransformerEncoderLayer -from torch import nn -from torch.nn import functional as F -from torchmetrics.classification import MulticlassAccuracy - -default_config = { - "embedding_dim": 512, - "hidden_dim": 512, - "num_head": 8, - "num_layers": 12, - "num_codebook": 8, - "p_dropout": 0.0, - "vocab_size": 1024 + 1, - "phoneme_vocab_size": 512, - "EOS": 1024, -} - -inf_tensor_value = torch.FloatTensor([-float("Inf")]).float() - -def logits_to_probs( - logits, - previous_tokens = None, - temperature: float = 1.0, - top_k = None, - top_p = None, - repetition_penalty: float = 1.0, -): - previous_tokens = previous_tokens.squeeze() - if previous_tokens is not None and repetition_penalty != 1.0: - previous_tokens = previous_tokens.long() - score = torch.gather(logits, dim=0, index=previous_tokens) - score = torch.where( - score < 0, score * repetition_penalty, score / repetition_penalty - ) - logits.scatter_(dim=0, index=previous_tokens, src=score) - - if top_p is not None and top_p < 1.0: - sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cum_probs = torch.cumsum( - torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1 - ) - sorted_indices_to_remove = cum_probs > top_p - sorted_indices_to_remove[0] = False # keep at least one option - indices_to_remove = sorted_indices_to_remove.scatter( - dim=0, index=sorted_indices, src=sorted_indices_to_remove - ) - logits = logits.masked_fill(indices_to_remove, -float("Inf")) - - logits = logits / max(temperature, 1e-5) - - if top_k is not None: - v, _ = torch.topk(logits, top_k) - pivot = v.select(-1, -1).unsqueeze(-1) - logits = torch.where(logits < pivot, inf_tensor_value, logits) - - probs = torch.nn.functional.softmax(logits, dim=-1) - return probs - - -def multinomial_sample_one_no_sync( - probs_sort -): # Does multinomial sampling without a cuda synchronization - q = torch.randn_like(probs_sort) - return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) - - -def sample( - logits, - previous_tokens, - **sampling_kwargs, -): - probs = logits_to_probs( - logits=logits, previous_tokens=previous_tokens, **sampling_kwargs - ) - idx_next = multinomial_sample_one_no_sync(probs) - return idx_next, probs - - -class OnnxEncoder(nn.Module): - def __init__(self, ar_text_embedding, bert_proj, ar_text_position): - super().__init__() - self.ar_text_embedding = ar_text_embedding - self.bert_proj = bert_proj - self.ar_text_position = ar_text_position - - def forward(self, x, bert_feature): - x = self.ar_text_embedding(x) - x = x + self.bert_proj(bert_feature.transpose(1, 2)) - return self.ar_text_position(x) - - -class T2SFirstStageDecoder(nn.Module): - def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric, - top_k, early_stop_num, num_layers): - super().__init__() - self.ar_audio_embedding = ar_audio_embedding - self.ar_audio_position = ar_audio_position - self.h = h - self.ar_predict_layer = ar_predict_layer - self.loss_fct = loss_fct - self.ar_accuracy_metric = ar_accuracy_metric - self.top_k = top_k - self.early_stop_num = early_stop_num - self.num_layers = num_layers - - def forward(self, x, prompt): - y = prompt - x_example = x[:,:,0] * 0.0 - #N, 1, 512 - cache = { - "all_stage": self.num_layers, - "k": None, - "v": None, - "y_emb": None, - "first_infer": 1, - "stage": 0, - } - - y_emb = self.ar_audio_embedding(y) - - cache["y_emb"] = y_emb - y_pos = self.ar_audio_position(y_emb) - - xy_pos = torch.concat([x, y_pos], dim=1) - - y_example = y_pos[:,:,0] * 0.0 - x_attn_mask = torch.matmul(x_example.transpose(0, 1) , x_example).bool() - y_attn_mask = torch.ones_like(torch.matmul(y_example.transpose(0, 1), y_example), dtype=torch.int64) - y_attn_mask = torch.cumsum(y_attn_mask, dim=1) - torch.cumsum( - torch.ones_like(y_example.transpose(0, 1), dtype=torch.int64), dim=0 - ) - y_attn_mask = y_attn_mask > 0 - - x_y_pad = torch.matmul(x_example.transpose(0, 1), y_example).bool() - y_x_pad = torch.matmul(y_example.transpose(0, 1), x_example).bool() - x_attn_mask_pad = torch.cat([x_attn_mask, torch.ones_like(x_y_pad)], dim=1) - y_attn_mask = torch.cat([y_x_pad, y_attn_mask], dim=1) - xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) - cache["k"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\ - .unsqueeze(1).repeat(self.num_layers, 1, 1, 1) - cache["v"] = torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))\ - .unsqueeze(1).repeat(self.num_layers, 1, 1, 1) - - xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache) - logits = self.ar_predict_layer(xy_dec[:, -1]) - samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0) - - y = torch.concat([y, samples], dim=1) - - return y, cache["k"], cache["v"], cache["y_emb"], x_example - - -class T2SStageDecoder(nn.Module): - def __init__(self, ar_audio_embedding, ar_audio_position, h, ar_predict_layer, loss_fct, ar_accuracy_metric, - top_k, early_stop_num, num_layers): - super().__init__() - self.ar_audio_embedding = ar_audio_embedding - self.ar_audio_position = ar_audio_position - self.h = h - self.ar_predict_layer = ar_predict_layer - self.loss_fct = loss_fct - self.ar_accuracy_metric = ar_accuracy_metric - self.top_k = top_k - self.early_stop_num = early_stop_num - self.num_layers = num_layers - - def forward(self, y, k, v, y_emb, x_example): - cache = { - "all_stage": self.num_layers, - "k": torch.nn.functional.pad(k, (0, 0, 0, 0, 0, 1)), - "v": torch.nn.functional.pad(v, (0, 0, 0, 0, 0, 1)), - "y_emb": y_emb, - "first_infer": 0, - "stage": 0, - } - - y_emb = torch.cat( - [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1 - ) - cache["y_emb"] = y_emb - y_pos = self.ar_audio_position(y_emb) - - xy_pos = y_pos[:, -1:] - - y_example = y_pos[:,:,0] * 0.0 - - xy_attn_mask = torch.cat([x_example, y_example], dim=1) - xy_attn_mask = torch.zeros_like(xy_attn_mask, dtype=torch.bool) - - xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache) - logits = self.ar_predict_layer(xy_dec[:, -1]) - samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0) - - y = torch.concat([y, samples], dim=1) - - return y, cache["k"], cache["v"], cache["y_emb"], logits, samples - - -class Text2SemanticDecoder(nn.Module): - def __init__(self, config, norm_first=False, top_k=3): - super(Text2SemanticDecoder, self).__init__() - self.model_dim = config["model"]["hidden_dim"] - self.embedding_dim = config["model"]["embedding_dim"] - self.num_head = config["model"]["head"] - self.num_layers = config["model"]["n_layer"] - self.norm_first = norm_first - self.vocab_size = config["model"]["vocab_size"] - self.phoneme_vocab_size = config["model"]["phoneme_vocab_size"] - self.p_dropout = float(config["model"]["dropout"]) - self.EOS = config["model"]["EOS"] - self.norm_first = norm_first - assert self.EOS == self.vocab_size - 1 - self.bert_proj = nn.Linear(1024, self.embedding_dim) - self.ar_text_embedding = TokenEmbedding(self.embedding_dim, self.phoneme_vocab_size, self.p_dropout) - self.ar_text_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True) - self.ar_audio_embedding = TokenEmbedding(self.embedding_dim, self.vocab_size, self.p_dropout) - self.ar_audio_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True) - self.h = TransformerEncoder( - TransformerEncoderLayer( - d_model=self.model_dim, - nhead=self.num_head, - dim_feedforward=self.model_dim * 4, - dropout=0.1, - batch_first=True, - norm_first=norm_first, - ), - num_layers=self.num_layers, - norm=LayerNorm(self.model_dim) if norm_first else None, - ) - self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False) - self.loss_fct = nn.CrossEntropyLoss(reduction="sum") - self.ar_accuracy_metric = MulticlassAccuracy( - self.vocab_size, - top_k=top_k, - average="micro", - multidim_average="global", - ignore_index=self.EOS, - ) - self.top_k = torch.LongTensor([1]) - self.early_stop_num = torch.LongTensor([-1]) - - def init_onnx(self): - self.onnx_encoder = OnnxEncoder(self.ar_text_embedding, self.bert_proj, self.ar_text_position) - self.first_stage_decoder = T2SFirstStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h, - self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num, - self.num_layers) - self.stage_decoder = T2SStageDecoder(self.ar_audio_embedding, self.ar_audio_position, self.h, - self.ar_predict_layer, self.loss_fct, self.ar_accuracy_metric, self.top_k, self.early_stop_num, - self.num_layers) - - def forward(self, x, prompts, bert_feature): - early_stop_num = self.early_stop_num - prefix_len = prompts.shape[1] - - x = self.onnx_encoder(x, bert_feature) - y, k, v, y_emb, stage, x_example = self.first_stage_decoder(x, prompts) - - stop = False - for idx in range(1, 1500): - enco = self.stage_decoder(y, k, v, y_emb, stage, x_example) - y, k, v, y_emb, stage, logits, samples = enco - if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: - stop = True - if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: - stop = True - if stop: - break - y[0, -1] = 0 - return y, idx - - def infer(self, x, prompts, bert_feature): - top_k = self.top_k - early_stop_num = self.early_stop_num - - x = self.onnx_encoder(x, bert_feature) - - y = prompts - prefix_len = y.shape[1] - x_len = x.shape[1] - x_example = x[:,:,0] * 0.0 - x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example) - x_attn_mask = torch.zeros_like(x_attn_mask, dtype=torch.bool) - - stop = False - cache = { - "all_stage": self.num_layers, - "k": [None] * self.num_layers, - "v": [None] * self.num_layers, - "y_emb": None, - "first_infer": 1, - "stage": 0, - } - for idx in range(1500): - if cache["first_infer"] == 1: - y_emb = self.ar_audio_embedding(y) - else: - y_emb = torch.cat( - [cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1 - ) - cache["y_emb"] = y_emb - y_pos = self.ar_audio_position(y_emb) - if cache["first_infer"] == 1: - xy_pos = torch.concat([x, y_pos], dim=1) - else: - xy_pos = y_pos[:, -1:] - y_len = y_pos.shape[1] - if cache["first_infer"] == 1: - x_attn_mask_pad = F.pad(x_attn_mask, (0, y_len), value=True) - y_attn_mask = F.pad( - torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1), - (x_len, 0), value=False - ) - xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0) - else: - xy_attn_mask = torch.zeros((1, x_len + y_len), dtype=torch.bool) - xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache) - logits = self.ar_predict_layer(xy_dec[:, -1]) - samples = sample(logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0) - if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: - stop = True - if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS: - stop = True - if stop: - if prompts.shape[1] == y.shape[1]: - y = torch.concat([y, torch.zeros_like(samples)], dim=1) - break - y = torch.concat([y, samples], dim=1) - cache["first_infer"] = 0 - return y, idx \ No newline at end of file diff --git a/AR/modules/__init__.py b/AR/modules/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/AR/modules/activation.py b/AR/modules/activation.py deleted file mode 100644 index 5ca888b5e557777be5a09482a84ce00967f67236..0000000000000000000000000000000000000000 --- a/AR/modules/activation.py +++ /dev/null @@ -1,428 +0,0 @@ -# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py -from typing import Optional -from typing import Tuple -import torch -from torch import Tensor -from torch.nn import Linear -from torch.nn import Module -from torch.nn.init import constant_ -from torch.nn.init import xavier_normal_ -from torch.nn.init import xavier_uniform_ -from torch.nn.modules.linear import NonDynamicallyQuantizableLinear -from torch.nn.parameter import Parameter - -from torch.nn import functional as F -from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched - -F.multi_head_attention_forward = multi_head_attention_forward_patched - - -class MultiheadAttention(Module): - r"""Allows the model to jointly attend to information - from different representation subspaces as described in the paper: - `Attention Is All You Need `_. - - Multi-Head Attention is defined as: - - .. math:: - \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O - - where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`. - - ``forward()`` will use a special optimized implementation if all of the following - conditions are met: - - - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This - restriction will be loosened in the future.) - - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad`` - - training is disabled (using ``.eval()``) - - dropout is 0 - - ``add_bias_kv`` is ``False`` - - ``add_zero_attn`` is ``False`` - - ``batch_first`` is ``True`` and the input is batched - - ``kdim`` and ``vdim`` are equal to ``embed_dim`` - - at most one of ``key_padding_mask`` or ``attn_mask`` is passed - - if a `NestedTensor `_ is passed, neither ``key_padding_mask`` - nor ``attn_mask`` is passed - - If the optimized implementation is in use, a - `NestedTensor `_ can be passed for - ``query``/``key``/``value`` to represent padding more efficiently than using a - padding mask. In this case, a `NestedTensor `_ - will be returned, and an additional speedup proportional to the fraction of the input - that is padding can be expected. - - Args: - embed_dim: Total dimension of the model. - num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split - across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``). - dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout). - bias: If specified, adds bias to input / output projection layers. Default: ``True``. - add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``. - add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1. - Default: ``False``. - kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``). - vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``). - batch_first: If ``True``, then the input and output tensors are provided - as (batch, seq, feature). Default: ``False`` (seq, batch, feature). - - Examples:: - - >>> # xdoctest: +SKIP - >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) - >>> attn_output, attn_output_weights = multihead_attn(query, key, value) - - """ - __constants__ = ["batch_first"] - bias_k: Optional[torch.Tensor] - bias_v: Optional[torch.Tensor] - - def __init__( - self, - embed_dim, - num_heads, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - kdim=None, - vdim=None, - batch_first=False, - linear1_cls=Linear, - linear2_cls=Linear, - device=None, - dtype=None, - ) -> None: - factory_kwargs = {"device": device, "dtype": dtype} - super(MultiheadAttention, self).__init__() - self.embed_dim = embed_dim - self.kdim = kdim if kdim is not None else embed_dim - self.vdim = vdim if vdim is not None else embed_dim - self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim - - self.num_heads = num_heads - self.dropout = dropout - self.batch_first = batch_first - self.head_dim = embed_dim // num_heads - assert ( - self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" - - if add_bias_kv: - self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) - self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) - else: - self.bias_k = self.bias_v = None - - if linear1_cls == Linear: - if not self._qkv_same_embed_dim: - self.q_proj_weight = Parameter( - torch.empty((embed_dim, embed_dim), **factory_kwargs) - ) - self.k_proj_weight = Parameter( - torch.empty((embed_dim, self.kdim), **factory_kwargs) - ) - self.v_proj_weight = Parameter( - torch.empty((embed_dim, self.vdim), **factory_kwargs) - ) - self.register_parameter("in_proj_weight", None) - else: - self.in_proj_weight = Parameter( - torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) - ) - self.register_parameter("q_proj_weight", None) - self.register_parameter("k_proj_weight", None) - self.register_parameter("v_proj_weight", None) - - if bias: - self.in_proj_bias = Parameter( - torch.empty(3 * embed_dim, **factory_kwargs) - ) - else: - self.register_parameter("in_proj_bias", None) - self.out_proj = NonDynamicallyQuantizableLinear( - embed_dim, embed_dim, bias=bias, **factory_kwargs - ) - - self._reset_parameters() - else: - if not self._qkv_same_embed_dim: - raise NotImplementedError - else: - self.in_proj_linear = linear1_cls( - embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs - ) - self.in_proj_weight = self.in_proj_linear.weight - - self.register_parameter("q_proj_weight", None) - self.register_parameter("k_proj_weight", None) - self.register_parameter("v_proj_weight", None) - - if bias: - self.in_proj_bias = self.in_proj_linear.bias - else: - self.register_parameter("in_proj_bias", None) - - self.out_proj = linear2_cls( - embed_dim, embed_dim, bias=bias, **factory_kwargs - ) - - if self.bias_k is not None: - xavier_normal_(self.bias_k) - if self.bias_v is not None: - xavier_normal_(self.bias_v) - - self.add_zero_attn = add_zero_attn - - def _reset_parameters(self): - if self._qkv_same_embed_dim: - xavier_uniform_(self.in_proj_weight) - else: - xavier_uniform_(self.q_proj_weight) - xavier_uniform_(self.k_proj_weight) - xavier_uniform_(self.v_proj_weight) - - if self.in_proj_bias is not None: - constant_(self.in_proj_bias, 0.0) - constant_(self.out_proj.bias, 0.0) - - if self.bias_k is not None: - xavier_normal_(self.bias_k) - if self.bias_v is not None: - xavier_normal_(self.bias_v) - - def __setstate__(self, state): - # Support loading old MultiheadAttention checkpoints generated by v1.1.0 - if "_qkv_same_embed_dim" not in state: - state["_qkv_same_embed_dim"] = True - - super(MultiheadAttention, self).__setstate__(state) - - def forward( - self, - query: Tensor, - key: Tensor, - value: Tensor, - key_padding_mask: Optional[Tensor] = None, - need_weights: bool = True, - attn_mask: Optional[Tensor] = None, - average_attn_weights: bool = True, - cache=None, - ) -> Tuple[Tensor, Optional[Tensor]]: - r""" - Args: - query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False`` - or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length, - :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``. - Queries are compared against key-value pairs to produce the output. - See "Attention Is All You Need" for more details. - key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False`` - or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length, - :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``. - See "Attention Is All You Need" for more details. - value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when - ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source - sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``. - See "Attention Is All You Need" for more details. - key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key`` - to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`. - Binary and byte masks are supported. - For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for - the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value. - need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``. - Default: ``True``. - attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape - :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size, - :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be - broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch. - Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the - corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the - corresponding position is not allowed to attend. For a float mask, the mask values will be added to - the attention weight. - average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across - heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an - effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads) - - Outputs: - - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched, - :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``, - where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the - embedding dimension ``embed_dim``. - - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``, - returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or - :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and - :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per - head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`. - - .. note:: - `batch_first` argument is ignored for unbatched inputs. - """ - is_batched = query.dim() == 3 - if key_padding_mask is not None: - _kpm_dtype = key_padding_mask.dtype - if _kpm_dtype != torch.bool and not torch.is_floating_point( - key_padding_mask - ): - raise AssertionError( - "only bool and floating types of key_padding_mask are supported" - ) - why_not_fast_path = "" - if not is_batched: - why_not_fast_path = ( - f"input not batched; expected query.dim() of 3 but got {query.dim()}" - ) - elif query is not key or key is not value: - # When lifting this restriction, don't forget to either - # enforce that the dtypes all match or test cases where - # they don't! - why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)" - elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype: - why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match" - elif ( - self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype - ): - # this case will fail anyway, but at least they'll get a useful error message. - why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match" - elif self.training: - why_not_fast_path = "training is enabled" - elif not self.batch_first: - why_not_fast_path = "batch_first was not True" - elif self.bias_k is not None: - why_not_fast_path = "self.bias_k was not None" - elif self.bias_v is not None: - why_not_fast_path = "self.bias_v was not None" - elif self.dropout: - why_not_fast_path = f"dropout was {self.dropout}, required zero" - elif self.add_zero_attn: - why_not_fast_path = "add_zero_attn was enabled" - elif not self._qkv_same_embed_dim: - why_not_fast_path = "_qkv_same_embed_dim was not True" - elif attn_mask is not None: - why_not_fast_path = "attn_mask was not None" - elif query.is_nested and key_padding_mask is not None: - why_not_fast_path = ( - "key_padding_mask is not supported with NestedTensor input" - ) - elif self.num_heads % 2 == 1: - why_not_fast_path = "num_heads is odd" - elif torch.is_autocast_enabled(): - why_not_fast_path = "autocast is enabled" - - if not why_not_fast_path: - tensor_args = ( - query, - key, - value, - self.in_proj_weight, - self.in_proj_bias, - self.out_proj.weight, - self.out_proj.bias, - ) - # We have to use list comprehensions below because TorchScript does not support - # generator expressions. - if torch.overrides.has_torch_function(tensor_args): - why_not_fast_path = "some Tensor argument has_torch_function" - elif not all( - [ - (x is None or x.is_cuda or "cpu" in str(x.device)) - for x in tensor_args - ] - ): - why_not_fast_path = "some Tensor argument is neither CUDA nor CPU" - elif torch.is_grad_enabled() and any( - [x is not None and x.requires_grad for x in tensor_args] - ): - why_not_fast_path = ( - "grad is enabled and at least one of query or the " - "input/output projection weights or biases requires_grad" - ) - if not why_not_fast_path: - return torch._native_multi_head_attention( - query, - key, - value, - self.embed_dim, - self.num_heads, - self.in_proj_weight, - self.in_proj_bias, - self.out_proj.weight, - self.out_proj.bias, - key_padding_mask if key_padding_mask is not None else attn_mask, - need_weights, - average_attn_weights, - 1 - if key_padding_mask is not None - else 0 - if attn_mask is not None - else None, - ) - - any_nested = query.is_nested or key.is_nested or value.is_nested - assert not any_nested, ( - "MultiheadAttention does not support NestedTensor outside of its fast path. " - + f"The fast path was not hit because {why_not_fast_path}" - ) - - if self.batch_first and is_batched: - # make sure that the transpose op does not affect the "is" property - if key is value: - if query is key: - query = key = value = query.transpose(1, 0) - else: - query, key = [x.transpose(1, 0) for x in (query, key)] - value = key - else: - query, key, value = [x.transpose(1, 0) for x in (query, key, value)] - - if not self._qkv_same_embed_dim: - attn_output, attn_output_weights = F.multi_head_attention_forward( - query, - key, - value, - self.embed_dim, - self.num_heads, - self.in_proj_weight, - self.in_proj_bias, - self.bias_k, - self.bias_v, - self.add_zero_attn, - self.dropout, - self.out_proj.weight, - self.out_proj.bias, - training=self.training, - key_padding_mask=key_padding_mask, - need_weights=need_weights, - attn_mask=attn_mask, - use_separate_proj_weight=True, - q_proj_weight=self.q_proj_weight, - k_proj_weight=self.k_proj_weight, - v_proj_weight=self.v_proj_weight, - average_attn_weights=average_attn_weights, - cache=cache, - ) - else: - attn_output, attn_output_weights = F.multi_head_attention_forward( - query, - key, - value, - self.embed_dim, - self.num_heads, - self.in_proj_weight, - self.in_proj_bias, - self.bias_k, - self.bias_v, - self.add_zero_attn, - self.dropout, - self.out_proj.weight, - self.out_proj.bias, - training=self.training, - key_padding_mask=key_padding_mask, - need_weights=need_weights, - attn_mask=attn_mask, - average_attn_weights=average_attn_weights, - cache=cache, - ) - if self.batch_first and is_batched: - return attn_output.transpose(1, 0), attn_output_weights - else: - return attn_output, attn_output_weights diff --git a/AR/modules/activation_onnx.py b/AR/modules/activation_onnx.py deleted file mode 100644 index b54acd999f165cb7d47a9388461e3f31164cd380..0000000000000000000000000000000000000000 --- a/AR/modules/activation_onnx.py +++ /dev/null @@ -1,178 +0,0 @@ -# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py -from typing import Optional -from typing import Tuple -import torch -from torch import Tensor -from torch.nn import Linear -from torch.nn import Module -from torch.nn.init import constant_ -from torch.nn.init import xavier_normal_ -from torch.nn.init import xavier_uniform_ -from torch.nn.modules.linear import NonDynamicallyQuantizableLinear -from torch.nn.parameter import Parameter - -from torch.nn import functional as F -from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched - - -class MultiheadAttention(Module): - __constants__ = ["batch_first"] - bias_k: Optional[torch.Tensor] - bias_v: Optional[torch.Tensor] - - def __init__( - self, - embed_dim, - num_heads, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - kdim=None, - vdim=None, - batch_first=False, - linear1_cls=Linear, - linear2_cls=Linear, - device=None, - dtype=None, - ) -> None: - factory_kwargs = {"device": device, "dtype": dtype} - super(MultiheadAttention, self).__init__() - self.embed_dim = embed_dim - self.kdim = kdim if kdim is not None else embed_dim - self.vdim = vdim if vdim is not None else embed_dim - self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim - - self.num_heads = num_heads - self.dropout = dropout - self.batch_first = batch_first - self.head_dim = embed_dim // num_heads - assert ( - self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" - - if add_bias_kv: - self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) - self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) - else: - self.bias_k = self.bias_v = None - - if linear1_cls == Linear: - if not self._qkv_same_embed_dim: - self.q_proj_weight = Parameter( - torch.empty((embed_dim, embed_dim), **factory_kwargs) - ) - self.k_proj_weight = Parameter( - torch.empty((embed_dim, self.kdim), **factory_kwargs) - ) - self.v_proj_weight = Parameter( - torch.empty((embed_dim, self.vdim), **factory_kwargs) - ) - self.register_parameter("in_proj_weight", None) - else: - self.in_proj_weight = Parameter( - torch.empty((3 * embed_dim, embed_dim), **factory_kwargs) - ) - self.register_parameter("q_proj_weight", None) - self.register_parameter("k_proj_weight", None) - self.register_parameter("v_proj_weight", None) - - if bias: - self.in_proj_bias = Parameter( - torch.empty(3 * embed_dim, **factory_kwargs) - ) - else: - self.register_parameter("in_proj_bias", None) - self.out_proj = NonDynamicallyQuantizableLinear( - embed_dim, embed_dim, bias=bias, **factory_kwargs - ) - - self._reset_parameters() - else: - if not self._qkv_same_embed_dim: - raise NotImplementedError - else: - self.in_proj_linear = linear1_cls( - embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs - ) - self.in_proj_weight = self.in_proj_linear.weight - - self.register_parameter("q_proj_weight", None) - self.register_parameter("k_proj_weight", None) - self.register_parameter("v_proj_weight", None) - - if bias: - self.in_proj_bias = self.in_proj_linear.bias - else: - self.register_parameter("in_proj_bias", None) - - self.out_proj = linear2_cls( - embed_dim, embed_dim, bias=bias, **factory_kwargs - ) - - if self.bias_k is not None: - xavier_normal_(self.bias_k) - if self.bias_v is not None: - xavier_normal_(self.bias_v) - - self.add_zero_attn = add_zero_attn - - def _reset_parameters(self): - if self._qkv_same_embed_dim: - xavier_uniform_(self.in_proj_weight) - else: - xavier_uniform_(self.q_proj_weight) - xavier_uniform_(self.k_proj_weight) - xavier_uniform_(self.v_proj_weight) - - if self.in_proj_bias is not None: - constant_(self.in_proj_bias, 0.0) - constant_(self.out_proj.bias, 0.0) - - if self.bias_k is not None: - xavier_normal_(self.bias_k) - if self.bias_v is not None: - xavier_normal_(self.bias_v) - - def __setstate__(self, state): - # Support loading old MultiheadAttention checkpoints generated by v1.1.0 - if "_qkv_same_embed_dim" not in state: - state["_qkv_same_embed_dim"] = True - - super(MultiheadAttention, self).__setstate__(state) - - def forward( - self, - query: Tensor, - key: Tensor, - value: Tensor, - key_padding_mask: Optional[Tensor] = None, - need_weights: bool = True, - attn_mask: Optional[Tensor] = None, - average_attn_weights: bool = True, - cache=None, - ) -> Tuple[Tensor, Optional[Tensor]]: - any_nested = query.is_nested or key.is_nested or value.is_nested - query = key = value = query.transpose(1, 0) - attn_output = multi_head_attention_forward_patched( - query, - key, - value, - self.embed_dim, - self.num_heads, - self.in_proj_weight, - self.in_proj_bias, - self.bias_k, - self.bias_v, - self.add_zero_attn, - self.dropout, - self.out_proj.weight, - self.out_proj.bias, - training=self.training, - key_padding_mask=key_padding_mask, - need_weights=need_weights, - attn_mask=attn_mask, - average_attn_weights=average_attn_weights, - cache=cache, - ) - return attn_output.transpose(1, 0) diff --git a/AR/modules/embedding_onnx.py b/AR/modules/embedding_onnx.py deleted file mode 100644 index b93405b45005d6101da1a3947d3d188fc460ae4e..0000000000000000000000000000000000000000 --- a/AR/modules/embedding_onnx.py +++ /dev/null @@ -1,63 +0,0 @@ -# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py -import math - -import torch -from torch import nn - - -class TokenEmbedding(nn.Module): - def __init__( - self, - embedding_dim: int, - vocab_size: int, - dropout: float = 0.0, - ): - super().__init__() - - self.vocab_size = vocab_size - self.embedding_dim = embedding_dim - - self.dropout = torch.nn.Dropout(p=dropout) - self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim) - - @property - def weight(self) -> torch.Tensor: - return self.word_embeddings.weight - - def embedding(self, index: int) -> torch.Tensor: - return self.word_embeddings.weight[index : index + 1] - - def forward(self, x: torch.Tensor): - x = self.word_embeddings(x) - x = self.dropout(x) - return x - - -class SinePositionalEmbedding(nn.Module): - def __init__( - self, - embedding_dim: int, - dropout: float = 0.0, - scale: bool = False, - alpha: bool = False, - ): - super().__init__() - self.embedding_dim = embedding_dim - self.x_scale = math.sqrt(embedding_dim) if scale else 1.0 - self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha) - self.dropout = torch.nn.Dropout(p=dropout) - self.reverse = False - self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim)) - - def extend_pe(self, x): - position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1) - scpe = (position * self.div_term).unsqueeze(0) - pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0) - pe = pe.contiguous().view(1, -1, self.embedding_dim) - return pe - - def forward(self, x: torch.Tensor) -> torch.Tensor: - pe = self.extend_pe(x) - output = x.unsqueeze(-1) if x.ndim == 2 else x - output = output * self.x_scale + self.alpha * pe - return self.dropout(output) diff --git a/AR/modules/lr_schedulers.py b/AR/modules/lr_schedulers.py deleted file mode 100644 index b8867467381cd58e14ad09ac4434512bc46186fc..0000000000000000000000000000000000000000 --- a/AR/modules/lr_schedulers.py +++ /dev/null @@ -1,83 +0,0 @@ -# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py -# reference: https://github.com/lifeiteng/vall-e -import math - -import torch -from matplotlib import pyplot as plt -from torch import nn -from torch.optim import Adam - - -class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler): - """ - Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers. - """ - - def __init__( - self, - optimizer, - init_lr, - peak_lr, - end_lr, - warmup_steps=10000, - total_steps=400000, - current_step=0, - ): - self.init_lr = init_lr - self.peak_lr = peak_lr - self.end_lr = end_lr - self.optimizer = optimizer - self._warmup_rate = (peak_lr - init_lr) / warmup_steps - self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps) - self._current_step = current_step - self.lr = init_lr - self.warmup_steps = warmup_steps - self.total_steps = total_steps - self._last_lr = [self.lr] - - def set_lr(self, lr): - self._last_lr = [g["lr"] for g in self.optimizer.param_groups] - for g in self.optimizer.param_groups: - # g['lr'] = lr - g["lr"] = self.end_lr ###锁定用线性 - - def step(self): - if self._current_step < self.warmup_steps: - lr = self.init_lr + self._warmup_rate * self._current_step - - elif self._current_step > self.total_steps: - lr = self.end_lr - - else: - decay_ratio = (self._current_step - self.warmup_steps) / ( - self.total_steps - self.warmup_steps - ) - if decay_ratio < 0.0 or decay_ratio > 1.0: - raise RuntimeError( - "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings." - ) - coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) - lr = self.end_lr + coeff * (self.peak_lr - self.end_lr) - - self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定! - self.set_lr(lr) - self.lr = lr - self._current_step += 1 - return self.lr - - -if __name__ == "__main__": - m = nn.Linear(10, 10) - opt = Adam(m.parameters(), lr=1e-4) - s = WarmupCosineLRSchedule( - opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0 - ) - lrs = [] - for i in range(25000): - s.step() - lrs.append(s.lr) - print(s.lr) - - plt.plot(lrs) - plt.plot(range(0, 25000), lrs) - plt.show() diff --git a/AR/modules/optim.py b/AR/modules/optim.py deleted file mode 100644 index 98785f05b2e3aa123ae22955d677b9f68efec256..0000000000000000000000000000000000000000 --- a/AR/modules/optim.py +++ /dev/null @@ -1,622 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey) -# -# See ../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import contextlib -import logging -from collections import defaultdict -from typing import List -from typing import Tuple - -import torch -from torch import Tensor -from torch.optim import Optimizer - - -class BatchedOptimizer(Optimizer): - """ - This class adds to class Optimizer the capability to optimize parameters in batches: - it will stack the parameters and their grads for you so the optimizer can work - on tensors with an extra leading dimension. This is intended for speed with GPUs, - as it reduces the number of kernels launched in the optimizer. - - Args: - params: - """ - - def __init__(self, params, defaults): - super(BatchedOptimizer, self).__init__(params, defaults) - - @contextlib.contextmanager - def batched_params(self, param_group, group_params_names): - """ - This function returns (technically, yields) a list of - of tuples (p, state), where - p is a `fake` parameter that is stacked (over axis 0) from real parameters - that share the same shape, and its gradient is also stacked; - `state` is the state corresponding to this batch of parameters - (it will be physically located in the "state" for one of the real - parameters, the last one that has any particular shape and dtype). - - This function is decorated as a context manager so that it can - write parameters back to their "real" locations. - - The idea is, instead of doing: - - for p in group["params"]: - state = self.state[p] - ... - - you can do: - - with self.batched_params(group["params"]) as batches: - for p, state, p_names in batches: - ... - - - Args: - group: a parameter group, which is a list of parameters; should be - one of self.param_groups. - group_params_names: name for each parameter in group, - which is List[str]. - """ - batches = defaultdict( - list - ) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter - batches_names = defaultdict( - list - ) # `batches` maps from tuple (dtype_as_str,*shape) to list of str - - assert len(param_group) == len(group_params_names) - for p, named_p in zip(param_group, group_params_names): - key = (str(p.dtype), *p.shape) - batches[key].append(p) - batches_names[key].append(named_p) - - batches_names_keys = list(batches_names.keys()) - sorted_idx = sorted( - range(len(batches_names)), key=lambda i: batches_names_keys[i]) - batches_names = [ - batches_names[batches_names_keys[idx]] for idx in sorted_idx - ] - batches = [batches[batches_names_keys[idx]] for idx in sorted_idx] - - stacked_params_dict = dict() - - # turn batches into a list, in deterministic order. - # tuples will contain tuples of (stacked_param, state, stacked_params_names), - # one for each batch in `batches`. - tuples = [] - - for batch, batch_names in zip(batches, batches_names): - p = batch[0] - # we arbitrarily store the state in the - # state corresponding to the 1st parameter in the - # group. class Optimizer will take care of saving/loading state. - state = self.state[p] - p_stacked = torch.stack(batch) - grad = torch.stack([ - torch.zeros_like(p) if p.grad is None else p.grad for p in batch - ]) - p_stacked.grad = grad - stacked_params_dict[key] = p_stacked - tuples.append((p_stacked, state, batch_names)) - - yield tuples # <-- calling code will do the actual optimization here! - - for ((stacked_params, _state, _names), batch) in zip(tuples, batches): - for i, p in enumerate(batch): # batch is list of Parameter - p.copy_(stacked_params[i]) - - -class ScaledAdam(BatchedOptimizer): - """ - Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update - proportional to the norm of that parameter; and also learn the scale of the parameter, - in log space, subject to upper and lower limits (as if we had factored each parameter as - param = underlying_param * log_scale.exp()) - - - Args: - params: The parameters or param_groups to optimize (like other Optimizer subclasses) - lr: The learning rate. We will typically use a learning rate schedule that starts - at 0.03 and decreases over time, i.e. much higher than other common - optimizers. - clipping_scale: (e.g. 2.0) - A scale for gradient-clipping: if specified, the normalized gradients - over the whole model will be clipped to have 2-norm equal to - `clipping_scale` times the median 2-norm over the most recent period - of `clipping_update_period` minibatches. By "normalized gradients", - we mean after multiplying by the rms parameter value for this tensor - [for non-scalars]; this is appropriate because our update is scaled - by this quantity. - betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad. - Must satisfy 0 < beta <= beta2 < 1. - scalar_lr_scale: A scaling factor on the learning rate, that we use to update the - scale of each parameter tensor and scalar parameters of the mode.. - If each parameter were decomposed - as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale - would be a the scaling factor on the learning rate of p_scale. - eps: A general-purpose epsilon to prevent division by zero - param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of - learning the scale on the parameters (we'll constrain the rms of each non-scalar - parameter tensor to be >= this value) - param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of - learning the scale on the parameters (we'll constrain the rms of each non-scalar - parameter tensor to be <= this value) - scalar_max: Maximum absolute value for scalar parameters (applicable if your - model has any parameters with numel() == 1). - size_update_period: The periodicity, in steps, with which we update the size (scale) - of the parameter tensor. This is provided to save a little time - in the update. - clipping_update_period: if clipping_scale is specified, this is the period - """ - - def __init__( - self, - params, - lr=3e-02, - clipping_scale=None, - betas=(0.9, 0.98), - scalar_lr_scale=0.1, - eps=1.0e-08, - param_min_rms=1.0e-05, - param_max_rms=3.0, - scalar_max=10.0, - size_update_period=4, - clipping_update_period=100, - parameters_names=None, - show_dominant_parameters=True, ): - - assert parameters_names is not None, ( - "Please prepare parameters_names," - "which is a List[List[str]]. Each List[str] is for a group" - "and each str is for a parameter") - defaults = dict( - lr=lr, - clipping_scale=clipping_scale, - betas=betas, - scalar_lr_scale=scalar_lr_scale, - eps=eps, - param_min_rms=param_min_rms, - param_max_rms=param_max_rms, - scalar_max=scalar_max, - size_update_period=size_update_period, - clipping_update_period=clipping_update_period, ) - - super(ScaledAdam, self).__init__(params, defaults) - assert len(self.param_groups) == len(parameters_names) - self.parameters_names = parameters_names - self.show_dominant_parameters = show_dominant_parameters - - def __setstate__(self, state): - super(ScaledAdam, self).__setstate__(state) - - @torch.no_grad() - def step(self, closure=None): - """Performs a single optimization step. - - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - with torch.enable_grad(): - loss = closure() - - batch = True - - for group, group_params_names in zip(self.param_groups, - self.parameters_names): - - with self.batched_params(group["params"], - group_params_names) as batches: - - # batches is list of pairs (stacked_param, state). stacked_param is like - # a regular parameter, and will have a .grad, but the 1st dim corresponds to - # a stacking dim, it is not a real dim. - - if (len(batches[0][1]) == - 0): # if len(first state) == 0: not yet initialized - clipping_scale = 1 - else: - clipping_scale = self._get_clipping_scale(group, batches) - - for p, state, _ in batches: - # Perform optimization step. - # grad is not going to be None, we handled that when creating the batches. - grad = p.grad - if grad.is_sparse: - raise RuntimeError( - "ScaledAdam optimizer does not support sparse gradients" - ) - # State initialization - if len(state) == 0: - self._init_state(group, p, state) - - self._step_one_batch(group, p, state, clipping_scale) - - return loss - - def _init_state(self, group: dict, p: Tensor, state: dict): - """ - Initializes state dict for parameter 'p'. Assumes that dim 0 of tensor p - is actually the batch dimension, corresponding to batched-together - parameters of a given shape. - - - Args: - group: Dict to look up configuration values. - p: The parameter that we are initializing the state for - state: Dict from string to whatever state we are initializing - """ - size_update_period = group["size_update_period"] - - state["step"] = 0 - - kwargs = {"device": p.device, "dtype": p.dtype} - - # 'delta' implements conventional momentum. There are - # several different kinds of update going on, so rather than - # compute "exp_avg" like in Adam, we store and decay a - # parameter-change "delta", which combines all forms of - # update. this is equivalent to how it's done in Adam, - # except for the first few steps. - state["delta"] = torch.zeros_like( - p, memory_format=torch.preserve_format) - - batch_size = p.shape[0] - numel = p.numel() // batch_size - numel = p.numel() - - if numel > 1: - # "param_rms" just periodically records the scalar root-mean-square value of - # the parameter tensor. - # it has a shape like (batch_size, 1, 1, 1, 1) - param_rms = ( - (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()) - state["param_rms"] = param_rms - - state["scale_exp_avg_sq"] = torch.zeros_like(param_rms) - state["scale_grads"] = torch.zeros(size_update_period, - *param_rms.shape, **kwargs) - - # exp_avg_sq is the weighted sum of scaled gradients. as in Adam. - state["exp_avg_sq"] = torch.zeros_like( - p, memory_format=torch.preserve_format) - - def _get_clipping_scale(self, - group: dict, - tuples: List[Tuple[Tensor, dict, List[str]]] - ) -> float: - """ - Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients - by this amount before applying the rest of the update. - - Args: - group: the parameter group, an item in self.param_groups - tuples: a list of tuples of (param, state, param_names) - where param is a batched set of parameters, - with a .grad (1st dim is batch dim) - and state is the state-dict where optimization parameters are kept. - param_names is a List[str] while each str is name for a parameter - in batched set of parameters "param". - """ - assert len(tuples) >= 1 - clipping_scale = group["clipping_scale"] - (first_p, first_state, _) = tuples[0] - step = first_state["step"] - if clipping_scale is None or step == 0: - # no clipping. return early on step == 0 because the other - # parameters' state won't have been initialized yet. - return 1.0 - clipping_update_period = group["clipping_update_period"] - - tot_sumsq = torch.tensor(0.0, device=first_p.device) - for (p, state, param_names) in tuples: - grad = p.grad - if grad.is_sparse: - raise RuntimeError( - "ScaledAdam optimizer does not support sparse gradients") - if p.numel() == p.shape[0]: # a batch of scalars - tot_sumsq += (grad**2).sum() # sum() to change shape [1] to [] - else: - tot_sumsq += ((grad * state["param_rms"])**2).sum() - - tot_norm = tot_sumsq.sqrt() - if "model_norms" not in first_state: - first_state["model_norms"] = torch.zeros( - clipping_update_period, device=p.device) - first_state["model_norms"][step % clipping_update_period] = tot_norm - - if step % clipping_update_period == 0: - # Print some stats. - # We don't reach here if step == 0 because we would have returned - # above. - sorted_norms = first_state["model_norms"].sort()[0].to("cpu") - quartiles = [] - for n in range(0, 5): - index = min( - clipping_update_period - 1, - (clipping_update_period // 4) * n, ) - quartiles.append(sorted_norms[index].item()) - - median = quartiles[2] - threshold = clipping_scale * median - first_state["model_norm_threshold"] = threshold - percent_clipped = (first_state["num_clipped"] * 100.0 / - clipping_update_period - if "num_clipped" in first_state else 0.0) - first_state["num_clipped"] = 0 - quartiles = " ".join(["%.3e" % x for x in quartiles]) - logging.info( - f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, " - f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}" - ) - - if step < clipping_update_period: - return 1.0 # We have not yet estimated a norm to clip to. - else: - try: - model_norm_threshold = first_state["model_norm_threshold"] - except KeyError: - logging.info( - "Warning: model_norm_threshold not in state: possibly " - "you changed config when restarting, adding clipping_scale option?" - ) - return 1.0 - ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item()) - if ans < 1.0: - first_state["num_clipped"] += 1 - if ans < 0.1: - logging.warn( - f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}" - ) - if self.show_dominant_parameters: - assert p.shape[0] == len(param_names) - self._show_gradient_dominating_parameter(tuples, tot_sumsq) - return ans - - def _show_gradient_dominating_parameter( - self, tuples: List[Tuple[Tensor, dict, List[str]]], - tot_sumsq: Tensor): - """ - Show information of parameter wihch dominanting tot_sumsq. - - Args: - tuples: a list of tuples of (param, state, param_names) - where param is a batched set of parameters, - with a .grad (1st dim is batch dim) - and state is the state-dict where optimization parameters are kept. - param_names is a List[str] while each str is name for a parameter - in batched set of parameters "param". - tot_sumsq: sumsq of all parameters. Though it's could be calculated - from tuples, we still pass it to save some time. - """ - all_sumsq_orig = {} - for (p, state, batch_param_names) in tuples: - # p is a stacked batch parameters. - batch_grad = p.grad - if p.numel() == p.shape[0]: # a batch of scalars - batch_sumsq_orig = batch_grad**2 - # Dummpy values used by following `zip` statement. - batch_rms_orig = torch.ones(p.shape[0]) - else: - batch_rms_orig = state["param_rms"] - batch_sumsq_orig = ((batch_grad * batch_rms_orig)**2).sum( - dim=list(range(1, batch_grad.ndim))) - - for name, sumsq_orig, rms, grad in zip(batch_param_names, - batch_sumsq_orig, - batch_rms_orig, batch_grad): - - proportion_orig = sumsq_orig / tot_sumsq - all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad) - - assert torch.isclose( - sum([value[0] for value in all_sumsq_orig.values()]).cpu(), - torch.tensor(1.0), ) - sorted_by_proportion = { - k: v - for k, v in sorted( - all_sumsq_orig.items(), - key=lambda item: item[1][0], - reverse=True, ) - } - dominant_param_name = next(iter(sorted_by_proportion)) - (dominant_proportion, dominant_sumsq, dominant_rms, - dominant_grad, ) = sorted_by_proportion[dominant_param_name] - logging.info(f"Parameter Dominanting tot_sumsq {dominant_param_name}" - f" with proportion {dominant_proportion:.2f}," - f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)" - f"={dominant_sumsq:.3e}," - f" grad_sumsq = {(dominant_grad**2).sum():.3e}," - f" orig_rms_sq={(dominant_rms**2).item():.3e}") - - def _step_one_batch(self, - group: dict, - p: Tensor, - state: dict, - clipping_scale: float): - """ - Do the step for one parameter, which is actually going to be a batch of - `real` parameters, with dim 0 as the batch dim. - Args: - group: dict to look up configuration values - p: parameter to update (actually multiple parameters stacked together - as a batch) - state: state-dict for p, to look up the optimizer state - """ - lr = group["lr"] - size_update_period = group["size_update_period"] - beta1 = group["betas"][0] - - grad = p.grad - if clipping_scale != 1.0: - grad = grad * clipping_scale - step = state["step"] - delta = state["delta"] - - delta.mul_(beta1) - batch_size = p.shape[0] - numel = p.numel() // batch_size - if numel > 1: - # Update the size/scale of p, and set param_rms - scale_grads = state["scale_grads"] - scale_grads[step % size_update_period] = (p * grad).sum( - dim=list(range(1, p.ndim)), keepdim=True) - if step % size_update_period == size_update_period - 1: - param_rms = state["param_rms"] # shape: (batch_size, 1, 1, ..) - param_rms.copy_((p**2) - .mean(dim=list(range(1, p.ndim)), keepdim=True) - .sqrt()) - if step > 0: - # self._size_update() learns the overall scale on the - # parameter, by shrinking or expanding it. - self._size_update(group, scale_grads, p, state) - - if numel == 1: - # For parameters with 1 element we just use regular Adam. - # Updates delta. - self._step_scalar(group, p, state) - else: - self._step(group, p, state) - - state["step"] = step + 1 - - def _size_update(self, - group: dict, - scale_grads: Tensor, - p: Tensor, - state: dict) -> None: - """ - Called only where p.numel() > 1, this updates the scale of the parameter. - If we imagine: p = underlying_param * scale.exp(), and we are doing - gradient descent on underlying param and on scale, this function does the update - on `scale`. - - Args: - group: dict to look up configuration values - scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing - grads w.r.t. the scales. - p: The parameter to update - state: The state-dict of p - """ - - param_rms = state["param_rms"] - beta1, beta2 = group["betas"] - size_lr = group["lr"] * group["scalar_lr_scale"] - param_min_rms = group["param_min_rms"] - param_max_rms = group["param_max_rms"] - eps = group["eps"] - step = state["step"] - batch_size = p.shape[0] - - size_update_period = scale_grads.shape[0] - # correct beta2 for the size update period: we will have - # faster decay at this level. - beta2_corr = beta2**size_update_period - - scale_exp_avg_sq = state[ - "scale_exp_avg_sq"] # shape: (batch_size, 1, 1, ..) - scale_exp_avg_sq.mul_(beta2_corr).add_( - (scale_grads**2).mean(dim=0), # mean over dim `size_update_period` - alpha=1 - beta2_corr, ) # shape is (batch_size, 1, 1, ...) - - # The 1st time we reach here is when size_step == 1. - size_step = (step + 1) // size_update_period - bias_correction2 = 1 - beta2_corr**size_step - # we don't bother with bias_correction1; this will help prevent divergence - # at the start of training. - - denom = scale_exp_avg_sq.sqrt() + eps - - scale_step = (-size_lr * (bias_correction2**0.5) * - scale_grads.sum(dim=0) / denom) - - is_too_small = param_rms < param_min_rms - is_too_large = param_rms > param_max_rms - - # when the param gets too small, just don't shrink it any further. - scale_step.masked_fill_(is_too_small, 0.0) - # when it gets too large, stop it from getting any larger. - scale_step.masked_fill_(is_too_large, -size_lr * size_update_period) - delta = state["delta"] - # the factor of (1-beta1) relates to momentum. - delta.add_(p * scale_step, alpha=(1 - beta1)) - - def _step(self, group: dict, p: Tensor, state: dict): - """ - This function does the core update of self.step(), in the case where the members of - the batch have more than 1 element. - - Args: - group: A dict which will be used to look up configuration values - p: The parameter to be updated - grad: The grad of p - state: The state-dict corresponding to parameter p - - This function modifies p. - """ - grad = p.grad - lr = group["lr"] - beta1, beta2 = group["betas"] - eps = group["eps"] - param_min_rms = group["param_min_rms"] - step = state["step"] - - exp_avg_sq = state["exp_avg_sq"] - exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2)) - - this_step = state["step"] - (state["zero_step"] - if "zero_step" in state else 0) - bias_correction2 = 1 - beta2**(this_step + 1) - if bias_correction2 < 0.99: - # note: not in-place. - exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2) - - denom = exp_avg_sq.sqrt() - denom += eps - grad = grad / denom - - alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms) - - delta = state["delta"] - delta.add_(grad * alpha) - p.add_(delta) - - def _step_scalar(self, group: dict, p: Tensor, state: dict): - """ - A simplified form of the core update for scalar tensors, where we cannot get a good - estimate of the parameter rms. - """ - beta1, beta2 = group["betas"] - scalar_max = group["scalar_max"] - eps = group["eps"] - lr = group["lr"] * group["scalar_lr_scale"] - grad = p.grad - - exp_avg_sq = state["exp_avg_sq"] # shape: (batch_size,) - exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) - - # bias_correction2 is like in Adam. Don't bother with bias_correction1; - # slower update at the start will help stability anyway. - bias_correction2 = 1 - beta2**(state["step"] + 1) - denom = (exp_avg_sq / bias_correction2).sqrt() + eps - - delta = state["delta"] - delta.add_(grad / denom, alpha=-lr * (1 - beta1)) - p.clamp_(min=-scalar_max, max=scalar_max) - p.add_(delta) diff --git a/AR/modules/patched_mha_with_cache.py b/AR/modules/patched_mha_with_cache.py deleted file mode 100644 index 7be241dadd378fc9312916f60433ba4b7aa7c764..0000000000000000000000000000000000000000 --- a/AR/modules/patched_mha_with_cache.py +++ /dev/null @@ -1,465 +0,0 @@ -from torch.nn.functional import * -from torch.nn.functional import ( - _mha_shape_check, - _canonical_mask, - _none_or_dtype, - _in_projection_packed, -) -from torch.nn import functional as F -import torch -# Tensor = torch.Tensor -# from typing import Callable, List, Optional, Tuple, Union - - -def multi_head_attention_forward_patched( - query: Tensor, - key: Tensor, - value: Tensor, - embed_dim_to_check: int, - num_heads: int, - in_proj_weight: Optional[Tensor], - in_proj_bias: Optional[Tensor], - bias_k: Optional[Tensor], - bias_v: Optional[Tensor], - add_zero_attn: bool, - dropout_p: float, - out_proj_weight: Tensor, - out_proj_bias: Optional[Tensor], - training: bool = True, - key_padding_mask: Optional[Tensor] = None, - need_weights: bool = True, - attn_mask: Optional[Tensor] = None, - use_separate_proj_weight: bool = False, - q_proj_weight: Optional[Tensor] = None, - k_proj_weight: Optional[Tensor] = None, - v_proj_weight: Optional[Tensor] = None, - static_k: Optional[Tensor] = None, - static_v: Optional[Tensor] = None, - average_attn_weights: bool = True, - is_causal: bool = False, - cache=None, -) -> Tuple[Tensor, Optional[Tensor]]: - r""" - Args: - query, key, value: map a query and a set of key-value pairs to an output. - See "Attention Is All You Need" for more details. - embed_dim_to_check: total dimension of the model. - num_heads: parallel attention heads. - in_proj_weight, in_proj_bias: input projection weight and bias. - bias_k, bias_v: bias of the key and value sequences to be added at dim=0. - add_zero_attn: add a new batch of zeros to the key and - value sequences at dim=1. - dropout_p: probability of an element to be zeroed. - out_proj_weight, out_proj_bias: the output projection weight and bias. - training: apply dropout if is ``True``. - key_padding_mask: if provided, specified padding elements in the key will - be ignored by the attention. This is an binary mask. When the value is True, - the corresponding value on the attention layer will be filled with -inf. - need_weights: output attn_output_weights. - Default: `True` - Note: `needs_weight` defaults to `True`, but should be set to `False` - For best performance when attention weights are not nedeeded. - *Setting needs_weights to `True` - leads to a significant performance degradation.* - attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all - the batches while a 3D mask allows to specify a different mask for the entries of each batch. - is_causal: If specified, applies a causal mask as attention mask, and ignores - attn_mask for computing scaled dot product attention. - Default: ``False``. - .. warning:: - is_causal is provides a hint that the attn_mask is the - causal mask.Providing incorrect hints can result in - incorrect execution, including forward and backward - compatibility. - use_separate_proj_weight: the function accept the proj. weights for query, key, - and value in different forms. If false, in_proj_weight will be used, which is - a combination of q_proj_weight, k_proj_weight, v_proj_weight. - q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias. - static_k, static_v: static key and value used for attention operators. - average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across heads. - Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an effect - when ``need_weights=True.``. Default: True - - - Shape: - Inputs: - - query: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is - the embedding dimension. - - key: :math:`(S, E)` or :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is - the embedding dimension. - - value: :math:`(S, E)` or :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is - the embedding dimension. - - key_padding_mask: :math:`(S)` or :math:`(N, S)` where N is the batch size, S is the source sequence length. - If a FloatTensor is provided, it will be directly added to the value. - If a BoolTensor is provided, the positions with the - value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. - - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. - 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, - S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked - positions. If a BoolTensor is provided, positions with ``True`` - are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor - is provided, it will be added to the attention weight. - - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, - N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. - - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, - N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. - - Outputs: - - attn_output: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size, - E is the embedding dimension. - - attn_output_weights: Only returned when ``need_weights=True``. If ``average_attn_weights=True``, returns - attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or - :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and - :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per - head of shape :math:`(num_heads, L, S)` when input is unbatched or :math:`(N, num_heads, L, S)`. - """ - tens_ops = ( - query, - key, - value, - in_proj_weight, - in_proj_bias, - bias_k, - bias_v, - out_proj_weight, - out_proj_bias, - ) - if has_torch_function(tens_ops): - return handle_torch_function( - multi_head_attention_forward, - tens_ops, - query, - key, - value, - embed_dim_to_check, - num_heads, - in_proj_weight, - in_proj_bias, - bias_k, - bias_v, - add_zero_attn, - dropout_p, - out_proj_weight, - out_proj_bias, - training=training, - key_padding_mask=key_padding_mask, - need_weights=need_weights, - attn_mask=attn_mask, - is_causal=is_causal, - use_separate_proj_weight=use_separate_proj_weight, - q_proj_weight=q_proj_weight, - k_proj_weight=k_proj_weight, - v_proj_weight=v_proj_weight, - static_k=static_k, - static_v=static_v, - average_attn_weights=average_attn_weights, - cache=cache, - ) - - is_batched = _mha_shape_check( - query, key, value, key_padding_mask, attn_mask, num_heads - ) - - # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input - # is batched, run the computation and before returning squeeze the - # batch dimension so that the output doesn't carry this temporary batch dimension. - if not is_batched: - # unsqueeze if the input is unbatched - query = query.unsqueeze(1) - key = key.unsqueeze(1) - value = value.unsqueeze(1) - if key_padding_mask is not None: - key_padding_mask = key_padding_mask.unsqueeze(0) - - # set up shape vars - tgt_len, bsz, embed_dim = query.shape - src_len, _, _ = key.shape - - key_padding_mask = _canonical_mask( - mask=key_padding_mask, - mask_name="key_padding_mask", - other_type=_none_or_dtype(attn_mask), - other_name="attn_mask", - target_type=query.dtype, - ) - - if is_causal and attn_mask is None: - raise RuntimeError( - "Need attn_mask if specifying the is_causal hint. " - "You may use the Transformer module method " - "`generate_square_subsequent_mask` to create this mask." - ) - - if is_causal and key_padding_mask is None and not need_weights: - # when we have a kpm or need weights, we need attn_mask - # Otherwise, we use the is_causal hint go as is_causal - # indicator to SDPA. - attn_mask = None - else: - attn_mask = _canonical_mask( - mask=attn_mask, - mask_name="attn_mask", - other_type=None, - other_name="", - target_type=query.dtype, - check_other=False, - ) - - if key_padding_mask is not None: - # We have the attn_mask, and use that to merge kpm into it. - # Turn off use of is_causal hint, as the merged mask is no - # longer causal. - is_causal = False - - assert ( - embed_dim == embed_dim_to_check - ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" - if isinstance(embed_dim, torch.Tensor): - # embed_dim can be a tensor when JIT tracing - head_dim = embed_dim.div(num_heads, rounding_mode="trunc") - else: - head_dim = embed_dim // num_heads - assert ( - head_dim * num_heads == embed_dim - ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" - if use_separate_proj_weight: - # allow MHA to have different embedding dimensions when separate projection weights are used - assert ( - key.shape[:2] == value.shape[:2] - ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" - else: - assert ( - key.shape == value.shape - ), f"key shape {key.shape} does not match value shape {value.shape}" - - # - # compute in-projection - # - if not use_separate_proj_weight: - assert ( - in_proj_weight is not None - ), "use_separate_proj_weight is False but in_proj_weight is None" - q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias) - else: - assert ( - q_proj_weight is not None - ), "use_separate_proj_weight is True but q_proj_weight is None" - assert ( - k_proj_weight is not None - ), "use_separate_proj_weight is True but k_proj_weight is None" - assert ( - v_proj_weight is not None - ), "use_separate_proj_weight is True but v_proj_weight is None" - if in_proj_bias is None: - b_q = b_k = b_v = None - else: - b_q, b_k, b_v = in_proj_bias.chunk(3) - q, k, v = _in_projection( - query, - key, - value, - q_proj_weight, - k_proj_weight, - v_proj_weight, - b_q, - b_k, - b_v, - ) - if cache != None: - if cache["first_infer"] == 1: - cache["k"][cache["stage"]] = k - # print(0,cache["k"].shape) - cache["v"][cache["stage"]] = v - else: ###12个layer每个都要留自己的cache_kv - # print(1,cache["k"].shape) - cache["k"][cache["stage"]] = torch.cat( - [cache["k"][cache["stage"]], k], 0 - ) ##本来时序是1,但是proj的时候可能transpose了所以时序到0维了 - cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]], v], 0) - # print(2, cache["k"].shape) - src_len = cache["k"][cache["stage"]].shape[0] - k = cache["k"][cache["stage"]] - v = cache["v"][cache["stage"]] - # if attn_mask is not None: - # attn_mask=attn_mask[-1:,] - # print(attn_mask.shape,attn_mask) - cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] - # print(2333,cache) - # prep attention mask - - attn_mask = _canonical_mask( - mask=attn_mask, - mask_name="attn_mask", - other_type=None, - other_name="", - target_type=q.dtype, - check_other=False, - ) - - if attn_mask is not None: - # ensure attn_mask's dim is 3 - if attn_mask.dim() == 2: - correct_2d_size = (tgt_len, src_len) - if attn_mask.shape != correct_2d_size: - raise RuntimeError( - f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}." - ) - attn_mask = attn_mask.unsqueeze(0) - elif attn_mask.dim() == 3: - correct_3d_size = (bsz * num_heads, tgt_len, src_len) - if attn_mask.shape != correct_3d_size: - raise RuntimeError( - f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}." - ) - else: - raise RuntimeError( - f"attn_mask's dimension {attn_mask.dim()} is not supported" - ) - - # add bias along batch dimension (currently second) - if bias_k is not None and bias_v is not None: - assert static_k is None, "bias cannot be added to static key." - assert static_v is None, "bias cannot be added to static value." - k = torch.cat([k, bias_k.repeat(1, bsz, 1)]) - v = torch.cat([v, bias_v.repeat(1, bsz, 1)]) - if attn_mask is not None: - attn_mask = pad(attn_mask, (0, 1)) - if key_padding_mask is not None: - key_padding_mask = pad(key_padding_mask, (0, 1)) - else: - assert bias_k is None - assert bias_v is None - - # - # reshape q, k, v for multihead attention and make em batch first - # - q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) - if static_k is None: - k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1) - else: - # TODO finish disentangling control flow so we don't do in-projections when statics are passed - assert ( - static_k.size(0) == bsz * num_heads - ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}" - assert ( - static_k.size(2) == head_dim - ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" - k = static_k - if static_v is None: - v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1) - else: - # TODO finish disentangling control flow so we don't do in-projections when statics are passed - assert ( - static_v.size(0) == bsz * num_heads - ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}" - assert ( - static_v.size(2) == head_dim - ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" - v = static_v - - # add zero attention along batch dimension (now first) - if add_zero_attn: - zero_attn_shape = (bsz * num_heads, 1, head_dim) - k = torch.cat( - [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1 - ) - v = torch.cat( - [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1 - ) - if attn_mask is not None: - attn_mask = pad(attn_mask, (0, 1)) - if key_padding_mask is not None: - key_padding_mask = pad(key_padding_mask, (0, 1)) - - # update source sequence length after adjustments - src_len = k.size(1) - - # merge key padding and attention masks - if key_padding_mask is not None: - assert key_padding_mask.shape == ( - bsz, - src_len, - ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" - key_padding_mask = ( - key_padding_mask.view(bsz, 1, 1, src_len) - .expand(-1, num_heads, -1, -1) - .reshape(bsz * num_heads, 1, src_len) - ) - if attn_mask is None: - attn_mask = key_padding_mask - else: - attn_mask = attn_mask + key_padding_mask - - # adjust dropout probability - if not training: - dropout_p = 0.0 - - # - # (deep breath) calculate attention and out projection - # - - if need_weights: - B, Nt, E = q.shape - q_scaled = q / math.sqrt(E) - - assert not ( - is_causal and attn_mask is None - ), "FIXME: is_causal not implemented for need_weights" - - if attn_mask is not None: - attn_output_weights = torch.baddbmm( - attn_mask, q_scaled, k.transpose(-2, -1) - ) - else: - attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1)) - attn_output_weights = softmax(attn_output_weights, dim=-1) - if dropout_p > 0.0: - attn_output_weights = dropout(attn_output_weights, p=dropout_p) - - attn_output = torch.bmm(attn_output_weights, v) - - attn_output = ( - attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim) - ) - attn_output = linear(attn_output, out_proj_weight, out_proj_bias) - attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) - - # optionally average attention weights over heads - attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) - if average_attn_weights: - attn_output_weights = attn_output_weights.mean(dim=1) - - if not is_batched: - # squeeze the output if input was unbatched - attn_output = attn_output.squeeze(1) - attn_output_weights = attn_output_weights.squeeze(0) - return attn_output, attn_output_weights - else: - # attn_mask can be either (L,S) or (N*num_heads, L, S) - # if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S) - # in order to match the input for SDPA of (N, num_heads, L, S) - if attn_mask is not None: - if attn_mask.size(0) == 1 and attn_mask.dim() == 3: - attn_mask = attn_mask.unsqueeze(0) - else: - attn_mask = attn_mask.view(bsz, num_heads, -1, src_len) - - q = q.view(bsz, num_heads, tgt_len, head_dim) - k = k.view(bsz, num_heads, src_len, head_dim) - v = v.view(bsz, num_heads, src_len, head_dim) - - # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): - attn_output = scaled_dot_product_attention( - q, k, v, attn_mask, dropout_p, is_causal - ) - - attn_output = ( - attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim) - ) - - attn_output = linear(attn_output, out_proj_weight, out_proj_bias) - attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1)) - if not is_batched: - # squeeze the output if input was unbatched - attn_output = attn_output.squeeze(1) - return attn_output, None diff --git a/AR/modules/patched_mha_with_cache_onnx.py b/AR/modules/patched_mha_with_cache_onnx.py deleted file mode 100644 index 14bdb550a09a2f1dac610ea653848689e6443b4d..0000000000000000000000000000000000000000 --- a/AR/modules/patched_mha_with_cache_onnx.py +++ /dev/null @@ -1,92 +0,0 @@ -from torch.nn.functional import * -from torch.nn.functional import ( - _mha_shape_check, - _canonical_mask, - _none_or_dtype, - _in_projection_packed, -) - -def multi_head_attention_forward_patched( - query, - key, - value, - embed_dim_to_check: int, - num_heads: int, - in_proj_weight, - in_proj_bias: Optional[Tensor], - bias_k: Optional[Tensor], - bias_v: Optional[Tensor], - add_zero_attn: bool, - dropout_p: float, - out_proj_weight: Tensor, - out_proj_bias: Optional[Tensor], - training: bool = True, - key_padding_mask: Optional[Tensor] = None, - need_weights: bool = True, - attn_mask: Optional[Tensor] = None, - use_separate_proj_weight: bool = False, - q_proj_weight: Optional[Tensor] = None, - k_proj_weight: Optional[Tensor] = None, - v_proj_weight: Optional[Tensor] = None, - static_k: Optional[Tensor] = None, - static_v: Optional[Tensor] = None, - average_attn_weights: bool = True, - is_causal: bool = False, - cache=None, -) -> Tuple[Tensor, Optional[Tensor]]: - - # set up shape vars - _, _, embed_dim = query.shape - attn_mask = _canonical_mask( - mask=attn_mask, - mask_name="attn_mask", - other_type=None, - other_name="", - target_type=query.dtype, - check_other=False, - ) - head_dim = embed_dim // num_heads - - proj_qkv = linear(query, in_proj_weight, in_proj_bias) - proj_qkv = proj_qkv.unflatten(-1, (3, query.size(-1))).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous() - q, k, v = proj_qkv[0], proj_qkv[1], proj_qkv[2] - - if cache["first_infer"] == 1: - cache["k"][cache["stage"]] = k - cache["v"][cache["stage"]] = v - else: - cache["k"][cache["stage"]] = torch.cat([cache["k"][cache["stage"]][:-1], k], 0) - cache["v"][cache["stage"]] = torch.cat([cache["v"][cache["stage"]][:-1], v], 0) - k = cache["k"][cache["stage"]] - v = cache["v"][cache["stage"]] - cache["stage"] = (cache["stage"] + 1) % cache["all_stage"] - - attn_mask = _canonical_mask( - mask=attn_mask, - mask_name="attn_mask", - other_type=None, - other_name="", - target_type=q.dtype, - check_other=False, - ) - attn_mask = attn_mask.unsqueeze(0) - - q = q.view(-1, num_heads, head_dim).transpose(0, 1) - k = k.view(-1, num_heads, head_dim).transpose(0, 1) - v = v.view(-1, num_heads, head_dim).transpose(0, 1) - - dropout_p = 0.0 - attn_mask = attn_mask.unsqueeze(0) - q = q.view(num_heads, -1, head_dim).unsqueeze(0) - k = k.view(num_heads, -1, head_dim).unsqueeze(0) - v = v.view(num_heads, -1, head_dim).unsqueeze(0) - attn_output = scaled_dot_product_attention( - q, k, v, attn_mask, dropout_p, is_causal - ) - attn_output = ( - attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim) - ) - attn_output = linear(attn_output, out_proj_weight, out_proj_bias) - attn_output = attn_output.view(-1, 1, attn_output.size(1)) - - return attn_output diff --git a/AR/modules/scaling.py b/AR/modules/scaling.py deleted file mode 100644 index 9256a8cbf342b6a259c48fb8821fed0492c649fd..0000000000000000000000000000000000000000 --- a/AR/modules/scaling.py +++ /dev/null @@ -1,335 +0,0 @@ -# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import math -import random -from typing import Optional -from typing import Tuple -from typing import Union - -import torch -import torch.nn as nn -from torch import Tensor - - -class DoubleSwishFunction(torch.autograd.Function): - """ - double_swish(x) = x * torch.sigmoid(x-1) - This is a definition, originally motivated by its close numerical - similarity to swish(swish(x)), where swish(x) = x * sigmoid(x). - - Memory-efficient derivative computation: - double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1) - double_swish'(x) = d/dx double_swish(x) = x * s'(x) + x' * s(x) = x * s'(x) + s(x). - Now, s'(x) = s(x) * (1-s(x)). - double_swish'(x) = x * s'(x) + s(x). - = x * s(x) * (1-s(x)) + s(x). - = double_swish(x) * (1-s(x)) + s(x) - ... so we just need to remember s(x) but not x itself. - """ - - @staticmethod - def forward(ctx, x: Tensor) -> Tensor: - requires_grad = x.requires_grad - x_dtype = x.dtype - if x.dtype == torch.float16: - x = x.to(torch.float32) - - s = torch.sigmoid(x - 1.0) - y = x * s - - if requires_grad: - deriv = y * (1 - s) + s - # notes on derivative of x * sigmoid(x - 1): - # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29 - # min \simeq -0.043638. Take floor as -0.043637 so it's a lower bund - # max \simeq 1.1990. Take ceil to be 1.2 so it's an upper bound. - # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which - # floors), should be expectation-preserving. - floor = -0.043637 - ceil = 1.2 - d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like( - deriv - ) - if __name__ == "__main__": - # for self-testing only. - assert d_scaled.min() >= 0.0 - assert d_scaled.max() < 256.0 - d_int = d_scaled.to(torch.uint8) - ctx.save_for_backward(d_int) - if x.dtype == torch.float16 or torch.is_autocast_enabled(): - y = y.to(torch.float16) - return y - - @staticmethod - def backward(ctx, y_grad: Tensor) -> Tensor: - (d,) = ctx.saved_tensors - # the same constants as used in forward pass. - floor = -0.043637 - ceil = 1.2 - d = d * ((ceil - floor) / 255.0) + floor - return y_grad * d - - -class DoubleSwish(torch.nn.Module): - def forward(self, x: Tensor) -> Tensor: - """Return double-swish activation function which is an approximation to Swish(Swish(x)), - that we approximate closely with x * sigmoid(x-1). - """ - if torch.jit.is_scripting() or torch.jit.is_tracing(): - return x * torch.sigmoid(x - 1.0) - return DoubleSwishFunction.apply(x) - - -class ActivationBalancerFunction(torch.autograd.Function): - @staticmethod - def forward( - ctx, - x: Tensor, - scale_factor: Tensor, - sign_factor: Optional[Tensor], - channel_dim: int, - ) -> Tensor: - if channel_dim < 0: - channel_dim += x.ndim - ctx.channel_dim = channel_dim - xgt0 = x > 0 - if sign_factor is None: - ctx.save_for_backward(xgt0, scale_factor) - else: - ctx.save_for_backward(xgt0, scale_factor, sign_factor) - return x - - @staticmethod - def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]: - if len(ctx.saved_tensors) == 3: - xgt0, scale_factor, sign_factor = ctx.saved_tensors - for _ in range(ctx.channel_dim, x_grad.ndim - 1): - scale_factor = scale_factor.unsqueeze(-1) - sign_factor = sign_factor.unsqueeze(-1) - factor = sign_factor + scale_factor * (xgt0.to(x_grad.dtype) - 0.5) - else: - xgt0, scale_factor = ctx.saved_tensors - for _ in range(ctx.channel_dim, x_grad.ndim - 1): - scale_factor = scale_factor.unsqueeze(-1) - factor = scale_factor * (xgt0.to(x_grad.dtype) - 0.5) - neg_delta_grad = x_grad.abs() * factor - return ( - x_grad - neg_delta_grad, - None, - None, - None, - ) - - -def _compute_scale_factor( - x: Tensor, - channel_dim: int, - min_abs: float, - max_abs: float, - gain_factor: float, - max_factor: float, -) -> Tensor: - if channel_dim < 0: - channel_dim += x.ndim - sum_dims = [d for d in range(x.ndim) if d != channel_dim] - x_abs_mean = torch.mean(x.abs(), dim=sum_dims).to(torch.float32) - - if min_abs == 0.0: - below_threshold = 0.0 - else: - # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if - # x_abs)_mean , min_abs. - below_threshold = ((min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp( - min=0, max=max_factor - ) - - above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp( - min=0, max=max_factor - ) - - return below_threshold - above_threshold - - -def _compute_sign_factor( - x: Tensor, - channel_dim: int, - min_positive: float, - max_positive: float, - gain_factor: float, - max_factor: float, -) -> Tensor: - if channel_dim < 0: - channel_dim += x.ndim - sum_dims = [d for d in range(x.ndim) if d != channel_dim] - proportion_positive = torch.mean((x > 0).to(torch.float32), dim=sum_dims) - if min_positive == 0.0: - factor1 = 0.0 - else: - # 0 if proportion_positive >= min_positive, else can be - # as large as max_factor. - factor1 = ( - (min_positive - proportion_positive) * (gain_factor / min_positive) - ).clamp_(min=0, max=max_factor) - - if max_positive == 1.0: - factor2 = 0.0 - else: - # 0 if self.proportion_positive <= max_positive, else can be - # as large as -max_factor. - factor2 = ( - (proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive)) - ).clamp_(min=0, max=max_factor) - sign_factor = factor1 - factor2 - # require min_positive != 0 or max_positive != 1: - assert not isinstance(sign_factor, float) - return sign_factor - - -class ActivationBalancer(torch.nn.Module): - """ - Modifies the backpropped derivatives of a function to try to encourage, for - each channel, that it is positive at least a proportion `threshold` of the - time. It does this by multiplying negative derivative values by up to - (1+max_factor), and positive derivative values by up to (1-max_factor), - interpolated from 1 at the threshold to those extremal values when none - of the inputs are positive. - - Args: - num_channels: the number of channels - channel_dim: the dimension/axis corresponding to the channel, e.g. - -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative. - min_positive: the minimum, per channel, of the proportion of the time - that (x > 0), below which we start to modify the derivatives. - max_positive: the maximum, per channel, of the proportion of the time - that (x > 0), above which we start to modify the derivatives. - max_factor: the maximum factor by which we modify the derivatives for - either the sign constraint or the magnitude constraint; - e.g. with max_factor=0.02, the the derivatives would be multiplied by - values in the range [0.98..1.02]. - sign_gain_factor: determines the 'gain' with which we increase the - change in gradient once the constraints on min_positive and max_positive - are violated. - scale_gain_factor: determines the 'gain' with which we increase the - change in gradient once the constraints on min_abs and max_abs - are violated. - min_abs: the minimum average-absolute-value difference from the mean - value per channel, which we allow, before we start to modify - the derivatives to prevent this. - max_abs: the maximum average-absolute-value difference from the mean - value per channel, which we allow, before we start to modify - the derivatives to prevent this. - min_prob: determines the minimum probability with which we modify the - gradients for the {min,max}_positive and {min,max}_abs constraints, - on each forward(). This is done randomly to prevent all layers - from doing it at the same time. Early in training we may use - higher probabilities than this; it will decay to this value. - """ - - def __init__( - self, - num_channels: int, - channel_dim: int, - min_positive: float = 0.05, - max_positive: float = 0.95, - max_factor: float = 0.04, - sign_gain_factor: float = 0.01, - scale_gain_factor: float = 0.02, - min_abs: float = 0.2, - max_abs: float = 100.0, - min_prob: float = 0.1, - ): - super(ActivationBalancer, self).__init__() - self.num_channels = num_channels - self.channel_dim = channel_dim - self.min_positive = min_positive - self.max_positive = max_positive - self.max_factor = max_factor - self.min_abs = min_abs - self.max_abs = max_abs - self.min_prob = min_prob - self.sign_gain_factor = sign_gain_factor - self.scale_gain_factor = scale_gain_factor - - # count measures how many times the forward() function has been called. - # We occasionally sync this to a tensor called `count`, that exists to - # make sure it is synced to disk when we load and save the model. - self.cpu_count = 0 - self.register_buffer("count", torch.tensor(0, dtype=torch.int64)) - - def forward(self, x: Tensor) -> Tensor: - if torch.jit.is_scripting() or not x.requires_grad or torch.jit.is_tracing(): - return _no_op(x) - - count = self.cpu_count - self.cpu_count += 1 - - if random.random() < 0.01: - # Occasionally sync self.cpu_count with self.count. - # count affects the decay of 'prob'. don't do this on every iter, - # because syncing with the GPU is slow. - self.cpu_count = max(self.cpu_count, self.count.item()) - self.count.fill_(self.cpu_count) - - # the prob of doing some work exponentially decreases from 0.5 till it hits - # a floor at min_prob (==0.1, by default) - prob = max(self.min_prob, 0.5 ** (1 + (count / 4000.0))) - - if random.random() < prob: - sign_gain_factor = 0.5 - if self.min_positive != 0.0 or self.max_positive != 1.0: - sign_factor = _compute_sign_factor( - x, - self.channel_dim, - self.min_positive, - self.max_positive, - gain_factor=self.sign_gain_factor / prob, - max_factor=self.max_factor, - ) - else: - sign_factor = None - - scale_factor = _compute_scale_factor( - x.detach(), - self.channel_dim, - min_abs=self.min_abs, - max_abs=self.max_abs, - gain_factor=self.scale_gain_factor / prob, - max_factor=self.max_factor, - ) - return ActivationBalancerFunction.apply( - x, - scale_factor, - sign_factor, - self.channel_dim, - ) - else: - return _no_op(x) - - -def BalancedDoubleSwish( - d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25 -) -> nn.Sequential: - """ - ActivationBalancer -> DoubleSwish - """ - balancer = ActivationBalancer( - d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob - ) - return nn.Sequential( - balancer, - DoubleSwish(), - ) diff --git a/AR/modules/transformer.py b/AR/modules/transformer.py deleted file mode 100644 index 7921f48e70bd6849897389d8e0a39d1ac4062b97..0000000000000000000000000000000000000000 --- a/AR/modules/transformer.py +++ /dev/null @@ -1,378 +0,0 @@ -# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py -import copy -import numbers -from functools import partial -from typing import Any -from typing import Callable -from typing import List -from typing import Optional -from typing import Tuple -from typing import Union - -import torch -from AR.modules.activation import MultiheadAttention -from AR.modules.scaling import BalancedDoubleSwish -from torch import nn -from torch import Tensor -from torch.nn import functional as F - -_shape_t = Union[int, List[int], torch.Size] - - -class LayerNorm(nn.Module): - __constants__ = ["normalized_shape", "eps", "elementwise_affine"] - normalized_shape: Tuple[int, ...] - eps: float - elementwise_affine: bool - - def __init__( - self, - normalized_shape: _shape_t, - eps: float = 1e-5, - elementwise_affine: bool = True, - device=None, - dtype=None, - ) -> None: - factory_kwargs = {"device": device, "dtype": dtype} - super(LayerNorm, self).__init__() - if isinstance(normalized_shape, numbers.Integral): - # mypy error: incompatible types in assignment - normalized_shape = (normalized_shape,) # type: ignore[assignment] - self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] - self.eps = eps - self.elementwise_affine = elementwise_affine - if self.elementwise_affine: - self.weight = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) - self.bias = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) - else: - self.register_parameter("weight", None) - self.register_parameter("bias", None) - - self.reset_parameters() - - def reset_parameters(self) -> None: - if self.elementwise_affine: - nn.init.ones_(self.weight) - nn.init.zeros_(self.bias) - - def forward(self, input: Tensor, embedding: Any = None) -> Tensor: - if isinstance(input, tuple): - input, embedding = input - return ( - F.layer_norm( - input, - self.normalized_shape, - self.weight, - self.bias, - self.eps, - ), - embedding, - ) - - assert embedding is None - return F.layer_norm( - input, self.normalized_shape, self.weight, self.bias, self.eps - ) - - def extra_repr(self) -> str: - return ( - "{normalized_shape}, eps={eps}, " - "elementwise_affine={elementwise_affine}".format(**self.__dict__) - ) - - -class IdentityNorm(nn.Module): - def __init__( - self, - d_model: int, - eps: float = 1e-5, - device=None, - dtype=None, - ) -> None: - super(IdentityNorm, self).__init__() - - def forward(self, input: Tensor, embedding: Any = None) -> Tensor: - if isinstance(input, tuple): - return input - - assert embedding is None - return input - - -class TransformerEncoder(nn.Module): - r"""TransformerEncoder is a stack of N encoder layers. Users can build the - BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters. - - Args: - encoder_layer: an instance of the TransformerEncoderLayer() class (required). - num_layers: the number of sub-encoder-layers in the encoder (required). - norm: the layer normalization component (optional). - enable_nested_tensor: if True, input will automatically convert to nested tensor - (and convert back on output). This will improve the overall performance of - TransformerEncoder when padding rate is high. Default: ``True`` (enabled). - - Examples:: - >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8) - >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6) - >>> src = torch.rand(10, 32, 512) - >>> out = transformer_encoder(src) - """ - __constants__ = ["norm"] - - def __init__(self, encoder_layer, num_layers, norm=None): - super(TransformerEncoder, self).__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - - def forward( - self, - src: Tensor, - mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, - return_layer_states: bool = False, - cache=None, - ) -> Tensor: - r"""Pass the input through the encoder layers in turn. - - Args: - src: the sequence to the encoder (required). - mask: the mask for the src sequence (optional). - src_key_padding_mask: the mask for the src keys per batch (optional). - return_layer_states: return layers' state (optional). - - Shape: - see the docs in Transformer class. - """ - if return_layer_states: - layer_states = [] # layers' output - output = src - for mod in self.layers: - output = mod( - output, - src_mask=mask, - src_key_padding_mask=src_key_padding_mask, - cache=cache, - ) - layer_states.append(output[0]) - - if self.norm is not None: - output = self.norm(output) - - return layer_states, output - - output = src - for mod in self.layers: - output = mod( - output, - src_mask=mask, - src_key_padding_mask=src_key_padding_mask, - cache=cache, - ) - - if self.norm is not None: - output = self.norm(output) - - return output - - -class TransformerEncoderLayer(nn.Module): - __constants__ = ["batch_first", "norm_first"] - - def __init__( - self, - d_model: int, - nhead: int, - dim_feedforward: int = 2048, - dropout: float = 0.1, - activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, - batch_first: bool = False, - norm_first: bool = False, - device=None, - dtype=None, - linear1_self_attention_cls: nn.Module = nn.Linear, - linear2_self_attention_cls: nn.Module = nn.Linear, - linear1_feedforward_cls: nn.Module = nn.Linear, - linear2_feedforward_cls: nn.Module = nn.Linear, - layer_norm_cls: nn.Module = LayerNorm, - layer_norm_eps: float = 1e-5, - adaptive_layer_norm=False, - ) -> None: - factory_kwargs = {"device": device, "dtype": dtype} - super(TransformerEncoderLayer, self).__init__() - # print(233333333333,d_model,nhead) - # import os - # os._exit(2333333) - self.self_attn = MultiheadAttention( - d_model, # 512 16 - nhead, - dropout=dropout, - batch_first=batch_first, - linear1_cls=linear1_self_attention_cls, - linear2_cls=linear2_self_attention_cls, - **factory_kwargs, - ) - - # Implementation of Feedforward model - self.linear1 = linear1_feedforward_cls( - d_model, dim_feedforward, **factory_kwargs - ) - self.dropout = nn.Dropout(dropout) - self.linear2 = linear2_feedforward_cls( - dim_feedforward, d_model, **factory_kwargs - ) - - self.norm_first = norm_first - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - - # Legacy string support for activation function. - if isinstance(activation, str): - activation = _get_activation_fn(activation) - elif isinstance(activation, partial): - activation = activation(d_model) - elif activation == BalancedDoubleSwish: - activation = BalancedDoubleSwish(d_model) - - # # We can't test self.activation in forward() in TorchScript, - # # so stash some information about it instead. - # if activation is F.relu or isinstance(activation, torch.nn.ReLU): - # self.activation_relu_or_gelu = 1 - # elif activation is F.gelu or isinstance(activation, torch.nn.GELU): - # self.activation_relu_or_gelu = 2 - # else: - # self.activation_relu_or_gelu = 0 - self.activation = activation - - norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) - if layer_norm_cls == IdentityNorm: - norm2 = BalancedBasicNorm(d_model, eps=layer_norm_eps, **factory_kwargs) - else: - norm2 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) - - if adaptive_layer_norm: - self.norm1 = AdaptiveLayerNorm(d_model, norm1) - self.norm2 = AdaptiveLayerNorm(d_model, norm2) - else: - self.norm1 = norm1 - self.norm2 = norm2 - - def __setstate__(self, state): - super(TransformerEncoderLayer, self).__setstate__(state) - if not hasattr(self, "activation"): - self.activation = F.relu - - def forward( - self, - src: Tensor, - src_mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, - cache=None, - ) -> Tensor: - r"""Pass the input through the encoder layer. - - Args: - src: the sequence to the encoder layer (required). - src_mask: the mask for the src sequence (optional). - src_key_padding_mask: the mask for the src keys per batch (optional). - - Shape: - see the docs in Transformer class. - """ - x, stage_embedding = src, None - is_src_tuple = False - if isinstance(src, tuple): - x, stage_embedding = src - is_src_tuple = True - - if src_key_padding_mask is not None: - _skpm_dtype = src_key_padding_mask.dtype - if _skpm_dtype != torch.bool and not torch.is_floating_point( - src_key_padding_mask - ): - raise AssertionError( - "only bool and floating types of key_padding_mask are supported" - ) - - if self.norm_first: - x = x + self._sa_block( - self.norm1(x, stage_embedding), - src_mask, - src_key_padding_mask, - cache=cache, - ) - x = x + self._ff_block(self.norm2(x, stage_embedding)) - else: - x = self.norm1( - x + self._sa_block(x, src_mask, src_key_padding_mask, cache=cache), - stage_embedding, - ) - x = self.norm2(x + self._ff_block(x), stage_embedding) - - if is_src_tuple: - return (x, stage_embedding) - return x - - # self-attention block - def _sa_block( - self, - x: Tensor, - attn_mask: Optional[Tensor], - key_padding_mask: Optional[Tensor], - cache=None, - ) -> Tensor: - # print(x.shape,attn_mask.shape,key_padding_mask) - # torch.Size([1, 188, 512]) torch.Size([188, 188]) None - # import os - # os._exit(23333) - x = self.self_attn( - x, - x, - x, - attn_mask=attn_mask, - key_padding_mask=key_padding_mask, - need_weights=False, - cache=cache, - )[0] - return self.dropout1(x) - - # feed forward block - def _ff_block(self, x: Tensor) -> Tensor: - x = self.linear2(self.dropout(self.activation(self.linear1(x)))) - return self.dropout2(x) - - -class AdaptiveLayerNorm(nn.Module): - r"""Adaptive Layer Normalization""" - - def __init__(self, d_model, norm) -> None: - super(AdaptiveLayerNorm, self).__init__() - self.project_layer = nn.Linear(d_model, 2 * d_model) - self.norm = norm - self.d_model = d_model - self.eps = self.norm.eps - - def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor: - if isinstance(input, tuple): - input, embedding = input - weight, bias = torch.split( - self.project_layer(embedding), - split_size_or_sections=self.d_model, - dim=-1, - ) - return (weight * self.norm(input) + bias, embedding) - - weight, bias = torch.split( - self.project_layer(embedding), - split_size_or_sections=self.d_model, - dim=-1, - ) - return weight * self.norm(input) + bias - - -def _get_clones(module, N): - return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) diff --git a/AR/modules/transformer_onnx.py b/AR/modules/transformer_onnx.py deleted file mode 100644 index a3f68b43e7a4f3c8d989140009477e07a2d44d19..0000000000000000000000000000000000000000 --- a/AR/modules/transformer_onnx.py +++ /dev/null @@ -1,292 +0,0 @@ -# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py -import copy -import numbers -from functools import partial -from typing import Any -from typing import Callable -from typing import List -from typing import Optional -from typing import Tuple -from typing import Union - -import torch -from AR.modules.activation_onnx import MultiheadAttention -from AR.modules.scaling import BalancedDoubleSwish -from torch import nn -from torch import Tensor -from torch.nn import functional as F - -_shape_t = Union[int, List[int], torch.Size] - - -class LayerNorm(nn.Module): - __constants__ = ["normalized_shape", "eps", "elementwise_affine"] - normalized_shape: Tuple[int, ...] - eps: float - elementwise_affine: bool - - def __init__( - self, - normalized_shape: _shape_t, - eps: float = 1e-5, - elementwise_affine: bool = True, - device=None, - dtype=None, - ) -> None: - factory_kwargs = {"device": device, "dtype": dtype} - super(LayerNorm, self).__init__() - if isinstance(normalized_shape, numbers.Integral): - # mypy error: incompatible types in assignment - normalized_shape = (normalized_shape,) # type: ignore[assignment] - self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] - self.eps = eps - self.elementwise_affine = elementwise_affine - if self.elementwise_affine: - self.weight = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) - self.bias = nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) - else: - self.register_parameter("weight", None) - self.register_parameter("bias", None) - - self.reset_parameters() - - def reset_parameters(self) -> None: - if self.elementwise_affine: - nn.init.ones_(self.weight) - nn.init.zeros_(self.bias) - - def forward(self, input: Tensor, embedding: Any = None) -> Tensor: - if isinstance(input, tuple): - input, embedding = input - return ( - F.layer_norm( - input, - self.normalized_shape, - self.weight, - self.bias, - self.eps, - ), - embedding, - ) - - assert embedding is None - return F.layer_norm( - input, self.normalized_shape, self.weight, self.bias, self.eps - ) - - def extra_repr(self) -> str: - return ( - "{normalized_shape}, eps={eps}, " - "elementwise_affine={elementwise_affine}".format(**self.__dict__) - ) - - -class IdentityNorm(nn.Module): - def __init__( - self, - d_model: int, - eps: float = 1e-5, - device=None, - dtype=None, - ) -> None: - super(IdentityNorm, self).__init__() - - def forward(self, input: Tensor, embedding: Any = None) -> Tensor: - if isinstance(input, tuple): - return input - - assert embedding is None - return input - - -class TransformerEncoder(nn.Module): - r"""TransformerEncoder is a stack of N encoder layers. Users can build the - BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters. - - Args: - encoder_layer: an instance of the TransformerEncoderLayer() class (required). - num_layers: the number of sub-encoder-layers in the encoder (required). - norm: the layer normalization component (optional). - enable_nested_tensor: if True, input will automatically convert to nested tensor - (and convert back on output). This will improve the overall performance of - TransformerEncoder when padding rate is high. Default: ``True`` (enabled). - - Examples:: - >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8) - >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6) - >>> src = torch.rand(10, 32, 512) - >>> out = transformer_encoder(src) - """ - __constants__ = ["norm"] - - def __init__(self, encoder_layer, num_layers, norm=None): - super(TransformerEncoder, self).__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - - def forward( - self, - src: Tensor, - mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, - return_layer_states: bool = False, - cache=None, - ) -> Tensor: - output = src - for mod in self.layers: - output = mod( - output, - src_mask=mask, - src_key_padding_mask=src_key_padding_mask, - cache=cache, - ) - - if self.norm is not None: - output = self.norm(output) - - return output - - -class TransformerEncoderLayer(nn.Module): - __constants__ = ["batch_first", "norm_first"] - def __init__( - self, - d_model: int, - nhead: int, - dim_feedforward: int = 2048, - dropout: float = 0.1, - activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, - batch_first: bool = False, - norm_first: bool = False, - device=None, - dtype=None, - linear1_self_attention_cls: nn.Module = nn.Linear, - linear2_self_attention_cls: nn.Module = nn.Linear, - linear1_feedforward_cls: nn.Module = nn.Linear, - linear2_feedforward_cls: nn.Module = nn.Linear, - layer_norm_cls: nn.Module = LayerNorm, - layer_norm_eps: float = 1e-5, - adaptive_layer_norm=False, - ) -> None: - factory_kwargs = {"device": device, "dtype": dtype} - super(TransformerEncoderLayer, self).__init__() - self.self_attn = MultiheadAttention( - d_model, # 512 16 - nhead, - dropout=dropout, - batch_first=batch_first, - linear1_cls=linear1_self_attention_cls, - linear2_cls=linear2_self_attention_cls, - **factory_kwargs, - ) - self.linear1 = linear1_feedforward_cls( - d_model, dim_feedforward, **factory_kwargs - ) - self.dropout = nn.Dropout(dropout) - self.linear2 = linear2_feedforward_cls( - dim_feedforward, d_model, **factory_kwargs - ) - self.norm_first = norm_first - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - if isinstance(activation, str): - activation = _get_activation_fn(activation) - elif isinstance(activation, partial): - activation = activation(d_model) - elif activation == BalancedDoubleSwish: - activation = BalancedDoubleSwish(d_model) - self.activation = activation - - norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) - if layer_norm_cls == IdentityNorm: - norm2 = BalancedBasicNorm(d_model, eps=layer_norm_eps, **factory_kwargs) - else: - norm2 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs) - - if adaptive_layer_norm: - self.norm1 = AdaptiveLayerNorm(d_model, norm1) - self.norm2 = AdaptiveLayerNorm(d_model, norm2) - else: - self.norm1 = norm1 - self.norm2 = norm2 - - def __setstate__(self, state): - super(TransformerEncoderLayer, self).__setstate__(state) - if not hasattr(self, "activation"): - self.activation = F.relu - - def forward( - self, - src: Tensor, - src_mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, - cache=None, - ) -> Tensor: - x = src - stage_embedding = None - x = self.norm1( - x + self._sa_block(x, src_mask, src_key_padding_mask, cache=cache), - stage_embedding, - ) - x = self.norm2(x + self._ff_block(x), stage_embedding) - - return x - - def _sa_block( - self, - x: Tensor, - attn_mask: Optional[Tensor], - key_padding_mask: Optional[Tensor], - cache=None, - ) -> Tensor: - x = self.self_attn( - x, - x, - x, - attn_mask=attn_mask, - key_padding_mask=key_padding_mask, - need_weights=False, - cache=cache, - ) - return self.dropout1(x) - - def _ff_block(self, x: Tensor) -> Tensor: - x = self.linear2(self.dropout(self.activation(self.linear1(x)))) - return self.dropout2(x) - - -class AdaptiveLayerNorm(nn.Module): - r"""Adaptive Layer Normalization""" - - def __init__(self, d_model, norm) -> None: - super(AdaptiveLayerNorm, self).__init__() - self.project_layer = nn.Linear(d_model, 2 * d_model) - self.norm = norm - self.d_model = d_model - self.eps = self.norm.eps - - def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor: - if isinstance(input, tuple): - input, embedding = input - weight, bias = torch.split( - self.project_layer(embedding), - split_size_or_sections=self.d_model, - dim=-1, - ) - return (weight * self.norm(input) + bias, embedding) - - weight, bias = torch.split( - self.project_layer(embedding), - split_size_or_sections=self.d_model, - dim=-1, - ) - return weight * self.norm(input) + bias - - -def _get_clones(module, N): - return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) diff --git a/AR/text_processing/__init__.py b/AR/text_processing/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/AR/text_processing/phonemizer.py b/AR/text_processing/phonemizer.py deleted file mode 100644 index 9c5f58fb74da836764cc9d71b8556e979f2b2830..0000000000000000000000000000000000000000 --- a/AR/text_processing/phonemizer.py +++ /dev/null @@ -1,79 +0,0 @@ -# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py -# reference: https://github.com/lifeiteng/vall-e -import itertools -import re -from typing import Dict -from typing import List - -import regex -from gruut import sentences -from gruut.const import Sentence -from gruut.const import Word -from AR.text_processing.symbols import SYMBOL_TO_ID - - -class GruutPhonemizer: - def __init__(self, language: str): - self._phonemizer = sentences - self.lang = language - self.symbol_to_id = SYMBOL_TO_ID - self._special_cases_dict: Dict[str] = { - r"\.\.\.": "... ", - ";": "; ", - ":": ": ", - ",": ", ", - r"\.": ". ", - "!": "! ", - r"\?": "? ", - "—": "—", - "…": "… ", - "«": "«", - "»": "»", - } - self._punctuation_regexp: str = ( - rf"([{''.join(self._special_cases_dict.keys())}])" - ) - - def _normalize_punctuation(self, text: str) -> str: - text = regex.sub(rf"\pZ+{self._punctuation_regexp}", r"\1", text) - text = regex.sub(rf"{self._punctuation_regexp}(\pL)", r"\1 \2", text) - text = regex.sub(r"\pZ+", r" ", text) - return text.strip() - - def _convert_punctuation(self, word: Word) -> str: - if not word.phonemes: - return "" - if word.phonemes[0] in ["‖", "|"]: - return word.text.strip() - - phonemes = "".join(word.phonemes) - # remove modifier characters ˈˌː with regex - phonemes = re.sub(r"[ˈˌː͡]", "", phonemes) - return phonemes.strip() - - def phonemize(self, text: str, espeak: bool = False) -> str: - text_to_phonemize: str = self._normalize_punctuation(text) - sents: List[Sentence] = [ - sent - for sent in self._phonemizer(text_to_phonemize, lang="en-us", espeak=espeak) - ] - words: List[str] = [ - self._convert_punctuation(word) for word in itertools.chain(*sents) - ] - return " ".join(words) - - def transform(self, phonemes): - # convert phonemes to ids - # dictionary is in symbols.py - return [self.symbol_to_id[p] for p in phonemes if p in self.symbol_to_id.keys()] - - -if __name__ == "__main__": - phonemizer = GruutPhonemizer("en-us") - # text -> IPA - phonemes = phonemizer.phonemize("Hello, wor-ld ?") - print("phonemes:", phonemes) - print("len(phonemes):", len(phonemes)) - phoneme_ids = phonemizer.transform(phonemes) - print("phoneme_ids:", phoneme_ids) - print("len(phoneme_ids):", len(phoneme_ids)) diff --git a/AR/text_processing/symbols.py b/AR/text_processing/symbols.py deleted file mode 100644 index 7d754a78b1fd5b3d89768585e1891404bb318118..0000000000000000000000000000000000000000 --- a/AR/text_processing/symbols.py +++ /dev/null @@ -1,10 +0,0 @@ -# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py -# reference: https://github.com/lifeiteng/vall-e -PAD = "_" -PUNCTUATION = ';:,.!?¡¿—…"«»“” ' -LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" -IPA_LETTERS = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" -SYMBOLS = [PAD] + list(PUNCTUATION) + list(LETTERS) + list(IPA_LETTERS) -SPACE_ID = SYMBOLS.index(" ") -SYMBOL_TO_ID = {s: i for i, s in enumerate(SYMBOLS)} -ID_TO_SYMBOL = {i: s for i, s in enumerate(SYMBOLS)} diff --git a/AR/utils/__init__.py b/AR/utils/__init__.py deleted file mode 100644 index c2eaf61adcfee96d6e7ec8fd70a7603e18afb567..0000000000000000000000000000000000000000 --- a/AR/utils/__init__.py +++ /dev/null @@ -1,37 +0,0 @@ -import re - - -def str2bool(str): - return True if str.lower() == 'true' else False - - -def get_newest_ckpt(string_list): - # 定义一个正则表达式模式,用于匹配字符串中的数字 - pattern = r'epoch=(\d+)-step=(\d+)\.ckpt' - - # 使用正则表达式提取每个字符串中的数字信息,并创建一个包含元组的列表 - extracted_info = [] - for string in string_list: - match = re.match(pattern, string) - if match: - epoch = int(match.group(1)) - step = int(match.group(2)) - extracted_info.append((epoch, step, string)) - # 按照 epoch 后面的数字和 step 后面的数字进行排序 - sorted_info = sorted( - extracted_info, key=lambda x: (x[0], x[1]), reverse=True) - # 获取最新的 ckpt 文件名 - newest_ckpt = sorted_info[0][2] - return newest_ckpt - - -# 文本存在且不为空时 return True -def check_txt_file(file_path): - try: - with open(file_path, 'r') as file: - text = file.readline().strip() - assert text.strip() != '' - return text - except Exception: - return False - return False diff --git a/AR/utils/initialize.py b/AR/utils/initialize.py deleted file mode 100644 index 17ff9f92e51c8941973139d6e34d4a1c7cd8daaa..0000000000000000000000000000000000000000 --- a/AR/utils/initialize.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -"""Initialize modules for espnet2 neural networks.""" -import torch -from typeguard import check_argument_types - - -def initialize(model: torch.nn.Module, init: str): - """Initialize weights of a neural network module. - - Parameters are initialized using the given method or distribution. - - Custom initialization routines can be implemented into submodules - as function `espnet_initialization_fn` within the custom module. - - Args: - model: Target. - init: Method of initialization. - """ - assert check_argument_types() - print("init with", init) - - # weight init - for p in model.parameters(): - if p.dim() > 1: - if init == "xavier_uniform": - torch.nn.init.xavier_uniform_(p.data) - elif init == "xavier_normal": - torch.nn.init.xavier_normal_(p.data) - elif init == "kaiming_uniform": - torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu") - elif init == "kaiming_normal": - torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu") - else: - raise ValueError("Unknown initialization: " + init) - # bias init - for name, p in model.named_parameters(): - if ".bias" in name and p.dim() == 1: - p.data.zero_() diff --git a/AR/utils/io.py b/AR/utils/io.py deleted file mode 100644 index 52f1f3c991506a9ea5d3fdbc71ded61e914694f1..0000000000000000000000000000000000000000 --- a/AR/utils/io.py +++ /dev/null @@ -1,34 +0,0 @@ -import sys - -import torch -import yaml - - -def load_yaml_config(path): - with open(path) as f: - config = yaml.full_load(f) - return config - - -def save_config_to_yaml(config, path): - assert path.endswith(".yaml") - with open(path, "w") as f: - f.write(yaml.dump(config)) - f.close() - - -def write_args(args, path): - args_dict = dict( - (name, getattr(args, name)) for name in dir(args) if not name.startswith("_") - ) - with open(path, "a") as args_file: - args_file.write("==> torch version: {}\n".format(torch.__version__)) - args_file.write( - "==> cudnn version: {}\n".format(torch.backends.cudnn.version()) - ) - args_file.write("==> Cmd:\n") - args_file.write(str(sys.argv)) - args_file.write("\n==> args:\n") - for k, v in sorted(args_dict.items()): - args_file.write(" %s: %s\n" % (str(k), str(v))) - args_file.close() diff --git a/inference_cli.py b/inference_cli.py deleted file mode 100644 index bd987aaf8e27e79a1f0050a401583b2fbeac8679..0000000000000000000000000000000000000000 --- a/inference_cli.py +++ /dev/null @@ -1,55 +0,0 @@ -import argparse -import os -import soundfile as sf - -from tools.i18n.i18n import I18nAuto -from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav - -i18n = I18nAuto() - -def synthesize(GPT_model_path, SoVITS_model_path, ref_audio_path, ref_text_path, ref_language, target_text_path, target_language, output_path): - # Read reference text - with open(ref_text_path, 'r', encoding='utf-8') as file: - ref_text = file.read() - - # Read target text - with open(target_text_path, 'r', encoding='utf-8') as file: - target_text = file.read() - - # Change model weights - change_gpt_weights(gpt_path=GPT_model_path) - change_sovits_weights(sovits_path=SoVITS_model_path) - - # Synthesize audio - synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, - prompt_text=ref_text, - prompt_language=i18n(ref_language), - text=target_text, - text_language=i18n(target_language), top_p=1, temperature=1) - - result_list = list(synthesis_result) - - if result_list: - last_sampling_rate, last_audio_data = result_list[-1] - output_wav_path = os.path.join(output_path, "output.wav") - sf.write(output_wav_path, last_audio_data, last_sampling_rate) - print(f"Audio saved to {output_wav_path}") - -def main(): - parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool") - parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file") - parser.add_argument('--sovits_model', required=True, help="Path to the SoVITS model file") - parser.add_argument('--ref_audio', required=True, help="Path to the reference audio file") - parser.add_argument('--ref_text', required=True, help="Path to the reference text file") - parser.add_argument('--ref_language', required=True, choices=["中文", "英文", "日文"], help="Language of the reference audio") - parser.add_argument('--target_text', required=True, help="Path to the target text file") - parser.add_argument('--target_language', required=True, choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], help="Language of the target text") - parser.add_argument('--output_path', required=True, help="Path to the output directory") - - args = parser.parse_args() - - synthesize(args.gpt_model, args.sovits_model, args.ref_audio, args.ref_text, args.ref_language, args.target_text, args.target_language, args.output_path) - -if __name__ == '__main__': - main() - diff --git a/inference_gui.py b/inference_gui.py deleted file mode 100644 index 2059155da8115b7e5c80eea42b2cc0859c7e15ce..0000000000000000000000000000000000000000 --- a/inference_gui.py +++ /dev/null @@ -1,310 +0,0 @@ -import os -import sys -from PyQt5.QtCore import QEvent -from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit -from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox -import soundfile as sf - -from tools.i18n.i18n import I18nAuto -i18n = I18nAuto() - -from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav - - -class GPTSoVITSGUI(QMainWindow): - GPT_Path = gpt_path - SoVITS_Path = sovits_path - - def __init__(self): - super().__init__() - - self.setWindowTitle('GPT-SoVITS GUI') - self.setGeometry(800, 450, 950, 850) - - self.setStyleSheet(""" - QWidget { - background-color: #a3d3b1; - } - - QTabWidget::pane { - background-color: #a3d3b1; - } - - QTabWidget::tab-bar { - alignment: left; - } - - QTabBar::tab { - background: #8da4bf; - color: #ffffff; - padding: 8px; - } - - QTabBar::tab:selected { - background: #2a3f54; - } - - QLabel { - color: #000000; - } - - QPushButton { - background-color: #4CAF50; - color: white; - padding: 8px; - border: 1px solid #4CAF50; - border-radius: 4px; - } - - QPushButton:hover { - background-color: #45a049; - border: 1px solid #45a049; - box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1); - } - """) - - license_text = ( - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. " - "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") - license_label = QLabel(license_text) - license_label.setWordWrap(True) - - self.GPT_model_label = QLabel("选择GPT模型:") - self.GPT_model_input = QLineEdit() - self.GPT_model_input.setPlaceholderText("拖拽或选择文件") - self.GPT_model_input.setText(self.GPT_Path) - self.GPT_model_input.setReadOnly(True) - self.GPT_model_button = QPushButton("选择GPT模型文件") - self.GPT_model_button.clicked.connect(self.select_GPT_model) - - self.SoVITS_model_label = QLabel("选择SoVITS模型:") - self.SoVITS_model_input = QLineEdit() - self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件") - self.SoVITS_model_input.setText(self.SoVITS_Path) - self.SoVITS_model_input.setReadOnly(True) - self.SoVITS_model_button = QPushButton("选择SoVITS模型文件") - self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model) - - self.ref_audio_label = QLabel("上传参考音频:") - self.ref_audio_input = QLineEdit() - self.ref_audio_input.setPlaceholderText("拖拽或选择文件") - self.ref_audio_input.setReadOnly(True) - self.ref_audio_button = QPushButton("选择音频文件") - self.ref_audio_button.clicked.connect(self.select_ref_audio) - - self.ref_text_label = QLabel("参考音频文本:") - self.ref_text_input = QLineEdit() - self.ref_text_input.setPlaceholderText("直接输入文字或上传文本") - self.ref_text_button = QPushButton("上传文本") - self.ref_text_button.clicked.connect(self.upload_ref_text) - - self.ref_language_label = QLabel("参考音频语言:") - self.ref_language_combobox = QComboBox() - self.ref_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) - self.ref_language_combobox.setCurrentText("多语种混合") - - self.target_text_label = QLabel("合成目标文本:") - self.target_text_input = QLineEdit() - self.target_text_input.setPlaceholderText("直接输入文字或上传文本") - self.target_text_button = QPushButton("上传文本") - self.target_text_button.clicked.connect(self.upload_target_text) - - self.target_language_label = QLabel("合成音频语言:") - self.target_language_combobox = QComboBox() - self.target_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) - self.target_language_combobox.setCurrentText("多语种混合") - - self.output_label = QLabel("输出音频路径:") - self.output_input = QLineEdit() - self.output_input.setPlaceholderText("拖拽或选择文件") - self.output_input.setReadOnly(True) - self.output_button = QPushButton("选择文件夹") - self.output_button.clicked.connect(self.select_output_path) - - self.output_text = QTextEdit() - self.output_text.setReadOnly(True) - - self.add_drag_drop_events([ - self.GPT_model_input, - self.SoVITS_model_input, - self.ref_audio_input, - self.ref_text_input, - self.target_text_input, - self.output_input, - ]) - - self.synthesize_button = QPushButton("合成") - self.synthesize_button.clicked.connect(self.synthesize) - - self.clear_output_button = QPushButton("清空输出") - self.clear_output_button.clicked.connect(self.clear_output) - - self.status_bar = QStatusBar() - - main_layout = QVBoxLayout() - - input_layout = QGridLayout(self) - input_layout.setSpacing(10) - - input_layout.addWidget(license_label, 0, 0, 1, 3) - - input_layout.addWidget(self.GPT_model_label, 1, 0) - input_layout.addWidget(self.GPT_model_input, 2, 0, 1, 2) - input_layout.addWidget(self.GPT_model_button, 2, 2) - - input_layout.addWidget(self.SoVITS_model_label, 3, 0) - input_layout.addWidget(self.SoVITS_model_input, 4, 0, 1, 2) - input_layout.addWidget(self.SoVITS_model_button, 4, 2) - - input_layout.addWidget(self.ref_audio_label, 5, 0) - input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2) - input_layout.addWidget(self.ref_audio_button, 6, 2) - - input_layout.addWidget(self.ref_language_label, 7, 0) - input_layout.addWidget(self.ref_language_combobox, 8, 0, 1, 1) - input_layout.addWidget(self.ref_text_label, 9, 0) - input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2) - input_layout.addWidget(self.ref_text_button, 10, 2) - - input_layout.addWidget(self.target_language_label, 11, 0) - input_layout.addWidget(self.target_language_combobox, 12, 0, 1, 1) - input_layout.addWidget(self.target_text_label, 13, 0) - input_layout.addWidget(self.target_text_input, 14, 0, 1, 2) - input_layout.addWidget(self.target_text_button, 14, 2) - - input_layout.addWidget(self.output_label, 15, 0) - input_layout.addWidget(self.output_input, 16, 0, 1, 2) - input_layout.addWidget(self.output_button, 16, 2) - - main_layout.addLayout(input_layout) - - output_layout = QVBoxLayout() - output_layout.addWidget(self.output_text) - main_layout.addLayout(output_layout) - - main_layout.addWidget(self.synthesize_button) - - main_layout.addWidget(self.clear_output_button) - - main_layout.addWidget(self.status_bar) - - self.central_widget = QWidget() - self.central_widget.setLayout(main_layout) - self.setCentralWidget(self.central_widget) - - def dragEnterEvent(self, event): - if event.mimeData().hasUrls(): - event.acceptProposedAction() - - def dropEvent(self, event): - if event.mimeData().hasUrls(): - file_paths = [url.toLocalFile() for url in event.mimeData().urls()] - if len(file_paths) == 1: - self.update_ref_audio(file_paths[0]) - else: - self.update_ref_audio(", ".join(file_paths)) - - def add_drag_drop_events(self, widgets): - for widget in widgets: - widget.setAcceptDrops(True) - widget.installEventFilter(self) - - def eventFilter(self, obj, event): - if event.type() in (QEvent.DragEnter, QEvent.Drop): - mime_data = event.mimeData() - if mime_data.hasUrls(): - event.acceptProposedAction() - - return super().eventFilter(obj, event) - - def select_GPT_model(self): - file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)") - if file_path: - self.GPT_model_input.setText(file_path) - - def select_SoVITS_model(self): - file_path, _ = QFileDialog.getOpenFileName(self, "选择SoVITS模型文件", "", "SoVITS Files (*.pth)") - if file_path: - self.SoVITS_model_input.setText(file_path) - - def select_ref_audio(self): - file_path, _ = QFileDialog.getOpenFileName(self, "选择参考音频文件", "", "Audio Files (*.wav *.mp3)") - if file_path: - self.update_ref_audio(file_path) - - def upload_ref_text(self): - file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") - if file_path: - with open(file_path, 'r', encoding='utf-8') as file: - content = file.read() - self.ref_text_input.setText(content) - - def upload_target_text(self): - file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") - if file_path: - with open(file_path, 'r', encoding='utf-8') as file: - content = file.read() - self.target_text_input.setText(content) - - def select_output_path(self): - options = QFileDialog.Options() - options |= QFileDialog.DontUseNativeDialog - options |= QFileDialog.ShowDirsOnly - - folder_dialog = QFileDialog() - folder_dialog.setOptions(options) - folder_dialog.setFileMode(QFileDialog.Directory) - - if folder_dialog.exec_(): - folder_path = folder_dialog.selectedFiles()[0] - self.output_input.setText(folder_path) - - def update_ref_audio(self, file_path): - self.ref_audio_input.setText(file_path) - - def clear_output(self): - self.output_text.clear() - - def synthesize(self): - GPT_model_path = self.GPT_model_input.text() - SoVITS_model_path = self.SoVITS_model_input.text() - ref_audio_path = self.ref_audio_input.text() - language_combobox = self.ref_language_combobox.currentText() - language_combobox = i18n(language_combobox) - ref_text = self.ref_text_input.text() - target_language_combobox = self.target_language_combobox.currentText() - target_language_combobox = i18n(target_language_combobox) - target_text = self.target_text_input.text() - output_path = self.output_input.text() - - if GPT_model_path != self.GPT_Path: - change_gpt_weights(gpt_path=GPT_model_path) - self.GPT_Path = GPT_model_path - if SoVITS_model_path != self.SoVITS_Path: - change_sovits_weights(sovits_path=SoVITS_model_path) - self.SoVITS_Path = SoVITS_model_path - - synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, - prompt_text=ref_text, - prompt_language=language_combobox, - text=target_text, - text_language=target_language_combobox) - - result_list = list(synthesis_result) - - if result_list: - last_sampling_rate, last_audio_data = result_list[-1] - output_wav_path = os.path.join(output_path, "output.wav") - sf.write(output_wav_path, last_audio_data, last_sampling_rate) - - result = "Audio saved to " + output_wav_path - - self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000) - self.output_text.append("处理结果:\n" + result) - - -if __name__ == '__main__': - app = QApplication(sys.argv) - mainWin = GPTSoVITSGUI() - mainWin.show() - sys.exit(app.exec_()) \ No newline at end of file diff --git a/onnx_export.py b/onnx_export.py deleted file mode 100644 index ab457d755c075e54d903d94f03ac5620a739fe9c..0000000000000000000000000000000000000000 --- a/onnx_export.py +++ /dev/null @@ -1,334 +0,0 @@ -from module.models_onnx import SynthesizerTrn, symbols -from AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule -import torch -import torchaudio -from torch import nn -from feature_extractor import cnhubert -cnhubert_base_path = "pretrained_models/chinese-hubert-base" -cnhubert.cnhubert_base_path=cnhubert_base_path -ssl_model = cnhubert.get_model() -from text import cleaned_text_to_sequence -import soundfile -from tools.my_utils import load_audio -import os -import json - -def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): - hann_window = torch.hann_window(win_size).to( - dtype=y.dtype, device=y.device - ) - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window, - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, - ) - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - return spec - - -class DictToAttrRecursive(dict): - def __init__(self, input_dict): - super().__init__(input_dict) - for key, value in input_dict.items(): - if isinstance(value, dict): - value = DictToAttrRecursive(value) - self[key] = value - setattr(self, key, value) - - def __getattr__(self, item): - try: - return self[item] - except KeyError: - raise AttributeError(f"Attribute {item} not found") - - def __setattr__(self, key, value): - if isinstance(value, dict): - value = DictToAttrRecursive(value) - super(DictToAttrRecursive, self).__setitem__(key, value) - super().__setattr__(key, value) - - def __delattr__(self, item): - try: - del self[item] - except KeyError: - raise AttributeError(f"Attribute {item} not found") - - -class T2SEncoder(nn.Module): - def __init__(self, t2s, vits): - super().__init__() - self.encoder = t2s.onnx_encoder - self.vits = vits - - def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content): - codes = self.vits.extract_latent(ssl_content) - prompt_semantic = codes[0, 0] - bert = torch.cat([ref_bert.transpose(0, 1), text_bert.transpose(0, 1)], 1) - all_phoneme_ids = torch.cat([ref_seq, text_seq], 1) - bert = bert.unsqueeze(0) - prompt = prompt_semantic.unsqueeze(0) - return self.encoder(all_phoneme_ids, bert), prompt - - -class T2SModel(nn.Module): - def __init__(self, t2s_path, vits_model): - super().__init__() - dict_s1 = torch.load(t2s_path, map_location="cpu") - self.config = dict_s1["config"] - self.t2s_model = Text2SemanticLightningModule(self.config, "ojbk", is_train=False) - self.t2s_model.load_state_dict(dict_s1["weight"]) - self.t2s_model.eval() - self.vits_model = vits_model.vq_model - self.hz = 50 - self.max_sec = self.config["data"]["max_sec"] - self.t2s_model.model.top_k = torch.LongTensor([self.config["inference"]["top_k"]]) - self.t2s_model.model.early_stop_num = torch.LongTensor([self.hz * self.max_sec]) - self.t2s_model = self.t2s_model.model - self.t2s_model.init_onnx() - self.onnx_encoder = T2SEncoder(self.t2s_model, self.vits_model) - self.first_stage_decoder = self.t2s_model.first_stage_decoder - self.stage_decoder = self.t2s_model.stage_decoder - #self.t2s_model = torch.jit.script(self.t2s_model) - - def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content): - early_stop_num = self.t2s_model.early_stop_num - - #[1,N] [1,N] [N, 1024] [N, 1024] [1, 768, N] - x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content) - - prefix_len = prompts.shape[1] - - #[1,N,512] [1,N] - y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) - - stop = False - for idx in range(1, 1500): - #[1, N] [N_layer, N, 1, 512] [N_layer, N, 1, 512] [1, N, 512] [1] [1, N, 512] [1, N] - enco = self.stage_decoder(y, k, v, y_emb, x_example) - y, k, v, y_emb, logits, samples = enco - if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num: - stop = True - if torch.argmax(logits, dim=-1)[0] == self.t2s_model.EOS or samples[0, 0] == self.t2s_model.EOS: - stop = True - if stop: - break - y[0, -1] = 0 - - return y[:, -idx:].unsqueeze(0) - - def export(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name, dynamo=False): - #self.onnx_encoder = torch.jit.script(self.onnx_encoder) - if dynamo: - export_options = torch.onnx.ExportOptions(dynamic_shapes=True) - onnx_encoder_export_output = torch.onnx.dynamo_export( - self.onnx_encoder, - (ref_seq, text_seq, ref_bert, text_bert, ssl_content), - export_options=export_options - ) - onnx_encoder_export_output.save(f"onnx/{project_name}/{project_name}_t2s_encoder.onnx") - return - - torch.onnx.export( - self.onnx_encoder, - (ref_seq, text_seq, ref_bert, text_bert, ssl_content), - f"onnx/{project_name}/{project_name}_t2s_encoder.onnx", - input_names=["ref_seq", "text_seq", "ref_bert", "text_bert", "ssl_content"], - output_names=["x", "prompts"], - dynamic_axes={ - "ref_seq": {1 : "ref_length"}, - "text_seq": {1 : "text_length"}, - "ref_bert": {0 : "ref_length"}, - "text_bert": {0 : "text_length"}, - "ssl_content": {2 : "ssl_length"}, - }, - opset_version=16 - ) - x, prompts = self.onnx_encoder(ref_seq, text_seq, ref_bert, text_bert, ssl_content) - - torch.onnx.export( - self.first_stage_decoder, - (x, prompts), - f"onnx/{project_name}/{project_name}_t2s_fsdec.onnx", - input_names=["x", "prompts"], - output_names=["y", "k", "v", "y_emb", "x_example"], - dynamic_axes={ - "x": {1 : "x_length"}, - "prompts": {1 : "prompts_length"}, - }, - verbose=False, - opset_version=16 - ) - y, k, v, y_emb, x_example = self.first_stage_decoder(x, prompts) - - torch.onnx.export( - self.stage_decoder, - (y, k, v, y_emb, x_example), - f"onnx/{project_name}/{project_name}_t2s_sdec.onnx", - input_names=["iy", "ik", "iv", "iy_emb", "ix_example"], - output_names=["y", "k", "v", "y_emb", "logits", "samples"], - dynamic_axes={ - "iy": {1 : "iy_length"}, - "ik": {1 : "ik_length"}, - "iv": {1 : "iv_length"}, - "iy_emb": {1 : "iy_emb_length"}, - "ix_example": {1 : "ix_example_length"}, - }, - verbose=False, - opset_version=16 - ) - - -class VitsModel(nn.Module): - def __init__(self, vits_path): - super().__init__() - dict_s2 = torch.load(vits_path,map_location="cpu") - self.hps = dict_s2["config"] - self.hps = DictToAttrRecursive(self.hps) - self.hps.model.semantic_frame_rate = "25hz" - self.vq_model = SynthesizerTrn( - self.hps.data.filter_length // 2 + 1, - self.hps.train.segment_size // self.hps.data.hop_length, - n_speakers=self.hps.data.n_speakers, - **self.hps.model - ) - self.vq_model.eval() - self.vq_model.load_state_dict(dict_s2["weight"], strict=False) - - def forward(self, text_seq, pred_semantic, ref_audio): - refer = spectrogram_torch( - ref_audio, - self.hps.data.filter_length, - self.hps.data.sampling_rate, - self.hps.data.hop_length, - self.hps.data.win_length, - center=False - ) - return self.vq_model(pred_semantic, text_seq, refer)[0, 0] - - -class GptSoVits(nn.Module): - def __init__(self, vits, t2s): - super().__init__() - self.vits = vits - self.t2s = t2s - - def forward(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content, debug=False): - pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content) - audio = self.vits(text_seq, pred_semantic, ref_audio) - if debug: - import onnxruntime - sess = onnxruntime.InferenceSession("onnx/koharu/koharu_vits.onnx", providers=["CPU"]) - audio1 = sess.run(None, { - "text_seq" : text_seq.detach().cpu().numpy(), - "pred_semantic" : pred_semantic.detach().cpu().numpy(), - "ref_audio" : ref_audio.detach().cpu().numpy() - }) - return audio, audio1 - return audio - - def export(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ssl_content, project_name): - self.t2s.export(ref_seq, text_seq, ref_bert, text_bert, ssl_content, project_name) - pred_semantic = self.t2s(ref_seq, text_seq, ref_bert, text_bert, ssl_content) - torch.onnx.export( - self.vits, - (text_seq, pred_semantic, ref_audio), - f"onnx/{project_name}/{project_name}_vits.onnx", - input_names=["text_seq", "pred_semantic", "ref_audio"], - output_names=["audio"], - dynamic_axes={ - "text_seq": {1 : "text_length"}, - "pred_semantic": {2 : "pred_length"}, - "ref_audio": {1 : "audio_length"}, - }, - opset_version=17, - verbose=False - ) - - -class SSLModel(nn.Module): - def __init__(self): - super().__init__() - self.ssl = ssl_model - - def forward(self, ref_audio_16k): - return self.ssl.model(ref_audio_16k)["last_hidden_state"].transpose(1, 2) - - -def export(vits_path, gpt_path, project_name): - vits = VitsModel(vits_path) - gpt = T2SModel(gpt_path, vits) - gpt_sovits = GptSoVits(vits, gpt) - ssl = SSLModel() - ref_seq = torch.LongTensor([cleaned_text_to_sequence(["n", "i2", "h", "ao3", ",", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4"])]) - text_seq = torch.LongTensor([cleaned_text_to_sequence(["w", "o3", "sh", "i4", "b", "ai2", "y", "e4", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4", "w", "o3", "sh", "i4", "b", "ai2", "y", "e4"])]) - ref_bert = torch.randn((ref_seq.shape[1], 1024)).float() - text_bert = torch.randn((text_seq.shape[1], 1024)).float() - ref_audio = torch.randn((1, 48000 * 5)).float() - # ref_audio = torch.tensor([load_audio("rec.wav", 48000)]).float() - ref_audio_16k = torchaudio.functional.resample(ref_audio,48000,16000).float() - ref_audio_sr = torchaudio.functional.resample(ref_audio,48000,vits.hps.data.sampling_rate).float() - - try: - os.mkdir(f"onnx/{project_name}") - except: - pass - - ssl_content = ssl(ref_audio_16k).float() - - debug = False - - if debug: - a, b = gpt_sovits(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content, debug=debug) - soundfile.write("out1.wav", a.cpu().detach().numpy(), vits.hps.data.sampling_rate) - soundfile.write("out2.wav", b[0], vits.hps.data.sampling_rate) - return - - a = gpt_sovits(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content).detach().cpu().numpy() - - soundfile.write("out.wav", a, vits.hps.data.sampling_rate) - - gpt_sovits.export(ref_seq, text_seq, ref_bert, text_bert, ref_audio_sr, ssl_content, project_name) - - MoeVSConf = { - "Folder" : f"{project_name}", - "Name" : f"{project_name}", - "Type" : "GPT-SoVits", - "Rate" : vits.hps.data.sampling_rate, - "NumLayers": gpt.t2s_model.num_layers, - "EmbeddingDim": gpt.t2s_model.embedding_dim, - "Dict": "BasicDict", - "BertPath": "chinese-roberta-wwm-ext-large", - "Symbol": symbols, - "AddBlank": False - } - - MoeVSConfJson = json.dumps(MoeVSConf) - with open(f"onnx/{project_name}.json", 'w') as MoeVsConfFile: - json.dump(MoeVSConf, MoeVsConfFile, indent = 4) - - -if __name__ == "__main__": - try: - os.mkdir("onnx") - except: - pass - - gpt_path = "GPT_weights/nahida-e25.ckpt" - vits_path = "SoVITS_weights/nahida_e30_s3930.pth" - exp_path = "nahida" - export(vits_path, gpt_path, exp_path) - - # soundfile.write("out.wav", a, vits.hps.data.sampling_rate) \ No newline at end of file diff --git a/prepare_datasets/1-get-text.py b/prepare_datasets/1-get-text.py deleted file mode 100644 index 7af6c100eddf7dd2daafc15fe966e2843c55cb6a..0000000000000000000000000000000000000000 --- a/prepare_datasets/1-get-text.py +++ /dev/null @@ -1,146 +0,0 @@ -# -*- coding: utf-8 -*- - -import os - -inp_text = os.environ.get("inp_text") -inp_wav_dir = os.environ.get("inp_wav_dir") -exp_name = os.environ.get("exp_name") -i_part = os.environ.get("i_part") -all_parts = os.environ.get("all_parts") -os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES") -opt_dir = os.environ.get("opt_dir") -bert_pretrained_dir = os.environ.get("bert_pretrained_dir") -import torch -is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() -version = os.environ.get('version', None) -import sys, numpy as np, traceback, pdb -import os.path -from glob import glob -from tqdm import tqdm -from text.cleaner import clean_text -from transformers import AutoModelForMaskedLM, AutoTokenizer -import numpy as np -from tools.my_utils import clean_path - -# inp_text=sys.argv[1] -# inp_wav_dir=sys.argv[2] -# exp_name=sys.argv[3] -# i_part=sys.argv[4] -# all_parts=sys.argv[5] -# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]#i_gpu -# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name -# bert_pretrained_dir="/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large" - -from time import time as ttime -import shutil - - -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) - # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) - tmp_path="%s%s.pth"%(ttime(),i_part) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) - - -txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part) -if os.path.exists(txt_path) == False: - bert_dir = "%s/3-bert" % (opt_dir) - os.makedirs(opt_dir, exist_ok=True) - os.makedirs(bert_dir, exist_ok=True) - if torch.cuda.is_available(): - device = "cuda:0" - # elif torch.backends.mps.is_available(): - # device = "mps" - else: - device = "cpu" - if os.path.exists(bert_pretrained_dir):... - else:raise FileNotFoundError(bert_pretrained_dir) - tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir) - bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir) - if is_half == True: - bert_model = bert_model.half().to(device) - else: - bert_model = bert_model.to(device) - - def get_bert_feature(text, word2ph): - with torch.no_grad(): - inputs = tokenizer(text, return_tensors="pt") - for i in inputs: - inputs[i] = inputs[i].to(device) - res = bert_model(**inputs, output_hidden_states=True) - res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1] - - assert len(word2ph) == len(text) - phone_level_feature = [] - for i in range(len(word2ph)): - repeat_feature = res[i].repeat(word2ph[i], 1) - phone_level_feature.append(repeat_feature) - - phone_level_feature = torch.cat(phone_level_feature, dim=0) - - return phone_level_feature.T - - def process(data, res): - for name, text, lan in data: - try: - name=clean_path(name) - name = os.path.basename(name) - print(name) - phones, word2ph, norm_text = clean_text( - text.replace("%", "-").replace("¥", ","), lan, version - ) - path_bert = "%s/%s.pt" % (bert_dir, name) - if os.path.exists(path_bert) == False and lan == "zh": - bert_feature = get_bert_feature(norm_text, word2ph) - assert bert_feature.shape[-1] == len(phones) - # torch.save(bert_feature, path_bert) - my_save(bert_feature, path_bert) - phones = " ".join(phones) - # res.append([name,phones]) - res.append([name, phones, word2ph, norm_text]) - except: - print(name, text, traceback.format_exc()) - - todo = [] - res = [] - with open(inp_text, "r", encoding="utf8") as f: - lines = f.read().strip("\n").split("\n") - - language_v1_to_language_v2 = { - "ZH": "zh", - "zh": "zh", - "JP": "ja", - "jp": "ja", - "JA": "ja", - "ja": "ja", - "EN": "en", - "en": "en", - "En": "en", - "KO": "ko", - "Ko": "ko", - "ko": "ko", - "yue": "yue", - "YUE": "yue", - "Yue": "yue", - } - for line in lines[int(i_part) :: int(all_parts)]: - try: - wav_name, spk_name, language, text = line.split("|") - # todo.append([name,text,"zh"]) - if language in language_v1_to_language_v2.keys(): - todo.append( - [wav_name, text, language_v1_to_language_v2.get(language, language)] - ) - else: - print(f"\033[33m[Waring] The {language = } of {wav_name} is not supported for training.\033[0m") - except: - print(line, traceback.format_exc()) - - process(todo, res) - opt = [] - for name, phones, word2ph, norm_text in res: - opt.append("%s\t%s\t%s\t%s" % (name, phones, word2ph, norm_text)) - with open(txt_path, "w", encoding="utf8") as f: - f.write("\n".join(opt) + "\n") diff --git a/prepare_datasets/2-get-hubert-wav32k.py b/prepare_datasets/2-get-hubert-wav32k.py deleted file mode 100644 index 82f3c69d08672d3b18589c51bb8a7a0448707116..0000000000000000000000000000000000000000 --- a/prepare_datasets/2-get-hubert-wav32k.py +++ /dev/null @@ -1,122 +0,0 @@ -# -*- coding: utf-8 -*- - -import sys,os -inp_text= os.environ.get("inp_text") -inp_wav_dir= os.environ.get("inp_wav_dir") -exp_name= os.environ.get("exp_name") -i_part= os.environ.get("i_part") -all_parts= os.environ.get("all_parts") -os.environ["CUDA_VISIBLE_DEVICES"]= os.environ.get("_CUDA_VISIBLE_DEVICES") -from feature_extractor import cnhubert -opt_dir= os.environ.get("opt_dir") -cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") -import torch -is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() - -import pdb,traceback,numpy as np,logging -from scipy.io import wavfile -import librosa -now_dir = os.getcwd() -sys.path.append(now_dir) -from tools.my_utils import load_audio,clean_path - -# from config import cnhubert_base_path -# cnhubert.cnhubert_base_path=cnhubert_base_path -# inp_text=sys.argv[1] -# inp_wav_dir=sys.argv[2] -# exp_name=sys.argv[3] -# i_part=sys.argv[4] -# all_parts=sys.argv[5] -# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6] -# cnhubert.cnhubert_base_path=sys.argv[7] -# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name - -from time import time as ttime -import shutil -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) - # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part) - tmp_path="%s%s.pth"%(ttime(),i_part) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) - -hubert_dir="%s/4-cnhubert"%(opt_dir) -wav32dir="%s/5-wav32k"%(opt_dir) -os.makedirs(opt_dir,exist_ok=True) -os.makedirs(hubert_dir,exist_ok=True) -os.makedirs(wav32dir,exist_ok=True) - -maxx=0.95 -alpha=0.5 -if torch.cuda.is_available(): - device = "cuda:0" -# elif torch.backends.mps.is_available(): -# device = "mps" -else: - device = "cpu" -model=cnhubert.get_model() -# is_half=False -if(is_half==True): - model=model.half().to(device) -else: - model = model.to(device) - -nan_fails=[] -def name2go(wav_name,wav_path): - hubert_path="%s/%s.pt"%(hubert_dir,wav_name) - if(os.path.exists(hubert_path)):return - tmp_audio = load_audio(wav_path, 32000) - tmp_max = np.abs(tmp_audio).max() - if tmp_max > 2.2: - print("%s-filtered,%s" % (wav_name, tmp_max)) - return - tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * tmp_audio - tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha*1145.14)) + ((1 - alpha)*1145.14) * tmp_audio - tmp_audio = librosa.resample( - tmp_audio32b, orig_sr=32000, target_sr=16000 - )#不是重采样问题 - tensor_wav16 = torch.from_numpy(tmp_audio) - if (is_half == True): - tensor_wav16=tensor_wav16.half().to(device) - else: - tensor_wav16 = tensor_wav16.to(device) - ssl=model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1,2).cpu()#torch.Size([1, 768, 215]) - if np.isnan(ssl.detach().numpy()).sum()!= 0: - nan_fails.append((wav_name,wav_path)) - print("nan filtered:%s"%wav_name) - return - wavfile.write( - "%s/%s"%(wav32dir,wav_name), - 32000, - tmp_audio32.astype("int16"), - ) - my_save(ssl,hubert_path) - -with open(inp_text,"r",encoding="utf8")as f: - lines=f.read().strip("\n").split("\n") - -for line in lines[int(i_part)::int(all_parts)]: - try: - # wav_name,text=line.split("\t") - wav_name, spk_name, language, text = line.split("|") - wav_name=clean_path(wav_name) - if (inp_wav_dir != "" and inp_wav_dir != None): - wav_name = os.path.basename(wav_name) - wav_path = "%s/%s"%(inp_wav_dir, wav_name) - - else: - wav_path=wav_name - wav_name = os.path.basename(wav_name) - name2go(wav_name,wav_path) - except: - print(line,traceback.format_exc()) - -if(len(nan_fails)>0 and is_half==True): - is_half=False - model=model.float() - for wav in nan_fails: - try: - name2go(wav[0],wav[1]) - except: - print(wav_name,traceback.format_exc()) diff --git a/prepare_datasets/3-get-semantic.py b/prepare_datasets/3-get-semantic.py deleted file mode 100644 index bbf7688b40ff9cb6148cde0be71129775b9287b5..0000000000000000000000000000000000000000 --- a/prepare_datasets/3-get-semantic.py +++ /dev/null @@ -1,101 +0,0 @@ -import os - -inp_text = os.environ.get("inp_text") -exp_name = os.environ.get("exp_name") -i_part = os.environ.get("i_part") -all_parts = os.environ.get("all_parts") -os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("_CUDA_VISIBLE_DEVICES") -opt_dir = os.environ.get("opt_dir") -pretrained_s2G = os.environ.get("pretrained_s2G") -s2config_path = os.environ.get("s2config_path") -version=os.environ.get("version","v2") -import torch -is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available() -import math, traceback -import multiprocessing -import sys, pdb - -now_dir = os.getcwd() -sys.path.append(now_dir) -from random import shuffle -import torch.multiprocessing as mp -from glob import glob -from tqdm import tqdm -import logging, librosa, utils -from module.models import SynthesizerTrn -from tools.my_utils import clean_path -logging.getLogger("numba").setLevel(logging.WARNING) -# from config import pretrained_s2G - -# inp_text=sys.argv[1] -# exp_name=sys.argv[2] -# i_part=sys.argv[3] -# all_parts=sys.argv[4] -# os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[5] -# opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name - -if os.path.exists(pretrained_s2G):... -else:raise FileNotFoundError(pretrained_s2G) - -hubert_dir = "%s/4-cnhubert" % (opt_dir) -semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part) -if os.path.exists(semantic_path) == False: - os.makedirs(opt_dir, exist_ok=True) - - if torch.cuda.is_available(): - device = "cuda" - # elif torch.backends.mps.is_available(): - # device = "mps" - else: - device = "cpu" - hps = utils.get_hparams_from_file(s2config_path) - vq_model = SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - version=version, - **hps.model - ) - if is_half == True: - vq_model = vq_model.half().to(device) - else: - vq_model = vq_model.to(device) - vq_model.eval() - # utils.load_checkpoint(utils.latest_checkpoint_path(hps.s2_ckpt_dir, "G_*.pth"), vq_model, None, True) - # utils.load_checkpoint(pretrained_s2G, vq_model, None, True) - print( - vq_model.load_state_dict( - torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False - ) - ) - - def name2go(wav_name, lines): - hubert_path = "%s/%s.pt" % (hubert_dir, wav_name) - if os.path.exists(hubert_path) == False: - return - ssl_content = torch.load(hubert_path, map_location="cpu") - if is_half == True: - ssl_content = ssl_content.half().to(device) - else: - ssl_content = ssl_content.to(device) - codes = vq_model.extract_latent(ssl_content) - semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()]) - lines.append("%s\t%s" % (wav_name, semantic)) - - with open(inp_text, "r", encoding="utf8") as f: - lines = f.read().strip("\n").split("\n") - - lines1 = [] - for line in lines[int(i_part) :: int(all_parts)]: - # print(line) - try: - # wav_name,text=line.split("\t") - wav_name, spk_name, language, text = line.split("|") - wav_name=clean_path(wav_name) - wav_name = os.path.basename(wav_name) - # name2go(name,lines1) - name2go(wav_name, lines1) - except: - print(line, traceback.format_exc()) - with open(semantic_path, "w", encoding="utf8") as f: - f.write("\n".join(lines1)) diff --git a/s1_train.py b/s1_train.py deleted file mode 100644 index 898ca549845f81ef4171c38a38f465402d95f18b..0000000000000000000000000000000000000000 --- a/s1_train.py +++ /dev/null @@ -1,183 +0,0 @@ -# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/train_t2s.py -import os -import pdb - -if "_CUDA_VISIBLE_DEVICES" in os.environ: - os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] -import argparse -import logging -from pathlib import Path - -import torch, platform -from pytorch_lightning import seed_everything -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import ModelCheckpoint -from pytorch_lightning.loggers import TensorBoardLogger # WandbLogger -from pytorch_lightning.strategies import DDPStrategy -from AR.data.data_module import Text2SemanticDataModule -from AR.models.t2s_lightning_module import Text2SemanticLightningModule -from AR.utils.io import load_yaml_config - -logging.getLogger("numba").setLevel(logging.WARNING) -logging.getLogger("matplotlib").setLevel(logging.WARNING) -torch.set_float32_matmul_precision("high") -from AR.utils import get_newest_ckpt - -from collections import OrderedDict -from time import time as ttime -import shutil -def my_save(fea,path):#####fix issue: torch.save doesn't support chinese path - dir=os.path.dirname(path) - name=os.path.basename(path) - tmp_path="%s.pth"%(ttime()) - torch.save(fea,tmp_path) - shutil.move(tmp_path,"%s/%s"%(dir,name)) - - -class my_model_ckpt(ModelCheckpoint): - def __init__( - self, - config, - if_save_latest, - if_save_every_weights, - half_weights_save_dir, - exp_name, - **kwargs - ): - super().__init__(**kwargs) - self.if_save_latest = if_save_latest - self.if_save_every_weights = if_save_every_weights - self.half_weights_save_dir = half_weights_save_dir - self.exp_name = exp_name - self.config = config - - def on_train_epoch_end(self, trainer, pl_module): - # if not self._should_skip_saving_checkpoint(trainer) and self._should_save_on_train_epoch_end(trainer): - if self._should_save_on_train_epoch_end(trainer): - monitor_candidates = self._monitor_candidates(trainer) - if ( - self._every_n_epochs >= 1 - and (trainer.current_epoch + 1) % self._every_n_epochs == 0 - ): - if ( - self.if_save_latest == True - ): ####如果设置只保存最后一个ckpt,在保存下一个ckpt后要清理掉之前的所有ckpt - to_clean = list(os.listdir(self.dirpath)) - self._save_topk_checkpoint(trainer, monitor_candidates) - if self.if_save_latest == True: - for name in to_clean: - try: - os.remove("%s/%s" % (self.dirpath, name)) - except: - pass - if self.if_save_every_weights == True: - to_save_od = OrderedDict() - to_save_od["weight"] = OrderedDict() - dictt = trainer.strategy._lightning_module.state_dict() - for key in dictt: - to_save_od["weight"][key] = dictt[key].half() - to_save_od["config"] = self.config - to_save_od["info"] = "GPT-e%s" % (trainer.current_epoch + 1) - # torch.save( - # print(os.environ) - if(os.environ.get("LOCAL_RANK","0")=="0"): - my_save( - to_save_od, - "%s/%s-e%s.ckpt" - % ( - self.half_weights_save_dir, - self.exp_name, - trainer.current_epoch + 1, - ), - ) - self._save_last_checkpoint(trainer, monitor_candidates) - - -def main(args): - config = load_yaml_config(args.config_file) - - output_dir = Path(config["output_dir"]) - output_dir.mkdir(parents=True, exist_ok=True) - - ckpt_dir = output_dir / "ckpt" - ckpt_dir.mkdir(parents=True, exist_ok=True) - - seed_everything(config["train"]["seed"], workers=True) - ckpt_callback: ModelCheckpoint = my_model_ckpt( - config=config, - if_save_latest=config["train"]["if_save_latest"], - if_save_every_weights=config["train"]["if_save_every_weights"], - half_weights_save_dir=config["train"]["half_weights_save_dir"], - exp_name=config["train"]["exp_name"], - save_top_k=-1, - monitor="top_3_acc", - mode="max", - save_on_train_epoch_end=True, - every_n_epochs=config["train"]["save_every_n_epoch"], - dirpath=ckpt_dir, - ) - logger = TensorBoardLogger(name=output_dir.stem, save_dir=output_dir) - os.environ["MASTER_ADDR"]="localhost" - trainer: Trainer = Trainer( - max_epochs=config["train"]["epochs"], - accelerator="gpu" if torch.cuda.is_available() else "cpu", - # val_check_interval=9999999999999999999999,###不要验证 - # check_val_every_n_epoch=None, - limit_val_batches=0, - devices=-1 if torch.cuda.is_available() else 1, - benchmark=False, - fast_dev_run=False, - strategy = DDPStrategy( - process_group_backend="nccl" if platform.system() != "Windows" else "gloo" - ) if torch.cuda.is_available() else "auto", - precision=config["train"]["precision"], - logger=logger, - num_sanity_val_steps=0, - callbacks=[ckpt_callback], - use_distributed_sampler=False, # 非常简单的修改,但解决了采用自定义的 bucket_sampler 下训练步数不一致的问题! - ) - - model: Text2SemanticLightningModule = Text2SemanticLightningModule( - config, output_dir - ) - - data_module: Text2SemanticDataModule = Text2SemanticDataModule( - config, - train_semantic_path=config["train_semantic_path"], - train_phoneme_path=config["train_phoneme_path"], - # dev_semantic_path=args.dev_semantic_path, - # dev_phoneme_path=args.dev_phoneme_path - ) - - try: - # 使用正则表达式匹配文件名中的数字部分,并按数字大小进行排序 - newest_ckpt_name = get_newest_ckpt(os.listdir(ckpt_dir)) - ckpt_path = ckpt_dir / newest_ckpt_name - except Exception: - ckpt_path = None - print("ckpt_path:", ckpt_path) - trainer.fit(model, data_module, ckpt_path=ckpt_path) - - -# srun --gpus-per-node=1 --ntasks-per-node=1 python train.py --path-to-configuration configurations/default.yaml -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-c", - "--config_file", - type=str, - default="configs/s1longer.yaml", - help="path of config file", - ) - # args for dataset - # parser.add_argument('--train_semantic_path',type=str,default='/data/docker/liujing04/gpt-vits/fine_tune_dataset/xuangou/6-name2semantic.tsv') - # parser.add_argument('--train_phoneme_path', type=str, default='/data/docker/liujing04/gpt-vits/fine_tune_dataset/xuangou/2-name2text.txt') - - # parser.add_argument('--dev_semantic_path', type=str, default='dump_mix/semantic_dev.tsv') - # parser.add_argument('--dev_phoneme_path', type=str, default='dump_mix/phoneme_dev.npy') - # parser.add_argument('--output_dir',type=str,default='/data/docker/liujing04/gpt-vits/fine_tune_dataset/xuangou/logs_s1',help='directory to save the results') - # parser.add_argument('--output_dir',type=str,default='/liujing04/gpt_logs/s1/xuangou_ft',help='directory to save the results') - - args = parser.parse_args() - logging.info(str(args)) - main(args) diff --git a/s2_train.py b/s2_train.py deleted file mode 100644 index f5de615fcdc93ab6f1a9ee91cb0320b404cfec0b..0000000000000000000000000000000000000000 --- a/s2_train.py +++ /dev/null @@ -1,601 +0,0 @@ -import warnings -warnings.filterwarnings("ignore") -import utils, os -hps = utils.get_hparams(stage=2) -os.environ["CUDA_VISIBLE_DEVICES"] = hps.train.gpu_numbers.replace("-", ",") -import torch -from torch.nn import functional as F -from torch.utils.data import DataLoader -from torch.utils.tensorboard import SummaryWriter -import torch.multiprocessing as mp -import torch.distributed as dist, traceback -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.cuda.amp import autocast, GradScaler -from tqdm import tqdm -import logging, traceback - -logging.getLogger("matplotlib").setLevel(logging.INFO) -logging.getLogger("h5py").setLevel(logging.INFO) -logging.getLogger("numba").setLevel(logging.INFO) -from random import randint -from module import commons - -from module.data_utils import ( - TextAudioSpeakerLoader, - TextAudioSpeakerCollate, - DistributedBucketSampler, -) -from module.models import ( - SynthesizerTrn, - MultiPeriodDiscriminator, -) -from module.losses import generator_loss, discriminator_loss, feature_loss, kl_loss -from module.mel_processing import mel_spectrogram_torch, spec_to_mel_torch -from process_ckpt import savee - -torch.backends.cudnn.benchmark = False -torch.backends.cudnn.deterministic = False -###反正A100fp32更快,那试试tf32吧 -torch.backends.cuda.matmul.allow_tf32 = True -torch.backends.cudnn.allow_tf32 = True -torch.set_float32_matmul_precision("medium") # 最低精度但最快(也就快一丁点),对于结果造成不了影响 -# from config import pretrained_s2G,pretrained_s2D -global_step = 0 - -device = "cpu" # cuda以外的设备,等mps优化后加入 - - -def main(): - - if torch.cuda.is_available(): - n_gpus = torch.cuda.device_count() - else: - n_gpus = 1 - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(randint(20000, 55555)) - - mp.spawn( - run, - nprocs=n_gpus, - args=( - n_gpus, - hps, - ), - ) - - -def run(rank, n_gpus, hps): - global global_step - if rank == 0: - logger = utils.get_logger(hps.data.exp_dir) - logger.info(hps) - # utils.check_git_hash(hps.s2_ckpt_dir) - writer = SummaryWriter(log_dir=hps.s2_ckpt_dir) - writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval")) - - dist.init_process_group( - backend = "gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl", - init_method="env://", - world_size=n_gpus, - rank=rank, - ) - torch.manual_seed(hps.train.seed) - if torch.cuda.is_available(): - torch.cuda.set_device(rank) - - train_dataset = TextAudioSpeakerLoader(hps.data) ######## - train_sampler = DistributedBucketSampler( - train_dataset, - hps.train.batch_size, - [ - 32, - 300, - 400, - 500, - 600, - 700, - 800, - 900, - 1000, - 1100, - 1200, - 1300, - 1400, - 1500, - 1600, - 1700, - 1800, - 1900, - ], - num_replicas=n_gpus, - rank=rank, - shuffle=True, - ) - collate_fn = TextAudioSpeakerCollate() - train_loader = DataLoader( - train_dataset, - num_workers=6, - shuffle=False, - pin_memory=True, - collate_fn=collate_fn, - batch_sampler=train_sampler, - persistent_workers=True, - prefetch_factor=4, - ) - # if rank == 0: - # eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data, val=True) - # eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False, - # batch_size=1, pin_memory=True, - # drop_last=False, collate_fn=collate_fn) - - net_g = SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ).cuda(rank) if torch.cuda.is_available() else SynthesizerTrn( - hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model, - ).to(device) - - net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) if torch.cuda.is_available() else MultiPeriodDiscriminator(hps.model.use_spectral_norm).to(device) - for name, param in net_g.named_parameters(): - if not param.requires_grad: - print(name, "not requires_grad") - - te_p = list(map(id, net_g.enc_p.text_embedding.parameters())) - et_p = list(map(id, net_g.enc_p.encoder_text.parameters())) - mrte_p = list(map(id, net_g.enc_p.mrte.parameters())) - base_params = filter( - lambda p: id(p) not in te_p + et_p + mrte_p and p.requires_grad, - net_g.parameters(), - ) - - # te_p=net_g.enc_p.text_embedding.parameters() - # et_p=net_g.enc_p.encoder_text.parameters() - # mrte_p=net_g.enc_p.mrte.parameters() - - optim_g = torch.optim.AdamW( - # filter(lambda p: p.requires_grad, net_g.parameters()),###默认所有层lr一致 - [ - {"params": base_params, "lr": hps.train.learning_rate}, - { - "params": net_g.enc_p.text_embedding.parameters(), - "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, - }, - { - "params": net_g.enc_p.encoder_text.parameters(), - "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, - }, - { - "params": net_g.enc_p.mrte.parameters(), - "lr": hps.train.learning_rate * hps.train.text_low_lr_rate, - }, - ], - hps.train.learning_rate, - betas=hps.train.betas, - eps=hps.train.eps, - ) - optim_d = torch.optim.AdamW( - net_d.parameters(), - hps.train.learning_rate, - betas=hps.train.betas, - eps=hps.train.eps, - ) - if torch.cuda.is_available(): - net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) - net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) - else: - net_g = net_g.to(device) - net_d = net_d.to(device) - - try: # 如果能加载自动resume - _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "D_*.pth"), - net_d, - optim_d, - ) # D多半加载没事 - if rank == 0: - logger.info("loaded D") - # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) - _, _, _, epoch_str = utils.load_checkpoint( - utils.latest_checkpoint_path("%s/logs_s2" % hps.data.exp_dir, "G_*.pth"), - net_g, - optim_g, - ) - global_step = (epoch_str - 1) * len(train_loader) - # epoch_str = 1 - # global_step = 0 - except: # 如果首次不能加载,加载pretrain - # traceback.print_exc() - epoch_str = 1 - global_step = 0 - if hps.train.pretrained_s2G != ""and hps.train.pretrained_s2G != None and os.path.exists(hps.train.pretrained_s2G): - if rank == 0: - logger.info("loaded pretrained %s" % hps.train.pretrained_s2G) - print( - net_g.module.load_state_dict( - torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], - strict=False, - ) if torch.cuda.is_available() else net_g.load_state_dict( - torch.load(hps.train.pretrained_s2G, map_location="cpu")["weight"], - strict=False, - ) - ) ##测试不加载优化器 - if hps.train.pretrained_s2D != ""and hps.train.pretrained_s2D != None and os.path.exists(hps.train.pretrained_s2D): - if rank == 0: - logger.info("loaded pretrained %s" % hps.train.pretrained_s2D) - print( - net_d.module.load_state_dict( - torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"] - ) if torch.cuda.is_available() else net_d.load_state_dict( - torch.load(hps.train.pretrained_s2D, map_location="cpu")["weight"] - ) - ) - - # scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) - # scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) - - scheduler_g = torch.optim.lr_scheduler.ExponentialLR( - optim_g, gamma=hps.train.lr_decay, last_epoch=-1 - ) - scheduler_d = torch.optim.lr_scheduler.ExponentialLR( - optim_d, gamma=hps.train.lr_decay, last_epoch=-1 - ) - for _ in range(epoch_str): - scheduler_g.step() - scheduler_d.step() - - scaler = GradScaler(enabled=hps.train.fp16_run) - - for epoch in range(epoch_str, hps.train.epochs + 1): - if rank == 0: - train_and_evaluate( - rank, - epoch, - hps, - [net_g, net_d], - [optim_g, optim_d], - [scheduler_g, scheduler_d], - scaler, - # [train_loader, eval_loader], logger, [writer, writer_eval]) - [train_loader, None], - logger, - [writer, writer_eval], - ) - else: - train_and_evaluate( - rank, - epoch, - hps, - [net_g, net_d], - [optim_g, optim_d], - [scheduler_g, scheduler_d], - scaler, - [train_loader, None], - None, - None, - ) - scheduler_g.step() - scheduler_d.step() - - -def train_and_evaluate( - rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers -): - net_g, net_d = nets - optim_g, optim_d = optims - # scheduler_g, scheduler_d = schedulers - train_loader, eval_loader = loaders - if writers is not None: - writer, writer_eval = writers - - train_loader.batch_sampler.set_epoch(epoch) - global global_step - - net_g.train() - net_d.train() - for batch_idx, ( - ssl, - ssl_lengths, - spec, - spec_lengths, - y, - y_lengths, - text, - text_lengths, - ) in enumerate(tqdm(train_loader)): - if torch.cuda.is_available(): - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda( - rank, non_blocking=True - ) - y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda( - rank, non_blocking=True - ) - ssl = ssl.cuda(rank, non_blocking=True) - ssl.requires_grad = False - # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) - text, text_lengths = text.cuda(rank, non_blocking=True), text_lengths.cuda( - rank, non_blocking=True - ) - else: - spec, spec_lengths = spec.to(device), spec_lengths.to(device) - y, y_lengths = y.to(device), y_lengths.to(device) - ssl = ssl.to(device) - ssl.requires_grad = False - # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True) - text, text_lengths = text.to(device), text_lengths.to(device) - - with autocast(enabled=hps.train.fp16_run): - ( - y_hat, - kl_ssl, - ids_slice, - x_mask, - z_mask, - (z, z_p, m_p, logs_p, m_q, logs_q), - stats_ssl, - ) = net_g(ssl, spec, spec_lengths, text, text_lengths) - - mel = spec_to_mel_torch( - spec, - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.mel_fmin, - hps.data.mel_fmax, - ) - y_mel = commons.slice_segments( - mel, ids_slice, hps.train.segment_size // hps.data.hop_length - ) - y_hat_mel = mel_spectrogram_torch( - y_hat.squeeze(1), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.hop_length, - hps.data.win_length, - hps.data.mel_fmin, - hps.data.mel_fmax, - ) - - y = commons.slice_segments( - y, ids_slice * hps.data.hop_length, hps.train.segment_size - ) # slice - - # Discriminator - y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) - with autocast(enabled=False): - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( - y_d_hat_r, y_d_hat_g - ) - loss_disc_all = loss_disc - optim_d.zero_grad() - scaler.scale(loss_disc_all).backward() - scaler.unscale_(optim_d) - grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) - scaler.step(optim_d) - - with autocast(enabled=hps.train.fp16_run): - # Generator - y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) - with autocast(enabled=False): - loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel - loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl - - loss_fm = feature_loss(fmap_r, fmap_g) - loss_gen, losses_gen = generator_loss(y_d_hat_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + kl_ssl * 1 + loss_kl - - optim_g.zero_grad() - scaler.scale(loss_gen_all).backward() - scaler.unscale_(optim_g) - grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) - scaler.step(optim_g) - scaler.update() - - if rank == 0: - if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]["lr"] - losses = [loss_disc, loss_gen, loss_fm, loss_mel, kl_ssl, loss_kl] - logger.info( - "Train Epoch: {} [{:.0f}%]".format( - epoch, 100.0 * batch_idx / len(train_loader) - ) - ) - logger.info([x.item() for x in losses] + [global_step, lr]) - - scalar_dict = { - "loss/g/total": loss_gen_all, - "loss/d/total": loss_disc_all, - "learning_rate": lr, - "grad_norm_d": grad_norm_d, - "grad_norm_g": grad_norm_g, - } - scalar_dict.update( - { - "loss/g/fm": loss_fm, - "loss/g/mel": loss_mel, - "loss/g/kl_ssl": kl_ssl, - "loss/g/kl": loss_kl, - } - ) - - # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) - # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) - # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) - image_dict = { - "slice/mel_org": utils.plot_spectrogram_to_numpy( - y_mel[0].data.cpu().numpy() - ), - "slice/mel_gen": utils.plot_spectrogram_to_numpy( - y_hat_mel[0].data.cpu().numpy() - ), - "all/mel": utils.plot_spectrogram_to_numpy( - mel[0].data.cpu().numpy() - ), - "all/stats_ssl": utils.plot_spectrogram_to_numpy( - stats_ssl[0].data.cpu().numpy() - ), - } - utils.summarize( - writer=writer, - global_step=global_step, - images=image_dict, - scalars=scalar_dict, - ) - global_step += 1 - if epoch % hps.train.save_every_epoch == 0 and rank == 0: - if hps.train.if_save_latest == 0: - utils.save_checkpoint( - net_g, - optim_g, - hps.train.learning_rate, - epoch, - os.path.join( - "%s/logs_s2" % hps.data.exp_dir, "G_{}.pth".format(global_step) - ), - ) - utils.save_checkpoint( - net_d, - optim_d, - hps.train.learning_rate, - epoch, - os.path.join( - "%s/logs_s2" % hps.data.exp_dir, "D_{}.pth".format(global_step) - ), - ) - else: - utils.save_checkpoint( - net_g, - optim_g, - hps.train.learning_rate, - epoch, - os.path.join( - "%s/logs_s2" % hps.data.exp_dir, "G_{}.pth".format(233333333333) - ), - ) - utils.save_checkpoint( - net_d, - optim_d, - hps.train.learning_rate, - epoch, - os.path.join( - "%s/logs_s2" % hps.data.exp_dir, "D_{}.pth".format(233333333333) - ), - ) - if rank == 0 and hps.train.if_save_every_weights == True: - if hasattr(net_g, "module"): - ckpt = net_g.module.state_dict() - else: - ckpt = net_g.state_dict() - logger.info( - "saving ckpt %s_e%s:%s" - % ( - hps.name, - epoch, - savee( - ckpt, - hps.name + "_e%s_s%s" % (epoch, global_step), - epoch, - global_step, - hps, - ), - ) - ) - - if rank == 0: - logger.info("====> Epoch: {}".format(epoch)) - - -def evaluate(hps, generator, eval_loader, writer_eval): - generator.eval() - image_dict = {} - audio_dict = {} - print("Evaluating ...") - with torch.no_grad(): - for batch_idx, ( - ssl, - ssl_lengths, - spec, - spec_lengths, - y, - y_lengths, - text, - text_lengths, - ) in enumerate(eval_loader): - print(111) - if torch.cuda.is_available(): - spec, spec_lengths = spec.cuda(), spec_lengths.cuda() - y, y_lengths = y.cuda(), y_lengths.cuda() - ssl = ssl.cuda() - text, text_lengths = text.cuda(), text_lengths.cuda() - else: - spec, spec_lengths = spec.to(device), spec_lengths.to(device) - y, y_lengths = y.to(device), y_lengths.to(device) - ssl = ssl.to(device) - text, text_lengths = text.to(device), text_lengths.to(device) - for test in [0, 1]: - y_hat, mask, *_ = generator.module.infer( - ssl, spec, spec_lengths, text, text_lengths, test=test - ) if torch.cuda.is_available() else generator.infer( - ssl, spec, spec_lengths, text, text_lengths, test=test - ) - y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length - - mel = spec_to_mel_torch( - spec, - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.mel_fmin, - hps.data.mel_fmax, - ) - y_hat_mel = mel_spectrogram_torch( - y_hat.squeeze(1).float(), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.hop_length, - hps.data.win_length, - hps.data.mel_fmin, - hps.data.mel_fmax, - ) - image_dict.update( - { - f"gen/mel_{batch_idx}_{test}": utils.plot_spectrogram_to_numpy( - y_hat_mel[0].cpu().numpy() - ) - } - ) - audio_dict.update( - {f"gen/audio_{batch_idx}_{test}": y_hat[0, :, : y_hat_lengths[0]]} - ) - image_dict.update( - { - f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy( - mel[0].cpu().numpy() - ) - } - ) - audio_dict.update({f"gt/audio_{batch_idx}": y[0, :, : y_lengths[0]]}) - - # y_hat, mask, *_ = generator.module.infer(ssl, spec_lengths, speakers, y=None) - # audio_dict.update({ - # f"gen/audio_{batch_idx}_style_pred": y_hat[0, :, :] - # }) - - utils.summarize( - writer=writer_eval, - global_step=global_step, - images=image_dict, - audios=audio_dict, - audio_sampling_rate=hps.data.sampling_rate, - ) - generator.train() - - -if __name__ == "__main__": - main() diff --git a/tools/__init__.py b/tools/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/tools/asr/config.py b/tools/asr/config.py deleted file mode 100644 index 4b0d37ae64a583cf80b5c244339c3f20b27d1acd..0000000000000000000000000000000000000000 --- a/tools/asr/config.py +++ /dev/null @@ -1,33 +0,0 @@ -import os - -def check_fw_local_models(): - ''' - 启动时检查本地是否有 Faster Whisper 模型. - ''' - model_size_list = [ - "tiny", "tiny.en", - "base", "base.en", - "small", "small.en", - "medium", "medium.en", - "large", "large-v1", - "large-v2", "large-v3"] - for i, size in enumerate(model_size_list): - if os.path.exists(f'tools/asr/models/faster-whisper-{size}'): - model_size_list[i] = size + '-local' - return model_size_list - -asr_dict = { - "达摩 ASR (中文)": { - 'lang': ['zh','yue'], - 'size': ['large'], - 'path': 'funasr_asr.py', - 'precision': ['float32'] - }, - "Faster Whisper (多语种)": { - 'lang': ['auto', 'zh', 'en', 'ja', 'ko', 'yue'], - 'size': check_fw_local_models(), - 'path': 'fasterwhisper_asr.py', - 'precision': ['float32', 'float16', 'int8'] - }, -} - diff --git a/tools/asr/fasterwhisper_asr.py b/tools/asr/fasterwhisper_asr.py deleted file mode 100644 index da8eadfb10c3fe6e917c25c018703d460aae1564..0000000000000000000000000000000000000000 --- a/tools/asr/fasterwhisper_asr.py +++ /dev/null @@ -1,114 +0,0 @@ -import argparse -import os -import traceback - -os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" -os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" - -import torch -from faster_whisper import WhisperModel -from tqdm import tqdm - -from tools.asr.config import check_fw_local_models - -language_code_list = [ - "af", "am", "ar", "as", "az", - "ba", "be", "bg", "bn", "bo", - "br", "bs", "ca", "cs", "cy", - "da", "de", "el", "en", "es", - "et", "eu", "fa", "fi", "fo", - "fr", "gl", "gu", "ha", "haw", - "he", "hi", "hr", "ht", "hu", - "hy", "id", "is", "it", "ja", - "jw", "ka", "kk", "km", "kn", - "ko", "la", "lb", "ln", "lo", - "lt", "lv", "mg", "mi", "mk", - "ml", "mn", "mr", "ms", "mt", - "my", "ne", "nl", "nn", "no", - "oc", "pa", "pl", "ps", "pt", - "ro", "ru", "sa", "sd", "si", - "sk", "sl", "sn", "so", "sq", - "sr", "su", "sv", "sw", "ta", - "te", "tg", "th", "tk", "tl", - "tr", "tt", "uk", "ur", "uz", - "vi", "yi", "yo", "zh", "yue", - "auto"] - -def execute_asr(input_folder, output_folder, model_size, language, precision): - if '-local' in model_size: - model_size = model_size[:-6] - model_path = f'tools/asr/models/faster-whisper-{model_size}' - else: - model_path = model_size - if language == 'auto': - language = None #不设置语种由模型自动输出概率最高的语种 - print("loading faster whisper model:",model_size,model_path) - device = 'cuda' if torch.cuda.is_available() else 'cpu' - try: - model = WhisperModel(model_path, device=device, compute_type=precision) - except: - return print(traceback.format_exc()) - - input_file_names = os.listdir(input_folder) - input_file_names.sort() - - output = [] - output_file_name = os.path.basename(input_folder) - - for file_name in tqdm(input_file_names): - try: - file_path = os.path.join(input_folder, file_name) - segments, info = model.transcribe( - audio = file_path, - beam_size = 5, - vad_filter = True, - vad_parameters = dict(min_silence_duration_ms=700), - language = language) - text = '' - - if info.language == "zh": - print("检测为中文文本, 转 FunASR 处理") - if("only_asr"not in globals()): - from tools.asr.funasr_asr import \ - only_asr # #如果用英文就不需要导入下载模型 - text = only_asr(file_path) - - if text == '': - for segment in segments: - text += segment.text - output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}") - except: - print(traceback.format_exc()) - - output_folder = output_folder or "output/asr_opt" - os.makedirs(output_folder, exist_ok=True) - output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') - - with open(output_file_path, "w", encoding="utf-8") as f: - f.write("\n".join(output)) - print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") - return output_file_path - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") - parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") - parser.add_argument("-s", "--model_size", type=str, default='large-v3', - choices=check_fw_local_models(), - help="Model Size of Faster Whisper") - parser.add_argument("-l", "--language", type=str, default='ja', - choices=language_code_list, - help="Language of the audio files.") - parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32','int8'], - help="fp16, int8 or fp32") - - cmd = parser.parse_args() - output_file_path = execute_asr( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, - model_size = cmd.model_size, - language = cmd.language, - precision = cmd.precision, - ) diff --git a/tools/asr/funasr_asr.py b/tools/asr/funasr_asr.py deleted file mode 100644 index 11209ada3de924ca9afc2f84974b620ae0947adf..0000000000000000000000000000000000000000 --- a/tools/asr/funasr_asr.py +++ /dev/null @@ -1,91 +0,0 @@ -# -*- coding:utf-8 -*- - -import argparse -import os -import traceback -from tqdm import tqdm -# from funasr.utils import version_checker -# version_checker.check_for_update = lambda: None -from funasr import AutoModel - - -def only_asr(input_file): - try: - text = model.generate(input=input_file)[0]["text"] - except: - text = '' - print(traceback.format_exc()) - return text - -def execute_asr(input_folder, output_folder, model_size, language): - input_file_names = os.listdir(input_folder) - input_file_names.sort() - - output = [] - output_file_name = os.path.basename(input_folder) - - for file_name in tqdm(input_file_names): - try: - print(file_name) - file_path = os.path.join(input_folder, file_name) - text = model.generate(input=file_path)[0]["text"] - output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}") - except: - print(traceback.format_exc()) - - output_folder = output_folder or "output/asr_opt" - os.makedirs(output_folder, exist_ok=True) - output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') - - with open(output_file_path, "w", encoding="utf-8") as f: - f.write("\n".join(output)) - print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") - return output_file_path - - -parser = argparse.ArgumentParser() -parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") -parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") -parser.add_argument("-s", "--model_size", type=str, default='large', - help="Model Size of FunASR is Large") -parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh','yue','auto'], - help="Language of the audio files.") -parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], - help="fp16 or fp32")#还没接入 - -cmd = parser.parse_args() - -path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch' -path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' -path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" -path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" -vad_model_revision=punc_model_revision="v2.0.4" - -if(cmd.language=="zh"): - path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' - path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" - model_revision="v2.0.4" -else: - path_asr = 'tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online' - path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online" - model_revision="master" - path_vad=path_punc=vad_model_revision=punc_model_revision=None###友情提示:粤语带VAD识别可能会有少量shape不对报错的,但是不带VAD可以.不带vad只能分阶段单独加标点。不过标点模型对粤语效果真的不行… - -model = AutoModel( - model=path_asr, - model_revision=model_revision, - vad_model=path_vad, - vad_model_revision=vad_model_revision, - punc_model=path_punc, - punc_model_revision=punc_model_revision, -) - -if __name__ == '__main__': - execute_asr( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, - model_size = cmd.model_size, - language = cmd.language, - ) diff --git a/tools/asr/models/.gitignore b/tools/asr/models/.gitignore deleted file mode 100644 index c96a04f008ee21e260b28f7701595ed59e2839e3..0000000000000000000000000000000000000000 --- a/tools/asr/models/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore \ No newline at end of file diff --git a/tools/cmd-denoise.py b/tools/cmd-denoise.py deleted file mode 100644 index 1fdcab6dc1c8a3727d69faa96349b889b0d76d6d..0000000000000000000000000000000000000000 --- a/tools/cmd-denoise.py +++ /dev/null @@ -1,33 +0,0 @@ -import os,argparse -import traceback - -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks -from tqdm import tqdm - -path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k' -path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k" -ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise) -def execute_denoise(input_folder,output_folder): - os.makedirs(output_folder,exist_ok=True) - # print(input_folder) - # print(list(os.listdir(input_folder).sort())) - for name in tqdm(os.listdir(input_folder)): - try: - ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name)) - except: - traceback.print_exc() - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") - parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") - parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], - help="fp16 or fp32")#还没接入 - cmd = parser.parse_args() - execute_denoise( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, - ) \ No newline at end of file diff --git a/tools/denoise-model/.gitignore b/tools/denoise-model/.gitignore deleted file mode 100644 index d6b7ef32c8478a48c3994dcadc86837f4371184d..0000000000000000000000000000000000000000 --- a/tools/denoise-model/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/tools/slice_audio.py b/tools/slice_audio.py deleted file mode 100644 index 8a06292d993825ca49d57f1274865c029c0b2bb4..0000000000000000000000000000000000000000 --- a/tools/slice_audio.py +++ /dev/null @@ -1,48 +0,0 @@ -import os,sys,numpy as np -import traceback -from scipy.io import wavfile -# parent_directory = os.path.dirname(os.path.abspath(__file__)) -# sys.path.append(parent_directory) -from tools.my_utils import load_audio -from slicer2 import Slicer - -def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part): - os.makedirs(opt_root,exist_ok=True) - if os.path.isfile(inp): - input=[inp] - elif os.path.isdir(inp): - input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))] - else: - return "输入路径存在但既不是文件也不是文件夹" - slicer = Slicer( - sr=32000, # 长音频采样率 - threshold= int(threshold), # 音量小于这个值视作静音的备选切割点 - min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值 - min_interval= int(min_interval), # 最短切割间隔 - hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好) - max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长 - ) - _max=float(_max) - alpha=float(alpha) - for inp_path in input[int(i_part)::int(all_part)]: - # print(inp_path) - try: - name = os.path.basename(inp_path) - audio = load_audio(inp_path, 32000) - # print(audio.shape) - for chunk, start, end in slicer.slice(audio): # start和end是帧数 - tmp_max = np.abs(chunk).max() - if(tmp_max>1):chunk/=tmp_max - chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk - wavfile.write( - "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end), - 32000, - # chunk.astype(np.float32), - (chunk * 32767).astype(np.int16), - ) - except: - print(inp_path,"->fail->",traceback.format_exc()) - return "执行完毕,请检查输出文件" - -print(slice(*sys.argv[1:])) - diff --git a/tools/slicer2.py b/tools/slicer2.py deleted file mode 100644 index ba6794b6335fc50a494ba1b1cfb375536ab7a1aa..0000000000000000000000000000000000000000 --- a/tools/slicer2.py +++ /dev/null @@ -1,261 +0,0 @@ -import numpy as np - - -# This function is obtained from librosa. -def get_rms( - y, - frame_length=2048, - hop_length=512, - pad_mode="constant", -): - padding = (int(frame_length // 2), int(frame_length // 2)) - y = np.pad(y, padding, mode=pad_mode) - - axis = -1 - # put our new within-frame axis at the end for now - out_strides = y.strides + tuple([y.strides[axis]]) - # Reduce the shape on the framing axis - x_shape_trimmed = list(y.shape) - x_shape_trimmed[axis] -= frame_length - 1 - out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) - xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) - if axis < 0: - target_axis = axis - 1 - else: - target_axis = axis + 1 - xw = np.moveaxis(xw, -1, target_axis) - # Downsample along the target axis - slices = [slice(None)] * xw.ndim - slices[axis] = slice(0, None, hop_length) - x = xw[tuple(slices)] - - # Calculate power - power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) - - return np.sqrt(power) - - -class Slicer: - def __init__( - self, - sr: int, - threshold: float = -40.0, - min_length: int = 5000, - min_interval: int = 300, - hop_size: int = 20, - max_sil_kept: int = 5000, - ): - if not min_length >= min_interval >= hop_size: - raise ValueError( - "The following condition must be satisfied: min_length >= min_interval >= hop_size" - ) - if not max_sil_kept >= hop_size: - raise ValueError( - "The following condition must be satisfied: max_sil_kept >= hop_size" - ) - min_interval = sr * min_interval / 1000 - self.threshold = 10 ** (threshold / 20.0) - self.hop_size = round(sr * hop_size / 1000) - self.win_size = min(round(min_interval), 4 * self.hop_size) - self.min_length = round(sr * min_length / 1000 / self.hop_size) - self.min_interval = round(min_interval / self.hop_size) - self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) - - def _apply_slice(self, waveform, begin, end): - if len(waveform.shape) > 1: - return waveform[ - :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) - ] - else: - return waveform[ - begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) - ] - - # @timeit - def slice(self, waveform): - if len(waveform.shape) > 1: - samples = waveform.mean(axis=0) - else: - samples = waveform - if samples.shape[0] <= self.min_length: - return [waveform] - rms_list = get_rms( - y=samples, frame_length=self.win_size, hop_length=self.hop_size - ).squeeze(0) - sil_tags = [] - silence_start = None - clip_start = 0 - for i, rms in enumerate(rms_list): - # Keep looping while frame is silent. - if rms < self.threshold: - # Record start of silent frames. - if silence_start is None: - silence_start = i - continue - # Keep looping while frame is not silent and silence start has not been recorded. - if silence_start is None: - continue - # Clear recorded silence start if interval is not enough or clip is too short - is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = ( - i - silence_start >= self.min_interval - and i - clip_start >= self.min_length - ) - if not is_leading_silence and not need_slice_middle: - silence_start = None - continue - # Need slicing. Record the range of silent frames to be removed. - if i - silence_start <= self.max_sil_kept: - pos = rms_list[silence_start : i + 1].argmin() + silence_start - if silence_start == 0: - sil_tags.append((0, pos)) - else: - sil_tags.append((pos, pos)) - clip_start = pos - elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[ - i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 - ].argmin() - pos += i - self.max_sil_kept - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) - if silence_start == 0: - sil_tags.append((0, pos_r)) - clip_start = pos_r - else: - sil_tags.append((min(pos_l, pos), max(pos_r, pos))) - clip_start = max(pos_r, pos) - else: - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) - if silence_start == 0: - sil_tags.append((0, pos_r)) - else: - sil_tags.append((pos_l, pos_r)) - clip_start = pos_r - silence_start = None - # Deal with trailing silence. - total_frames = rms_list.shape[0] - if ( - silence_start is not None - and total_frames - silence_start >= self.min_interval - ): - silence_end = min(total_frames, silence_start + self.max_sil_kept) - pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start - sil_tags.append((pos, total_frames + 1)) - # Apply and return slices. - ####音频+起始时间+终止时间 - if len(sil_tags) == 0: - return [[waveform,0,int(total_frames*self.hop_size)]] - else: - chunks = [] - if sil_tags[0][0] > 0: - chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)]) - for i in range(len(sil_tags) - 1): - chunks.append( - [self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)] - ) - if sil_tags[-1][1] < total_frames: - chunks.append( - [self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)] - ) - return chunks - - -def main(): - import os.path - from argparse import ArgumentParser - - import librosa - import soundfile - - parser = ArgumentParser() - parser.add_argument("audio", type=str, help="The audio to be sliced") - parser.add_argument( - "--out", type=str, help="Output directory of the sliced audio clips" - ) - parser.add_argument( - "--db_thresh", - type=float, - required=False, - default=-40, - help="The dB threshold for silence detection", - ) - parser.add_argument( - "--min_length", - type=int, - required=False, - default=5000, - help="The minimum milliseconds required for each sliced audio clip", - ) - parser.add_argument( - "--min_interval", - type=int, - required=False, - default=300, - help="The minimum milliseconds for a silence part to be sliced", - ) - parser.add_argument( - "--hop_size", - type=int, - required=False, - default=10, - help="Frame length in milliseconds", - ) - parser.add_argument( - "--max_sil_kept", - type=int, - required=False, - default=500, - help="The maximum silence length kept around the sliced clip, presented in milliseconds", - ) - args = parser.parse_args() - out = args.out - if out is None: - out = os.path.dirname(os.path.abspath(args.audio)) - audio, sr = librosa.load(args.audio, sr=None, mono=False) - slicer = Slicer( - sr=sr, - threshold=args.db_thresh, - min_length=args.min_length, - min_interval=args.min_interval, - hop_size=args.hop_size, - max_sil_kept=args.max_sil_kept, - ) - chunks = slicer.slice(audio) - if not os.path.exists(out): - os.makedirs(out) - for i, chunk in enumerate(chunks): - if len(chunk.shape) > 1: - chunk = chunk.T - soundfile.write( - os.path.join( - out, - f"%s_%d.wav" - % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), - ), - chunk, - sr, - ) - - -if __name__ == "__main__": - main() diff --git a/tools/subfix_webui.py b/tools/subfix_webui.py deleted file mode 100644 index d6624d03601bbfd6b1c4b2c3627b777c6e59cf27..0000000000000000000000000000000000000000 --- a/tools/subfix_webui.py +++ /dev/null @@ -1,498 +0,0 @@ -import argparse,os -import copy -import json -import os -import uuid - -import librosa -import gradio as gr -import numpy as np -import soundfile - -g_json_key_text = "" -g_json_key_path = "" -g_load_file = "" -g_load_format = "" - -g_max_json_index = 0 -g_index = 0 -g_batch = 10 -g_text_list = [] -g_audio_list = [] -g_checkbox_list = [] -g_data_json = [] - - -def reload_data(index, batch): - global g_index - g_index = index - global g_batch - g_batch = batch - datas = g_data_json[index:index+batch] - output = [] - for d in datas: - output.append( - { - g_json_key_text: d[g_json_key_text], - g_json_key_path: d[g_json_key_path] - } - ) - return output - - -def b_change_index(index, batch): - global g_index, g_batch - g_index, g_batch = index, batch - datas = reload_data(index, batch) - output = [] - for i , _ in enumerate(datas): - output.append( - # gr.Textbox( - # label=f"Text {i+index}", - # value=_[g_json_key_text]#text - # ) - { - "__type__":"update", - "label":f"Text {i+index}", - "value":_[g_json_key_text] - } - ) - for _ in range(g_batch - len(datas)): - output.append( - # gr.Textbox( - # label=f"Text", - # value="" - # ) - { - "__type__": "update", - "label": f"Text", - "value": "" - } - ) - for _ in datas: - output.append(_[g_json_key_path]) - for _ in range(g_batch - len(datas)): - output.append(None) - for _ in range(g_batch): - output.append(False) - return output - - -def b_next_index(index, batch): - b_save_file() - if (index + batch) <= g_max_json_index: - return index + batch , *b_change_index(index + batch, batch) - else: - return index, *b_change_index(index, batch) - - -def b_previous_index(index, batch): - b_save_file() - if (index - batch) >= 0: - return index - batch , *b_change_index(index - batch, batch) - else: - return 0, *b_change_index(0, batch) - - -def b_submit_change(*text_list): - global g_data_json - change = False - for i, new_text in enumerate(text_list): - if g_index + i <= g_max_json_index: - new_text = new_text.strip()+' ' - if (g_data_json[g_index + i][g_json_key_text] != new_text): - g_data_json[g_index + i][g_json_key_text] = new_text - change = True - if change: - b_save_file() - return g_index, *b_change_index(g_index, g_batch) - - -def b_delete_audio(*checkbox_list): - global g_data_json, g_index, g_max_json_index - b_save_file() - change = False - for i, checkbox in reversed(list(enumerate(checkbox_list))): - if g_index + i < len(g_data_json): - if (checkbox == True): - g_data_json.pop(g_index + i) - change = True - - g_max_json_index = len(g_data_json)-1 - if g_index > g_max_json_index: - g_index = g_max_json_index - g_index = g_index if g_index >= 0 else 0 - if change: - b_save_file() - # return gr.Slider(value=g_index, maximum=(g_max_json_index if g_max_json_index>=0 else 0)), *b_change_index(g_index, g_batch) - return {"value":g_index,"__type__":"update","maximum":(g_max_json_index if g_max_json_index>=0 else 0)},*b_change_index(g_index, g_batch) - - -def b_invert_selection(*checkbox_list): - new_list = [not item if item is True else True for item in checkbox_list] - return new_list - - -def get_next_path(filename): - base_dir = os.path.dirname(filename) - base_name = os.path.splitext(os.path.basename(filename))[0] - for i in range(100): - new_path = os.path.join(base_dir, f"{base_name}_{str(i).zfill(2)}.wav") - if not os.path.exists(new_path) : - return new_path - return os.path.join(base_dir, f'{str(uuid.uuid4())}.wav') - - -def b_audio_split(audio_breakpoint, *checkbox_list): - global g_data_json , g_max_json_index - checked_index = [] - for i, checkbox in enumerate(checkbox_list): - if (checkbox == True and g_index+i < len(g_data_json)): - checked_index.append(g_index + i) - if len(checked_index) == 1 : - index = checked_index[0] - audio_json = copy.deepcopy(g_data_json[index]) - path = audio_json[g_json_key_path] - data, sample_rate = librosa.load(path, sr=None, mono=True) - audio_maxframe = len(data) - break_frame = int(audio_breakpoint * sample_rate) - - if (break_frame >= 1 and break_frame < audio_maxframe): - audio_first = data[0:break_frame] - audio_second = data[break_frame:] - nextpath = get_next_path(path) - soundfile.write(nextpath, audio_second, sample_rate) - soundfile.write(path, audio_first, sample_rate) - g_data_json.insert(index + 1, audio_json) - g_data_json[index + 1][g_json_key_path] = nextpath - b_save_file() - - g_max_json_index = len(g_data_json) - 1 - # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) - return {"value":g_index,"maximum":g_max_json_index,"__type__":"update"}, *b_change_index(g_index, g_batch) - -def b_merge_audio(interval_r, *checkbox_list): - global g_data_json , g_max_json_index - b_save_file() - checked_index = [] - audios_path = [] - audios_text = [] - for i, checkbox in enumerate(checkbox_list): - if (checkbox == True and g_index+i < len(g_data_json)): - checked_index.append(g_index + i) - - if (len(checked_index)>1): - for i in checked_index: - audios_path.append(g_data_json[i][g_json_key_path]) - audios_text.append(g_data_json[i][g_json_key_text]) - for i in reversed(checked_index[1:]): - g_data_json.pop(i) - - base_index = checked_index[0] - base_path = audios_path[0] - g_data_json[base_index][g_json_key_text] = "".join(audios_text) - - audio_list = [] - l_sample_rate = None - for i, path in enumerate(audios_path): - data, sample_rate = librosa.load(path, sr=l_sample_rate, mono=True) - l_sample_rate = sample_rate - if (i > 0): - silence = np.zeros(int(l_sample_rate * interval_r)) - audio_list.append(silence) - - audio_list.append(data) - - audio_concat = np.concatenate(audio_list) - - soundfile.write(base_path, audio_concat, l_sample_rate) - - b_save_file() - - g_max_json_index = len(g_data_json) - 1 - - # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) - return {"value":g_index,"maximum":g_max_json_index,"__type__":"update"}, *b_change_index(g_index, g_batch) - - -def b_save_json(): - with open(g_load_file,'w', encoding="utf-8") as file: - for data in g_data_json: - file.write(f'{json.dumps(data, ensure_ascii = False)}\n') - - -def b_save_list(): - with open(g_load_file,'w', encoding="utf-8") as file: - for data in g_data_json: - wav_path = data["wav_path"] - speaker_name = data["speaker_name"] - language = data["language"] - text = data["text"] - file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip()+'\n') - - -def b_load_json(): - global g_data_json, g_max_json_index - with open(g_load_file, 'r', encoding="utf-8") as file: - g_data_json = file.readlines() - g_data_json = [json.loads(line) for line in g_data_json] - g_max_json_index = len(g_data_json) - 1 - - -def b_load_list(): - global g_data_json, g_max_json_index - with open(g_load_file, 'r', encoding="utf-8") as source: - data_list = source.readlines() - for _ in data_list: - data = _.split('|') - if (len(data) == 4): - wav_path, speaker_name, language, text = data - g_data_json.append( - { - 'wav_path':wav_path, - 'speaker_name':speaker_name, - 'language':language, - 'text':text.strip() - } - ) - else: - print("error line:", data) - g_max_json_index = len(g_data_json) - 1 - - -def b_save_file(): - if g_load_format == "json": - b_save_json() - elif g_load_format == "list": - b_save_list() - - -def b_load_file(): - if g_load_format == "json": - b_load_json() - elif g_load_format == "list": - b_load_list() - - -def set_global(load_json, load_list, json_key_text, json_key_path, batch): - global g_json_key_text, g_json_key_path, g_load_file, g_load_format, g_batch - - g_batch = int(batch) - - if (load_json != "None"): - g_load_format = "json" - g_load_file = load_json - elif (load_list != "None"): - g_load_format = "list" - g_load_file = load_list - else: - g_load_format = "list" - g_load_file = "demo.list" - - g_json_key_text = json_key_text - g_json_key_path = json_key_path - - b_load_file() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Process some integers.') - parser.add_argument('--load_json', default="None", help='source file, like demo.json') - parser.add_argument('--is_share', default="False", help='whether webui is_share=True') - parser.add_argument('--load_list', default="None", help='source file, like demo.list') - parser.add_argument('--webui_port_subfix', default=9871, help='source file, like demo.list') - parser.add_argument('--json_key_text', default="text", help='the text key name in json, Default: text') - parser.add_argument('--json_key_path', default="wav_path", help='the path key name in json, Default: wav_path') - parser.add_argument('--g_batch', default=10, help='max number g_batch wav to display, Default: 10') - - args = parser.parse_args() - - set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch) - - with gr.Blocks() as demo: - - with gr.Row(): - btn_change_index = gr.Button("Change Index") - btn_submit_change = gr.Button("Submit Text") - btn_merge_audio = gr.Button("Merge Audio") - btn_delete_audio = gr.Button("Delete Audio") - btn_previous_index = gr.Button("Previous Index") - btn_next_index = gr.Button("Next Index") - - with gr.Row(): - index_slider = gr.Slider( - minimum=0, maximum=g_max_json_index, value=g_index, step=1, label="Index", scale=3 - ) - splitpoint_slider = gr.Slider( - minimum=0, maximum=120.0, value=0, step=0.1, label="Audio Split Point(s)", scale=3 - ) - btn_audio_split = gr.Button("Split Audio", scale=1) - btn_save_json = gr.Button("Save File", visible=True, scale=1) - btn_invert_selection = gr.Button("Invert Selection", scale=1) - - with gr.Row(): - with gr.Column(): - for _ in range(0,g_batch): - with gr.Row(): - text = gr.Textbox( - label = "Text", - visible = True, - scale=5 - ) - audio_output = gr.Audio( - label="Output Audio", - visible = True, - scale=5 - ) - audio_check = gr.Checkbox( - label="Yes", - show_label = True, - info = "Choose Audio", - scale=1 - ) - g_text_list.append(text) - g_audio_list.append(audio_output) - g_checkbox_list.append(audio_check) - - - - with gr.Row(): - batchsize_slider = gr.Slider( - minimum=1, maximum=g_batch, value=g_batch, step=1, label="Batch Size", scale=3, interactive=False - ) - interval_slider = gr.Slider( - minimum=0, maximum=2, value=0, step=0.01, label="Interval", scale=3 - ) - btn_theme_dark = gr.Button("Light Theme", link="?__theme=light", scale=1) - btn_theme_light = gr.Button("Dark Theme", link="?__theme=dark", scale=1) - - btn_change_index.click( - b_change_index, - inputs=[ - index_slider, - batchsize_slider, - ], - outputs=[ - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], - ) - - - btn_submit_change.click( - b_submit_change, - inputs=[ - *g_text_list, - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], - ) - - btn_previous_index.click( - b_previous_index, - inputs=[ - index_slider, - batchsize_slider, - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], - ) - - btn_next_index.click( - b_next_index, - inputs=[ - index_slider, - batchsize_slider, - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], - ) - - btn_delete_audio.click( - b_delete_audio, - inputs=[ - *g_checkbox_list - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ] - ) - - btn_merge_audio.click( - b_merge_audio, - inputs=[ - interval_slider, - *g_checkbox_list - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ] - ) - - btn_audio_split.click( - b_audio_split, - inputs=[ - splitpoint_slider, - *g_checkbox_list - ], - outputs=[ - index_slider, - *g_text_list, - *g_audio_list, - *g_checkbox_list - ] - ) - - btn_invert_selection.click( - b_invert_selection, - inputs=[ - *g_checkbox_list - ], - outputs=[ - *g_checkbox_list - ] - ) - - btn_save_json.click( - b_save_file - ) - - demo.load( - b_change_index, - inputs=[ - index_slider, - batchsize_slider, - ], - outputs=[ - *g_text_list, - *g_audio_list, - *g_checkbox_list - ], - ) - - demo.launch( - server_name="0.0.0.0", - inbrowser=True, - quiet=True, - share=eval(args.is_share), - server_port=int(args.webui_port_subfix) - ) \ No newline at end of file diff --git a/tools/uvr5/bs_roformer/__init__.py b/tools/uvr5/bs_roformer/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/tools/uvr5/bs_roformer/attend.py b/tools/uvr5/bs_roformer/attend.py deleted file mode 100644 index 34476c181629652e10ca866679abbbe4868927e6..0000000000000000000000000000000000000000 --- a/tools/uvr5/bs_roformer/attend.py +++ /dev/null @@ -1,120 +0,0 @@ -from functools import wraps -from packaging import version -from collections import namedtuple - -import torch -from torch import nn, einsum -import torch.nn.functional as F - -from einops import rearrange, reduce - -# constants - -FlashAttentionConfig = namedtuple('FlashAttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient']) - -# helpers - -def exists(val): - return val is not None - -def default(v, d): - return v if exists(v) else d - -def once(fn): - called = False - @wraps(fn) - def inner(x): - nonlocal called - if called: - return - called = True - return fn(x) - return inner - -print_once = once(print) - -# main class - -class Attend(nn.Module): - def __init__( - self, - dropout = 0., - flash = False, - scale = None - ): - super().__init__() - self.scale = scale - self.dropout = dropout - self.attn_dropout = nn.Dropout(dropout) - - self.flash = flash - assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above' - - # determine efficient attention configs for cuda and cpu - - self.cpu_config = FlashAttentionConfig(True, True, True) - self.cuda_config = None - - if not torch.cuda.is_available() or not flash: - return - - device_properties = torch.cuda.get_device_properties(torch.device('cuda')) - - if device_properties.major == 8 and device_properties.minor == 0: - print_once('A100 GPU detected, using flash attention if input tensor is on cuda') - self.cuda_config = FlashAttentionConfig(True, False, False) - else: - print_once('Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda') - self.cuda_config = FlashAttentionConfig(False, True, True) - - def flash_attn(self, q, k, v): - _, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device - - if exists(self.scale): - default_scale = q.shape[-1] ** -0.5 - q = q * (self.scale / default_scale) - - # Check if there is a compatible device for flash attention - - config = self.cuda_config if is_cuda else self.cpu_config - - # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale - - with torch.backends.cuda.sdp_kernel(**config._asdict()): - out = F.scaled_dot_product_attention( - q, k, v, - dropout_p = self.dropout if self.training else 0. - ) - - return out - - def forward(self, q, k, v): - """ - einstein notation - b - batch - h - heads - n, i, j - sequence length (base sequence length, source, target) - d - feature dimension - """ - - q_len, k_len, device = q.shape[-2], k.shape[-2], q.device - - scale = default(self.scale, q.shape[-1] ** -0.5) - - if self.flash: - return self.flash_attn(q, k, v) - - # similarity - - sim = einsum(f"b h i d, b h j d -> b h i j", q, k) * scale - - # attention - - attn = sim.softmax(dim=-1) - attn = self.attn_dropout(attn) - - # aggregate values - - out = einsum(f"b h i j, b h j d -> b h i d", attn, v) - - return out diff --git a/tools/uvr5/bs_roformer/bs_roformer.py b/tools/uvr5/bs_roformer/bs_roformer.py deleted file mode 100644 index 88af3caa06369f2b815fd6cea532f8ba6e974aa2..0000000000000000000000000000000000000000 --- a/tools/uvr5/bs_roformer/bs_roformer.py +++ /dev/null @@ -1,583 +0,0 @@ -from functools import partial - -import torch -from torch import nn, einsum, Tensor -from torch.nn import Module, ModuleList -import torch.nn.functional as F - -from bs_roformer.attend import Attend - -from typing import Tuple, Optional, List, Callable -# from beartype.typing import Tuple, Optional, List, Callable -# from beartype import beartype - -from rotary_embedding_torch import RotaryEmbedding - -from einops import rearrange, pack, unpack -from einops.layers.torch import Rearrange - -# helper functions - -def exists(val): - return val is not None - - -def default(v, d): - return v if exists(v) else d - - -def pack_one(t, pattern): - return pack([t], pattern) - - -def unpack_one(t, ps, pattern): - return unpack(t, ps, pattern)[0] - - -# norm - -def l2norm(t): - return F.normalize(t, dim = -1, p = 2) - - -class RMSNorm(Module): - def __init__(self, dim): - super().__init__() - self.scale = dim ** 0.5 - self.gamma = nn.Parameter(torch.ones(dim)) - - def forward(self, x): - return F.normalize(x, dim=-1) * self.scale * self.gamma - - -# attention - -class FeedForward(Module): - def __init__( - self, - dim, - mult=4, - dropout=0. - ): - super().__init__() - dim_inner = int(dim * mult) - self.net = nn.Sequential( - RMSNorm(dim), - nn.Linear(dim, dim_inner), - nn.GELU(), - nn.Dropout(dropout), - nn.Linear(dim_inner, dim), - nn.Dropout(dropout) - ) - - def forward(self, x): - return self.net(x) - - -class Attention(Module): - def __init__( - self, - dim, - heads=8, - dim_head=64, - dropout=0., - rotary_embed=None, - flash=True - ): - super().__init__() - self.heads = heads - self.scale = dim_head ** -0.5 - dim_inner = heads * dim_head - - self.rotary_embed = rotary_embed - - self.attend = Attend(flash=flash, dropout=dropout) - - self.norm = RMSNorm(dim) - self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False) - - self.to_gates = nn.Linear(dim, heads) - - self.to_out = nn.Sequential( - nn.Linear(dim_inner, dim, bias=False), - nn.Dropout(dropout) - ) - - def forward(self, x): - x = self.norm(x) - - q, k, v = rearrange(self.to_qkv(x), 'b n (qkv h d) -> qkv b h n d', qkv=3, h=self.heads) - - if exists(self.rotary_embed): - q = self.rotary_embed.rotate_queries_or_keys(q) - k = self.rotary_embed.rotate_queries_or_keys(k) - - out = self.attend(q, k, v) - - gates = self.to_gates(x) - out = out * rearrange(gates, 'b n h -> b h n 1').sigmoid() - - out = rearrange(out, 'b h n d -> b n (h d)') - return self.to_out(out) - - -class LinearAttention(Module): - """ - this flavor of linear attention proposed in https://arxiv.org/abs/2106.09681 by El-Nouby et al. - """ - - # @beartype - def __init__( - self, - *, - dim, - dim_head=32, - heads=8, - scale=8, - flash=False, - dropout=0. - ): - super().__init__() - dim_inner = dim_head * heads - self.norm = RMSNorm(dim) - - self.to_qkv = nn.Sequential( - nn.Linear(dim, dim_inner * 3, bias=False), - Rearrange('b n (qkv h d) -> qkv b h d n', qkv=3, h=heads) - ) - - self.temperature = nn.Parameter(torch.ones(heads, 1, 1)) - - self.attend = Attend( - scale=scale, - dropout=dropout, - flash=flash - ) - - self.to_out = nn.Sequential( - Rearrange('b h d n -> b n (h d)'), - nn.Linear(dim_inner, dim, bias=False) - ) - - def forward( - self, - x - ): - x = self.norm(x) - - q, k, v = self.to_qkv(x) - - q, k = map(l2norm, (q, k)) - q = q * self.temperature.exp() - - out = self.attend(q, k, v) - - return self.to_out(out) - - -class Transformer(Module): - def __init__( - self, - *, - dim, - depth, - dim_head=64, - heads=8, - attn_dropout=0., - ff_dropout=0., - ff_mult=4, - norm_output=True, - rotary_embed=None, - flash_attn=True, - linear_attn=False - ): - super().__init__() - self.layers = ModuleList([]) - - for _ in range(depth): - if linear_attn: - attn = LinearAttention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, flash=flash_attn) - else: - attn = Attention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, - rotary_embed=rotary_embed, flash=flash_attn) - - self.layers.append(ModuleList([ - attn, - FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout) - ])) - - self.norm = RMSNorm(dim) if norm_output else nn.Identity() - - def forward(self, x): - - for attn, ff in self.layers: - x = attn(x) + x - x = ff(x) + x - - return self.norm(x) - - -# bandsplit module - -class BandSplit(Module): - # @beartype - def __init__( - self, - dim, - dim_inputs: Tuple[int, ...] - ): - super().__init__() - self.dim_inputs = dim_inputs - self.to_features = ModuleList([]) - - for dim_in in dim_inputs: - net = nn.Sequential( - RMSNorm(dim_in), - nn.Linear(dim_in, dim) - ) - - self.to_features.append(net) - - def forward(self, x): - x = x.split(self.dim_inputs, dim=-1) - - outs = [] - for split_input, to_feature in zip(x, self.to_features): - split_output = to_feature(split_input) - outs.append(split_output) - - return torch.stack(outs, dim=-2) - - -def MLP( - dim_in, - dim_out, - dim_hidden=None, - depth=1, - activation=nn.Tanh -): - dim_hidden = default(dim_hidden, dim_in) - - net = [] - dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out) - - for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])): - is_last = ind == (len(dims) - 2) - - net.append(nn.Linear(layer_dim_in, layer_dim_out)) - - if is_last: - continue - - net.append(activation()) - - return nn.Sequential(*net) - - -class MaskEstimator(Module): - # @beartype - def __init__( - self, - dim, - dim_inputs: Tuple[int, ...], - depth, - mlp_expansion_factor=4 - ): - super().__init__() - self.dim_inputs = dim_inputs - self.to_freqs = ModuleList([]) - dim_hidden = dim * mlp_expansion_factor - - for dim_in in dim_inputs: - net = [] - - mlp = nn.Sequential( - MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), - nn.GLU(dim=-1) - ) - - self.to_freqs.append(mlp) - - def forward(self, x): - x = x.unbind(dim=-2) - - outs = [] - - for band_features, mlp in zip(x, self.to_freqs): - freq_out = mlp(band_features) - outs.append(freq_out) - - return torch.cat(outs, dim=-1) - - -# main class - -DEFAULT_FREQS_PER_BANDS = ( - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 12, 12, 12, 12, 12, 12, 12, 12, - 24, 24, 24, 24, 24, 24, 24, 24, - 48, 48, 48, 48, 48, 48, 48, 48, - 128, 129, -) - - -class BSRoformer(Module): - - # @beartype - def __init__( - self, - dim, - *, - depth, - stereo=False, - num_stems=1, - time_transformer_depth=2, - freq_transformer_depth=2, - linear_transformer_depth=0, - freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS, - # in the paper, they divide into ~60 bands, test with 1 for starters - dim_head=64, - heads=8, - attn_dropout=0., - ff_dropout=0., - flash_attn=True, - dim_freqs_in=1025, - stft_n_fft=2048, - stft_hop_length=512, - # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction - stft_win_length=2048, - stft_normalized=False, - stft_window_fn: Optional[Callable] = None, - mask_estimator_depth=2, - multi_stft_resolution_loss_weight=1., - multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256), - multi_stft_hop_size=147, - multi_stft_normalized=False, - multi_stft_window_fn: Callable = torch.hann_window - ): - super().__init__() - - self.stereo = stereo - self.audio_channels = 2 if stereo else 1 - self.num_stems = num_stems - - self.layers = ModuleList([]) - - transformer_kwargs = dict( - dim=dim, - heads=heads, - dim_head=dim_head, - attn_dropout=attn_dropout, - ff_dropout=ff_dropout, - flash_attn=flash_attn, - norm_output=False - ) - - time_rotary_embed = RotaryEmbedding(dim=dim_head) - freq_rotary_embed = RotaryEmbedding(dim=dim_head) - - for _ in range(depth): - tran_modules = [] - if linear_transformer_depth > 0: - tran_modules.append(Transformer(depth=linear_transformer_depth, linear_attn=True, **transformer_kwargs)) - tran_modules.append( - Transformer(depth=time_transformer_depth, rotary_embed=time_rotary_embed, **transformer_kwargs) - ) - tran_modules.append( - Transformer(depth=freq_transformer_depth, rotary_embed=freq_rotary_embed, **transformer_kwargs) - ) - self.layers.append(nn.ModuleList(tran_modules)) - - self.final_norm = RMSNorm(dim) - - self.stft_kwargs = dict( - n_fft=stft_n_fft, - hop_length=stft_hop_length, - win_length=stft_win_length, - normalized=stft_normalized - ) - - self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length) - - freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, return_complex=True).shape[1] - - assert len(freqs_per_bands) > 1 - assert sum( - freqs_per_bands) == freqs, f'the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}' - - freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in freqs_per_bands) - - self.band_split = BandSplit( - dim=dim, - dim_inputs=freqs_per_bands_with_complex - ) - - self.mask_estimators = nn.ModuleList([]) - - for _ in range(num_stems): - mask_estimator = MaskEstimator( - dim=dim, - dim_inputs=freqs_per_bands_with_complex, - depth=mask_estimator_depth - ) - - self.mask_estimators.append(mask_estimator) - - # for the multi-resolution stft loss - - self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight - self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes - self.multi_stft_n_fft = stft_n_fft - self.multi_stft_window_fn = multi_stft_window_fn - - self.multi_stft_kwargs = dict( - hop_length=multi_stft_hop_size, - normalized=multi_stft_normalized - ) - - def forward( - self, - raw_audio, - target=None, - return_loss_breakdown=False - ): - """ - einops - - b - batch - f - freq - t - time - s - audio channel (1 for mono, 2 for stereo) - n - number of 'stems' - c - complex (2) - d - feature dimension - """ - - device = raw_audio.device - - if raw_audio.ndim == 2: - raw_audio = rearrange(raw_audio, 'b t -> b 1 t') - - channels = raw_audio.shape[1] - assert (not self.stereo and channels == 1) or ( - self.stereo and channels == 2), 'stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)' - - # to stft - - raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, '* t') - - stft_window = self.stft_window_fn(device=device) - - stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True) - stft_repr = torch.view_as_real(stft_repr) - - stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, '* f t c') - stft_repr = rearrange(stft_repr, - 'b s f t c -> b (f s) t c') # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting - - x = rearrange(stft_repr, 'b f t c -> b t (f c)') - # print("460:", x.dtype)#fp32 - x = self.band_split(x) - - # axial / hierarchical attention - - # print("487:",x.dtype)#fp16 - for transformer_block in self.layers: - - if len(transformer_block) == 3: - linear_transformer, time_transformer, freq_transformer = transformer_block - - x, ft_ps = pack([x], 'b * d') - # print("494:", x.dtype)#fp16 - x = linear_transformer(x) - # print("496:", x.dtype)#fp16 - x, = unpack(x, ft_ps, 'b * d') - else: - time_transformer, freq_transformer = transformer_block - - # print("501:", x.dtype)#fp16 - x = rearrange(x, 'b t f d -> b f t d') - x, ps = pack([x], '* t d') - - x = time_transformer(x) - # print("505:", x.dtype)#fp16 - x, = unpack(x, ps, '* t d') - x = rearrange(x, 'b f t d -> b t f d') - x, ps = pack([x], '* f d') - - x = freq_transformer(x) - - x, = unpack(x, ps, '* f d') - - # print("515:", x.dtype)######fp16 - x = self.final_norm(x) - - num_stems = len(self.mask_estimators) - # print("519:", x.dtype)#fp32 - mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1) - mask = rearrange(mask, 'b n t (f c) -> b n f t c', c=2) - - # modulate frequency representation - - stft_repr = rearrange(stft_repr, 'b f t c -> b 1 f t c') - - # complex number multiplication - - stft_repr = torch.view_as_complex(stft_repr) - mask = torch.view_as_complex(mask) - - stft_repr = stft_repr * mask - - # istft - - stft_repr = rearrange(stft_repr, 'b n (f s) t -> (b n s) f t', s=self.audio_channels) - - recon_audio = torch.istft(stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False) - - recon_audio = rearrange(recon_audio, '(b n s) t -> b n s t', s=self.audio_channels, n=num_stems) - - if num_stems == 1: - recon_audio = rearrange(recon_audio, 'b 1 s t -> b s t') - - # if a target is passed in, calculate loss for learning - - if not exists(target): - return recon_audio - - if self.num_stems > 1: - assert target.ndim == 4 and target.shape[1] == self.num_stems - - if target.ndim == 2: - target = rearrange(target, '... t -> ... 1 t') - - target = target[..., :recon_audio.shape[-1]] # protect against lost length on istft - - loss = F.l1_loss(recon_audio, target) - - multi_stft_resolution_loss = 0. - - for window_size in self.multi_stft_resolutions_window_sizes: - res_stft_kwargs = dict( - n_fft=max(window_size, self.multi_stft_n_fft), # not sure what n_fft is across multi resolution stft - win_length=window_size, - return_complex=True, - window=self.multi_stft_window_fn(window_size, device=device), - **self.multi_stft_kwargs, - ) - - recon_Y = torch.stft(rearrange(recon_audio, '... s t -> (... s) t'), **res_stft_kwargs) - target_Y = torch.stft(rearrange(target, '... s t -> (... s) t'), **res_stft_kwargs) - - multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y) - - weighted_multi_resolution_loss = multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight - - total_loss = loss + weighted_multi_resolution_loss - - if not return_loss_breakdown: - return total_loss - - return total_loss, (loss, multi_stft_resolution_loss) \ No newline at end of file diff --git a/tools/uvr5/bsroformer.py b/tools/uvr5/bsroformer.py deleted file mode 100644 index d162032686d8b9478010bd4e0bd154f56200069e..0000000000000000000000000000000000000000 --- a/tools/uvr5/bsroformer.py +++ /dev/null @@ -1,216 +0,0 @@ -# This code is modified from https://github.com/ZFTurbo/ -import pdb - -import librosa -from tqdm import tqdm -import os -import torch -import numpy as np -import soundfile as sf -import torch.nn as nn - -import warnings -warnings.filterwarnings("ignore") -from bs_roformer.bs_roformer import BSRoformer - -class BsRoformer_Loader: - def get_model_from_config(self): - config = { - "attn_dropout": 0.1, - "depth": 12, - "dim": 512, - "dim_freqs_in": 1025, - "dim_head": 64, - "ff_dropout": 0.1, - "flash_attn": True, - "freq_transformer_depth": 1, - "freqs_per_bands":(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12, 24, 24, 24, 24, 24, 24, 24, 24, 48, 48, 48, 48, 48, 48, 48, 48, 128, 129), - "heads": 8, - "linear_transformer_depth": 0, - "mask_estimator_depth": 2, - "multi_stft_hop_size": 147, - "multi_stft_normalized": False, - "multi_stft_resolution_loss_weight": 1.0, - "multi_stft_resolutions_window_sizes":(4096, 2048, 1024, 512, 256), - "num_stems": 1, - "stereo": True, - "stft_hop_length": 441, - "stft_n_fft": 2048, - "stft_normalized": False, - "stft_win_length": 2048, - "time_transformer_depth": 1, - - } - - - model = BSRoformer( - **dict(config) - ) - - return model - - - def demix_track(self, model, mix, device): - C = 352800 - # num_overlap - N = 1 - fade_size = C // 10 - step = int(C // N) - border = C - step - batch_size = 4 - - length_init = mix.shape[-1] - - progress_bar = tqdm(total=length_init // step + 1) - progress_bar.set_description("Processing") - - # Do pad from the beginning and end to account floating window results better - if length_init > 2 * border and (border > 0): - mix = nn.functional.pad(mix, (border, border), mode='reflect') - - # Prepare windows arrays (do 1 time for speed up). This trick repairs click problems on the edges of segment - window_size = C - fadein = torch.linspace(0, 1, fade_size) - fadeout = torch.linspace(1, 0, fade_size) - window_start = torch.ones(window_size) - window_middle = torch.ones(window_size) - window_finish = torch.ones(window_size) - window_start[-fade_size:] *= fadeout # First audio chunk, no fadein - window_finish[:fade_size] *= fadein # Last audio chunk, no fadeout - window_middle[-fade_size:] *= fadeout - window_middle[:fade_size] *= fadein - - with torch.amp.autocast('cuda'): - with torch.inference_mode(): - req_shape = (1, ) + tuple(mix.shape) - - result = torch.zeros(req_shape, dtype=torch.float32) - counter = torch.zeros(req_shape, dtype=torch.float32) - i = 0 - batch_data = [] - batch_locations = [] - while i < mix.shape[1]: - part = mix[:, i:i + C].to(device) - length = part.shape[-1] - if length < C: - if length > C // 2 + 1: - part = nn.functional.pad(input=part, pad=(0, C - length), mode='reflect') - else: - part = nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode='constant', value=0) - if(self.is_half==True): - part=part.half() - batch_data.append(part) - batch_locations.append((i, length)) - i += step - progress_bar.update(1) - - if len(batch_data) >= batch_size or (i >= mix.shape[1]): - arr = torch.stack(batch_data, dim=0) - # print(23333333,arr.dtype) - x = model(arr) - - window = window_middle - if i - step == 0: # First audio chunk, no fadein - window = window_start - elif i >= mix.shape[1]: # Last audio chunk, no fadeout - window = window_finish - - for j in range(len(batch_locations)): - start, l = batch_locations[j] - result[..., start:start+l] += x[j][..., :l].cpu() * window[..., :l] - counter[..., start:start+l] += window[..., :l] - - batch_data = [] - batch_locations = [] - - estimated_sources = result / counter - estimated_sources = estimated_sources.cpu().numpy() - np.nan_to_num(estimated_sources, copy=False, nan=0.0) - - if length_init > 2 * border and (border > 0): - # Remove pad - estimated_sources = estimated_sources[..., border:-border] - - progress_bar.close() - - return {k: v for k, v in zip(['vocals', 'other'], estimated_sources)} - - - def run_folder(self,input, vocal_root, others_root, format): - # start_time = time.time() - self.model.eval() - path = input - - if not os.path.isdir(vocal_root): - os.mkdir(vocal_root) - - if not os.path.isdir(others_root): - os.mkdir(others_root) - - try: - mix, sr = librosa.load(path, sr=44100, mono=False) - except Exception as e: - print('Can read track: {}'.format(path)) - print('Error message: {}'.format(str(e))) - return - - # Convert mono to stereo if needed - if len(mix.shape) == 1: - mix = np.stack([mix, mix], axis=0) - - mix_orig = mix.copy() - - mixture = torch.tensor(mix, dtype=torch.float32) - res = self.demix_track(self.model, mixture, self.device) - - estimates = res['vocals'].T - - if format in ["wav", "flac"]: - sf.write("{}/{}_{}.{}".format(vocal_root, os.path.basename(path)[:-4], 'vocals', format), estimates, sr) - sf.write("{}/{}_{}.{}".format(others_root, os.path.basename(path)[:-4], 'instrumental', format), mix_orig.T - estimates, sr) - else: - path_vocal = "%s/%s_vocals.wav" % (vocal_root, os.path.basename(path)[:-4]) - path_other = "%s/%s_instrumental.wav" % (others_root, os.path.basename(path)[:-4]) - sf.write(path_vocal, estimates, sr) - sf.write(path_other, mix_orig.T - estimates, sr) - opt_path_vocal = path_vocal[:-4] + ".%s" % format - opt_path_other = path_other[:-4] + ".%s" % format - if os.path.exists(path_vocal): - os.system( - "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal) - ) - if os.path.exists(opt_path_vocal): - try: - os.remove(path_vocal) - except: - pass - if os.path.exists(path_other): - os.system( - "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other) - ) - if os.path.exists(opt_path_other): - try: - os.remove(path_other) - except: - pass - - # print("Elapsed time: {:.2f} sec".format(time.time() - start_time)) - - - def __init__(self, model_path, device,is_half): - self.device = device - self.extract_instrumental=True - - model = self.get_model_from_config() - state_dict = torch.load(model_path,map_location="cpu") - model.load_state_dict(state_dict) - self.is_half=is_half - if(is_half==False): - self.model = model.to(device) - else: - self.model = model.half().to(device) - - - def _path_audio_(self, input, others_root, vocal_root, format, is_hp3=False): - self.run_folder(input, vocal_root, others_root, format) - diff --git a/tools/uvr5/lib/lib_v5/dataset.py b/tools/uvr5/lib/lib_v5/dataset.py deleted file mode 100644 index cfd01a174978d97180a897e40cb59ecadec1d12e..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/dataset.py +++ /dev/null @@ -1,183 +0,0 @@ -import os -import random - -import numpy as np -import torch -import torch.utils.data -from tqdm import tqdm - -from . import spec_utils - - -class VocalRemoverValidationSet(torch.utils.data.Dataset): - def __init__(self, patch_list): - self.patch_list = patch_list - - def __len__(self): - return len(self.patch_list) - - def __getitem__(self, idx): - path = self.patch_list[idx] - data = np.load(path) - - X, y = data["X"], data["y"] - - X_mag = np.abs(X) - y_mag = np.abs(y) - - return X_mag, y_mag - - -def make_pair(mix_dir, inst_dir): - input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"] - - X_list = sorted( - [ - os.path.join(mix_dir, fname) - for fname in os.listdir(mix_dir) - if os.path.splitext(fname)[1] in input_exts - ] - ) - y_list = sorted( - [ - os.path.join(inst_dir, fname) - for fname in os.listdir(inst_dir) - if os.path.splitext(fname)[1] in input_exts - ] - ) - - filelist = list(zip(X_list, y_list)) - - return filelist - - -def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): - if split_mode == "random": - filelist = make_pair( - os.path.join(dataset_dir, "mixtures"), - os.path.join(dataset_dir, "instruments"), - ) - - random.shuffle(filelist) - - if len(val_filelist) == 0: - val_size = int(len(filelist) * val_rate) - train_filelist = filelist[:-val_size] - val_filelist = filelist[-val_size:] - else: - train_filelist = [ - pair for pair in filelist if list(pair) not in val_filelist - ] - elif split_mode == "subdirs": - if len(val_filelist) != 0: - raise ValueError( - "The `val_filelist` option is not available in `subdirs` mode" - ) - - train_filelist = make_pair( - os.path.join(dataset_dir, "training/mixtures"), - os.path.join(dataset_dir, "training/instruments"), - ) - - val_filelist = make_pair( - os.path.join(dataset_dir, "validation/mixtures"), - os.path.join(dataset_dir, "validation/instruments"), - ) - - return train_filelist, val_filelist - - -def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha): - perm = np.random.permutation(len(X)) - for i, idx in enumerate(tqdm(perm)): - if np.random.uniform() < reduction_rate: - y[idx] = spec_utils.reduce_vocal_aggressively( - X[idx], y[idx], reduction_mask - ) - - if np.random.uniform() < 0.5: - # swap channel - X[idx] = X[idx, ::-1] - y[idx] = y[idx, ::-1] - if np.random.uniform() < 0.02: - # mono - X[idx] = X[idx].mean(axis=0, keepdims=True) - y[idx] = y[idx].mean(axis=0, keepdims=True) - if np.random.uniform() < 0.02: - # inst - X[idx] = y[idx] - - if np.random.uniform() < mixup_rate and i < len(perm) - 1: - lam = np.random.beta(mixup_alpha, mixup_alpha) - X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]] - y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]] - - return X, y - - -def make_padding(width, cropsize, offset): - left = offset - roi_size = cropsize - left * 2 - if roi_size == 0: - roi_size = cropsize - right = roi_size - (width % roi_size) + left - - return left, right, roi_size - - -def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset): - len_dataset = patches * len(filelist) - - X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) - y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) - - for i, (X_path, y_path) in enumerate(tqdm(filelist)): - X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) - coef = np.max([np.abs(X).max(), np.abs(y).max()]) - X, y = X / coef, y / coef - - l, r, roi_size = make_padding(X.shape[2], cropsize, offset) - X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") - y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") - - starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches) - ends = starts + cropsize - for j in range(patches): - idx = i * patches + j - X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]] - y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]] - - return X_dataset, y_dataset - - -def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): - patch_list = [] - patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format( - cropsize, sr, hop_length, n_fft, offset - ) - os.makedirs(patch_dir, exist_ok=True) - - for i, (X_path, y_path) in enumerate(tqdm(filelist)): - basename = os.path.splitext(os.path.basename(X_path))[0] - - X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) - coef = np.max([np.abs(X).max(), np.abs(y).max()]) - X, y = X / coef, y / coef - - l, r, roi_size = make_padding(X.shape[2], cropsize, offset) - X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") - y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") - - len_dataset = int(np.ceil(X.shape[2] / roi_size)) - for j in range(len_dataset): - outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j)) - start = j * roi_size - if not os.path.exists(outpath): - np.savez( - outpath, - X=X_pad[:, :, start : start + cropsize], - y=y_pad[:, :, start : start + cropsize], - ) - patch_list.append(outpath) - - return VocalRemoverValidationSet(patch_list) diff --git a/tools/uvr5/lib/lib_v5/layers.py b/tools/uvr5/lib/lib_v5/layers.py deleted file mode 100644 index 4fc1b5cb85a3327f60cbb9f5deffbeeaaac516ad..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/layers.py +++ /dev/null @@ -1,118 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/tools/uvr5/lib/lib_v5/layers_123812KB.py b/tools/uvr5/lib/lib_v5/layers_123812KB.py deleted file mode 100644 index 4fc1b5cb85a3327f60cbb9f5deffbeeaaac516ad..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/layers_123812KB.py +++ /dev/null @@ -1,118 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/tools/uvr5/lib/lib_v5/layers_123821KB.py b/tools/uvr5/lib/lib_v5/layers_123821KB.py deleted file mode 100644 index 4fc1b5cb85a3327f60cbb9f5deffbeeaaac516ad..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/layers_123821KB.py +++ /dev/null @@ -1,118 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/tools/uvr5/lib/lib_v5/layers_33966KB.py b/tools/uvr5/lib/lib_v5/layers_33966KB.py deleted file mode 100644 index 9b127bc6427f5c60c8cf85603a3d8a093c3501c4..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/layers_33966KB.py +++ /dev/null @@ -1,126 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - feat6 = self.conv6(x) - feat7 = self.conv7(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/tools/uvr5/lib/lib_v5/layers_537227KB.py b/tools/uvr5/lib/lib_v5/layers_537227KB.py deleted file mode 100644 index 9b127bc6427f5c60c8cf85603a3d8a093c3501c4..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/layers_537227KB.py +++ /dev/null @@ -1,126 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - feat6 = self.conv6(x) - feat7 = self.conv7(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/tools/uvr5/lib/lib_v5/layers_537238KB.py b/tools/uvr5/lib/lib_v5/layers_537238KB.py deleted file mode 100644 index 9b127bc6427f5c60c8cf85603a3d8a093c3501c4..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/layers_537238KB.py +++ /dev/null @@ -1,126 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - feat6 = self.conv6(x) - feat7 = self.conv7(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/tools/uvr5/lib/lib_v5/layers_new.py b/tools/uvr5/lib/lib_v5/layers_new.py deleted file mode 100644 index 44153b6a23399c6938affc61c71919eaa172bcee..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/layers_new.py +++ /dev/null @@ -1,125 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) - - def __call__(self, x): - h = self.conv1(x) - h = self.conv2(h) - - return h - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - - h = self.conv1(x) - # h = self.conv2(h) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) - self.conv3 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - out = self.bottleneck(out) - - if self.dropout is not None: - out = self.dropout(out) - - return out - - -class LSTMModule(nn.Module): - def __init__(self, nin_conv, nin_lstm, nout_lstm): - super(LSTMModule, self).__init__() - self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) - self.lstm = nn.LSTM( - input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True - ) - self.dense = nn.Sequential( - nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() - ) - - def forward(self, x): - N, _, nbins, nframes = x.size() - h = self.conv(x)[:, 0] # N, nbins, nframes - h = h.permute(2, 0, 1) # nframes, N, nbins - h, _ = self.lstm(h) - h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins - h = h.reshape(nframes, N, 1, nbins) - h = h.permute(1, 2, 3, 0) - - return h diff --git a/tools/uvr5/lib/lib_v5/model_param_init.py b/tools/uvr5/lib/lib_v5/model_param_init.py deleted file mode 100644 index b995c0bfb1194746187692e2ab1c2a6dbaaaec6c..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/model_param_init.py +++ /dev/null @@ -1,69 +0,0 @@ -import json -import os -import pathlib - -default_param = {} -default_param["bins"] = 768 -default_param["unstable_bins"] = 9 # training only -default_param["reduction_bins"] = 762 # training only -default_param["sr"] = 44100 -default_param["pre_filter_start"] = 757 -default_param["pre_filter_stop"] = 768 -default_param["band"] = {} - - -default_param["band"][1] = { - "sr": 11025, - "hl": 128, - "n_fft": 960, - "crop_start": 0, - "crop_stop": 245, - "lpf_start": 61, # inference only - "res_type": "polyphase", -} - -default_param["band"][2] = { - "sr": 44100, - "hl": 512, - "n_fft": 1536, - "crop_start": 24, - "crop_stop": 547, - "hpf_start": 81, # inference only - "res_type": "sinc_best", -} - - -def int_keys(d): - r = {} - for k, v in d: - if k.isdigit(): - k = int(k) - r[k] = v - return r - - -class ModelParameters(object): - def __init__(self, config_path=""): - if ".pth" == pathlib.Path(config_path).suffix: - import zipfile - - with zipfile.ZipFile(config_path, "r") as zip: - self.param = json.loads( - zip.read("param.json"), object_pairs_hook=int_keys - ) - elif ".json" == pathlib.Path(config_path).suffix: - with open(config_path, "r") as f: - self.param = json.loads(f.read(), object_pairs_hook=int_keys) - else: - self.param = default_param - - for k in [ - "mid_side", - "mid_side_b", - "mid_side_b2", - "stereo_w", - "stereo_n", - "reverse", - ]: - if not k in self.param: - self.param[k] = False diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json deleted file mode 100644 index 72cb4499867ad2827185e85687f06fb73d33eced..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 16000, - "hl": 512, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 1024, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 16000, - "pre_filter_start": 1023, - "pre_filter_stop": 1024 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json deleted file mode 100644 index 3c00ecf0a105e55a6a86a3c32db301a2635b5b41..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 32000, - "hl": 512, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 1024, - "hpf_start": -1, - "res_type": "kaiser_fast" - } - }, - "sr": 32000, - "pre_filter_start": 1000, - "pre_filter_stop": 1021 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json deleted file mode 100644 index 55666ac9a8d0547751fb4b4d3bffb1ee2c956913..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 33075, - "hl": 384, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 1024, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 33075, - "pre_filter_start": 1000, - "pre_filter_stop": 1021 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json deleted file mode 100644 index 665abe20eb3cc39fe0f8493dad8f25f6ef634a14..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 44100, - "hl": 1024, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 1024, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 44100, - "pre_filter_start": 1023, - "pre_filter_stop": 1024 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json deleted file mode 100644 index 0e8b16f89b0231d06eabe8d2f7c2670c7caa2272..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 256, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 44100, - "hl": 256, - "n_fft": 512, - "crop_start": 0, - "crop_stop": 256, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 44100, - "pre_filter_start": 256, - "pre_filter_stop": 256 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json deleted file mode 100644 index 3b38fcaf60ba204e03a47f5bd3f5bcfe75e1983a..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 44100, - "hl": 512, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 1024, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 44100, - "pre_filter_start": 1023, - "pre_filter_stop": 1024 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json deleted file mode 100644 index 630df3524e340f43a1ddb7b33ff02cc91fc1cb47..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 44100, - "hl": 512, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 700, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 44100, - "pre_filter_start": 1023, - "pre_filter_stop": 700 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json b/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json deleted file mode 100644 index ab9cf1150a818eb6252105408311be0a40d423b3..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 705, - "band": { - "1": { - "sr": 6000, - "hl": 66, - "n_fft": 512, - "crop_start": 0, - "crop_stop": 240, - "lpf_start": 60, - "lpf_stop": 118, - "res_type": "sinc_fastest" - }, - "2": { - "sr": 32000, - "hl": 352, - "n_fft": 1024, - "crop_start": 22, - "crop_stop": 505, - "hpf_start": 44, - "hpf_stop": 23, - "res_type": "sinc_medium" - } - }, - "sr": 32000, - "pre_filter_start": 710, - "pre_filter_stop": 731 -} diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json b/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json deleted file mode 100644 index 7faa216d7b49aeece24123dbdd868847a1dbc03c..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "bins": 512, - "unstable_bins": 7, - "reduction_bins": 510, - "band": { - "1": { - "sr": 11025, - "hl": 160, - "n_fft": 768, - "crop_start": 0, - "crop_stop": 192, - "lpf_start": 41, - "lpf_stop": 139, - "res_type": "sinc_fastest" - }, - "2": { - "sr": 44100, - "hl": 640, - "n_fft": 1024, - "crop_start": 10, - "crop_stop": 320, - "hpf_start": 47, - "hpf_stop": 15, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 510, - "pre_filter_stop": 512 -} diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json b/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json deleted file mode 100644 index 7e78175052b09cb1a32345e54006475992712f9a..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 705, - "band": { - "1": { - "sr": 6000, - "hl": 66, - "n_fft": 512, - "crop_start": 0, - "crop_stop": 240, - "lpf_start": 60, - "lpf_stop": 240, - "res_type": "sinc_fastest" - }, - "2": { - "sr": 48000, - "hl": 528, - "n_fft": 1536, - "crop_start": 22, - "crop_stop": 505, - "hpf_start": 82, - "hpf_stop": 22, - "res_type": "sinc_medium" - } - }, - "sr": 48000, - "pre_filter_start": 710, - "pre_filter_stop": 731 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json b/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json deleted file mode 100644 index d881d767ff83fbac0e18dfe2587ef16925b29b3c..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "bins": 768, - "unstable_bins": 5, - "reduction_bins": 733, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 768, - "crop_start": 0, - "crop_stop": 278, - "lpf_start": 28, - "lpf_stop": 140, - "res_type": "polyphase" - }, - "2": { - "sr": 22050, - "hl": 256, - "n_fft": 768, - "crop_start": 14, - "crop_stop": 322, - "hpf_start": 70, - "hpf_stop": 14, - "lpf_start": 283, - "lpf_stop": 314, - "res_type": "polyphase" - }, - "3": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 131, - "crop_stop": 313, - "hpf_start": 154, - "hpf_stop": 141, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 757, - "pre_filter_stop": 768 -} diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json b/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json deleted file mode 100644 index 77ec198573b19f36519a028a509767d30764c0e2..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "mid_side": true, - "bins": 768, - "unstable_bins": 5, - "reduction_bins": 733, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 768, - "crop_start": 0, - "crop_stop": 278, - "lpf_start": 28, - "lpf_stop": 140, - "res_type": "polyphase" - }, - "2": { - "sr": 22050, - "hl": 256, - "n_fft": 768, - "crop_start": 14, - "crop_stop": 322, - "hpf_start": 70, - "hpf_stop": 14, - "lpf_start": 283, - "lpf_stop": 314, - "res_type": "polyphase" - }, - "3": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 131, - "crop_stop": 313, - "hpf_start": 154, - "hpf_stop": 141, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 757, - "pre_filter_stop": 768 -} diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json b/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json deleted file mode 100644 index 85ee8a7d44541c9176e85ea3dce8728d34990938..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "mid_side_b2": true, - "bins": 640, - "unstable_bins": 7, - "reduction_bins": 565, - "band": { - "1": { - "sr": 11025, - "hl": 108, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 187, - "lpf_start": 92, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "2": { - "sr": 22050, - "hl": 216, - "n_fft": 768, - "crop_start": 0, - "crop_stop": 212, - "hpf_start": 68, - "hpf_stop": 34, - "lpf_start": 174, - "lpf_stop": 209, - "res_type": "polyphase" - }, - "3": { - "sr": 44100, - "hl": 432, - "n_fft": 640, - "crop_start": 66, - "crop_stop": 307, - "hpf_start": 86, - "hpf_stop": 72, - "res_type": "kaiser_fast" - } - }, - "sr": 44100, - "pre_filter_start": 639, - "pre_filter_stop": 640 -} diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json deleted file mode 100644 index df123754204372aa50d464fbe9102a401f48cc73..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json deleted file mode 100644 index e91b699eb63d3382c3b9e9edf46d40ed91d6122b..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "bins": 768, - "unstable_bins": 7, - "mid_side": true, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json deleted file mode 100644 index f852f280ec9d98fc1b65cec688290eaafec61b84..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "mid_side_b": true, - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json deleted file mode 100644 index f852f280ec9d98fc1b65cec688290eaafec61b84..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "mid_side_b": true, - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json deleted file mode 100644 index 7a07d5541bd83dc1caa20b531c3b43a2ffccac88..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "reverse": true, - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json deleted file mode 100644 index ba0cf342106de793e6ec3e876854c7fd451fbf76..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "stereo_w": true, - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json b/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json deleted file mode 100644 index 33281a0cf9916fc33558ddfda7a0287a2547faf4..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "bins": 672, - "unstable_bins": 8, - "reduction_bins": 637, - "band": { - "1": { - "sr": 7350, - "hl": 80, - "n_fft": 640, - "crop_start": 0, - "crop_stop": 85, - "lpf_start": 25, - "lpf_stop": 53, - "res_type": "polyphase" - }, - "2": { - "sr": 7350, - "hl": 80, - "n_fft": 320, - "crop_start": 4, - "crop_stop": 87, - "hpf_start": 25, - "hpf_stop": 12, - "lpf_start": 31, - "lpf_stop": 62, - "res_type": "polyphase" - }, - "3": { - "sr": 14700, - "hl": 160, - "n_fft": 512, - "crop_start": 17, - "crop_stop": 216, - "hpf_start": 48, - "hpf_stop": 24, - "lpf_start": 139, - "lpf_stop": 210, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 480, - "n_fft": 960, - "crop_start": 78, - "crop_stop": 383, - "hpf_start": 130, - "hpf_stop": 86, - "res_type": "kaiser_fast" - } - }, - "sr": 44100, - "pre_filter_start": 668, - "pre_filter_stop": 672 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json b/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json deleted file mode 100644 index 2e5c770fe188779bf6b0873190b7a324d6a867b2..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "bins": 672, - "unstable_bins": 8, - "reduction_bins": 637, - "band": { - "1": { - "sr": 7350, - "hl": 80, - "n_fft": 640, - "crop_start": 0, - "crop_stop": 85, - "lpf_start": 25, - "lpf_stop": 53, - "res_type": "polyphase" - }, - "2": { - "sr": 7350, - "hl": 80, - "n_fft": 320, - "crop_start": 4, - "crop_stop": 87, - "hpf_start": 25, - "hpf_stop": 12, - "lpf_start": 31, - "lpf_stop": 62, - "res_type": "polyphase" - }, - "3": { - "sr": 14700, - "hl": 160, - "n_fft": 512, - "crop_start": 17, - "crop_stop": 216, - "hpf_start": 48, - "hpf_stop": 24, - "lpf_start": 139, - "lpf_stop": 210, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 480, - "n_fft": 960, - "crop_start": 78, - "crop_stop": 383, - "hpf_start": 130, - "hpf_stop": 86, - "convert_channels": "stereo_n", - "res_type": "kaiser_fast" - } - }, - "sr": 44100, - "pre_filter_start": 668, - "pre_filter_stop": 672 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json b/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json deleted file mode 100644 index 2a73bc97ac545145a75bdca7addc5d59f5b8574b..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "bins": 672, - "unstable_bins": 8, - "reduction_bins": 530, - "band": { - "1": { - "sr": 7350, - "hl": 80, - "n_fft": 640, - "crop_start": 0, - "crop_stop": 85, - "lpf_start": 25, - "lpf_stop": 53, - "res_type": "polyphase" - }, - "2": { - "sr": 7350, - "hl": 80, - "n_fft": 320, - "crop_start": 4, - "crop_stop": 87, - "hpf_start": 25, - "hpf_stop": 12, - "lpf_start": 31, - "lpf_stop": 62, - "res_type": "polyphase" - }, - "3": { - "sr": 14700, - "hl": 160, - "n_fft": 512, - "crop_start": 17, - "crop_stop": 216, - "hpf_start": 48, - "hpf_stop": 24, - "lpf_start": 139, - "lpf_stop": 210, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 480, - "n_fft": 960, - "crop_start": 78, - "crop_stop": 383, - "hpf_start": 130, - "hpf_stop": 86, - "res_type": "kaiser_fast" - } - }, - "sr": 44100, - "pre_filter_start": 668, - "pre_filter_stop": 672 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/modelparams/ensemble.json b/tools/uvr5/lib/lib_v5/modelparams/ensemble.json deleted file mode 100644 index ee69beb46fc82f34619c5e48761e329fcabbbd00..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/modelparams/ensemble.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "mid_side_b2": true, - "bins": 1280, - "unstable_bins": 7, - "reduction_bins": 565, - "band": { - "1": { - "sr": 11025, - "hl": 108, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 374, - "lpf_start": 92, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "2": { - "sr": 22050, - "hl": 216, - "n_fft": 1536, - "crop_start": 0, - "crop_stop": 424, - "hpf_start": 68, - "hpf_stop": 34, - "lpf_start": 348, - "lpf_stop": 418, - "res_type": "polyphase" - }, - "3": { - "sr": 44100, - "hl": 432, - "n_fft": 1280, - "crop_start": 132, - "crop_stop": 614, - "hpf_start": 172, - "hpf_stop": 144, - "res_type": "polyphase" - } - }, - "sr": 44100, - "pre_filter_start": 1280, - "pre_filter_stop": 1280 -} \ No newline at end of file diff --git a/tools/uvr5/lib/lib_v5/nets.py b/tools/uvr5/lib/lib_v5/nets.py deleted file mode 100644 index 5da3948c2f2e9edcc3cdac49bdf9f738e403de40..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/nets.py +++ /dev/null @@ -1,123 +0,0 @@ -import layers -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 16) - self.stg1_high_band_net = BaseASPPNet(2, 16) - - self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(8, 16) - - self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(16, 32) - - self.out = nn.Conv2d(32, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/tools/uvr5/lib/lib_v5/nets_123812KB.py b/tools/uvr5/lib/lib_v5/nets_123812KB.py deleted file mode 100644 index 167d4cb2198863cf43e93440f7e63c5342fc7605..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/nets_123812KB.py +++ /dev/null @@ -1,122 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_123821KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 32) - self.stg1_high_band_net = BaseASPPNet(2, 32) - - self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(16, 32) - - self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(32, 64) - - self.out = nn.Conv2d(64, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/tools/uvr5/lib/lib_v5/nets_123821KB.py b/tools/uvr5/lib/lib_v5/nets_123821KB.py deleted file mode 100644 index 167d4cb2198863cf43e93440f7e63c5342fc7605..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/nets_123821KB.py +++ /dev/null @@ -1,122 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_123821KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 32) - self.stg1_high_band_net = BaseASPPNet(2, 32) - - self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(16, 32) - - self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(32, 64) - - self.out = nn.Conv2d(64, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/tools/uvr5/lib/lib_v5/nets_33966KB.py b/tools/uvr5/lib/lib_v5/nets_33966KB.py deleted file mode 100644 index 73a5b836177b706c306e27875f8391c1aed4b948..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/nets_33966KB.py +++ /dev/null @@ -1,122 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_33966KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16, 32)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 16) - self.stg1_high_band_net = BaseASPPNet(2, 16) - - self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(8, 16) - - self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(16, 32) - - self.out = nn.Conv2d(32, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/tools/uvr5/lib/lib_v5/nets_537227KB.py b/tools/uvr5/lib/lib_v5/nets_537227KB.py deleted file mode 100644 index 823b44fb64898e8dcbb12180ba45d1718f9b03f7..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/nets_537227KB.py +++ /dev/null @@ -1,123 +0,0 @@ -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_537238KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 64) - self.stg1_high_band_net = BaseASPPNet(2, 64) - - self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(32, 64) - - self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(64, 128) - - self.out = nn.Conv2d(128, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/tools/uvr5/lib/lib_v5/nets_537238KB.py b/tools/uvr5/lib/lib_v5/nets_537238KB.py deleted file mode 100644 index 823b44fb64898e8dcbb12180ba45d1718f9b03f7..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/nets_537238KB.py +++ /dev/null @@ -1,123 +0,0 @@ -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_537238KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 64) - self.stg1_high_band_net = BaseASPPNet(2, 64) - - self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(32, 64) - - self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(64, 128) - - self.out = nn.Conv2d(128, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/tools/uvr5/lib/lib_v5/nets_61968KB.py b/tools/uvr5/lib/lib_v5/nets_61968KB.py deleted file mode 100644 index 167d4cb2198863cf43e93440f7e63c5342fc7605..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/nets_61968KB.py +++ /dev/null @@ -1,122 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_123821KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 32) - self.stg1_high_band_net = BaseASPPNet(2, 32) - - self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(16, 32) - - self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(32, 64) - - self.out = nn.Conv2d(64, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/tools/uvr5/lib/lib_v5/nets_new.py b/tools/uvr5/lib/lib_v5/nets_new.py deleted file mode 100644 index 1c0f4fa96d921e979fe31bd4151701b7783fbcea..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/nets_new.py +++ /dev/null @@ -1,133 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_new - - -class BaseNet(nn.Module): - def __init__( - self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) - ): - super(BaseNet, self).__init__() - self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1) - self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1) - self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1) - self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1) - self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1) - - self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) - - self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) - self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) - self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) - self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm) - self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) - - def __call__(self, x): - e1 = self.enc1(x) - e2 = self.enc2(e1) - e3 = self.enc3(e2) - e4 = self.enc4(e3) - e5 = self.enc5(e4) - - h = self.aspp(e5) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = torch.cat([h, self.lstm_dec2(h)], dim=1) - h = self.dec1(h, e1) - - return h - - -class CascadedNet(nn.Module): - def __init__(self, n_fft, nout=32, nout_lstm=128): - super(CascadedNet, self).__init__() - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - self.nin_lstm = self.max_bin // 2 - self.offset = 64 - - self.stg1_low_band_net = nn.Sequential( - BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), - layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), - ) - - self.stg1_high_band_net = BaseNet( - 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 - ) - - self.stg2_low_band_net = nn.Sequential( - BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), - layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), - ) - self.stg2_high_band_net = BaseNet( - nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 - ) - - self.stg3_full_band_net = BaseNet( - 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm - ) - - self.out = nn.Conv2d(nout, 2, 1, bias=False) - self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) - - def forward(self, x): - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - l1_in = x[:, :, :bandw] - h1_in = x[:, :, bandw:] - l1 = self.stg1_low_band_net(l1_in) - h1 = self.stg1_high_band_net(h1_in) - aux1 = torch.cat([l1, h1], dim=2) - - l2_in = torch.cat([l1_in, l1], dim=1) - h2_in = torch.cat([h1_in, h1], dim=1) - l2 = self.stg2_low_band_net(l2_in) - h2 = self.stg2_high_band_net(h2_in) - aux2 = torch.cat([l2, h2], dim=2) - - f3_in = torch.cat([x, aux1, aux2], dim=1) - f3 = self.stg3_full_band_net(f3_in) - - mask = torch.sigmoid(self.out(f3)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux = torch.cat([aux1, aux2], dim=1) - aux = torch.sigmoid(self.aux_out(aux)) - aux = F.pad( - input=aux, - pad=(0, 0, 0, self.output_bin - aux.size()[2]), - mode="replicate", - ) - return mask, aux - else: - return mask - - def predict_mask(self, x): - mask = self.forward(x) - - if self.offset > 0: - mask = mask[:, :, :, self.offset : -self.offset] - assert mask.size()[3] > 0 - - return mask - - def predict(self, x, aggressiveness=None): - mask = self.forward(x) - pred_mag = x * mask - - if self.offset > 0: - pred_mag = pred_mag[:, :, :, self.offset : -self.offset] - assert pred_mag.size()[3] > 0 - - return pred_mag diff --git a/tools/uvr5/lib/lib_v5/spec_utils.py b/tools/uvr5/lib/lib_v5/spec_utils.py deleted file mode 100644 index da072e4b2dd59b5382d3ebde818df286f9153f38..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/lib_v5/spec_utils.py +++ /dev/null @@ -1,676 +0,0 @@ -import hashlib -import json -import math -import os - -import librosa -import numpy as np -import soundfile as sf -from tqdm import tqdm - - -def crop_center(h1, h2): - h1_shape = h1.size() - h2_shape = h2.size() - - if h1_shape[3] == h2_shape[3]: - return h1 - elif h1_shape[3] < h2_shape[3]: - raise ValueError("h1_shape[3] must be greater than h2_shape[3]") - - # s_freq = (h2_shape[2] - h1_shape[2]) // 2 - # e_freq = s_freq + h1_shape[2] - s_time = (h1_shape[3] - h2_shape[3]) // 2 - e_time = s_time + h2_shape[3] - h1 = h1[:, :, :, s_time:e_time] - - return h1 - - -def wave_to_spectrogram( - wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False -): - if reverse: - wave_left = np.flip(np.asfortranarray(wave[0])) - wave_right = np.flip(np.asfortranarray(wave[1])) - elif mid_side: - wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) - elif mid_side_b2: - wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) - else: - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - - spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length) - spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length) - - spec = np.asfortranarray([spec_left, spec_right]) - - return spec - - -def wave_to_spectrogram_mt( - wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False -): - import threading - - if reverse: - wave_left = np.flip(np.asfortranarray(wave[0])) - wave_right = np.flip(np.asfortranarray(wave[1])) - elif mid_side: - wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) - elif mid_side_b2: - wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) - else: - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - - def run_thread(**kwargs): - global spec_left - spec_left = librosa.stft(**kwargs) - - thread = threading.Thread( - target=run_thread, - kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length}, - ) - thread.start() - spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length) - thread.join() - - spec = np.asfortranarray([spec_left, spec_right]) - - return spec - - -def combine_spectrograms(specs, mp): - l = min([specs[i].shape[2] for i in specs]) - spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64) - offset = 0 - bands_n = len(mp.param["band"]) - - for d in range(1, bands_n + 1): - h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"] - spec_c[:, offset : offset + h, :l] = specs[d][ - :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l - ] - offset += h - - if offset > mp.param["bins"]: - raise ValueError("Too much bins") - - # lowpass fiter - if ( - mp.param["pre_filter_start"] > 0 - ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: - if bands_n == 1: - spec_c = fft_lp_filter( - spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"] - ) - else: - gp = 1 - for b in range( - mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"] - ): - g = math.pow( - 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0 - ) - gp = g - spec_c[:, b, :] *= g - - return np.asfortranarray(spec_c) - - -def spectrogram_to_image(spec, mode="magnitude"): - if mode == "magnitude": - if np.iscomplexobj(spec): - y = np.abs(spec) - else: - y = spec - y = np.log10(y**2 + 1e-8) - elif mode == "phase": - if np.iscomplexobj(spec): - y = np.angle(spec) - else: - y = spec - - y -= y.min() - y *= 255 / y.max() - img = np.uint8(y) - - if y.ndim == 3: - img = img.transpose(1, 2, 0) - img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2) - - return img - - -def reduce_vocal_aggressively(X, y, softmask): - v = X - y - y_mag_tmp = np.abs(y) - v_mag_tmp = np.abs(v) - - v_mask = v_mag_tmp > y_mag_tmp - y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf) - - return y_mag * np.exp(1.0j * np.angle(y)) - - -def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32): - if min_range < fade_size * 2: - raise ValueError("min_range must be >= fade_area * 2") - - mag = mag.copy() - - idx = np.where(ref.mean(axis=(0, 1)) < thres)[0] - starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0]) - ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1]) - uninformative = np.where(ends - starts > min_range)[0] - if len(uninformative) > 0: - starts = starts[uninformative] - ends = ends[uninformative] - old_e = None - for s, e in zip(starts, ends): - if old_e is not None and s - old_e < fade_size: - s = old_e - fade_size * 2 - - if s != 0: - weight = np.linspace(0, 1, fade_size) - mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size] - else: - s -= fade_size - - if e != mag.shape[2]: - weight = np.linspace(1, 0, fade_size) - mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e] - else: - e += fade_size - - mag[:, :, s + fade_size : e - fade_size] += ref[ - :, :, s + fade_size : e - fade_size - ] - old_e = e - - return mag - - -def align_wave_head_and_tail(a, b): - l = min([a[0].size, b[0].size]) - - return a[:l, :l], b[:l, :l] - - -def cache_or_load(mix_path, inst_path, mp): - mix_basename = os.path.splitext(os.path.basename(mix_path))[0] - inst_basename = os.path.splitext(os.path.basename(inst_path))[0] - - cache_dir = "mph{}".format( - hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest() - ) - mix_cache_dir = os.path.join("cache", cache_dir) - inst_cache_dir = os.path.join("cache", cache_dir) - - os.makedirs(mix_cache_dir, exist_ok=True) - os.makedirs(inst_cache_dir, exist_ok=True) - - mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy") - inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy") - - if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path): - X_spec_m = np.load(mix_cache_path) - y_spec_m = np.load(inst_cache_path) - else: - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - - for d in range(len(mp.param["band"]), 0, -1): - bp = mp.param["band"][d] - - if d == len(mp.param["band"]): # high-end band - X_wave[d], _ = librosa.load( - mix_path, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"] - ) - y_wave[d], _ = librosa.load( - inst_path, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], - ) - else: # lower bands - X_wave[d] = librosa.resample( - X_wave[d + 1], - orig_sr = mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], - ) - y_wave[d] = librosa.resample( - y_wave[d + 1], - orig_sr = mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], - ) - - X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d]) - - X_spec_s[d] = wave_to_spectrogram( - X_wave[d], - bp["hl"], - bp["n_fft"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ) - y_spec_s[d] = wave_to_spectrogram( - y_wave[d], - bp["hl"], - bp["n_fft"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ) - - del X_wave, y_wave - - X_spec_m = combine_spectrograms(X_spec_s, mp) - y_spec_m = combine_spectrograms(y_spec_s, mp) - - if X_spec_m.shape != y_spec_m.shape: - raise ValueError("The combined spectrograms are different: " + mix_path) - - _, ext = os.path.splitext(mix_path) - - np.save(mix_cache_path, X_spec_m) - np.save(inst_cache_path, y_spec_m) - - return X_spec_m, y_spec_m - - -def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse): - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - - wave_left = librosa.istft(spec_left, hop_length=hop_length) - wave_right = librosa.istft(spec_right, hop_length=hop_length) - - if reverse: - return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) - elif mid_side: - return np.asfortranarray( - [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] - ) - elif mid_side_b2: - return np.asfortranarray( - [ - np.add(wave_right / 1.25, 0.4 * wave_left), - np.subtract(wave_left / 1.25, 0.4 * wave_right), - ] - ) - else: - return np.asfortranarray([wave_left, wave_right]) - - -def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): - import threading - - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - - def run_thread(**kwargs): - global wave_left - wave_left = librosa.istft(**kwargs) - - thread = threading.Thread( - target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length} - ) - thread.start() - wave_right = librosa.istft(spec_right, hop_length=hop_length) - thread.join() - - if reverse: - return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) - elif mid_side: - return np.asfortranarray( - [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] - ) - elif mid_side_b2: - return np.asfortranarray( - [ - np.add(wave_right / 1.25, 0.4 * wave_left), - np.subtract(wave_left / 1.25, 0.4 * wave_right), - ] - ) - else: - return np.asfortranarray([wave_left, wave_right]) - - -def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): - wave_band = {} - bands_n = len(mp.param["band"]) - offset = 0 - - for d in range(1, bands_n + 1): - bp = mp.param["band"][d] - spec_s = np.ndarray( - shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex - ) - h = bp["crop_stop"] - bp["crop_start"] - spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[ - :, offset : offset + h, : - ] - - offset += h - if d == bands_n: # higher - if extra_bins_h: # if --high_end_process bypass - max_bin = bp["n_fft"] // 2 - spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[ - :, :extra_bins_h, : - ] - if bp["hpf_start"] > 0: - spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) - if bands_n == 1: - wave = spectrogram_to_wave( - spec_s, - bp["hl"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ) - else: - wave = np.add( - wave, - spectrogram_to_wave( - spec_s, - bp["hl"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ), - ) - else: - sr = mp.param["band"][d + 1]["sr"] - if d == 1: # lower - spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) - wave = librosa.resample( - spectrogram_to_wave( - spec_s, - bp["hl"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ), - orig_sr = bp["sr"], - target_sr = sr, - res_type = "sinc_fastest", - ) - else: # mid - spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) - spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) - wave2 = np.add( - wave, - spectrogram_to_wave( - spec_s, - bp["hl"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ), - ) - # wave = librosa.core.resample(wave2, orig_sr=bp['sr'], target_sr=sr, res_type="sinc_fastest") - wave = librosa.core.resample(wave2, orig_sr=bp["sr"], target_sr=sr, res_type="scipy") - - return wave.T - - -def fft_lp_filter(spec, bin_start, bin_stop): - g = 1.0 - for b in range(bin_start, bin_stop): - g -= 1 / (bin_stop - bin_start) - spec[:, b, :] = g * spec[:, b, :] - - spec[:, bin_stop:, :] *= 0 - - return spec - - -def fft_hp_filter(spec, bin_start, bin_stop): - g = 1.0 - for b in range(bin_start, bin_stop, -1): - g -= 1 / (bin_start - bin_stop) - spec[:, b, :] = g * spec[:, b, :] - - spec[:, 0 : bin_stop + 1, :] *= 0 - - return spec - - -def mirroring(a, spec_m, input_high_end, mp): - if "mirroring" == a: - mirror = np.flip( - np.abs( - spec_m[ - :, - mp.param["pre_filter_start"] - - 10 - - input_high_end.shape[1] : mp.param["pre_filter_start"] - - 10, - :, - ] - ), - 1, - ) - mirror = mirror * np.exp(1.0j * np.angle(input_high_end)) - - return np.where( - np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror - ) - - if "mirroring2" == a: - mirror = np.flip( - np.abs( - spec_m[ - :, - mp.param["pre_filter_start"] - - 10 - - input_high_end.shape[1] : mp.param["pre_filter_start"] - - 10, - :, - ] - ), - 1, - ) - mi = np.multiply(mirror, input_high_end * 1.7) - - return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi) - - -def ensembling(a, specs): - for i in range(1, len(specs)): - if i == 1: - spec = specs[0] - - ln = min([spec.shape[2], specs[i].shape[2]]) - spec = spec[:, :, :ln] - specs[i] = specs[i][:, :, :ln] - - if "min_mag" == a: - spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec) - if "max_mag" == a: - spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec) - - return spec - - -def stft(wave, nfft, hl): - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - spec_left = librosa.stft(wave_left, n_fft=nfft, hop_length=hl) - spec_right = librosa.stft(wave_right, n_fft=nfft, hop_length=hl) - spec = np.asfortranarray([spec_left, spec_right]) - - return spec - - -def istft(spec, hl): - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - - wave_left = librosa.istft(spec_left, hop_length=hl) - wave_right = librosa.istft(spec_right, hop_length=hl) - wave = np.asfortranarray([wave_left, wave_right]) - - -if __name__ == "__main__": - import argparse - import sys - import time - - import cv2 - from model_param_init import ModelParameters - - p = argparse.ArgumentParser() - p.add_argument( - "--algorithm", - "-a", - type=str, - choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"], - default="min_mag", - ) - p.add_argument( - "--model_params", - "-m", - type=str, - default=os.path.join("modelparams", "1band_sr44100_hl512.json"), - ) - p.add_argument("--output_name", "-o", type=str, default="output") - p.add_argument("--vocals_only", "-v", action="store_true") - p.add_argument("input", nargs="+") - args = p.parse_args() - - start_time = time.time() - - if args.algorithm.startswith("invert") and len(args.input) != 2: - raise ValueError("There should be two input files.") - - if not args.algorithm.startswith("invert") and len(args.input) < 2: - raise ValueError("There must be at least two input files.") - - wave, specs = {}, {} - mp = ModelParameters(args.model_params) - - for i in range(len(args.input)): - spec = {} - - for d in range(len(mp.param["band"]), 0, -1): - bp = mp.param["band"][d] - - if d == len(mp.param["band"]): # high-end band - wave[d], _ = librosa.load( - args.input[i], - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], - ) - - if len(wave[d].shape) == 1: # mono to stereo - wave[d] = np.array([wave[d], wave[d]]) - else: # lower bands - wave[d] = librosa.resample( - wave[d + 1], - orig_sr = mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], - ) - - spec[d] = wave_to_spectrogram( - wave[d], - bp["hl"], - bp["n_fft"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ) - - specs[i] = combine_spectrograms(spec, mp) - - del wave - - if args.algorithm == "deep": - d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1]) - v_spec = d_spec - specs[1] - sf.write( - os.path.join("{}.wav".format(args.output_name)), - cmb_spectrogram_to_wave(v_spec, mp), - mp.param["sr"], - ) - - if args.algorithm.startswith("invert"): - ln = min([specs[0].shape[2], specs[1].shape[2]]) - specs[0] = specs[0][:, :, :ln] - specs[1] = specs[1][:, :, :ln] - - if "invert_p" == args.algorithm: - X_mag = np.abs(specs[0]) - y_mag = np.abs(specs[1]) - max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) - v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0])) - else: - specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) - v_spec = specs[0] - specs[1] - - if not args.vocals_only: - X_mag = np.abs(specs[0]) - y_mag = np.abs(specs[1]) - v_mag = np.abs(v_spec) - - X_image = spectrogram_to_image(X_mag) - y_image = spectrogram_to_image(y_mag) - v_image = spectrogram_to_image(v_mag) - - cv2.imwrite("{}_X.png".format(args.output_name), X_image) - cv2.imwrite("{}_y.png".format(args.output_name), y_image) - cv2.imwrite("{}_v.png".format(args.output_name), v_image) - - sf.write( - "{}_X.wav".format(args.output_name), - cmb_spectrogram_to_wave(specs[0], mp), - mp.param["sr"], - ) - sf.write( - "{}_y.wav".format(args.output_name), - cmb_spectrogram_to_wave(specs[1], mp), - mp.param["sr"], - ) - - sf.write( - "{}_v.wav".format(args.output_name), - cmb_spectrogram_to_wave(v_spec, mp), - mp.param["sr"], - ) - else: - if not args.algorithm == "deep": - sf.write( - os.path.join("ensembled", "{}.wav".format(args.output_name)), - cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), - mp.param["sr"], - ) - - if args.algorithm == "align": - trackalignment = [ - { - "file1": '"{}"'.format(args.input[0]), - "file2": '"{}"'.format(args.input[1]), - } - ] - - for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."): - os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}") - - # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1)) diff --git a/tools/uvr5/lib/name_params.json b/tools/uvr5/lib/name_params.json deleted file mode 100644 index 4e5ee7bec45de4740f8402c42537c9a98681c95e..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/name_params.json +++ /dev/null @@ -1,263 +0,0 @@ -{ - "equivalent" : [ - { - "model_hash_name" : [ - { - "hash_name": "47939caf0cfe52a0e81442b85b971dfd", - "model_params": "lib/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe", - "model_params": "lib/lib_v5/modelparams/4band_v2.json", - "param_name": "4band_v2" - }, - { - "hash_name": "ca106edd563e034bde0bdec4bb7a4b36", - "model_params": "lib/lib_v5/modelparams/4band_v2.json", - "param_name": "4band_v2" - }, - { - "hash_name": "e60a1e84803ce4efc0a6551206cc4b71", - "model_params": "lib/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "a82f14e75892e55e994376edbf0c8435", - "model_params": "lib/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06", - "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json", - "param_name": "4band_v2_sn" - }, - { - "hash_name": "08611fb99bd59eaa79ad27c58d137727", - "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json", - "param_name": "4band_v2_sn" - }, - { - "hash_name": "5c7bbca45a187e81abbbd351606164e5", - "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json", - "param_name": "3band_44100_msb2" - }, - { - "hash_name": "d6b2cb685a058a091e5e7098192d3233", - "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json", - "param_name": "3band_44100_msb2" - }, - { - "hash_name": "c1b9f38170a7c90e96f027992eb7c62b", - "model_params": "lib/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "c3448ec923fa0edf3d03a19e633faa53", - "model_params": "lib/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "68aa2c8093d0080704b200d140f59e54", - "model_params": "lib/lib_v5/modelparams/3band_44100.json", - "param_name": "3band_44100" - }, - { - "hash_name": "fdc83be5b798e4bd29fe00fe6600e147", - "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json", - "param_name": "3band_44100_mid.json" - }, - { - "hash_name": "2ce34bc92fd57f55db16b7a4def3d745", - "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json", - "param_name": "3band_44100_mid.json" - }, - { - "hash_name": "52fdca89576f06cf4340b74a4730ee5f", - "model_params": "lib/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100.json" - }, - { - "hash_name": "41191165b05d38fc77f072fa9e8e8a30", - "model_params": "lib/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100.json" - }, - { - "hash_name": "89e83b511ad474592689e562d5b1f80e", - "model_params": "lib/lib_v5/modelparams/2band_32000.json", - "param_name": "2band_32000.json" - }, - { - "hash_name": "0b954da81d453b716b114d6d7c95177f", - "model_params": "lib/lib_v5/modelparams/2band_32000.json", - "param_name": "2band_32000.json" - } - - ], - "v4 Models": [ - { - "hash_name": "6a00461c51c2920fd68937d4609ed6c8", - "model_params": "lib/lib_v5/modelparams/1band_sr16000_hl512.json", - "param_name": "1band_sr16000_hl512" - }, - { - "hash_name": "0ab504864d20f1bd378fe9c81ef37140", - "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json", - "param_name": "1band_sr32000_hl512" - }, - { - "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f", - "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json", - "param_name": "1band_sr32000_hl512" - }, - { - "hash_name": "80ab74d65e515caa3622728d2de07d23", - "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json", - "param_name": "1band_sr32000_hl512" - }, - { - "hash_name": "edc115e7fc523245062200c00caa847f", - "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json", - "param_name": "1band_sr33075_hl384" - }, - { - "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7", - "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json", - "param_name": "1band_sr33075_hl384" - }, - { - "hash_name": "b58090534c52cbc3e9b5104bad666ef2", - "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json", - "param_name": "1band_sr44100_hl512" - }, - { - "hash_name": "0cdab9947f1b0928705f518f3c78ea8f", - "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json", - "param_name": "1band_sr44100_hl512" - }, - { - "hash_name": "ae702fed0238afb5346db8356fe25f13", - "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl1024.json", - "param_name": "1band_sr44100_hl1024" - } - ] - } - ], - "User Models" : [ - { - "1 Band": [ - { - "hash_name": "1band_sr16000_hl512", - "model_params": "lib/lib_v5/modelparams/1band_sr16000_hl512.json", - "param_name": "1band_sr16000_hl512" - }, - { - "hash_name": "1band_sr32000_hl512", - "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json", - "param_name": "1band_sr16000_hl512" - }, - { - "hash_name": "1band_sr33075_hl384", - "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json", - "param_name": "1band_sr33075_hl384" - }, - { - "hash_name": "1band_sr44100_hl256", - "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl256.json", - "param_name": "1band_sr44100_hl256" - }, - { - "hash_name": "1band_sr44100_hl512", - "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json", - "param_name": "1band_sr44100_hl512" - }, - { - "hash_name": "1band_sr44100_hl1024", - "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl1024.json", - "param_name": "1band_sr44100_hl1024" - } - ], - "2 Band": [ - { - "hash_name": "2band_44100_lofi", - "model_params": "lib/lib_v5/modelparams/2band_44100_lofi.json", - "param_name": "2band_44100_lofi" - }, - { - "hash_name": "2band_32000", - "model_params": "lib/lib_v5/modelparams/2band_32000.json", - "param_name": "2band_32000" - }, - { - "hash_name": "2band_48000", - "model_params": "lib/lib_v5/modelparams/2band_48000.json", - "param_name": "2band_48000" - } - ], - "3 Band": [ - { - "hash_name": "3band_44100", - "model_params": "lib/lib_v5/modelparams/3band_44100.json", - "param_name": "3band_44100" - }, - { - "hash_name": "3band_44100_mid", - "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json", - "param_name": "3band_44100_mid" - }, - { - "hash_name": "3band_44100_msb2", - "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json", - "param_name": "3band_44100_msb2" - } - ], - "4 Band": [ - { - "hash_name": "4band_44100", - "model_params": "lib/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "4band_44100_mid", - "model_params": "lib/lib_v5/modelparams/4band_44100_mid.json", - "param_name": "4band_44100_mid" - }, - { - "hash_name": "4band_44100_msb", - "model_params": "lib/lib_v5/modelparams/4band_44100_msb.json", - "param_name": "4band_44100_msb" - }, - { - "hash_name": "4band_44100_msb2", - "model_params": "lib/lib_v5/modelparams/4band_44100_msb2.json", - "param_name": "4band_44100_msb2" - }, - { - "hash_name": "4band_44100_reverse", - "model_params": "lib/lib_v5/modelparams/4band_44100_reverse.json", - "param_name": "4band_44100_reverse" - }, - { - "hash_name": "4band_44100_sw", - "model_params": "lib/lib_v5/modelparams/4band_44100_sw.json", - "param_name": "4band_44100_sw" - }, - { - "hash_name": "4band_v2", - "model_params": "lib/lib_v5/modelparams/4band_v2.json", - "param_name": "4band_v2" - }, - { - "hash_name": "4band_v2_sn", - "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json", - "param_name": "4band_v2_sn" - }, - { - "hash_name": "tmodelparam", - "model_params": "lib/lib_v5/modelparams/tmodelparam.json", - "param_name": "User Model Param Set" - } - ] - } - ] -} \ No newline at end of file diff --git a/tools/uvr5/lib/utils.py b/tools/uvr5/lib/utils.py deleted file mode 100644 index 5e8cd22fad3d26d89a3c9c09e9c569eae73d7275..0000000000000000000000000000000000000000 --- a/tools/uvr5/lib/utils.py +++ /dev/null @@ -1,121 +0,0 @@ -import json - -import numpy as np -import torch -from tqdm import tqdm - - -def load_data(file_name: str = "./lib/name_params.json") -> dict: - with open(file_name, "r") as f: - data = json.load(f) - - return data - - -def make_padding(width, cropsize, offset): - left = offset - roi_size = cropsize - left * 2 - if roi_size == 0: - roi_size = cropsize - right = roi_size - (width % roi_size) + left - - return left, right, roi_size - - -def inference(X_spec, device, model, aggressiveness, data): - """ - data : dic configs - """ - - def _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True - ): - model.eval() - with torch.no_grad(): - preds = [] - - iterations = [n_window] - - total_iterations = sum(iterations) - for i in tqdm(range(n_window)): - start = i * roi_size - X_mag_window = X_mag_pad[ - None, :, :, start : start + data["window_size"] - ] - X_mag_window = torch.from_numpy(X_mag_window) - if is_half: - X_mag_window = X_mag_window.half() - X_mag_window = X_mag_window.to(device) - - pred = model.predict(X_mag_window, aggressiveness) - - pred = pred.detach().cpu().numpy() - preds.append(pred[0]) - - pred = np.concatenate(preds, axis=2) - return pred - - def preprocess(X_spec): - X_mag = np.abs(X_spec) - X_phase = np.angle(X_spec) - - return X_mag, X_phase - - X_mag, X_phase = preprocess(X_spec) - - coef = X_mag.max() - X_mag_pre = X_mag / coef - - n_frame = X_mag_pre.shape[2] - pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset) - n_window = int(np.ceil(n_frame / roi_size)) - - X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") - - if list(model.state_dict().values())[0].dtype == torch.float16: - is_half = True - else: - is_half = False - pred = _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half - ) - pred = pred[:, :, :n_frame] - - if data["tta"]: - pad_l += roi_size // 2 - pad_r += roi_size // 2 - n_window += 1 - - X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") - - pred_tta = _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half - ) - pred_tta = pred_tta[:, :, roi_size // 2 :] - pred_tta = pred_tta[:, :, :n_frame] - - return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) - else: - return pred * coef, X_mag, np.exp(1.0j * X_phase) - - -def _get_name_params(model_path, model_hash): - data = load_data() - flag = False - ModelName = model_path - for type in list(data): - for model in list(data[type][0]): - for i in range(len(data[type][0][model])): - if str(data[type][0][model][i]["hash_name"]) == model_hash: - flag = True - elif str(data[type][0][model][i]["hash_name"]) in ModelName: - flag = True - - if flag: - model_params_auto = data[type][0][model][i]["model_params"] - param_name_auto = data[type][0][model][i]["param_name"] - if type == "equivalent": - return param_name_auto, model_params_auto - else: - flag = False - return param_name_auto, model_params_auto diff --git a/tools/uvr5/mdxnet.py b/tools/uvr5/mdxnet.py deleted file mode 100644 index 372db25b2e169e1821696608676838b3d3207e2e..0000000000000000000000000000000000000000 --- a/tools/uvr5/mdxnet.py +++ /dev/null @@ -1,256 +0,0 @@ -import os -import logging - -logger = logging.getLogger(__name__) - -import librosa -import numpy as np -import soundfile as sf -import torch -from tqdm import tqdm - -cpu = torch.device("cpu") - - -class ConvTDFNetTrim: - def __init__( - self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 - ): - super(ConvTDFNetTrim, self).__init__() - - self.dim_f = dim_f - self.dim_t = 2**dim_t - self.n_fft = n_fft - self.hop = hop - self.n_bins = self.n_fft // 2 + 1 - self.chunk_size = hop * (self.dim_t - 1) - self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to( - device - ) - self.target_name = target_name - self.blender = "blender" in model_name - - self.dim_c = 4 - out_c = self.dim_c * 4 if target_name == "*" else self.dim_c - self.freq_pad = torch.zeros( - [1, out_c, self.n_bins - self.dim_f, self.dim_t] - ).to(device) - - self.n = L // 2 - - def stft(self, x): - x = x.reshape([-1, self.chunk_size]) - x = torch.stft( - x, - n_fft=self.n_fft, - hop_length=self.hop, - window=self.window, - center=True, - return_complex=True, - ) - x = torch.view_as_real(x) - x = x.permute([0, 3, 1, 2]) - x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( - [-1, self.dim_c, self.n_bins, self.dim_t] - ) - return x[:, :, : self.dim_f] - - def istft(self, x, freq_pad=None): - freq_pad = ( - self.freq_pad.repeat([x.shape[0], 1, 1, 1]) - if freq_pad is None - else freq_pad - ) - x = torch.cat([x, freq_pad], -2) - c = 4 * 2 if self.target_name == "*" else 2 - x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape( - [-1, 2, self.n_bins, self.dim_t] - ) - x = x.permute([0, 2, 3, 1]) - x = x.contiguous() - x = torch.view_as_complex(x) - x = torch.istft( - x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True - ) - return x.reshape([-1, c, self.chunk_size]) - - -def get_models(device, dim_f, dim_t, n_fft): - return ConvTDFNetTrim( - device=device, - model_name="Conv-TDF", - target_name="vocals", - L=11, - dim_f=dim_f, - dim_t=dim_t, - n_fft=n_fft, - ) - - -class Predictor: - def __init__(self, args): - import onnxruntime as ort - - logger.info(ort.get_available_providers()) - self.args = args - self.model_ = get_models( - device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft - ) - self.model = ort.InferenceSession( - os.path.join(args.onnx, self.model_.target_name + ".onnx"), - providers=[ - "CUDAExecutionProvider", - "DmlExecutionProvider", - "CPUExecutionProvider", - ], - ) - logger.info("ONNX load done") - - def demix(self, mix): - samples = mix.shape[-1] - margin = self.args.margin - chunk_size = self.args.chunks * 44100 - assert not margin == 0, "margin cannot be zero!" - if margin > chunk_size: - margin = chunk_size - - segmented_mix = {} - - if self.args.chunks == 0 or samples < chunk_size: - chunk_size = samples - - counter = -1 - for skip in range(0, samples, chunk_size): - counter += 1 - - s_margin = 0 if counter == 0 else margin - end = min(skip + chunk_size + margin, samples) - - start = skip - s_margin - - segmented_mix[skip] = mix[:, start:end].copy() - if end == samples: - break - - sources = self.demix_base(segmented_mix, margin_size=margin) - """ - mix:(2,big_sample) - segmented_mix:offset->(2,small_sample) - sources:(1,2,big_sample) - """ - return sources - - def demix_base(self, mixes, margin_size): - chunked_sources = [] - progress_bar = tqdm(total=len(mixes)) - progress_bar.set_description("Processing") - for mix in mixes: - cmix = mixes[mix] - sources = [] - n_sample = cmix.shape[1] - model = self.model_ - trim = model.n_fft // 2 - gen_size = model.chunk_size - 2 * trim - pad = gen_size - n_sample % gen_size - mix_p = np.concatenate( - (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1 - ) - mix_waves = [] - i = 0 - while i < n_sample + pad: - waves = np.array(mix_p[:, i : i + model.chunk_size]) - mix_waves.append(waves) - i += gen_size - mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu) - with torch.no_grad(): - _ort = self.model - spek = model.stft(mix_waves) - if self.args.denoise: - spec_pred = ( - -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5 - + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5 - ) - tar_waves = model.istft(torch.tensor(spec_pred)) - else: - tar_waves = model.istft( - torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0]) - ) - tar_signal = ( - tar_waves[:, :, trim:-trim] - .transpose(0, 1) - .reshape(2, -1) - .numpy()[:, :-pad] - ) - - start = 0 if mix == 0 else margin_size - end = None if mix == list(mixes.keys())[::-1][0] else -margin_size - if margin_size == 0: - end = None - sources.append(tar_signal[:, start:end]) - - progress_bar.update(1) - - chunked_sources.append(sources) - _sources = np.concatenate(chunked_sources, axis=-1) - # del self.model - progress_bar.close() - return _sources - - def prediction(self, m, vocal_root, others_root, format): - os.makedirs(vocal_root, exist_ok=True) - os.makedirs(others_root, exist_ok=True) - basename = os.path.basename(m) - mix, rate = librosa.load(m, mono=False, sr=44100) - if mix.ndim == 1: - mix = np.asfortranarray([mix, mix]) - mix = mix.T - sources = self.demix(mix.T) - opt = sources[0].T - if format in ["wav", "flac"]: - sf.write( - "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate - ) - sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate) - else: - path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) - path_other = "%s/%s_others.wav" % (others_root, basename) - sf.write(path_vocal, mix - opt, rate) - sf.write(path_other, opt, rate) - opt_path_vocal = path_vocal[:-4] + ".%s" % format - opt_path_other = path_other[:-4] + ".%s" % format - if os.path.exists(path_vocal): - os.system( - "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal) - ) - if os.path.exists(opt_path_vocal): - try: - os.remove(path_vocal) - except: - pass - if os.path.exists(path_other): - os.system( - "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other) - ) - if os.path.exists(opt_path_other): - try: - os.remove(path_other) - except: - pass - - -class MDXNetDereverb: - def __init__(self, chunks): - self.onnx = "%s/uvr5_weights/onnx_dereverb_By_FoxJoy"%os.path.dirname(os.path.abspath(__file__)) - self.shifts = 10 # 'Predict with randomised equivariant stabilisation' - self.mixing = "min_mag" # ['default','min_mag','max_mag'] - self.chunks = chunks - self.margin = 44100 - self.dim_t = 9 - self.dim_f = 3072 - self.n_fft = 6144 - self.denoise = True - self.pred = Predictor(self) - self.device = cpu - - def _path_audio_(self, input, others_root, vocal_root, format, is_hp3=False): - self.pred.prediction(input, vocal_root, others_root, format) diff --git a/tools/uvr5/uvr5_weights/.gitignore b/tools/uvr5/uvr5_weights/.gitignore deleted file mode 100644 index d6b7ef32c8478a48c3994dcadc86837f4371184d..0000000000000000000000000000000000000000 --- a/tools/uvr5/uvr5_weights/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/tools/uvr5/vr.py b/tools/uvr5/vr.py deleted file mode 100644 index 640392a4723ac94d076aa4765546bcac240d2717..0000000000000000000000000000000000000000 --- a/tools/uvr5/vr.py +++ /dev/null @@ -1,370 +0,0 @@ -import os,sys -parent_directory = os.path.dirname(os.path.abspath(__file__)) -import logging,pdb -logger = logging.getLogger(__name__) - -import librosa -import numpy as np -import soundfile as sf -import torch -from lib.lib_v5 import nets_61968KB as Nets -from lib.lib_v5 import spec_utils -from lib.lib_v5.model_param_init import ModelParameters -from lib.lib_v5.nets_new import CascadedNet -from lib.utils import inference - - -class AudioPre: - def __init__(self, agg, model_path, device, is_half, tta=False): - self.model_path = model_path - self.device = device - self.data = { - # Processing Options - "postprocess": False, - "tta": tta, - # Constants - "window_size": 512, - "agg": agg, - "high_end_process": "mirroring", - } - mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v2.json"%parent_directory) - model = Nets.CascadedASPPNet(mp.param["bins"] * 2) - cpk = torch.load(model_path, map_location="cpu") - model.load_state_dict(cpk) - model.eval() - if is_half: - model = model.half().to(device) - else: - model = model.to(device) - - self.mp = mp - self.model = model - - def _path_audio_( - self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False - ): - if ins_root is None and vocal_root is None: - return "No save root." - name = os.path.basename(music_file) - if ins_root is not None: - os.makedirs(ins_root, exist_ok=True) - if vocal_root is not None: - os.makedirs(vocal_root, exist_ok=True) - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - bands_n = len(self.mp.param["band"]) - # print(bands_n) - for d in range(bands_n, 0, -1): - bp = self.mp.param["band"][d] - if d == bands_n: # high-end band - ( - X_wave[d], - _, - ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 - music_file, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], - ) - if X_wave[d].ndim == 1: - X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) - else: # lower bands - X_wave[d] = librosa.core.resample( - X_wave[d + 1], - orig_sr = self.mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], - ) - # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( - X_wave[d], - bp["hl"], - bp["n_fft"], - self.mp.param["mid_side"], - self.mp.param["mid_side_b2"], - self.mp.param["reverse"], - ) - # pdb.set_trace() - if d == bands_n and self.data["high_end_process"] != "none": - input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( - self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] - ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] - - X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) - aggresive_set = float(self.data["agg"] / 100) - aggressiveness = { - "value": aggresive_set, - "split_bin": self.mp.param["band"][1]["crop_stop"], - } - with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) - # Postprocess - if self.data["postprocess"]: - pred_inv = np.clip(X_mag - pred, 0, np.inf) - pred = spec_utils.mask_silence(pred, pred_inv) - y_spec_m = pred * X_phase - v_spec_m = X_spec_m - y_spec_m - - if is_hp3 == True: - ins_root,vocal_root = vocal_root,ins_root - - if ins_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) - wav_instrument = spec_utils.cmb_spectrogram_to_wave( - y_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - logger.info("%s instruments done" % name) - if is_hp3 == True: - head = "vocal_" - else: - head = "instrument_" - if format in ["wav", "flac"]: - sf.write( - os.path.join( - ins_root, - head + "{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # - else: - path = os.path.join( - ins_root, head + "{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass - if vocal_root is not None: - if is_hp3 == True: - head = "instrument_" - else: - head = "vocal_" - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - logger.info("%s vocals done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - vocal_root, - head + "{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - else: - path = os.path.join( - vocal_root, head + "{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass - - -class AudioPreDeEcho: - def __init__(self, agg, model_path, device, is_half, tta=False): - self.model_path = model_path - self.device = device - self.data = { - # Processing Options - "postprocess": False, - "tta": tta, - # Constants - "window_size": 512, - "agg": agg, - "high_end_process": "mirroring", - } - mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v3.json"%parent_directory) - nout = 64 if "DeReverb" in model_path else 48 - model = CascadedNet(mp.param["bins"] * 2, nout) - cpk = torch.load(model_path, map_location="cpu") - model.load_state_dict(cpk) - model.eval() - if is_half: - model = model.half().to(device) - else: - model = model.to(device) - - self.mp = mp - self.model = model - - def _path_audio_( - self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False - ): # 3个VR模型vocal和ins是反的 - if ins_root is None and vocal_root is None: - return "No save root." - name = os.path.basename(music_file) - if ins_root is not None: - os.makedirs(ins_root, exist_ok=True) - if vocal_root is not None: - os.makedirs(vocal_root, exist_ok=True) - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - bands_n = len(self.mp.param["band"]) - # print(bands_n) - for d in range(bands_n, 0, -1): - bp = self.mp.param["band"][d] - if d == bands_n: # high-end band - ( - X_wave[d], - _, - ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 - music_file, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], - ) - if X_wave[d].ndim == 1: - X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) - else: # lower bands - X_wave[d] = librosa.core.resample( - X_wave[d + 1], - orig_sr = self.mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], - ) - # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( - X_wave[d], - bp["hl"], - bp["n_fft"], - self.mp.param["mid_side"], - self.mp.param["mid_side_b2"], - self.mp.param["reverse"], - ) - # pdb.set_trace() - if d == bands_n and self.data["high_end_process"] != "none": - input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( - self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] - ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] - - X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) - aggresive_set = float(self.data["agg"] / 100) - aggressiveness = { - "value": aggresive_set, - "split_bin": self.mp.param["band"][1]["crop_stop"], - } - with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) - # Postprocess - if self.data["postprocess"]: - pred_inv = np.clip(X_mag - pred, 0, np.inf) - pred = spec_utils.mask_silence(pred, pred_inv) - y_spec_m = pred * X_phase - v_spec_m = X_spec_m - y_spec_m - - if ins_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) - wav_instrument = spec_utils.cmb_spectrogram_to_wave( - y_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - logger.info("%s instruments done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - ins_root, - "vocal_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # - else: - path = os.path.join( - ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass - if vocal_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - logger.info("%s vocals done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - vocal_root, - "instrument_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - else: - path = os.path.join( - vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py deleted file mode 100644 index 60dfdaa7979d3472454611b63619cc9096c9e630..0000000000000000000000000000000000000000 --- a/tools/uvr5/webui.py +++ /dev/null @@ -1,190 +0,0 @@ -import os -import traceback,gradio as gr -import logging -from tools.i18n.i18n import I18nAuto -from tools.my_utils import clean_path -i18n = I18nAuto() - -logger = logging.getLogger(__name__) -import librosa,ffmpeg -import soundfile as sf -import torch -import sys -from mdxnet import MDXNetDereverb -from vr import AudioPre, AudioPreDeEcho -from bsroformer import BsRoformer_Loader - -weight_uvr5_root = "tools/uvr5/uvr5_weights" -uvr5_names = [] -for name in os.listdir(weight_uvr5_root): - if name.endswith(".pth") or name.endswith(".ckpt") or "onnx" in name: - uvr5_names.append(name.replace(".pth", "").replace(".ckpt", "")) - -device=sys.argv[1] -is_half=eval(sys.argv[2]) -webui_port_uvr5=int(sys.argv[3]) -is_share=eval(sys.argv[4]) - -def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): - infos = [] - try: - inp_root = clean_path(inp_root) - save_root_vocal = clean_path(save_root_vocal) - save_root_ins = clean_path(save_root_ins) - is_hp3 = "HP3" in model_name - if model_name == "onnx_dereverb_By_FoxJoy": - pre_fun = MDXNetDereverb(15) - elif model_name == "Bs_Roformer" or "bs_roformer" in model_name.lower(): - func = BsRoformer_Loader - pre_fun = func( - model_path = os.path.join(weight_uvr5_root, model_name + ".ckpt"), - device = device, - is_half=is_half - ) - else: - func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho - pre_fun = func( - agg=int(agg), - model_path=os.path.join(weight_uvr5_root, model_name + ".pth"), - device=device, - is_half=is_half, - ) - if inp_root != "": - paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] - else: - paths = [path.name for path in paths] - for path in paths: - inp_path = os.path.join(inp_root, path) - if(os.path.isfile(inp_path)==False):continue - need_reformat = 1 - done = 0 - try: - info = ffmpeg.probe(inp_path, cmd="ffprobe") - if ( - info["streams"][0]["channels"] == 2 - and info["streams"][0]["sample_rate"] == "44100" - ): - need_reformat = 0 - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0,is_hp3 - ) - done = 1 - except: - need_reformat = 1 - traceback.print_exc() - if need_reformat == 1: - tmp_path = "%s/%s.reformatted.wav" % ( - os.path.join(os.environ["TEMP"]), - os.path.basename(inp_path), - ) - os.system( - f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y' - ) - inp_path = tmp_path - try: - if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0,is_hp3 - ) - infos.append("%s->Success" % (os.path.basename(inp_path))) - yield "\n".join(infos) - except: - infos.append( - "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) - ) - yield "\n".join(infos) - except: - infos.append(traceback.format_exc()) - yield "\n".join(infos) - finally: - try: - if model_name == "onnx_dereverb_By_FoxJoy": - del pre_fun.pred.model - del pre_fun.pred.model_ - else: - del pre_fun.model - del pre_fun - except: - traceback.print_exc() - print("clean_empty_cache") - if torch.cuda.is_available(): - torch.cuda.empty_cache() - yield "\n".join(infos) - -with gr.Blocks(title="UVR5 WebUI") as app: - gr.Markdown( - value= - i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") - ) - with gr.Tabs(): - with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")): - with gr.Group(): - gr.Markdown( - value=i18n("人声伴奏分离批量处理, 使用UVR5模型。") + "
" + \ - i18n("合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。")+ "
" + \ - i18n("模型分为三类:") + "
" + \ - i18n("1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;") + "
" + \ - i18n("2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;") + "
" + \ - i18n("3、去混响、去延迟模型(by FoxJoy):") + "
  " + \ - i18n("(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;") + "
 " + \ - i18n("(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。") + "
" + \ - i18n("去混响/去延迟,附:") + "
" + \ - i18n("1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;") + "
" + \ - i18n("2、MDX-Net-Dereverb模型挺慢的;") + "
" + \ - i18n("3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。") - ) - with gr.Row(): - with gr.Column(): - dir_wav_input = gr.Textbox( - label=i18n("输入待处理音频文件夹路径"), - placeholder="C:\\Users\\Desktop\\todo-songs", - ) - wav_inputs = gr.File( - file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹") - ) - with gr.Column(): - model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names) - agg = gr.Slider( - minimum=0, - maximum=20, - step=1, - label=i18n("人声提取激进程度"), - value=10, - interactive=True, - visible=False, # 先不开放调整 - ) - opt_vocal_root = gr.Textbox( - label=i18n("指定输出主人声文件夹"), value="output/uvr5_opt" - ) - opt_ins_root = gr.Textbox( - label=i18n("指定输出非主人声文件夹"), value="output/uvr5_opt" - ) - format0 = gr.Radio( - label=i18n("导出文件格式"), - choices=["wav", "flac", "mp3", "m4a"], - value="flac", - interactive=True, - ) - but2 = gr.Button(i18n("转换"), variant="primary") - vc_output4 = gr.Textbox(label=i18n("输出信息")) - but2.click( - uvr, - [ - model_choose, - dir_wav_input, - opt_vocal_root, - wav_inputs, - opt_ins_root, - agg, - format0, - ], - [vc_output4], - api_name="uvr_convert", - ) -app.queue(concurrency_count=511, max_size=1022).launch( - server_name="0.0.0.0", - inbrowser=True, - share=is_share, - server_port=webui_port_uvr5, - quiet=True, -)