| { | |
| "seed": 1, | |
| "precision": "float16", | |
| "num_workers": 8, | |
| "resume": false, | |
| "tb_write_every_n_steps": 100, | |
| "print_every_n_steps": 400, | |
| "val_every_n_steps": 1600, | |
| "lr": 1e-05, | |
| "batch_size": 100, | |
| "weight_decay": 0.0, | |
| "warmup_fraction": 0.1, | |
| "num_epochs": 10, | |
| "num_steps": 500000, | |
| "gradient_accumulation_steps": 24, | |
| "gradient_clip_val": 1.0, | |
| "early_stop_step": 3200, | |
| "early_stop_threshold": -1.0, | |
| "exp_dir": "/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/tts_enhanced_330M", | |
| "dataset": "gigaspeech", | |
| "dataset_dir": "/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest/xl", | |
| "pseudo_epoch_size": 3000, | |
| "phn_folder_name": "phonemes", | |
| "encodec_folder_name": "encodec_16khz_4codebooks", | |
| "manifest_name": "manifest_large16khz_lessambi", | |
| "pad_x": 0, | |
| "max_num_tokens": 20000, | |
| "val_max_num_tokens": 6000, | |
| "num_buckets": 10, | |
| "dynamic_batching": 1, | |
| "audio_max_length": 16.0, | |
| "audio_min_length": 1.0, | |
| "text_max_length": 400, | |
| "text_min_length": 10.0, | |
| "encodec_sr": 50, | |
| "mask_len_min": 1, | |
| "mask_len_max": 600, | |
| "drop_long": 1, | |
| "eos": 2051, | |
| "reduced_eog": 1, | |
| "special_first": 0, | |
| "n_special": 4, | |
| "codebook_weight": "[2,1,1,1]", | |
| "empty_token": 2048, | |
| "optimizer_name": "AdamW", | |
| "reduce_lr_start_step": 3000, | |
| "reduce_lr_start_epoch": 4, | |
| "clipping_update_period": 1000, | |
| "max_mask_portion": 0.9, | |
| "max_n_spans": 3, | |
| "shuffle_mask_embedding": 0, | |
| "mask_sample_dist": "poisson1", | |
| "min_gap": 5, | |
| "n_codebooks": 4, | |
| "text_vocab_size": 120, | |
| "text_pad_token": 120, | |
| "audio_vocab_size": 2048, | |
| "eog": 2049, | |
| "audio_pad_token": 2050, | |
| "d_model": 1024, | |
| "audio_embedding_dim": 1024, | |
| "text_embedding_dropout": 0.0, | |
| "audio_embedding_dropout": 0.0, | |
| "text_positional_embedding_dropout": 0.0, | |
| "audio_positional_embedding_dropout": 0.0, | |
| "trm_dropout": 0.0, | |
| "nhead": 16, | |
| "num_decoder_layers": 24, | |
| "load_model_from": "./pretrained_models/giga330M.pth" | |
| } |