| dataset: | |
| bpe_model: bpe.model | |
| sample_rate: 24000 | |
| squeeze: false | |
| mel: | |
| sample_rate: 24000 | |
| n_fft: 1024 | |
| hop_length: 256 | |
| win_length: 1024 | |
| n_mels: 100 | |
| mel_fmin: 0 | |
| normalize: false | |
| gpt: | |
| model_dim: 1280 | |
| max_mel_tokens: 800 | |
| max_text_tokens: 600 | |
| heads: 20 | |
| use_mel_codes_as_input: true | |
| mel_length_compression: 1024 | |
| layers: 24 | |
| number_text_tokens: 12000 | |
| number_mel_codes: 8194 | |
| start_mel_token: 8192 | |
| stop_mel_token: 8193 | |
| start_text_token: 0 | |
| stop_text_token: 1 | |
| train_solo_embeddings: false | |
| condition_type: "conformer_perceiver" | |
| condition_module: | |
| output_size: 512 | |
| linear_units: 2048 | |
| attention_heads: 8 | |
| num_blocks: 6 | |
| input_layer: "conv2d2" | |
| perceiver_mult: 2 | |
| vqvae: | |
| channels: 100 | |
| num_tokens: 8192 | |
| hidden_dim: 512 | |
| num_resnet_blocks: 3 | |
| codebook_dim: 512 | |
| num_layers: 2 | |
| positional_dims: 1 | |
| kernel_size: 3 | |
| smooth_l1_loss: true | |
| use_transposed_convs: false | |
| bigvgan: | |
| adam_b1: 0.8 | |
| adam_b2: 0.99 | |
| lr_decay: 0.999998 | |
| seed: 1234 | |
| resblock: "1" | |
| upsample_rates: [4,4,4,4,2,2] | |
| upsample_kernel_sizes: [8,8,4,4,4,4] | |
| upsample_initial_channel: 1536 | |
| resblock_kernel_sizes: [3,7,11] | |
| resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] | |
| feat_upsample: false | |
| speaker_embedding_dim: 512 | |
| cond_d_vector_in_each_upsampling_layer: true | |
| gpt_dim: 1280 | |
| activation: "snakebeta" | |
| snake_logscale: true | |
| use_cqtd_instead_of_mrd: true | |
| cqtd_filters: 128 | |
| cqtd_max_filters: 1024 | |
| cqtd_filters_scale: 1 | |
| cqtd_dilations: [1, 2, 4] | |
| cqtd_hop_lengths: [512, 256, 256] | |
| cqtd_n_octaves: [9, 9, 9] | |
| cqtd_bins_per_octaves: [24, 36, 48] | |
| resolutions: [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]] | |
| mpd_reshapes: [2, 3, 5, 7, 11] | |
| use_spectral_norm: false | |
| discriminator_channel_mult: 1 | |
| use_multiscale_melloss: true | |
| lambda_melloss: 15 | |
| clip_grad_norm: 1000 | |
| segment_size: 16384 | |
| num_mels: 100 | |
| num_freq: 1025 | |
| n_fft: 1024 | |
| hop_size: 256 | |
| win_size: 1024 | |
| sampling_rate: 24000 | |
| fmin: 0 | |
| fmax: null | |
| fmax_for_loss: null | |
| mel_type: "pytorch" | |
| num_workers: 2 | |
| dist_config: | |
| dist_backend: "nccl" | |
| dist_url: "tcp://localhost:54321" | |
| world_size: 1 | |
| dvae_checkpoint: dvae.pth | |
| gpt_checkpoint: gpt.pth | |
| bigvgan_checkpoint: bigvgan_generator.pth | |
| version: 1.5 | |