mrfakename commited on
Commit
326e057
·
verified ·
1 Parent(s): b4c181c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,53 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
11
  *.model filter=lfs diff=lfs merge=lfs -text
12
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
13
  *.onnx filter=lfs diff=lfs merge=lfs -text
14
  *.ot filter=lfs diff=lfs merge=lfs -text
15
  *.parquet filter=lfs diff=lfs merge=lfs -text
16
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
17
  *.pt filter=lfs diff=lfs merge=lfs -text
18
  *.pth filter=lfs diff=lfs merge=lfs -text
19
  *.rar filter=lfs diff=lfs merge=lfs -text
 
20
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
22
  *.tflite filter=lfs diff=lfs merge=lfs -text
23
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *.tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.db* filter=lfs diff=lfs merge=lfs -text
29
+ *.ark* filter=lfs diff=lfs merge=lfs -text
30
+ **/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
31
+ **/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
32
+ **/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
33
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
34
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
35
+ *.gguf* filter=lfs diff=lfs merge=lfs -text
36
+ *.ggml filter=lfs diff=lfs merge=lfs -text
37
+ *.llamafile* filter=lfs diff=lfs merge=lfs -text
38
+ *.pt2 filter=lfs diff=lfs merge=lfs -text
39
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
40
+ *.npy filter=lfs diff=lfs merge=lfs -text
41
+ *.npz filter=lfs diff=lfs merge=lfs -text
42
+ *.pickle filter=lfs diff=lfs merge=lfs -text
43
+ *.pkl filter=lfs diff=lfs merge=lfs -text
44
+ *.tar filter=lfs diff=lfs merge=lfs -text
45
+ *.wasm filter=lfs diff=lfs merge=lfs -text
46
  *.zst filter=lfs diff=lfs merge=lfs -text
47
  *tfevents* filter=lfs diff=lfs merge=lfs -text
48
+
49
+ g2p/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+
51
+ g2p/tokenizer.json filter=lfs diff=lfs merge=lfs -text
52
+
53
+ g2p/tokenizer.json filter=lfs diff=lfs merge=lfs -text
.msc ADDED
Binary file (1.69 kB). View file
 
.mv ADDED
@@ -0,0 +1 @@
 
 
1
+ Revision:master,CreatedAt:1743818252
README.md CHANGED
@@ -1,6 +1,161 @@
1
  ---
 
 
 
2
  license: apache-2.0
 
3
  ---
4
- # MegaTTS 3 but with support for voice cloning
5
 
6
- All credits to: https://modelscope.cn/models/ACoderPassBy/MegaTTS-SFT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - en
4
+ - zh
5
  license: apache-2.0
6
+ pipeline_tag: text-to-speech
7
  ---
 
8
 
9
+ 非官方实现版本,包括全量参数。
10
+
11
+
12
+ # Model Description
13
+ This is a ModelScope model card for MegaTTS 3 👋
14
+
15
+ - Paper: [MegaTTS 3: Sparse Alignment Enhanced Latent Diffusion Transformer for Zero-Shot Speech Synthesis](https://huggingface.co/papers/2502.18924)
16
+ - Project Page (Audio Samples): <https://sditdemo.github.io/sditdemo/>
17
+ - github: <https://github.com/bytedance/MegaTTS3>
18
+ - [Demo Video](https://github.com/user-attachments/assets/0174c111-f392-4376-a34b-0b5b8164aacc)
19
+
20
+
21
+ ## Installation
22
+
23
+ ```sh
24
+ # Clone the repository
25
+ git clone https://github.com/bytedance/MegaTTS3
26
+ cd MegaTTS3
27
+ ```
28
+
29
+ **Model Download**
30
+
31
+ ```sh
32
+ modelscope download --model ACoderPassBy/MegaTTS-SFT --local_dir ./checkpoints
33
+ ```
34
+
35
+ **Requirements (for Linux)**
36
+
37
+ ```sh
38
+ # Create a python 3.10 conda env (you could also use virtualenv)
39
+ conda create -n megatts3-env python=3.10
40
+ conda activate megatts3-env
41
+ pip install -r requirements.txt
42
+
43
+ # Set the root directory
44
+ export PYTHONPATH="/path/to/MegaTTS3:$PYTHONPATH"
45
+
46
+ # [Optional] Set GPU
47
+ export CUDA_VISIBLE_DEVICES=0
48
+
49
+ # If you encounter bugs with pydantic in inference, you should check if the versions of pydantic and gradio are matched.
50
+ # [Note] if you encounter bugs related with httpx, please check that whether your environmental variable "no_proxy" has patterns like "::"
51
+ ```
52
+
53
+ **Requirements (for Windows)**
54
+
55
+ ```sh
56
+ # [The Windows version is currently under testing]
57
+ # Comment below dependence in requirements.txt:
58
+ # # WeTextProcessing==1.0.4.1
59
+
60
+ # Create a python 3.10 conda env (you could also use virtualenv)
61
+ conda create -n megatts3-env python=3.10
62
+ conda activate megatts3-env
63
+ pip install -r requirements.txt
64
+ conda install -y -c conda-forge pynini==2.1.5
65
+ pip install WeTextProcessing==1.0.3
66
+
67
+ # [Optional] If you want GPU inference, you may need to install specific version of PyTorch for your GPU from https://pytorch.org/.
68
+ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
69
+
70
+ # [Note] if you encounter bugs related with `ffprobe` or `ffmpeg`, you can install it through `conda install -c conda-forge ffmpeg`
71
+
72
+ # Set environment variable for root directory
73
+ set PYTHONPATH="C:\path\to\MegaTTS3;%PYTHONPATH%" # Windows
74
+ $env:PYTHONPATH="C:\path\to\MegaTTS3;%PYTHONPATH%" # Powershell on Windows
75
+ conda env config vars set PYTHONPATH="C:\path\to\MegaTTS3;%PYTHONPATH%" # For conda users
76
+
77
+ # [Optional] Set GPU
78
+ set CUDA_VISIBLE_DEVICES=0 # Windows
79
+ $env:CUDA_VISIBLE_DEVICES=0 # Powershell on Windows
80
+ ```
81
+
82
+ **Requirements (for Docker)**
83
+
84
+ ```sh
85
+ # [The Docker version is currently under testing]
86
+ # ! You should download the pretrained checkpoint before running the following command
87
+ docker build . -t megatts3:latest
88
+
89
+ # For GPU inference
90
+ docker run -it -p 7929:7929 --gpus all -e CUDA_VISIBLE_DEVICES=0 megatts3:latest
91
+ # For CPU inference
92
+ docker run -it -p 7929:7929 megatts3:latest
93
+
94
+ # Visit http://0.0.0.0:7929/ for gradio.
95
+ ```
96
+
97
+ > \[!TIP]
98
+ > \[IMPORTANT]
99
+ > 非官方版本
100
+
101
+ ## Inference
102
+
103
+ **Command-Line Usage (Standard)**
104
+
105
+ ```bash
106
+ # p_w (intelligibility weight), t_w (similarity weight). Typically, prompt with more noises requires higher p_w and t_w
107
+ python tts/infer_cli.py --input_wav 'assets/Chinese_prompt.wav' --input_text "另一边的桌上,一位读书人嗤之以鼻道,'佛子三藏,神子燕小鱼是什么样的人物,李家的那个李子夜如何与他们相提并论?'" --output_dir ./gen
108
+
109
+ # As long as audio volume and pronunciation are appropriate, increasing --t_w within reasonable ranges (2.0~5.0)
110
+ # will increase the generated speech's expressiveness and similarity (especially for some emotional cases).
111
+ python tts/infer_cli.py --input_wav 'assets/English_prompt.wav' --input_text 'As his long promised tariff threat turned into reality this week, top human advisers began fielding a wave of calls from business leaders, particularly in the automotive sector, along with lawmakers who were sounding the alarm.' --output_dir ./gen --p_w 2.0 --t_w 3.0
112
+ ```
113
+
114
+ **Command-Line Usage (for TTS with Accents)**
115
+
116
+ ```bash
117
+ # When p_w (intelligibility weight) ≈ 1.0, the generated audio closely retains the speaker’s original accent. As p_w increases, it shifts toward standard pronunciation.
118
+ # t_w (similarity weight) is typically set 0–3 points higher than p_w for optimal results.
119
+ # Useful for accented TTS or solving the accent problems in cross-lingual TTS.
120
+ python tts/infer_cli.py --input_wav 'assets/English_prompt.wav' --input_text '这是一条有口音的音频。' --output_dir ./gen --p_w 1.0 --t_w 3.0
121
+
122
+ python tts/infer_cli.py --input_wav 'assets/English_prompt.wav' --input_text '这条音频的发音标准一些了吗?' --output_dir ./gen --p_w 2.5 --t_w 2.5
123
+ ```
124
+
125
+ **Web UI Usage**
126
+
127
+ ```bash
128
+ # We also support cpu inference, but it may take about 30 seconds (for 10 inference steps).
129
+ python tts/gradio_api.py
130
+ ```
131
+
132
+ ## Security
133
+
134
+ If you discover a potential security issue in this project, or think you may
135
+ have discovered a security issue, we ask that you notify Bytedance Security via our [security center](https://security.bytedance.com/src) or [[email protected]]([email protected]).
136
+
137
+ Please do **not** create a public issue.
138
+
139
+ ## License
140
+
141
+ This project is licensed under the [Apache-2.0 License](LICENSE).
142
+
143
+ ## BibTeX Entry and Citation Info
144
+
145
+ This repo contains forced-align version of `Sparse Alignment Enhanced Latent Diffusion Transformer for Zero-Shot Speech Synthesis` and the WavVAE is mainly based on `Wavtokenizer: an efficient acoustic discrete codec tokenizer for audio language modeling`. Compared to the model described in paper, the repository includes additional models. These models not only enhance the stability and cloning capabilities of the algorithm but can also be independently utilized to serve a wider range of scenarios.
146
+
147
+ ```
148
+ @article{jiang2025sparse,
149
+ title={Sparse Alignment Enhanced Latent Diffusion Transformer for Zero-Shot Speech Synthesis},
150
+ author={Jiang, Ziyue and Ren, Yi and Li, Ruiqi and Ji, Shengpeng and Ye, Zhenhui and Zhang, Chen and Jionghao, Bai and Yang, Xiaoda and Zuo, Jialong and Zhang, Yu and others},
151
+ journal={arXiv preprint arXiv:2502.18924},
152
+ year={2025}
153
+ }
154
+
155
+ @article{ji2024wavtokenizer,
156
+ title={Wavtokenizer: an efficient acoustic discrete codec tokenizer for audio language modeling},
157
+ author={Ji, Shengpeng and Jiang, Ziyue and Wang, Wen and Chen, Yifu and Fang, Minghui and Zuo, Jialong and Yang, Qian and Cheng, Xize and Wang, Zehan and Li, Ruiqi and others},
158
+ journal={arXiv preprint arXiv:2408.16532},
159
+ year={2024}
160
+ }
161
+ ```
aligner_lm/config.yaml ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ acous_params:
2
+ - - 480
3
+ - 1200
4
+ - 80
5
+ - - 240
6
+ - 1200
7
+ - 160
8
+ amp: true
9
+ audio_num_mel_bins: 160
10
+ audio_sample_rate: 24000
11
+ base_config:
12
+ - ./base_config.yaml
13
+ c_spk_enc: 512
14
+ char_dict_size: 15000
15
+ conv_use_pos: false
16
+ dec0_dilations:
17
+ - 1
18
+ - 2
19
+ - 4
20
+ - 1
21
+ - 2
22
+ - 4
23
+ - 1
24
+ dec0_kernel_size: 3
25
+ dec_dilations:
26
+ - 1
27
+ - 2
28
+ - 1
29
+ - 2
30
+ - 1
31
+ dec_ffn_kernel_size: 9
32
+ dec_kernel_size: 5
33
+ dec_layers: 4
34
+ dec_post_net_kernel: 3
35
+ decoder_rnn_dim: 0
36
+ decoder_type: conv
37
+ dropout: 0.0
38
+ dur_alpha: 1.0
39
+ dur_context_enc: true
40
+ dur_log: true
41
+ dur_predictor_kernel: 3
42
+ dur_predictor_layers: 2
43
+ dur_use_char: true
44
+ dur_use_spk: true
45
+ enc_dec_norm: ln
46
+ enc_dilations:
47
+ - 1
48
+ - 1
49
+ - 1
50
+ - 1
51
+ enc_ffn_kernel_size: 5
52
+ enc_kernel_size: 5
53
+ enc_layers: 8
54
+ enc_post_net_kernel: 3
55
+ enc_pre_ln: true
56
+ enc_prenet: true
57
+ encoder_K: 8
58
+ encoder_type: rel_fft
59
+ endless_ds: true
60
+ eval_max_batches: 0
61
+ f0_max: 600
62
+ f0_min: 60
63
+ ffn_act: gelu
64
+ ffn_hidden_size: 1024
65
+ fft_size: 1200
66
+ fg_spk_enc_hidden: 256
67
+ fmax: 12000
68
+ fmin: 0
69
+ frames_multiple: 8
70
+ hidden_size: 512
71
+ hop_size: 240
72
+ keep_c0_init: true
73
+ lat_for_dur: false
74
+ latent_dim: 16
75
+ latent_size: 256
76
+ layers_in_block: 2
77
+ ling_label_dict_size:
78
+ - 20
79
+ - 4
80
+ - 5
81
+ - 2
82
+ - 3
83
+ - 3
84
+ - 3
85
+ - 6
86
+ - 15
87
+ ling_labels:
88
+ - tone
89
+ loud_norm: false
90
+ mel_vmax: 0.5
91
+ mel_vmin: -6
92
+ min_frames: 50
93
+ mix_melout_timbre: true
94
+ mix_ph_timbre: false
95
+ mixed_precision: bf16
96
+ model_type: 1
97
+ multistage: false
98
+ no_text_enc: false
99
+ num_ckpt_keep: 5
100
+ num_heads: 2
101
+ num_spk: 50000
102
+ out_wav_norm: true
103
+ pitch_extractor: reaper
104
+ pitch_key: pitch
105
+ pitch_type: frame
106
+ precision: bf16
107
+ ref_mel_bins: 160
108
+ seed: 1234
109
+ split_ref: true
110
+ use_bert_input: false
111
+ use_cfg: true
112
+ use_char: true
113
+ use_cur_global: false
114
+ use_cur_global_dec: true
115
+ use_dur_embed: true
116
+ use_dur_mask_embed: true
117
+ use_ema: false
118
+ use_expand_ph: true
119
+ use_finegrained_spk: false
120
+ use_gt_dur: false
121
+ use_gt_f0: false
122
+ use_mix_spk_embed: false
123
+ use_new_vae: false
124
+ use_ph_level_f0: false
125
+ use_ph_pos_embed: true
126
+ use_pitch_embed: false
127
+ use_pitch_embed_dec: false
128
+ use_pitch_pred: true
129
+ use_pos_embed: true
130
+ use_qk_norm: true
131
+ use_random_spk_embed: false
132
+ use_seq_cfg: true
133
+ use_spk_embed: false
134
+ use_spk_enc: true
135
+ use_spk_id: false
136
+ use_uv: true
137
+ use_vae: true
138
+ use_vpcfm: true
139
+ use_vqvae: true
140
+ use_word_encoder: true
141
+ use_word_input: false
142
+ vae_dur_grad: 0.1
143
+ vae_enc_hidden_size: 384
144
+ vae_stride: 4
145
+ vae_word_conder_layers: 0
146
+ vq_stride: 8
147
+ vqvae_start_steps: 0
148
+ win_size: 1200
149
+ word_dict_size: 10000
150
+ z_channels: 64
151
+ z_clamp: 2.0
aligner_lm/model_only_last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a00f18ec36f8c1328ddab7a405c8e388790a1c14fdbdd07c546fcacaf5d19296
3
+ size 218434266
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "name": this is a file used for counting downloads for models.
3
+ }
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework": "pytorch", "task": "text-to-speech", "allow_remote": true}
diffusion_transformer/config.yaml ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ acous_params:
2
+ - - 480
3
+ - 1200
4
+ - 80
5
+ - - 240
6
+ - 1200
7
+ - 160
8
+ amp: true
9
+ audio_num_mel_bins: 160
10
+ audio_sample_rate: 24000
11
+ base_config:
12
+ - ./base_config.yaml
13
+ c_spk_enc: 512
14
+ char_dict_size: 15000
15
+ conv_use_pos: false
16
+ dec0_dilations:
17
+ - 1
18
+ - 2
19
+ - 4
20
+ - 1
21
+ - 2
22
+ - 4
23
+ - 1
24
+ dec0_kernel_size: 3
25
+ dec_dilations:
26
+ - 1
27
+ - 2
28
+ - 1
29
+ - 2
30
+ - 1
31
+ dec_ffn_kernel_size: 9
32
+ dec_inp_add_noise: false
33
+ dec_kernel_size: 5
34
+ dec_layers: 4
35
+ dec_post_net_kernel: 3
36
+ decoder_rnn_dim: 0
37
+ decoder_type: conv
38
+ dropout: 0.0
39
+ ds_add_pitch_embed: false
40
+ dur_alpha: 1.0
41
+ dur_context_enc: true
42
+ dur_log: true
43
+ dur_predictor_kernel: 3
44
+ dur_predictor_layers: 2
45
+ dur_use_char: true
46
+ dur_use_spk: true
47
+ enc_dec_norm: ln
48
+ enc_dilations:
49
+ - 1
50
+ - 1
51
+ - 1
52
+ - 1
53
+ enc_ffn_kernel_size: 5
54
+ enc_kernel_size: 5
55
+ enc_layers: 8
56
+ enc_post_net_kernel: 3
57
+ enc_pre_ln: true
58
+ enc_prenet: true
59
+ encoder_K: 8
60
+ encoder_type: rel_fft
61
+ f0_max: 600
62
+ f0_min: 60
63
+ ffn_act: gelu
64
+ ffn_hidden_size: 1024
65
+ fft_size: 1200
66
+ fg_spk_enc_hidden: 256
67
+ fmax: 12000
68
+ fmin: 0
69
+ frames_multiple: 8
70
+ gen_dir_name: ''
71
+ hidden_size: 512
72
+ hop_size: 240
73
+ ignore_begin_end_sil: false
74
+ keep_c0_init: true
75
+ kl_min: 0
76
+ kl_start_steps: 1
77
+ latent_dim: 32
78
+ latent_size: 256
79
+ layers_in_block: 2
80
+ ling_label_dict_size:
81
+ - 20
82
+ - 4
83
+ - 5
84
+ - 2
85
+ - 3
86
+ - 3
87
+ - 3
88
+ - 6
89
+ - 15
90
+ ling_labels:
91
+ - tone
92
+ loud_norm: false
93
+ max_input_tokens: 1550
94
+ mel_vmax: 0.5
95
+ mel_vmin: -6
96
+ min_frames: 50
97
+ mix_melout_timbre: true
98
+ mix_ph_timbre: false
99
+ mixed_precision: bf16
100
+ no_text_enc: false
101
+ num_heads: 2
102
+ out_wav_norm: true
103
+ pad_frames: false
104
+
105
+ precision: bf16
106
+ seed: 1234
107
+ use_bert_input: false
108
+ use_cfg: true
109
+ use_char: true
110
+ use_cur_global: false
111
+ use_cur_global_dec: true
112
+ use_dit_1b: false
113
+ use_dur_embed: true
114
+ use_dur_mask_embed: true
115
+ use_ema: false
116
+ use_expand_ph: true
117
+ use_finegrained_spk: false
118
+ use_global_lat: false
119
+ use_gt_dur: false
120
+ use_gt_f0: false
121
+ use_mix_spk_embed: false
122
+ use_new_vae: false
123
+ use_ph_level_f0: false
124
+ use_ph_pos_embed: true
125
+ use_pitch_embed: false
126
+ use_pitch_embed_dec: false
127
+ use_pitch_pred: true
128
+ use_pos_embed: true
129
+ use_qk_norm: true
130
+ use_random_spk_embed: false
131
+ use_seq_cfg: true
132
+ use_spk_embed: false
133
+ use_spk_enc: true
134
+ use_spk_id: false
135
+ use_uv: true
136
+ use_vae: true
137
+ use_vpcfm: true
138
+ use_vqvae: true
139
+ use_word_encoder: true
140
+ use_word_input: false
141
+ vae_dur_grad: 0.1
142
+ vae_enc_hidden_size: 384
143
+ vae_stride: 4
144
+ vae_word_conder_layers: 0
145
+ vq_stride: 8
146
+ vqvae_start_steps: 0
147
+ win_size: 1200
148
+ word_dict_size: 10000
diffusion_transformer/model_only_last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12233b95be177504551034390cf71aa748f0c66cbe2fd0ce433b9f9686122da9
3
+ size 1836341777
duration_lm/config.yaml ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ acous_params:
2
+ - - 480
3
+ - 1200
4
+ - 80
5
+ - - 240
6
+ - 1200
7
+ - 160
8
+ amp: false
9
+ audio_num_mel_bins: 160
10
+ audio_sample_rate: 24000
11
+ balance_sil: true
12
+ c_spk_enc: 512
13
+ char_dict_size: 15000
14
+ conv_use_pos: false
15
+ dec0_dilations:
16
+ - 1
17
+ - 2
18
+ - 4
19
+ - 1
20
+ - 2
21
+ - 4
22
+ - 1
23
+ dec0_kernel_size: 3
24
+ dec_dilations:
25
+ - 1
26
+ - 2
27
+ - 1
28
+ - 2
29
+ - 1
30
+ dec_ffn_kernel_size: 9
31
+ dec_hidden_size: 2048
32
+ dec_inp_add_noise: false
33
+ dec_kernel_size: 5
34
+ dec_layers: 4
35
+ dec_num_heads: 8
36
+ dec_post_net_kernel: 3
37
+ decoder_rnn_dim: 0
38
+ decoder_type: conv
39
+ dropout: 0.0
40
+ ds_add_pitch_embed: false
41
+ dur_alpha: 1.0
42
+ dur_code_size: 128
43
+ dur_context_enc: true
44
+ dur_log: true
45
+ dur_model_hidden_size: 512
46
+ dur_model_layers: 8
47
+ dur_model_type: ar_mse
48
+ dur_predictor_kernel: 3
49
+ dur_predictor_layers: 2
50
+ dur_txt_hs: 512
51
+ dur_use_char: true
52
+ dur_use_spk: true
53
+ enc_dec_norm: ln
54
+ enc_dilations:
55
+ - 1
56
+ - 1
57
+ - 1
58
+ - 1
59
+ enc_ffn_kernel_size: 3
60
+ enc_hidden_size: 256
61
+ enc_kernel_size: 5
62
+ enc_layers: 4
63
+ enc_post_net_kernel: 3
64
+ enc_pre_ln: true
65
+ enc_prenet: true
66
+ encoder_K: 8
67
+ encoder_type: rel_fft
68
+ f0_max: 600
69
+ f0_min: 60
70
+ ffn_act: gelu
71
+ ffn_hidden_size: 1024
72
+ fft_size: 1200
73
+ fg_spk_enc_hidden: 256
74
+ flatten_dec: true
75
+ fmax: 12000
76
+ fmin: 0
77
+ frames_multiple: 8
78
+ hidden_size: 512
79
+ hop_size: 240
80
+ ignore_begin_end_sil: false
81
+ lat_for_dur: false
82
+ latent_size: 256
83
+ layers_in_block: 2
84
+ ling_label_dict_size:
85
+ - 20
86
+ - 4
87
+ - 5
88
+ - 2
89
+ - 3
90
+ - 3
91
+ - 3
92
+ - 6
93
+ - 15
94
+ ling_labels:
95
+ - tone
96
+ lm_num_layers: 24
97
+ lm_use_enc: true
98
+ loud_norm: false
99
+ max_tokens: 6000
100
+ mel_vmax: 0.5
101
+ mel_vmin: -6
102
+ min_frames: 0
103
+ mix_melout_timbre: true
104
+ mix_ph_timbre: false
105
+ model_type: 1
106
+ multistage: false
107
+ no_text_enc: false
108
+ num_heads: 2
109
+ out_wav_norm: true
110
+ pad_frames: false
111
+
112
+ precision: fp16
113
+ predict_pitch: false
114
+ predictor_dropout: 0.0
115
+ predictor_grad: 1.0
116
+ predictor_hidden: -1
117
+ predictor_kernel: 5
118
+ predictor_layers: 5
119
+ print_nan_grads: true
120
+ ref_mel_bins: 160
121
+ ref_size_max: 2000
122
+ ref_size_min: 1000
123
+ remove_sil: false
124
+ shuffle_ref: false
125
+ split_ref: true
126
+ temperature: 0.8
127
+ tone_percep_ckpt: ''
128
+ train_spk_embed_only: false
129
+ use_bert_input: false
130
+ use_char: true
131
+ use_cur_global: false
132
+ use_cur_global_dec: true
133
+ use_dur_embed: true
134
+ use_dur_mask_embed: true
135
+ use_finegrained_spk: false
136
+ use_global_lat: false
137
+ use_gpt: true
138
+ use_gt_dur: false
139
+ use_gt_f0: false
140
+ use_mix_spk_embed: false
141
+ use_new_vae: false
142
+ use_ph_level_f0: false
143
+ use_ph_pos_embed: true
144
+ use_pitch_embed: false
145
+ use_pitch_embed_dec: false
146
+ use_pitch_pred: true
147
+ use_pos_embed: false
148
+ use_post_ln: false
149
+ use_random_spk_embed: false
150
+ use_rot_embed: true
151
+ use_spk_embed: false
152
+ use_spk_enc: false
153
+ use_spk_id: false
154
+ use_text_postnet: true
155
+ use_uv: true
156
+ use_vae: true
157
+ use_vqvae: true
158
+ use_word_encoder: true
159
+ use_word_input: false
160
+ vae_dur_grad: 0.1
161
+ vae_enc_hidden_size: 384
162
+ vae_word_conder_layers: 0
163
+ vq_stride: 8
164
+ w_nonsil: 10.0
165
+ w_sil: 1.0
166
+ word_dict_size: 10000
167
+ z_channels: 64
168
+ z_clamp: 2.0
duration_lm/model_only_last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f21f4205c5d3ec4bef69716a85ca3d37f25c35b429bac500477a2085039b43f
3
+ size 267955084
g2p/added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
g2p/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./checkpoints/Qwen2-0.5B",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 896,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4864,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 24,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 14,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": true,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.48.3",
26
+ "use_cache": false,
27
+ "use_mrope": false,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 168896
30
+ }
g2p/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.48.3"
6
+ }
g2p/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step95500
g2p/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
g2p/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f9d70d454ee35d023a9a54552716a8ccf2411c967abc6a857160527046f62a2
3
+ size 1018490136
g2p/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
g2p/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18bf578a236efa19e36ee7be04c327ba4abc23aed0213a31d3199a55ea7d2411
3
+ size 14796960
g2p/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
g2p/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
g2p/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
wavvae/config.yaml ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ acous_params:
2
+ - - 480
3
+ - 1200
4
+ - 80
5
+ - - 240
6
+ - 1200
7
+ - 160
8
+ amp: false
9
+ audio_num_mel_bins: 160
10
+ audio_sample_rate: 24000
11
+ c_spk_enc: 512
12
+ char_dict_size: 15000
13
+ conv_use_pos: false
14
+ dec0_dilations:
15
+ - 1
16
+ - 2
17
+ - 4
18
+ - 1
19
+ - 2
20
+ - 4
21
+ - 1
22
+ dec0_kernel_size: 3
23
+ dec_dilations:
24
+ - 1
25
+ - 2
26
+ - 1
27
+ - 2
28
+ - 1
29
+ dec_ffn_kernel_size: 9
30
+ dec_inp_add_noise: false
31
+ dec_kernel_size: 5
32
+ dec_layers: 4
33
+ dec_post_net_kernel: 3
34
+ decoder_rnn_dim: 0
35
+ decoder_type: conv
36
+ dropout: 0.0
37
+ ds_add_pitch_embed: false
38
+ dur_alpha: 1.0
39
+ dur_context_enc: true
40
+ dur_log: true
41
+ dur_predictor_kernel: 3
42
+ dur_predictor_layers: 2
43
+ dur_use_char: true
44
+ dur_use_spk: true
45
+ enc_dec_norm: ln
46
+ enc_dilations:
47
+ - 1
48
+ - 1
49
+ - 1
50
+ - 1
51
+ enc_ffn_kernel_size: 5
52
+ enc_kernel_size: 5
53
+ enc_layers: 8
54
+ enc_post_net_kernel: 3
55
+ enc_pre_ln: true
56
+ enc_prenet: true
57
+ encoder_K: 8
58
+ encoder_type: rel_fft
59
+ f0_max: 600
60
+ f0_min: 60
61
+ ffn_act: gelu
62
+ ffn_hidden_size: 1024
63
+ fft_size: 1200
64
+ fg_spk_enc_hidden: 256
65
+ fmax: 12000
66
+ fmin: 0
67
+ frames_multiple: 8
68
+ hidden_size: 512
69
+ hop_size: 240
70
+ ignore_begin_end_sil: false
71
+ keep_c0_init: true
72
+ kl_min: 0
73
+ kl_start_steps: 1
74
+ lat_for_dur: false
75
+ latent_dim: 16
76
+ latent_size: 256
77
+ layers_in_block: 2
78
+ ling_label_dict_size:
79
+ - 20
80
+ - 4
81
+ - 5
82
+ - 2
83
+ - 3
84
+ - 3
85
+ - 3
86
+ - 6
87
+ - 15
88
+ ling_labels:
89
+ - tone
90
+ load_ckpt: ''
91
+ loud_norm: false
92
+ mel_vmax: 0.5
93
+ mel_vmin: -6
94
+ min_frames: 50
95
+ mixed_precision: bf16
96
+ no_text_enc: false
97
+ nsf_type: none
98
+ num_heads: 2
99
+ out_wav_norm: true
100
+ pad_frames: false
101
+
102
+ precision: fp16
103
+ predict_pitch: false
104
+ resblock: '1'
105
+ resblock_dilation_sizes:
106
+ - - 1
107
+ - 3
108
+ - 5
109
+ - - 1
110
+ - 3
111
+ - 5
112
+ - - 1
113
+ - 3
114
+ - 5
115
+ resblock_kernel_sizes:
116
+ - 3
117
+ - 7
118
+ - 11
119
+ train_spk_embed_only: false
120
+ upsample_initial_channel: 512
121
+ upsample_kernel_sizes:
122
+ - 12
123
+ - 11
124
+ - 8
125
+ - 4
126
+ upsample_rates:
127
+ - 6
128
+ - 5
129
+ - 4
130
+ - 2
131
+ use_bert_input: false
132
+ use_cfg: true
133
+ use_char: true
134
+ use_cur_global: false
135
+ use_cur_global_dec: true
136
+ use_dur_embed: true
137
+ use_dur_mask_embed: true
138
+ use_ema: false
139
+ use_expand_ph: true
140
+ use_finegrained_spk: false
141
+ use_global_lat: false
142
+ use_gt_dur: false
143
+ use_gt_f0: false
144
+ use_mix_spk_embed: false
145
+ use_new_vae: false
146
+ use_ph_level_f0: false
147
+ use_ph_pos_embed: true
148
+ use_pitch_embed: false
149
+ use_pitch_embed_dec: false
150
+ use_pitch_pred: true
151
+ use_pos_embed: true
152
+ use_qk_norm: true
153
+ use_random_spk_embed: false
154
+ use_seq_cfg: true
155
+ use_spk_embed: false
156
+ use_spk_enc: true
157
+ use_spk_id: false
158
+ use_uv: true
159
+ use_vae: true
160
+ use_vpcfm: true
161
+ use_vqvae: true
162
+ use_word_encoder: true
163
+ use_word_input: false
164
+ vae_dur_grad: 0.1
165
+ vae_enc_hidden_size: 384
166
+ vae_stride: 4
167
+ vae_word_conder_layers: 0
168
+ vq_stride: 8
169
+ win_size: 1200
170
+ word_dict_size: 10000
171
+ melgan_config:
172
+ all_noise: false
173
+ backbone_resampling: librosa_kaiser_best
174
+ batch_size: 8
175
+ cond_disc: false
176
+ dim_pitch_condition: 1
177
+ downsamp_factor: 4
178
+ epochs: 1000
179
+ frame_shift: 240
180
+ lambda_feat: 0.0
181
+ lambda_log_pitch: 0.4
182
+ lambda_voiced: 1.0
183
+ load_D: 1
184
+ log_interval: 100
185
+ loss_pitch: 1.0
186
+ loss_speaker: 1.0
187
+ loss_stft: 0.0
188
+ lr: 0.0005
189
+ mode_pitch_condition: singgan_torch
190
+ multi_resolution: 0
191
+ n_layers_D: 4
192
+ n_mel_channels: 160
193
+ n_residual_layers: 4
194
+ n_test_samples: 5
195
+ ndf: 16
196
+ noise_index: 1.0
197
+ nr: 0
198
+ num_D: 3
199
+ num_band: 1
200
+ num_workers: 0
201
+ offset: 0
202
+ pretrain_steps: 0
203
+ res_layers: 1
204
+ run_hdfs: 0
205
+ sampling_rate: 24000
206
+ save_interval: 5000
207
+ seq_len: 100
208
+ single_stft: 0
209
+ sub_dis: 1
210
+ tf: 1
211
+ tf_end_ratio: 0.0
212
+ tf_end_step: 0
213
+ tf_start_ratio: 0.0
214
+ tf_start_step: 0
215
+ up_sample:
216
+ - 5
217
+ - 4
218
+ - 4
219
+ - 3
220
+ use_F_dis: 0
221
+ use_aug_pitch: 0
222
+ use_interpolate: 0
223
+ use_lsgan: 1
224
+ use_mel_loss: 1
225
+ use_melnorm: 0
226
+ use_msg_gan: 0
227
+ use_pitch_condition: false
228
+ use_pitch_prediction: 1
229
+ use_sbd: 0
230
+ use_speaker_prediction: 0
231
+ use_tanh: true
232
+ use_time_loss: 1
wavvae/model_only_last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f432d0740f5238c573be49097a69ac5271e24d0d310e6ea63023f038e5e408a
3
+ size 1022420843