Text-to-Speech
ONNX
Safetensors
English
Chinese
zhu-han commited on
Commit
7337d84
·
verified ·
1 Parent(s): 6e78681

Upload 7 files

Browse files
zipvoice/zipvoice_base.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model" : {
3
+ "fm_decoder_downsampling_factor" : [1,2,4,2,1],
4
+ "fm_decoder_num_layers" : [2,2,4,4,4],
5
+ "fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
6
+ "fm_decoder_feedforward_dim" : 1536,
7
+ "fm_decoder_num_heads" : 4,
8
+ "fm_decoder_dim" : 512,
9
+ "text_encoder_num_layers" : 4,
10
+ "text_encoder_feedforward_dim" : 512,
11
+ "text_encoder_cnn_module_kernel" : 9,
12
+ "text_encoder_num_heads" : 4,
13
+ "text_encoder_dim" : 192,
14
+ "query_head_dim" : 32,
15
+ "value_head_dim" : 12,
16
+ "pos_head_dim" : 4,
17
+ "pos_dim" : 48,
18
+ "time_embed_dim" : 192,
19
+ "text_embed_dim" : 192,
20
+ "feat_dim": 100
21
+ },
22
+ "feature" : {
23
+ "sampling_rate": 24000,
24
+ "type": "vocos"
25
+ }
26
+ }
zipvoice_dialog/zipvoice_base.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model" : {
3
+ "fm_decoder_downsampling_factor" : [1,2,4,2,1],
4
+ "fm_decoder_num_layers" : [2,2,4,4,4],
5
+ "fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
6
+ "fm_decoder_feedforward_dim" : 1536,
7
+ "fm_decoder_num_heads" : 4,
8
+ "fm_decoder_dim" : 512,
9
+ "text_encoder_num_layers" : 4,
10
+ "text_encoder_feedforward_dim" : 512,
11
+ "text_encoder_cnn_module_kernel" : 9,
12
+ "text_encoder_num_heads" : 4,
13
+ "text_encoder_dim" : 192,
14
+ "query_head_dim" : 32,
15
+ "value_head_dim" : 12,
16
+ "pos_head_dim" : 4,
17
+ "pos_dim" : 48,
18
+ "time_embed_dim" : 192,
19
+ "text_embed_dim" : 192,
20
+ "feat_dim": 100
21
+ },
22
+ "feature" : {
23
+ "sampling_rate": 24000,
24
+ "type": "vocos"
25
+ }
26
+ }
zipvoice_dialog_opendialog/zipvoice_base.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model" : {
3
+ "fm_decoder_downsampling_factor" : [1,2,4,2,1],
4
+ "fm_decoder_num_layers" : [2,2,4,4,4],
5
+ "fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
6
+ "fm_decoder_feedforward_dim" : 1536,
7
+ "fm_decoder_num_heads" : 4,
8
+ "fm_decoder_dim" : 512,
9
+ "text_encoder_num_layers" : 4,
10
+ "text_encoder_feedforward_dim" : 512,
11
+ "text_encoder_cnn_module_kernel" : 9,
12
+ "text_encoder_num_heads" : 4,
13
+ "text_encoder_dim" : 192,
14
+ "query_head_dim" : 32,
15
+ "value_head_dim" : 12,
16
+ "pos_head_dim" : 4,
17
+ "pos_dim" : 48,
18
+ "time_embed_dim" : 192,
19
+ "text_embed_dim" : 192,
20
+ "feat_dim": 100
21
+ },
22
+ "feature" : {
23
+ "sampling_rate": 24000,
24
+ "type": "vocos"
25
+ }
26
+ }
zipvoice_dialog_stereo/zipvoice_base.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model" : {
3
+ "fm_decoder_downsampling_factor" : [1,2,4,2,1],
4
+ "fm_decoder_num_layers" : [2,2,4,4,4],
5
+ "fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
6
+ "fm_decoder_feedforward_dim" : 1536,
7
+ "fm_decoder_num_heads" : 4,
8
+ "fm_decoder_dim" : 512,
9
+ "text_encoder_num_layers" : 4,
10
+ "text_encoder_feedforward_dim" : 512,
11
+ "text_encoder_cnn_module_kernel" : 9,
12
+ "text_encoder_num_heads" : 4,
13
+ "text_encoder_dim" : 192,
14
+ "query_head_dim" : 32,
15
+ "value_head_dim" : 12,
16
+ "pos_head_dim" : 4,
17
+ "pos_dim" : 48,
18
+ "time_embed_dim" : 192,
19
+ "text_embed_dim" : 192,
20
+ "feat_dim": 100
21
+ },
22
+ "feature" : {
23
+ "sampling_rate": 24000,
24
+ "type": "vocos"
25
+ }
26
+ }
zipvoice_distill/zipvoice_base.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model" : {
3
+ "fm_decoder_downsampling_factor" : [1,2,4,2,1],
4
+ "fm_decoder_num_layers" : [2,2,4,4,4],
5
+ "fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
6
+ "fm_decoder_feedforward_dim" : 1536,
7
+ "fm_decoder_num_heads" : 4,
8
+ "fm_decoder_dim" : 512,
9
+ "text_encoder_num_layers" : 4,
10
+ "text_encoder_feedforward_dim" : 512,
11
+ "text_encoder_cnn_module_kernel" : 9,
12
+ "text_encoder_num_heads" : 4,
13
+ "text_encoder_dim" : 192,
14
+ "query_head_dim" : 32,
15
+ "value_head_dim" : 12,
16
+ "pos_head_dim" : 4,
17
+ "pos_dim" : 48,
18
+ "time_embed_dim" : 192,
19
+ "text_embed_dim" : 192,
20
+ "feat_dim": 100
21
+ },
22
+ "feature" : {
23
+ "sampling_rate": 24000,
24
+ "type": "vocos"
25
+ }
26
+ }
zipvoice_distill_libritts/zipvoice_base.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model" : {
3
+ "fm_decoder_downsampling_factor" : [1,2,4,2,1],
4
+ "fm_decoder_num_layers" : [2,2,4,4,4],
5
+ "fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
6
+ "fm_decoder_feedforward_dim" : 1536,
7
+ "fm_decoder_num_heads" : 4,
8
+ "fm_decoder_dim" : 512,
9
+ "text_encoder_num_layers" : 4,
10
+ "text_encoder_feedforward_dim" : 512,
11
+ "text_encoder_cnn_module_kernel" : 9,
12
+ "text_encoder_num_heads" : 4,
13
+ "text_encoder_dim" : 192,
14
+ "query_head_dim" : 32,
15
+ "value_head_dim" : 12,
16
+ "pos_head_dim" : 4,
17
+ "pos_dim" : 48,
18
+ "time_embed_dim" : 192,
19
+ "text_embed_dim" : 192,
20
+ "feat_dim": 100
21
+ },
22
+ "feature" : {
23
+ "sampling_rate": 24000,
24
+ "type": "vocos"
25
+ }
26
+ }
zipvoice_libritts/zipvoice_base.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model" : {
3
+ "fm_decoder_downsampling_factor" : [1,2,4,2,1],
4
+ "fm_decoder_num_layers" : [2,2,4,4,4],
5
+ "fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
6
+ "fm_decoder_feedforward_dim" : 1536,
7
+ "fm_decoder_num_heads" : 4,
8
+ "fm_decoder_dim" : 512,
9
+ "text_encoder_num_layers" : 4,
10
+ "text_encoder_feedforward_dim" : 512,
11
+ "text_encoder_cnn_module_kernel" : 9,
12
+ "text_encoder_num_heads" : 4,
13
+ "text_encoder_dim" : 192,
14
+ "query_head_dim" : 32,
15
+ "value_head_dim" : 12,
16
+ "pos_head_dim" : 4,
17
+ "pos_dim" : 48,
18
+ "time_embed_dim" : 192,
19
+ "text_embed_dim" : 192,
20
+ "feat_dim": 100
21
+ },
22
+ "feature" : {
23
+ "sampling_rate": 24000,
24
+ "type": "vocos"
25
+ }
26
+ }