Upload 7 files
Browse files- zipvoice/zipvoice_base.json +26 -0
- zipvoice_dialog/zipvoice_base.json +26 -0
- zipvoice_dialog_opendialog/zipvoice_base.json +26 -0
- zipvoice_dialog_stereo/zipvoice_base.json +26 -0
- zipvoice_distill/zipvoice_base.json +26 -0
- zipvoice_distill_libritts/zipvoice_base.json +26 -0
- zipvoice_libritts/zipvoice_base.json +26 -0
zipvoice/zipvoice_base.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model" : {
|
3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
7 |
+
"fm_decoder_num_heads" : 4,
|
8 |
+
"fm_decoder_dim" : 512,
|
9 |
+
"text_encoder_num_layers" : 4,
|
10 |
+
"text_encoder_feedforward_dim" : 512,
|
11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
12 |
+
"text_encoder_num_heads" : 4,
|
13 |
+
"text_encoder_dim" : 192,
|
14 |
+
"query_head_dim" : 32,
|
15 |
+
"value_head_dim" : 12,
|
16 |
+
"pos_head_dim" : 4,
|
17 |
+
"pos_dim" : 48,
|
18 |
+
"time_embed_dim" : 192,
|
19 |
+
"text_embed_dim" : 192,
|
20 |
+
"feat_dim": 100
|
21 |
+
},
|
22 |
+
"feature" : {
|
23 |
+
"sampling_rate": 24000,
|
24 |
+
"type": "vocos"
|
25 |
+
}
|
26 |
+
}
|
zipvoice_dialog/zipvoice_base.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model" : {
|
3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
7 |
+
"fm_decoder_num_heads" : 4,
|
8 |
+
"fm_decoder_dim" : 512,
|
9 |
+
"text_encoder_num_layers" : 4,
|
10 |
+
"text_encoder_feedforward_dim" : 512,
|
11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
12 |
+
"text_encoder_num_heads" : 4,
|
13 |
+
"text_encoder_dim" : 192,
|
14 |
+
"query_head_dim" : 32,
|
15 |
+
"value_head_dim" : 12,
|
16 |
+
"pos_head_dim" : 4,
|
17 |
+
"pos_dim" : 48,
|
18 |
+
"time_embed_dim" : 192,
|
19 |
+
"text_embed_dim" : 192,
|
20 |
+
"feat_dim": 100
|
21 |
+
},
|
22 |
+
"feature" : {
|
23 |
+
"sampling_rate": 24000,
|
24 |
+
"type": "vocos"
|
25 |
+
}
|
26 |
+
}
|
zipvoice_dialog_opendialog/zipvoice_base.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model" : {
|
3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
7 |
+
"fm_decoder_num_heads" : 4,
|
8 |
+
"fm_decoder_dim" : 512,
|
9 |
+
"text_encoder_num_layers" : 4,
|
10 |
+
"text_encoder_feedforward_dim" : 512,
|
11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
12 |
+
"text_encoder_num_heads" : 4,
|
13 |
+
"text_encoder_dim" : 192,
|
14 |
+
"query_head_dim" : 32,
|
15 |
+
"value_head_dim" : 12,
|
16 |
+
"pos_head_dim" : 4,
|
17 |
+
"pos_dim" : 48,
|
18 |
+
"time_embed_dim" : 192,
|
19 |
+
"text_embed_dim" : 192,
|
20 |
+
"feat_dim": 100
|
21 |
+
},
|
22 |
+
"feature" : {
|
23 |
+
"sampling_rate": 24000,
|
24 |
+
"type": "vocos"
|
25 |
+
}
|
26 |
+
}
|
zipvoice_dialog_stereo/zipvoice_base.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model" : {
|
3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
7 |
+
"fm_decoder_num_heads" : 4,
|
8 |
+
"fm_decoder_dim" : 512,
|
9 |
+
"text_encoder_num_layers" : 4,
|
10 |
+
"text_encoder_feedforward_dim" : 512,
|
11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
12 |
+
"text_encoder_num_heads" : 4,
|
13 |
+
"text_encoder_dim" : 192,
|
14 |
+
"query_head_dim" : 32,
|
15 |
+
"value_head_dim" : 12,
|
16 |
+
"pos_head_dim" : 4,
|
17 |
+
"pos_dim" : 48,
|
18 |
+
"time_embed_dim" : 192,
|
19 |
+
"text_embed_dim" : 192,
|
20 |
+
"feat_dim": 100
|
21 |
+
},
|
22 |
+
"feature" : {
|
23 |
+
"sampling_rate": 24000,
|
24 |
+
"type": "vocos"
|
25 |
+
}
|
26 |
+
}
|
zipvoice_distill/zipvoice_base.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model" : {
|
3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
7 |
+
"fm_decoder_num_heads" : 4,
|
8 |
+
"fm_decoder_dim" : 512,
|
9 |
+
"text_encoder_num_layers" : 4,
|
10 |
+
"text_encoder_feedforward_dim" : 512,
|
11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
12 |
+
"text_encoder_num_heads" : 4,
|
13 |
+
"text_encoder_dim" : 192,
|
14 |
+
"query_head_dim" : 32,
|
15 |
+
"value_head_dim" : 12,
|
16 |
+
"pos_head_dim" : 4,
|
17 |
+
"pos_dim" : 48,
|
18 |
+
"time_embed_dim" : 192,
|
19 |
+
"text_embed_dim" : 192,
|
20 |
+
"feat_dim": 100
|
21 |
+
},
|
22 |
+
"feature" : {
|
23 |
+
"sampling_rate": 24000,
|
24 |
+
"type": "vocos"
|
25 |
+
}
|
26 |
+
}
|
zipvoice_distill_libritts/zipvoice_base.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model" : {
|
3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
7 |
+
"fm_decoder_num_heads" : 4,
|
8 |
+
"fm_decoder_dim" : 512,
|
9 |
+
"text_encoder_num_layers" : 4,
|
10 |
+
"text_encoder_feedforward_dim" : 512,
|
11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
12 |
+
"text_encoder_num_heads" : 4,
|
13 |
+
"text_encoder_dim" : 192,
|
14 |
+
"query_head_dim" : 32,
|
15 |
+
"value_head_dim" : 12,
|
16 |
+
"pos_head_dim" : 4,
|
17 |
+
"pos_dim" : 48,
|
18 |
+
"time_embed_dim" : 192,
|
19 |
+
"text_embed_dim" : 192,
|
20 |
+
"feat_dim": 100
|
21 |
+
},
|
22 |
+
"feature" : {
|
23 |
+
"sampling_rate": 24000,
|
24 |
+
"type": "vocos"
|
25 |
+
}
|
26 |
+
}
|
zipvoice_libritts/zipvoice_base.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model" : {
|
3 |
+
"fm_decoder_downsampling_factor" : [1,2,4,2,1],
|
4 |
+
"fm_decoder_num_layers" : [2,2,4,4,4],
|
5 |
+
"fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
|
6 |
+
"fm_decoder_feedforward_dim" : 1536,
|
7 |
+
"fm_decoder_num_heads" : 4,
|
8 |
+
"fm_decoder_dim" : 512,
|
9 |
+
"text_encoder_num_layers" : 4,
|
10 |
+
"text_encoder_feedforward_dim" : 512,
|
11 |
+
"text_encoder_cnn_module_kernel" : 9,
|
12 |
+
"text_encoder_num_heads" : 4,
|
13 |
+
"text_encoder_dim" : 192,
|
14 |
+
"query_head_dim" : 32,
|
15 |
+
"value_head_dim" : 12,
|
16 |
+
"pos_head_dim" : 4,
|
17 |
+
"pos_dim" : 48,
|
18 |
+
"time_embed_dim" : 192,
|
19 |
+
"text_embed_dim" : 192,
|
20 |
+
"feat_dim": 100
|
21 |
+
},
|
22 |
+
"feature" : {
|
23 |
+
"sampling_rate": 24000,
|
24 |
+
"type": "vocos"
|
25 |
+
}
|
26 |
+
}
|