Spaces:
Configuration error
Configuration error
update 25hz yaml
Browse files
examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
CHANGED
|
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
| 18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
| 19 |
llm_input_size: !ref <llm_input_size>
|
| 20 |
llm_output_size: !ref <llm_output_size>
|
| 21 |
-
text_token_size: 51866
|
| 22 |
speech_token_size: 4096
|
| 23 |
length_normalized_loss: True
|
| 24 |
lsm_weight: 0
|
|
@@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|
| 66 |
spk_embed_dim: !ref <spk_embed_dim>
|
| 67 |
output_type: 'mel'
|
| 68 |
vocab_size: 4096
|
| 69 |
-
input_frame_rate: 50
|
| 70 |
only_mask_loss: True
|
| 71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
| 72 |
output_size: 512
|
|
@@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
| 135 |
|
| 136 |
# processor functions
|
| 137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
| 138 |
-
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
| 139 |
multilingual: True
|
| 140 |
num_languages: 100
|
| 141 |
language: 'en'
|
|
|
|
| 18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
| 19 |
llm_input_size: !ref <llm_input_size>
|
| 20 |
llm_output_size: !ref <llm_output_size>
|
| 21 |
+
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
| 22 |
speech_token_size: 4096
|
| 23 |
length_normalized_loss: True
|
| 24 |
lsm_weight: 0
|
|
|
|
| 66 |
spk_embed_dim: !ref <spk_embed_dim>
|
| 67 |
output_type: 'mel'
|
| 68 |
vocab_size: 4096
|
| 69 |
+
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
| 70 |
only_mask_loss: True
|
| 71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
| 72 |
output_size: 512
|
|
|
|
| 135 |
|
| 136 |
# processor functions
|
| 137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
| 138 |
+
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
| 139 |
multilingual: True
|
| 140 |
num_languages: 100
|
| 141 |
language: 'en'
|
examples/libritts/cosyvoice/conf/cosyvoice.yaml
CHANGED
|
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
| 18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
| 19 |
llm_input_size: !ref <llm_input_size>
|
| 20 |
llm_output_size: !ref <llm_output_size>
|
| 21 |
-
text_token_size: 51866
|
| 22 |
speech_token_size: 4096
|
| 23 |
length_normalized_loss: True
|
| 24 |
lsm_weight: 0
|
|
@@ -66,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|
| 66 |
spk_embed_dim: !ref <spk_embed_dim>
|
| 67 |
output_type: 'mel'
|
| 68 |
vocab_size: 4096
|
| 69 |
-
input_frame_rate: 50
|
| 70 |
only_mask_loss: True
|
| 71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
| 72 |
output_size: 512
|
|
@@ -135,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
| 135 |
|
| 136 |
# processor functions
|
| 137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
| 138 |
-
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
| 139 |
multilingual: True
|
| 140 |
num_languages: 100
|
| 141 |
language: 'en'
|
|
|
|
| 18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
| 19 |
llm_input_size: !ref <llm_input_size>
|
| 20 |
llm_output_size: !ref <llm_output_size>
|
| 21 |
+
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
| 22 |
speech_token_size: 4096
|
| 23 |
length_normalized_loss: True
|
| 24 |
lsm_weight: 0
|
|
|
|
| 66 |
spk_embed_dim: !ref <spk_embed_dim>
|
| 67 |
output_type: 'mel'
|
| 68 |
vocab_size: 4096
|
| 69 |
+
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
| 70 |
only_mask_loss: True
|
| 71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
| 72 |
output_size: 512
|
|
|
|
| 135 |
|
| 136 |
# processor functions
|
| 137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
| 138 |
+
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
| 139 |
multilingual: True
|
| 140 |
num_languages: 100
|
| 141 |
language: 'en'
|
examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml
CHANGED
|
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
| 18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
| 19 |
llm_input_size: !ref <llm_input_size>
|
| 20 |
llm_output_size: !ref <llm_output_size>
|
| 21 |
-
text_token_size: 51866
|
| 22 |
speech_token_size: 4096
|
| 23 |
length_normalized_loss: True
|
| 24 |
lsm_weight: 0
|
|
@@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
| 54 |
pos_enc_layer_type: 'rel_pos_espnet'
|
| 55 |
selfattention_layer_type: 'rel_selfattn'
|
| 56 |
static_chunk_size: 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
| 59 |
input_size: 512
|
|
@@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|
| 61 |
spk_embed_dim: !ref <spk_embed_dim>
|
| 62 |
output_type: 'mel'
|
| 63 |
vocab_size: 4096
|
| 64 |
-
input_frame_rate: 50
|
| 65 |
only_mask_loss: True
|
| 66 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
| 67 |
output_size: 512
|
|
@@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
| 130 |
|
| 131 |
# processor functions
|
| 132 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
| 133 |
-
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
| 134 |
multilingual: True
|
| 135 |
num_languages: 100
|
| 136 |
language: 'en'
|
|
|
|
| 18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
| 19 |
llm_input_size: !ref <llm_input_size>
|
| 20 |
llm_output_size: !ref <llm_output_size>
|
| 21 |
+
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
| 22 |
speech_token_size: 4096
|
| 23 |
length_normalized_loss: True
|
| 24 |
lsm_weight: 0
|
|
|
|
| 54 |
pos_enc_layer_type: 'rel_pos_espnet'
|
| 55 |
selfattention_layer_type: 'rel_selfattn'
|
| 56 |
static_chunk_size: 1
|
| 57 |
+
sampling: !name:cosyvoice.utils.common.ras_sampling
|
| 58 |
+
top_p: 0.8
|
| 59 |
+
top_k: 25
|
| 60 |
+
win_size: 10
|
| 61 |
+
tau_r: 0.1
|
| 62 |
|
| 63 |
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
| 64 |
input_size: 512
|
|
|
|
| 66 |
spk_embed_dim: !ref <spk_embed_dim>
|
| 67 |
output_type: 'mel'
|
| 68 |
vocab_size: 4096
|
| 69 |
+
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
| 70 |
only_mask_loss: True
|
| 71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
| 72 |
output_size: 512
|
|
|
|
| 135 |
|
| 136 |
# processor functions
|
| 137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
| 138 |
+
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
| 139 |
multilingual: True
|
| 140 |
num_languages: 100
|
| 141 |
language: 'en'
|
examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml
CHANGED
|
@@ -18,7 +18,7 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
| 18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
| 19 |
llm_input_size: !ref <llm_input_size>
|
| 20 |
llm_output_size: !ref <llm_output_size>
|
| 21 |
-
text_token_size: 51866
|
| 22 |
speech_token_size: 4096
|
| 23 |
length_normalized_loss: True
|
| 24 |
lsm_weight: 0
|
|
@@ -54,6 +54,11 @@ llm: !new:cosyvoice.llm.llm.TransformerLM
|
|
| 54 |
pos_enc_layer_type: 'rel_pos_espnet'
|
| 55 |
selfattention_layer_type: 'rel_selfattn'
|
| 56 |
static_chunk_size: 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
| 59 |
input_size: 512
|
|
@@ -61,7 +66,7 @@ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
|
| 61 |
spk_embed_dim: !ref <spk_embed_dim>
|
| 62 |
output_type: 'mel'
|
| 63 |
vocab_size: 4096
|
| 64 |
-
input_frame_rate: 50
|
| 65 |
only_mask_loss: True
|
| 66 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
| 67 |
output_size: 512
|
|
@@ -130,7 +135,7 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|
| 130 |
|
| 131 |
# processor functions
|
| 132 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
| 133 |
-
get_tokenizer: !name:whisper.tokenizer.get_tokenizer
|
| 134 |
multilingual: True
|
| 135 |
num_languages: 100
|
| 136 |
language: 'en'
|
|
|
|
| 18 |
text_encoder_input_size: !ref <text_encoder_input_size>
|
| 19 |
llm_input_size: !ref <llm_input_size>
|
| 20 |
llm_output_size: !ref <llm_output_size>
|
| 21 |
+
text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
|
| 22 |
speech_token_size: 4096
|
| 23 |
length_normalized_loss: True
|
| 24 |
lsm_weight: 0
|
|
|
|
| 54 |
pos_enc_layer_type: 'rel_pos_espnet'
|
| 55 |
selfattention_layer_type: 'rel_selfattn'
|
| 56 |
static_chunk_size: 1
|
| 57 |
+
sampling: !name:cosyvoice.utils.common.ras_sampling
|
| 58 |
+
top_p: 0.8
|
| 59 |
+
top_k: 25
|
| 60 |
+
win_size: 10
|
| 61 |
+
tau_r: 0.1
|
| 62 |
|
| 63 |
flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
|
| 64 |
input_size: 512
|
|
|
|
| 66 |
spk_embed_dim: !ref <spk_embed_dim>
|
| 67 |
output_type: 'mel'
|
| 68 |
vocab_size: 4096
|
| 69 |
+
input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
|
| 70 |
only_mask_loss: True
|
| 71 |
encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
|
| 72 |
output_size: 512
|
|
|
|
| 135 |
|
| 136 |
# processor functions
|
| 137 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
| 138 |
+
get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
|
| 139 |
multilingual: True
|
| 140 |
num_languages: 100
|
| 141 |
language: 'en'
|