first commit

Browse files

Files changed (7) hide show

README.md +77 -0
asr.ckpt +3 -0
config.json +68 -0
hyperparams.yaml +119 -0
preprocessor_config.json +8 -0
tokenizer.ckpt +0 -0
wav2vec2.ckpt +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,77 @@

+---
+language: "fr"
+thumbnail:
+tags:
+- ASR
+- CTC
+- Attention
+- pytorch
+- speechbrain
+- Transformer
+license: "apache-2.0"
+datasets:
+- commonvoice
+metrics:
+- wer
+- cer
+---
+# CRDNN with CTC/Attention trained on CommonVoice French (No LM)
+This repository provides all the necessary tools to perform automatic speech
+recognition from an end-to-end system pretrained on CommonVoice (French Language) within
+SpeechBrain. For a better experience, we encourage you to learn more about
+[SpeechBrain](https://speechbrain.github.io). The given ASR model performance are:
+| Release | Test CER | Test WER | GPUs |
+|:-------------:|:--------------:|:--------------:| :--------:|
+| 29-04-21 | 6.54 | 13.90 | 2xV100 32GB |
+## Pipeline description
+This ASR system is composed of 2 different but linked blocks:
+1. Tokenizer (unigram) that transforms words into subword units and trained with
+the train transcriptions (train.tsv) of CommonVoice (FR).
+3. Acoustic model (wav2vec2.0 + CTC/Attention). A pretrained wav2vec 2.0 model ([wav2vec2-large-xlsr-53-french](https://huggingface.co/facebook/wav2vec2-large-xlsr-53-french)) is combined with two DNN layers and finetuned on CommonVoice FR.
+The obtained final acoustic representation is given to the CTC and attention decoders.
+## Intended uses & limitations
+This model has been primarily developed to be run within SpeechBrain as a pretrained ASR model
+for the French language. Thanks to the flexibility of SpeechBrain, any of the 2 blocks
+detailed above can be extracted and connected to your custom pipeline as long as SpeechBrain is
+installed.
+## Install SpeechBrain
+First of all, please install tranformers and SpeechBrain with the following command:
+```
+pip install speechbrain transformers
+```
+Please notice that we encourage you to read our tutorials and learn more about
+[SpeechBrain](https://speechbrain.github.io).
+### Transcribing your own audio files (in French)
+```python
+from speechbrain.pretrained import EncoderDecoderASR
+asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-commonvoice-fr", savedir="pretrained_models/asr-crdnn-commonvoice-fr")
+asr_model.transcribe_file("example-fr.wav")
+```
+#### Referencing SpeechBrain
+```
+@misc{SB2021,
+    author = {Ravanelli, Mirco and Parcollet, Titouan and Rouhe, Aku and Plantinga, Peter and Rastorgueva, Elena and Lugosch, Loren and Dawalatabad, Nauman and Ju-Chieh, Chou and Heba, Abdel and Grondin, Francois and Aris, William and Liao, Chien-Feng and Cornell, Samuele and Yeh, Sung-Lin and Na, Hwidong and Gao, Yan and Fu, Szu-Wei and Subakan, Cem and De Mori, Renato and Bengio, Yoshua },
+    title = {SpeechBrain},
+    year = {2021},
+    publisher = {GitHub},
+    journal = {GitHub repository},
+    howpublished = {\url{https://github.com/speechbrain/speechbrain}},
+  }
+```

asr.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee40bc648d23dccd4d6d8cf77eb317aede679218ad192c96ad631921e7561024
+size 60570064

config.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "activation_dropout": 0.1,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "final_dropout": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_attention_heads": 16,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "transformers_version": "4.4.0.dev0",
+  "vocab_size": 49
+}

hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+# ################################
+# Model: wav2vec2 + DNN + CTC/Attention
+# Augmentation: SpecAugment
+# Authors: Titouan Parcollet 2021
+# ################################
+sample_rate: 16000
+wav2vec2_hub: facebook/wav2vec2-large-xlsr-53-french
+# BPE parameters
+token_type: unigram  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+# Model parameters
+activation: !name:torch.nn.LeakyReLU
+dnn_layers: 2
+dnn_neurons: 1024
+emb_size: 128
+dec_neurons: 1024
+# Outputs
+output_neurons: 500  # BPE size, index(blank/eos/bos) = 0
+# Decoding parameters
+# Be sure that the bos and eos index match with the BPEs ones
+blank_index: 0
+bos_index: 1
+eos_index: 2
+min_decode_ratio: 0.0
+max_decode_ratio: 1.0
+beam_size: 80
+eos_threshold: 1.5
+using_max_attn_shift: True
+max_attn_shift: 140
+ctc_weight_decode: 0.0
+temperature: 1.50
+enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN
+    input_shape: [null, null, 1024]
+    activation: !ref <activation>
+    dnn_blocks: !ref <dnn_layers>
+    dnn_neurons: !ref <dnn_neurons>
+wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
+    source: !ref <wav2vec2_hub>
+    output_norm: True
+    freeze: True
+    pretrain: False
+    save_path: model_checkpoints
+emb: !new:speechbrain.nnet.embedding.Embedding
+    num_embeddings: !ref <output_neurons>
+    embedding_dim: !ref <emb_size>
+dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
+    enc_dim: !ref <dnn_neurons>
+    input_size: !ref <emb_size>
+    rnn_type: gru
+    attn_type: location
+    hidden_size: 1024
+    attn_dim: 1024
+    num_layers: 1
+    scaling: 1.0
+    channels: 10
+    kernel_size: 100
+    re_init: True
+    dropout: 0.15
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <dnn_neurons>
+    n_neurons: !ref <output_neurons>
+seq_lin: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <dec_neurons>
+    n_neurons: !ref <output_neurons>
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+    blank_index: !ref <blank_index>
+seq_cost: !name:speechbrain.nnet.losses.nll_loss
+    label_smoothing: 0.1
+asr_model: !new:torch.nn.ModuleList
+    - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
+tokenizer: !new:sentencepiece.SentencePieceProcessor
+encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
+    wav2vec2: !ref <wav2vec2>
+    enc: !ref <enc>
+decoder: !new:speechbrain.decoders.S2SRNNBeamSearcher
+    embedding: !ref <emb>
+    decoder: !ref <dec>
+    linear: !ref <seq_lin>
+    ctc_linear: !ref <ctc_lin>
+    bos_index: !ref <bos_index>
+    eos_index: !ref <eos_index>
+    blank_index: !ref <blank_index>
+    min_decode_ratio: !ref <min_decode_ratio>
+    max_decode_ratio: !ref <max_decode_ratio>
+    beam_size: !ref <beam_size>
+    eos_threshold: !ref <eos_threshold>
+    using_max_attn_shift: !ref <using_max_attn_shift>
+    max_attn_shift: !ref <max_attn_shift>
+    temperature: !ref <temperature>
+modules:
+    encoder: !ref <encoder>
+    decoder: !ref <decoder>
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    loadables:
+        wav2vec2: !ref <wav2vec2>
+        asr: !ref <asr_model>
+        tokenizer: !ref <tokenizer>

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "do_normalize": true,
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

tokenizer.ckpt ADDED Viewed

Binary file (244 kB). View file

wav2vec2.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5675c122faaa76ed0e81e658a98a7bd6e498cd79f2f171b158a6dae10985c49c
+size 1261930757