Update README.md
Browse files
    	
        README.md
    CHANGED
    
    | @@ -21,19 +21,80 @@ This model is a fine-tuned version of [microsoft/speecht5_tts](https://huggingfa | |
| 21 | 
             
            It achieves the following results on the evaluation set:
         | 
| 22 | 
             
            - Loss: 0.4675
         | 
| 23 |  | 
| 24 | 
            -
            ##  | 
|  | |
| 25 |  | 
| 26 | 
            -
             | 
|  | |
| 27 |  | 
| 28 | 
            -
            ## Intended uses & limitations
         | 
| 29 |  | 
| 30 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 31 |  | 
| 32 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 33 |  | 
| 34 | 
            -
            More information needed
         | 
| 35 |  | 
| 36 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 37 |  | 
| 38 | 
             
            ### Training hyperparameters
         | 
| 39 |  | 
|  | |
| 21 | 
             
            It achieves the following results on the evaluation set:
         | 
| 22 | 
             
            - Loss: 0.4675
         | 
| 23 |  | 
| 24 | 
            +
            ## How to use/inference
         | 
| 25 | 
            +
            Follow the example below and adapt with your own need.
         | 
| 26 |  | 
| 27 | 
            +
            ```
         | 
| 28 | 
            +
            # ft_t5_id_inference.py
         | 
| 29 |  | 
|  | |
| 30 |  | 
| 31 | 
            +
            import sounddevice as sd
         | 
| 32 | 
            +
            import torch
         | 
| 33 | 
            +
            import torchaudio
         | 
| 34 | 
            +
            from datasets import Audio, load_dataset
         | 
| 35 | 
            +
            from transformers import (
         | 
| 36 | 
            +
                SpeechT5ForTextToSpeech,
         | 
| 37 | 
            +
                SpeechT5HifiGan,
         | 
| 38 | 
            +
                SpeechT5Processor,
         | 
| 39 | 
            +
            )
         | 
| 40 | 
            +
            from utils import create_speaker_embedding
         | 
| 41 |  | 
| 42 | 
            +
            # load dataset and pre-trained model
         | 
| 43 | 
            +
            dataset = load_dataset(
         | 
| 44 | 
            +
                "mozilla-foundation/common_voice_16_1", "id", split="test")
         | 
| 45 | 
            +
            model = SpeechT5ForTextToSpeech.from_pretrained(
         | 
| 46 | 
            +
                "Bagus/speecht5_finetuned_commonvoice_id")
         | 
| 47 |  | 
|  | |
| 48 |  | 
| 49 | 
            +
            # process the text using checkpoint
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            checkpoint = "microsoft/speecht5_tts"
         | 
| 52 | 
            +
            processor = SpeechT5Processor.from_pretrained(checkpoint)
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            sampling_rate = processor.feature_extractor.sampling_rate
         | 
| 55 | 
            +
            dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
         | 
| 56 | 
            +
             | 
| 57 | 
            +
             | 
| 58 | 
            +
            def prepare_dataset(example):
         | 
| 59 | 
            +
                audio = example["audio"]
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                example = processor(
         | 
| 62 | 
            +
                    text=example["sentence"],
         | 
| 63 | 
            +
                    audio_target=audio["array"],
         | 
| 64 | 
            +
                    sampling_rate=audio["sampling_rate"],
         | 
| 65 | 
            +
                    return_attention_mask=False,
         | 
| 66 | 
            +
                )
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                # strip off the batch dimension
         | 
| 69 | 
            +
                example["labels"] = example["labels"][0]
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                # use SpeechBrain to obtain x-vector
         | 
| 72 | 
            +
                example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                return example
         | 
| 75 | 
            +
             | 
| 76 | 
            +
             | 
| 77 | 
            +
            # prepare the speaker embeddings from the dataset and text
         | 
| 78 | 
            +
            example = prepare_dataset(dataset[30])
         | 
| 79 | 
            +
            speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
         | 
| 80 | 
            +
             | 
| 81 | 
            +
            # prepare text to be converted to speech
         | 
| 82 | 
            +
            text = "Saya suka baju yang berwarna merah tua."
         | 
| 83 | 
            +
             | 
| 84 | 
            +
            inputs = processor(text=text, return_tensors="pt")
         | 
| 85 | 
            +
             | 
| 86 | 
            +
             | 
| 87 | 
            +
            vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
         | 
| 88 | 
            +
            speech = model.generate_speech(
         | 
| 89 | 
            +
                inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            sampling_rate = 16000
         | 
| 92 | 
            +
            sd.play(speech, samplerate=sampling_rate, blocking=True)
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            # save the audio, signal needs to be in 2D tensor
         | 
| 95 | 
            +
            torchaudio.save("output_t5_ft_cv16_id.wav", speech.unsqueeze(0), 16000)
         | 
| 96 | 
            +
             | 
| 97 | 
            +
            ```
         | 
| 98 |  | 
| 99 | 
             
            ### Training hyperparameters
         | 
| 100 |  | 
