diff --git "a/examples/speech_tokenizer/spiritlm_speech_tokenizer.ipynb" "b/examples/speech_tokenizer/spiritlm_speech_tokenizer.ipynb" new file mode 100644--- /dev/null +++ "b/examples/speech_tokenizer/spiritlm_speech_tokenizer.ipynb" @@ -0,0 +1,563 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2798b8a9-1f91-4d0c-bc48-d6dd7b27613c", + "metadata": {}, + "source": [ + "## Load audio" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "41a0601b-102d-40dc-ac60-823a1cbb07c5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original audio:\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import IPython.display as ipd\n", + "audio = \"../audio/7143-88743-0029.flac\"\n", + "print('Original audio:')\n", + "ipd.display(ipd.Audio(audio))" + ] + }, + { + "cell_type": "markdown", + "id": "098d128f-a120-4c14-819b-8241546c98b9", + "metadata": {}, + "source": [ + "## SpiritLM tokenizers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e8128d42-4017-46d3-9095-732be0e3fc87", + "metadata": {}, + "outputs": [], + "source": [ + "from spiritlm.speech_tokenizer import spiritlm_base, spiritlm_expressive" + ] + }, + { + "cell_type": "markdown", + "id": "9ea2b4b6-cf4b-46b1-9e6b-a8a296c48400", + "metadata": {}, + "source": [ + "### SpiritLM-BASE tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9d1d8e02-58e7-4ec6-9238-ee3bcad01a5e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/.conda/envs/spiritlm/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n", + " WeightNorm.apply(module, name, dim)\n" + ] + } + ], + "source": [ + "## Load the tokenizer\n", + "tokenizer_base = spiritlm_base()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2d42054d-ad63-4c4e-aa6a-cda8b0321673", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SpiritLM-BASE: Encode audio into units (not deduplicated) \n", + " --------------------\n", + "{'audio': 'examples/audio/7143-88743-0029.flac', 'hubert': '99 49 38 149 149 71 423 427 492 288 315 153 153 389 497 412 247 354 7 96 452 452 176 266 266 77 248 336 336 211 166 65 94 224 224 148 492 191 440 440 41 41 457 79 382 451 332 216 114 340 478 74 79 370 272 370 370 53 477 65 171 60 258 111 111 111 111 338 338 23 23 338 23 338 338 338 7 338 338 149 406 7 361 361 361 99 99 99 99 99 99 99 209 209 209 209 209 479 50 50 7 149 149 35 35 130 130 169 169 72 434 119 272 4 249 245 245 433 159 294 139 359 343 269 302 226 370 216 459 424 424 226 382 7 58 138 428 397 350 350 306 306 306 84 11 171 171 60 314 227 227 355 9 58 138 226 370 272 382 334 330 176 176 307 145 248 493 64 44 388 7 111 111 111 111 23 23 481 149 149 80 70 431 457 79 79 249 249 245 245 245 433 433 316 316 180 458 458 458 86 86 225 103 60 96 119 119 129 356 218 4 259 259 392 490 75 488 166 65 171 60 7 54 54 85 85 361 361'}\n", + "\n", + "SpiritLM-BASE: Encode audio into string (deduplicated and sorted units) \n", + " --------------------\n", + "[Hu99][Hu49][Hu38][Hu149][Hu71][Hu423][Hu427][Hu492][Hu288][Hu315][Hu153][Hu389][Hu497][Hu412][Hu247][Hu354][Hu7][Hu96][Hu452][Hu176][Hu266][Hu77][Hu248][Hu336][Hu211][Hu166][Hu65][Hu94][Hu224][Hu148][Hu492][Hu191][Hu440][Hu41][Hu457][Hu79][Hu382][Hu451][Hu332][Hu216][Hu114][Hu340][Hu478][Hu74][Hu79][Hu370][Hu272][Hu370][Hu53][Hu477][Hu65][Hu171][Hu60][Hu258][Hu111][Hu338][Hu23][Hu338][Hu23][Hu338][Hu7][Hu338][Hu149][Hu406][Hu7][Hu361][Hu99][Hu209][Hu479][Hu50][Hu7][Hu149][Hu35][Hu130][Hu169][Hu72][Hu434][Hu119][Hu272][Hu4][Hu249][Hu245][Hu433][Hu159][Hu294][Hu139][Hu359][Hu343][Hu269][Hu302][Hu226][Hu370][Hu216][Hu459][Hu424][Hu226][Hu382][Hu7][Hu58][Hu138][Hu428][Hu397][Hu350][Hu306][Hu84][Hu11][Hu171][Hu60][Hu314][Hu227][Hu355][Hu9][Hu58][Hu138][Hu226][Hu370][Hu272][Hu382][Hu334][Hu330][Hu176][Hu307][Hu145][Hu248][Hu493][Hu64][Hu44][Hu388][Hu7][Hu111][Hu23][Hu481][Hu149][Hu80][Hu70][Hu431][Hu457][Hu79][Hu249][Hu245][Hu433][Hu316][Hu180][Hu458][Hu86][Hu225][Hu103][Hu60][Hu96][Hu119][Hu129][Hu356][Hu218][Hu4][Hu259][Hu392][Hu490][Hu75][Hu488][Hu166][Hu65][Hu171][Hu60][Hu7][Hu54][Hu85][Hu361]\n", + "\n", + "SpiritLM-BASE: Decode back to audio from units (not deduplicated) \n", + " --------------------\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "SpiritLM-BASE: Decode back to audio from string (deduplicated and sorted units) \n", + " --------------------\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "## encode_units\n", + "print('SpiritLM-BASE: Encode audio into units (not deduplicated) \\n', '-'*20)\n", + "units = tokenizer_base.encode_units(audio)\n", + "print(units)\n", + "\n", + "## encode_string\n", + "print('\\nSpiritLM-BASE: Encode audio into string (deduplicated and sorted units) \\n', '-'*20)\n", + "string_tokens = tokenizer_base.encode_string(audio)\n", + "print(string_tokens)\n", + "\n", + "## decode from units\n", + "print('\\nSpiritLM-BASE: Decode back to audio from units (not deduplicated) \\n', '-'*20)\n", + "resyn_wav = tokenizer_base.decode(units, speaker_id=2)\n", + "ipd.display(ipd.Audio(resyn_wav, rate=16000))\n", + "\n", + "## decode from string\n", + "print('\\nSpiritLM-BASE: Decode back to audio from string (deduplicated and sorted units) \\n', '-'*20)\n", + "resyn_dedup_wav = tokenizer_base.decode(string_tokens, speaker_id=2)\n", + "ipd.display(ipd.Audio(resyn_dedup_wav, rate=16000))" + ] + }, + { + "cell_type": "markdown", + "id": "9c0af45c-b397-4f41-b2e1-77f9199f5c3d", + "metadata": {}, + "source": [ + "### SpiritLM-EXPRESSIVE Tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e03a3594-f387-4b12-aaab-dd8e56a583de", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " [INFO]: device is not None, use cuda:0\n", + " [INFO] > call by:torchfcpe.tools.spawn_infer_cf_naive_mel_pe_from_pt\n", + " [WARN] args.model.use_harmonic_emb is None; use default False\n", + " [WARN] > call by:torchfcpe.tools.spawn_cf_naive_mel_pe\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/.conda/envs/spiritlm/lib/python3.10/site-packages/torchfcpe/models_infer.py:191: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " ckpt = torch.load(pt_path, map_location=torch.device(device))\n", + "Some weights of Wav2Vec2StyleEncoder were not initialized from the model checkpoint at checkpoints/speech_tokenizer/style_encoder_w2v2 and are newly initialized: ['_float_tensor']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "/home/.conda/envs/spiritlm/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n", + " WeightNorm.apply(module, name, dim)\n" + ] + } + ], + "source": [ + "## Load the tokenizer\n", + "tokenizer_expressive = spiritlm_expressive()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1cacdbc6-8a08-470c-ae67-e7d772535f09", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SpiritLM-EXPRESSIVE: Encode audio into units (not deduplicated) \n", + " --------------------\n", + "{'audio': 'examples/audio/7143-88743-0029.flac', 'hubert': '99 49 38 149 149 71 423 427 492 288 315 153 153 389 497 412 247 354 7 96 452 452 176 266 266 77 248 336 336 211 166 65 94 224 224 148 492 191 440 440 41 41 457 79 382 451 332 216 114 340 478 74 79 370 272 370 370 53 477 65 171 60 258 111 111 111 111 338 338 23 23 338 23 338 338 338 7 338 338 149 406 7 361 361 361 99 99 99 99 99 99 99 209 209 209 209 209 479 50 50 7 149 149 35 35 130 130 169 169 72 434 119 272 4 249 245 245 433 159 294 139 359 343 269 302 226 370 216 459 424 424 226 382 7 58 138 428 397 350 350 306 306 306 84 11 171 171 60 314 227 227 355 9 58 138 226 370 272 382 334 330 176 176 307 145 248 493 64 44 388 7 111 111 111 111 23 23 481 149 149 80 70 431 457 79 79 249 249 245 245 245 433 433 316 316 180 458 458 458 86 86 225 103 60 96 119 119 129 356 218 4 259 259 392 490 75 488 166 65 171 60 7 54 54 85 85 361 361', 'pitch': '39 39 39 48 56 40 42 39 51 40 43 54 3 35 39 25 58 26 44 40 13 20 46 41 26 40 26 56 41 46 46 41 41 40 40 40 39 39 57 59 59 59 59 59 59 59 59 20 20 20 35 35 13 3 9 6 0 20 57 56 56 56 56 59 44 57 41 59 42 51 59 57 59 59 39 39 46 56 58 41 41 40 39 39 39 59 59 59 15 27 13 55 13 27 35 36 3 53 3 26 43 53 54 39 25 14 41 46 46 46 46 41 41 41', 'style': '71 71 71 71 71 71 71 71 71 83'}\n", + "\n", + "SpiritLM-EXPRESSIVE: Encode audio into string (deduplicated and sorted units) \n", + " --------------------\n", + "[St71][Pi39][Hu99][Hu49][Hu38][Hu149][Hu71][Pi48][Hu423][Hu427][Pi56][Hu492][Hu288][Pi40][Hu315][Hu153][Pi42][Hu389][Pi39][Hu497][Hu412][Pi51][Hu247][Hu354][Pi40][Hu7][Hu96][Pi43][Hu452][Pi54][Hu176][Hu266][Pi3][St71][Hu77][Pi35][Hu248][Hu336][Pi39][Hu211][Pi25][Hu166][Hu65][Pi58][Hu94][Hu224][Pi26][Hu148][Pi44][Hu492][Hu191][Pi40][Hu440][Pi13][Hu41][Pi20][Hu457][Hu79][Pi46][Hu382][Hu451][Pi41][Hu332][Hu216][Pi26][Hu114][Hu340][St71][Pi40][Hu478][Hu74][Pi26][Hu79][Hu370][Pi56][Hu272][Hu370][Pi41][Hu53][Pi46][Hu477][Hu65][Hu171][Hu60][Pi41][Hu258][Hu111][Pi40][Hu338][Hu23][Hu338][Pi39][Hu23][Hu338][St71][Pi57][Hu7][Hu338][Pi59][Hu149][Hu406][Hu7][Hu361][Hu99][Hu209][Pi20][Hu479][Hu50][St71][Pi35][Hu7][Hu149][Hu35][Pi13][Hu130][Pi3][Hu169][Pi9][Hu72][Pi6][Hu434][Hu119][Pi0][Hu272][Hu4][Pi20][Hu249][Hu245][Pi57][Hu433][Pi56][Hu159][Hu294][Hu139][Hu359][Hu343][Hu269][Hu302][St71][Hu226][Pi59][Hu370][Hu216][Pi44][Hu459][Hu424][Pi57][Hu226][Pi41][Hu382][Hu7][Pi59][Hu58][Hu138][Pi42][Hu428][Hu397][Pi51][Hu350][Pi59][Hu306][Pi57][Hu84][Pi59][Hu11][Hu171][Hu60][Pi39][Hu314][Hu227][St71][Hu355][Pi46][Hu9][Hu58][Pi56][Hu138][Hu226][Pi58][Hu370][Hu272][Pi41][Hu382][Hu334][Hu330][Hu176][Pi40][Hu307][Pi39][Hu145][Hu248][Hu493][Hu64][Hu44][Hu388][Pi59][Hu7][Hu111][St71][Hu23][Pi15][Hu481][Pi27][Hu149][Pi13][Hu80][Hu70][Pi55][Hu431][Hu457][Pi13][Hu79][Pi27][Hu249][Pi35][Hu245][Pi36][Hu433][Pi3][Hu316][Pi53][Hu180][Pi3][Hu458][Pi26][Hu86][St71][Pi43][Hu225][Pi53][Hu103][Hu60][Pi54][Hu96][Hu119][Pi39][Hu129][Pi25][Hu356][Hu218][Pi14][Hu4][Hu259][Pi41][Hu392][Pi46][Hu490][Hu75][Hu488][Hu166][Hu65][Hu171][Hu60][Hu7][Pi41][Hu54][Hu85][St83][Hu361]\n", + "\n", + "SpiritLM-EXPRESSIVE: Decode back to audio from units (not deduplicated) \n", + " --------------------\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "SpiritLM-EXPRESSIVE: Decode back to audio from string (deduplicated and sorted units) \n", + " --------------------\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "## encode_units\n", + "print('SpiritLM-EXPRESSIVE: Encode audio into units (not deduplicated) \\n', '-'*20)\n", + "units = tokenizer_expressive.encode_units(audio)\n", + "print(units)\n", + "\n", + "## encode_string\n", + "print('\\nSpiritLM-EXPRESSIVE: Encode audio into string (deduplicated and sorted units) \\n', '-'*20)\n", + "string_tokens = tokenizer_expressive.encode_string(audio)\n", + "print(string_tokens)\n", + "\n", + "## decode from units\n", + "print('\\nSpiritLM-EXPRESSIVE: Decode back to audio from units (not deduplicated) \\n', '-'*20)\n", + "resyn_wav = tokenizer_expressive.decode(units, speaker_id=2)\n", + "ipd.display(ipd.Audio(resyn_wav, rate=16000))\n", + "\n", + "## decode from string\n", + "print('\\nSpiritLM-EXPRESSIVE: Decode back to audio from string (deduplicated and sorted units) \\n', '-'*20)\n", + "resyn_dedup_wav = tokenizer_expressive.decode(string_tokens, speaker_id=2)\n", + "ipd.display(ipd.Audio(resyn_dedup_wav, rate=16000))" + ] + }, + { + "cell_type": "markdown", + "id": "e46f7f7c-d26b-4ead-8d1b-fb013c1dd9d1", + "metadata": {}, + "source": [ + "## Test load each component" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d48c0595-0d25-4eab-8958-d2f30f940d88", + "metadata": {}, + "outputs": [], + "source": [ + "from spiritlm.speech_tokenizer.hubert import spiritlm_hubert\n", + "from spiritlm.speech_tokenizer.hifigan import spiritlm_base_hifigan, spiritlm_expressive_hifigan_w2v2\n", + "from spiritlm.speech_tokenizer.f0 import spiritlm_expressive_f0\n", + "from spiritlm.speech_tokenizer.style_encoder import spiritlm_expressive_style_encoder_w2v2" + ] + }, + { + "cell_type": "markdown", + "id": "a686cd17-c669-4860-8835-7e5406f0b0d9", + "metadata": {}, + "source": [ + "### Hubert Tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f5a5ae21-028e-4c9f-a089-47f16b1ed1a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hubert_tokenizer(audio):\n", + " tensor([ 99, 49, 38, 149, 149, 71, 423, 427, 492, 288, 315, 153, 153, 389,\n", + " 497, 412, 247, 354, 7, 96, 452, 452, 176, 266, 266, 77, 248, 336,\n", + " 336, 211, 166, 65, 94, 224, 224, 148, 492, 191, 440, 440, 41, 41,\n", + " 457, 79, 382, 451, 332, 216, 114, 340, 478, 74, 79, 370, 272, 370,\n", + " 370, 53, 477, 65, 171, 60, 258, 111, 111, 111, 111, 338, 338, 23,\n", + " 23, 338, 23, 338, 338, 338, 7, 338, 338, 149, 406, 7, 361, 361,\n", + " 361, 99, 99, 99, 99, 99, 99, 99, 209, 209, 209, 209, 209, 479,\n", + " 50, 50, 7, 149, 149, 35, 35, 130, 130, 169, 169, 72, 434, 119,\n", + " 272, 4, 249, 245, 245, 433, 159, 294, 139, 359, 343, 269, 302, 226,\n", + " 370, 216, 459, 424, 424, 226, 382, 7, 58, 138, 428, 397, 350, 350,\n", + " 306, 306, 306, 84, 11, 171, 171, 60, 314, 227, 227, 355, 9, 58,\n", + " 138, 226, 370, 272, 382, 334, 330, 176, 176, 307, 145, 248, 493, 64,\n", + " 44, 388, 7, 111, 111, 111, 111, 23, 23, 481, 149, 149, 80, 70,\n", + " 431, 457, 79, 79, 249, 249, 245, 245, 245, 433, 433, 316, 316, 180,\n", + " 458, 458, 458, 86, 86, 225, 103, 60, 96, 119, 119, 129, 356, 218,\n", + " 4, 259, 259, 392, 490, 75, 488, 166, 65, 171, 60, 7, 54, 54,\n", + " 85, 85, 361, 361], device='cuda:0')\n" + ] + } + ], + "source": [ + "hubert_tokenizer = spiritlm_hubert()\n", + "print(\"hubert_tokenizer(audio):\\n\", hubert_tokenizer(audio))" + ] + }, + { + "cell_type": "markdown", + "id": "86a6d8f1-d6c8-43c9-9a1c-eb9bbcad3791", + "metadata": {}, + "source": [ + "### Pitch Tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "21b03535-37a4-4ce5-8045-b309c2ebe51a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " [INFO]: device is not None, use cuda:0\n", + " [INFO] > call by:torchfcpe.tools.spawn_infer_cf_naive_mel_pe_from_pt\n", + " [WARN] args.model.use_harmonic_emb is None; use default False\n", + " [WARN] > call by:torchfcpe.tools.spawn_cf_naive_mel_pe\n", + "f0_tokenizer(audio):\n", + " tensor([39, 39, 39, 48, 56, 40, 42, 39, 51, 40, 43, 54, 3, 35, 39, 25, 58, 26,\n", + " 44, 40, 13, 20, 46, 41, 26, 40, 26, 56, 41, 46, 46, 41, 41, 40, 40, 40,\n", + " 39, 39, 57, 59, 59, 59, 59, 59, 59, 59, 59, 20, 20, 20, 35, 35, 13, 3,\n", + " 9, 6, 0, 20, 57, 56, 56, 56, 56, 59, 44, 57, 41, 59, 42, 51, 59, 57,\n", + " 59, 59, 39, 39, 46, 56, 58, 41, 41, 40, 39, 39, 39, 59, 59, 59, 15, 27,\n", + " 13, 55, 13, 27, 35, 36, 3, 53, 3, 26, 43, 53, 54, 39, 25, 14, 41, 46,\n", + " 46, 46, 46, 41, 41, 41], device='cuda:0')\n" + ] + } + ], + "source": [ + "f0_tokenizer = spiritlm_expressive_f0()\n", + "print(\"f0_tokenizer(audio):\\n\", f0_tokenizer(audio))" + ] + }, + { + "cell_type": "markdown", + "id": "33e2ef11-82de-42cb-be3d-16d32b5b8510", + "metadata": {}, + "source": [ + "### Style Tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0eb0fef0-5140-4829-84b3-624e7baeecdd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of Wav2Vec2StyleEncoder were not initialized from the model checkpoint at checkpoints/speech_tokenizer/style_encoder_w2v2 and are newly initialized: ['_float_tensor']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "style_tokenizer(audio):\n", + " tensor([71, 71, 71, 71, 71, 71, 71, 71, 71, 83], device='cuda:0')\n" + ] + } + ], + "source": [ + "style_tokenizer = spiritlm_expressive_style_encoder_w2v2()\n", + "print(\"style_tokenizer(audio):\\n\", style_tokenizer(audio))" + ] + }, + { + "cell_type": "markdown", + "id": "8f00182f-3e7c-4ea1-b0db-3e1bf386dae5", + "metadata": {}, + "source": [ + "### Hifi-GAN Vocoders" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "422f5566-2a08-46b7-84ba-ae43c5d9cd0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Base vocoder\n", + "base_vocoder = spiritlm_base_hifigan()\n", + "wav = base_vocoder(hubert_tokenizer(audio), speaker_id=1).cpu().numpy()\n", + "ipd.Audio(wav, rate=16000)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "f82246da-0396-45c0-ab44-f83f8231ca75", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Expressive vocoder\n", + "expressive_vocoder = spiritlm_expressive_hifigan_w2v2()\n", + "wav_ex = expressive_vocoder(\n", + " code=hubert_tokenizer(audio),\n", + " f0_code=f0_tokenizer(audio),\n", + " style_code=style_tokenizer(audio),\n", + " dur_pred=False,\n", + " speaker_id=1,\n", + " not_dedup_code=True,\n", + " ).cpu().numpy()\n", + "ipd.Audio(wav_ex, rate=16000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "979c6080-25e1-4516-9482-67b364687a88", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}