Spaces:
Sleeping
Sleeping
Commit
·
d161181
1
Parent(s):
7efb86f
Rename Sliders and refactor text input
Browse files- Dataset/dataset.yaml +6 -6
- app.py +19 -9
- pvq_manipulation/models/vits.py +0 -5
Dataset/dataset.yaml
CHANGED
|
@@ -1,4 +1,10 @@
|
|
| 1 |
dataset:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
'7190_90542_000054_000000':
|
| 3 |
speaker_id: '7190'
|
| 4 |
example_id: '7190_90542_000054_000000'
|
|
@@ -29,9 +35,6 @@ dataset:
|
|
| 29 |
'8758_296465_000020_000000':
|
| 30 |
speaker_id: '8758'
|
| 31 |
example_id: '8758_296465_000020_000000'
|
| 32 |
-
'1034_121119_000028_000001':
|
| 33 |
-
speaker_id: '1034'
|
| 34 |
-
'example_id': '1034_121119_000028_000001'
|
| 35 |
'4957_30119_000070_000001':
|
| 36 |
speaker_id: '4957'
|
| 37 |
example_id: '4957_30119_000070_000001'
|
|
@@ -56,9 +59,6 @@ dataset:
|
|
| 56 |
'5012_80192_000020_000003':
|
| 57 |
speaker_id: '5012'
|
| 58 |
example_id: '5012_80192_000020_000003'
|
| 59 |
-
'1422_149735_000006_000000':
|
| 60 |
-
speaker_id: '1422'
|
| 61 |
-
example_id: '1422_149735_000006_000000'
|
| 62 |
'14_212_000019_000000':
|
| 63 |
speaker_id: '14'
|
| 64 |
example_id: '14_212_000019_000000'
|
|
|
|
| 1 |
dataset:
|
| 2 |
+
'1422_149735_000006_000000':
|
| 3 |
+
speaker_id: '1422'
|
| 4 |
+
example_id: '1422_149735_000006_000000'
|
| 5 |
+
'1034_121119_000028_000001':
|
| 6 |
+
speaker_id: '1034'
|
| 7 |
+
'example_id': '1034_121119_000028_000001'
|
| 8 |
'7190_90542_000054_000000':
|
| 9 |
speaker_id: '7190'
|
| 10 |
example_id: '7190_90542_000054_000000'
|
|
|
|
| 35 |
'8758_296465_000020_000000':
|
| 36 |
speaker_id: '8758'
|
| 37 |
example_id: '8758_296465_000020_000000'
|
|
|
|
|
|
|
|
|
|
| 38 |
'4957_30119_000070_000001':
|
| 39 |
speaker_id: '4957'
|
| 40 |
example_id: '4957_30119_000070_000001'
|
|
|
|
| 59 |
'5012_80192_000020_000003':
|
| 60 |
speaker_id: '5012'
|
| 61 |
example_id: '5012_80192_000020_000003'
|
|
|
|
|
|
|
|
|
|
| 62 |
'14_212_000019_000000':
|
| 63 |
speaker_id: '14'
|
| 64 |
example_id: '14_212_000019_000000'
|
app.py
CHANGED
|
@@ -25,6 +25,7 @@ cached_loaded_example = None
|
|
| 25 |
cached_labels = None
|
| 26 |
cached_d_vector = None
|
| 27 |
cached_unmanipulated = None
|
|
|
|
| 28 |
|
| 29 |
# path to stats
|
| 30 |
stats_path = Path('./Dataset/Embeddings/')
|
|
@@ -48,7 +49,7 @@ hubert_model = HubertExtractor(
|
|
| 48 |
layer=SID_LARGE_LAYER,
|
| 49 |
model_name="HUBERT_LARGE",
|
| 50 |
backend="torchaudio",
|
| 51 |
-
device=device,
|
| 52 |
# storage_dir= # target storage dir hubert model
|
| 53 |
)
|
| 54 |
|
|
@@ -166,8 +167,7 @@ def delete_cache():
|
|
| 166 |
|
| 167 |
|
| 168 |
def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
|
| 169 |
-
global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated
|
| 170 |
-
|
| 171 |
speaker_id = dataset_dict['dataset'][example_id]['speaker_id']
|
| 172 |
|
| 173 |
example = {
|
|
@@ -189,6 +189,14 @@ def update_manipulation(manipulation_idx, example_id, transcription, manipulatio
|
|
| 189 |
'text': transcription,
|
| 190 |
'd_vector': cached_d_vector.detach().numpy(),
|
| 191 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
with torch.no_grad():
|
| 194 |
wav_manipulated = get_manipulation(
|
|
@@ -214,18 +222,20 @@ demo = gr.Interface(
|
|
| 214 |
value=2, type="value"
|
| 215 |
),
|
| 216 |
gr.Dropdown(
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
| 219 |
),
|
| 220 |
gr.Textbox(
|
| 221 |
-
|
|
|
|
| 222 |
placeholder='Type something'
|
| 223 |
),
|
| 224 |
-
gr.Slider(label="Manipulation
|
| 225 |
],
|
| 226 |
-
outputs=[gr.Audio(label="original utterance"), gr.Audio(label="manipulated utterance")],
|
| 227 |
)
|
| 228 |
-
|
| 229 |
if __name__ == "__main__":
|
| 230 |
demo.launch(share=True)
|
| 231 |
|
|
|
|
| 25 |
cached_labels = None
|
| 26 |
cached_d_vector = None
|
| 27 |
cached_unmanipulated = None
|
| 28 |
+
cached_transcription = None
|
| 29 |
|
| 30 |
# path to stats
|
| 31 |
stats_path = Path('./Dataset/Embeddings/')
|
|
|
|
| 49 |
layer=SID_LARGE_LAYER,
|
| 50 |
model_name="HUBERT_LARGE",
|
| 51 |
backend="torchaudio",
|
| 52 |
+
device=device,
|
| 53 |
# storage_dir= # target storage dir hubert model
|
| 54 |
)
|
| 55 |
|
|
|
|
| 167 |
|
| 168 |
|
| 169 |
def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
|
| 170 |
+
global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated, cached_transcription
|
|
|
|
| 171 |
speaker_id = dataset_dict['dataset'][example_id]['speaker_id']
|
| 172 |
|
| 173 |
example = {
|
|
|
|
| 189 |
'text': transcription,
|
| 190 |
'd_vector': cached_d_vector.detach().numpy(),
|
| 191 |
})
|
| 192 |
+
cached_transcription = transcription
|
| 193 |
+
if cached_loaded_example != example or transcription != cached_transcription:
|
| 194 |
+
with torch.no_grad():
|
| 195 |
+
cached_unmanipulated = tts_model.synthesize_from_example({
|
| 196 |
+
'text': transcription,
|
| 197 |
+
'd_vector': cached_d_vector.detach().numpy(),
|
| 198 |
+
})
|
| 199 |
+
cached_transcription = transcription
|
| 200 |
|
| 201 |
with torch.no_grad():
|
| 202 |
wav_manipulated = get_manipulation(
|
|
|
|
| 222 |
value=2, type="value"
|
| 223 |
),
|
| 224 |
gr.Dropdown(
|
| 225 |
+
label="Speaker",
|
| 226 |
+
choices=[(str(idx), example_id) for idx, example_id in enumerate(dataset_dict['dataset'].keys())],
|
| 227 |
+
value="1422_149735_000006_000000",
|
| 228 |
+
type="value"
|
| 229 |
),
|
| 230 |
gr.Textbox(
|
| 231 |
+
label="Text Input",
|
| 232 |
+
value="Department of Communications Engineering Paderborn University.",
|
| 233 |
placeholder='Type something'
|
| 234 |
),
|
| 235 |
+
gr.Slider(label="Manipulation Intensity", minimum=-1.0, maximum=2.0, value=1.0, step=0.1),
|
| 236 |
],
|
| 237 |
+
outputs=[gr.Audio(label="original synthesized utterance"), gr.Audio(label="manipulated synthesized utterance")],
|
| 238 |
)
|
|
|
|
| 239 |
if __name__ == "__main__":
|
| 240 |
demo.launch(share=True)
|
| 241 |
|
pvq_manipulation/models/vits.py
CHANGED
|
@@ -246,11 +246,8 @@ class Vits_NT(Vits):
|
|
| 246 |
y_mask=y_mask
|
| 247 |
)
|
| 248 |
|
| 249 |
-
import time
|
| 250 |
-
start = time.time()
|
| 251 |
if not torch.cuda.is_available():
|
| 252 |
num_chunks = min(os.cpu_count() or 2, z.shape[-1])
|
| 253 |
-
print(num_chunks, 'num chunks')
|
| 254 |
chunk_size = z.shape[-1] // num_chunks
|
| 255 |
z_chunks = torch.split(z, chunk_size, dim=-1)
|
| 256 |
|
|
@@ -271,8 +268,6 @@ class Vits_NT(Vits):
|
|
| 271 |
(z * y_mask)[:, :, : self.max_inference_len],
|
| 272 |
g=speaker_embedding_man[:, :, None] if self.config.gan_speaker_conditioning else None
|
| 273 |
)
|
| 274 |
-
|
| 275 |
-
print(time.time() - start)
|
| 276 |
return o
|
| 277 |
|
| 278 |
def forward(self, x, x_lengths, y, y_lengths, aux_input, inference=False):
|
|
|
|
| 246 |
y_mask=y_mask
|
| 247 |
)
|
| 248 |
|
|
|
|
|
|
|
| 249 |
if not torch.cuda.is_available():
|
| 250 |
num_chunks = min(os.cpu_count() or 2, z.shape[-1])
|
|
|
|
| 251 |
chunk_size = z.shape[-1] // num_chunks
|
| 252 |
z_chunks = torch.split(z, chunk_size, dim=-1)
|
| 253 |
|
|
|
|
| 268 |
(z * y_mask)[:, :, : self.max_inference_len],
|
| 269 |
g=speaker_embedding_man[:, :, None] if self.config.gan_speaker_conditioning else None
|
| 270 |
)
|
|
|
|
|
|
|
| 271 |
return o
|
| 272 |
|
| 273 |
def forward(self, x, x_lengths, y, y_lengths, aux_input, inference=False):
|