Spaces:

descript
/

vampnet

Runtime error

App Files Files Community

Hugo Flores Garcia commited on Jul 11, 2023

Commit

3346920

1 Parent(s): 3445a71

more sampling fixes

Browse files

Files changed (4) hide show

sample.py +70 -0
scripts/{utils/vamp_folder.py → exp/experiment.py} +6 -7
scripts/utils/parallel-gpu.sh +0 -23
vampnet/modules/transformer.py +23 -34

sample.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import yaml
+import argbind
+import audiotools as at
+from vampnet.interface import Interface
+import logging
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+Interface = argbind.bind(Interface)
+with open("conf/interface/spotdl.yml") as f:
+    conf = yaml.safe_load(f)
+with argbind.scope(conf):
+    interface = Interface()
+    interface.to("cuda")
+loader = at.data.datasets.AudioLoader(sources=[
+    "input.wav",
+])
+dataset = at.data.datasets.AudioDataset(
+    loader,
+    sample_rate=interface.codec.sample_rate,
+    duration=interface.coarse.chunk_size_s,
+    n_examples=200,
+    without_replacement=True,
+)
+import numpy as np
+def load_random_audio():
+    index = np.random.randint(0, len(dataset))
+    sig = dataset[index]["signal"]
+    sig = interface.preprocess(sig)
+    return sig
+sig = load_random_audio()
+z = interface.encode(sig)
+sig.write('input.wav')
+from vampnet import mask as pmask
+# build the mask
+mask = pmask.linear_random(z, 1.0)
+print("coarse")
+zv, mask_z = interface.coarse_vamp(
+    z,
+    mask=mask,
+    sampling_steps=36,
+    temperature=8.0,
+    return_mask=True,
+    typical_filtering=False,
+    # typical_mass=data[typical_mass],
+    # typical_min_tokens=data[typical_min_tokens],
+    gen_fn=interface.coarse.generate,
+)
+print("coarse2fine")
+zv = interface.coarse_to_fine(zv, temperature=0.8)
+sig = interface.to_signal(zv).cpu()
+sig.write('output-t=8.wav')

scripts/{utils/vamp_folder.py → exp/experiment.py} RENAMED Viewed

@@ -119,13 +119,15 @@ def beat_mask(ctx_time):
     def wrapper(sig, interface):
         beat_mask = interface.make_beat_mask(
             sig,
-            before_beat_s=0.0,
-            after_beat_s=ctx_time,
             invert=True
         )
         z = interface.encode(sig)
         zv = interface.coarse_vamp(
-            z, beat_mask,
         )
         zv = interface.coarse_to_fine(zv)
@@ -185,9 +187,6 @@ EXP_REGISTRY["sampling-steps"] = {
 EXP_REGISTRY["musical-sampling"] = {
-    "baseline": baseline,
-    "codec": reconstructed,
-    **{f"downsample_{x}x": CoarseCond(4, downsample_factor=x) for x in [16, 32]},
     **{f"beat_mask_{t}": beat_mask(t) for t in [0.075]},
     **{f"inpaint_{t}": inpaint(t) for t in [0.5, 1.0,]}, # multiply these by 2 (they go left and right)
 }
@@ -195,7 +194,7 @@ EXP_REGISTRY["musical-sampling"] = {
 @argbind.bind(without_prefix=True)
 def main(
         sources=[
-            "/media/CHONK/hugo/spotdl/audio-test",
         ],
         output_dir: str = "./samples",
         max_excerpts: int = 2000,

     def wrapper(sig, interface):
         beat_mask = interface.make_beat_mask(
             sig,
+            before_beat_s=ctx_time/2,
+            after_beat_s=ctx_time/2,
             invert=True
         )
         z = interface.encode(sig)
         zv = interface.coarse_vamp(
+            z, beat_mask
         )
         zv = interface.coarse_to_fine(zv)
 EXP_REGISTRY["musical-sampling"] = {
     **{f"beat_mask_{t}": beat_mask(t) for t in [0.075]},
     **{f"inpaint_{t}": inpaint(t) for t in [0.5, 1.0,]}, # multiply these by 2 (they go left and right)
 }
 @argbind.bind(without_prefix=True)
 def main(
         sources=[
+            "/media/CHONK/hugo/spotdl/val",
         ],
         output_dir: str = "./samples",
         max_excerpts: int = 2000,

scripts/utils/parallel-gpu.sh DELETED Viewed

@@ -1,23 +0,0 @@
-#!/bin/bash
-# Get the command to execute from the user
-command_to_execute="$1"
-# Get the maximum number of GPUs to use from the user
-max_gpus="$2"
-# Get the number of instances to start per GPU from the user
-instances_per_gpu="$3"
-# Set the CUDA_VISIBLE_DEVICES flag for each GPU
-for gpu_id in $(seq 0 $(($max_gpus - 1))); do
-    export CUDA_VISIBLE_DEVICES="$gpu_id"
-    # Start the specified number of instances for this GPU
-    for i in $(seq 1 "$instances_per_gpu"); do
-        # Run the command in the background
-        $command_to_execute &
-    done
-done
-# Wait for all instances to finish
-wait

vampnet/modules/transformer.py CHANGED Viewed

@@ -581,7 +581,7 @@ class VampNet(at.ml.BaseModel):
         sampling_steps: int = 24,
         start_tokens: Optional[torch.Tensor] = None,
         mask: Optional[torch.Tensor] = None,
-        temperature: Union[float, Tuple[float, float]] = 8.0,
         typical_filtering=False,
         typical_mass=0.2,
         typical_min_tokens=1,
@@ -592,15 +592,7 @@ class VampNet(at.ml.BaseModel):
         #####################
         # resolve temperature #
         #####################
-        if isinstance(temperature, float):
-            temperature = torch.tensor(temperature).repeat(sampling_steps)
-        elif isinstance(temperature, tuple):
-            assert len(temperature) == 2
-            l, h = temperature
-            temperature = torch.linspace(l, h, sampling_steps)
-        else:
-            raise TypeError(f"invalid type for temperature")
         logging.debug(f"temperature: {temperature}")
@@ -642,10 +634,6 @@ class VampNet(at.ml.BaseModel):
         num_mask_tokens_at_start = (z_masked == self.mask_token).sum()
         logging.debug(f"num mask tokens at start: {num_mask_tokens_at_start}")
-        # our r steps
-        r_steps = torch.linspace(1e-10, 1, sampling_steps+1)[1:].to(self.device)
-        logging.debug(f"r steps: {r_steps}")
         # how many codebooks are we inferring vs conditioning on?
         n_infer_codebooks = self.n_codebooks - self.n_conditioning_codebooks
         logging.debug(f"n infer codebooks: {n_infer_codebooks}")
@@ -658,11 +646,13 @@ class VampNet(at.ml.BaseModel):
             logging.debug(f"step {i} of {sampling_steps}")
             # our current temperature
-            tmpt = temperature[i]
-            logging.debug(f"temperature: {tmpt}")
             # our current schedule step
-            r = r_steps[i : i + 1]
             logging.debug(f"r: {r}")
             # get latents
@@ -699,11 +689,18 @@ class VampNet(at.ml.BaseModel):
             probs = rearrange(probs, "(b seq) prob -> b seq prob", b=b)
             logging.debug(f"sampled z with shape: {sampled_z.shape}")
             # flatten z_masked and mask, so we can deal with the sampling logic
             # we'll unflatten them at the end of the loop for the next forward pass
             # remove conditioning codebooks, we'll add them back at the end
-            z_masked = codebook_flatten(z_masked[:, self.n_conditioning_codebooks:, :])
             mask = (z_masked == self.mask_token).int()
@@ -715,15 +712,6 @@ class VampNet(at.ml.BaseModel):
             )
             logging.debug(f"added z back into sampled z with shape: {sampled_z.shape}")
-            # get the confidences: which tokens did we sample?
-            selected_probs = (
-                torch.take_along_dim(
-                    probs, sampled_z.long().unsqueeze(-1),
-                    dim=-1
-                ).squeeze(-1)
-            )
             # ignore any tokens that weren't masked
             selected_probs = torch.where(
                 mask.bool(), selected_probs, torch.inf
@@ -733,18 +721,19 @@ class VampNet(at.ml.BaseModel):
             num_to_mask = torch.floor(_gamma(r) * num_mask_tokens_at_start).unsqueeze(1).long()
             logging.debug(f"num to mask: {num_to_mask}")
-            num_to_mask = torch.maximum(
-                torch.tensor(1),
-                torch.minimum(
-                    mask.sum(dim=-1, keepdim=True) - 1,
-                    num_to_mask
                 )
-            )
             # get our new mask
             mask = mask_by_random_topk(
-                num_to_mask, selected_probs, tmpt * (1-r)
             )
             # update the mask

         sampling_steps: int = 24,
         start_tokens: Optional[torch.Tensor] = None,
         mask: Optional[torch.Tensor] = None,
+        temperature: Union[float, Tuple[float, float]] = 2.5,
         typical_filtering=False,
         typical_mass=0.2,
         typical_min_tokens=1,
         #####################
         # resolve temperature #
         #####################
+        assert isinstance(temperature, float)
         logging.debug(f"temperature: {temperature}")
         num_mask_tokens_at_start = (z_masked == self.mask_token).sum()
         logging.debug(f"num mask tokens at start: {num_mask_tokens_at_start}")
         # how many codebooks are we inferring vs conditioning on?
         n_infer_codebooks = self.n_codebooks - self.n_conditioning_codebooks
         logging.debug(f"n infer codebooks: {n_infer_codebooks}")
             logging.debug(f"step {i} of {sampling_steps}")
             # our current temperature
+            logging.debug(f"temperature: {temperature}")
             # our current schedule step
+            r = scalar_to_batch_tensor(
+                (i + 1) / sampling_steps,
+                z.shape[0]
+            ).to(z.device)
             logging.debug(f"r: {r}")
             # get latents
             probs = rearrange(probs, "(b seq) prob -> b seq prob", b=b)
             logging.debug(f"sampled z with shape: {sampled_z.shape}")
+            # get the confidences: which tokens did we sample?
+            selected_probs = (
+                torch.take_along_dim(
+                    probs, sampled_z.long().unsqueeze(-1),
+                    dim=-1
+                ).squeeze(-1)
+            )
             # flatten z_masked and mask, so we can deal with the sampling logic
             # we'll unflatten them at the end of the loop for the next forward pass
             # remove conditioning codebooks, we'll add them back at the end
+            z_masked = codebook_flatten(z_masked[:, self.n_conditioning_codebooks:, :])
             mask = (z_masked == self.mask_token).int()
             )
             logging.debug(f"added z back into sampled z with shape: {sampled_z.shape}")
             # ignore any tokens that weren't masked
             selected_probs = torch.where(
                 mask.bool(), selected_probs, torch.inf
             num_to_mask = torch.floor(_gamma(r) * num_mask_tokens_at_start).unsqueeze(1).long()
             logging.debug(f"num to mask: {num_to_mask}")
+            if i != (sampling_steps - 1):
+                num_to_mask = torch.maximum(
+                    torch.tensor(1),
+                    torch.minimum(
+                        mask.sum(dim=-1, keepdim=True) - 1,
+                        num_to_mask
+                    )
                 )
             # get our new mask
             mask = mask_by_random_topk(
+                num_to_mask, selected_probs, temperature * (1-r)
             )
             # update the mask