Spaces:

descript
/

vampnet

Runtime error

App Files Files Community

Hugo Flores commited on Mar 27, 2023

Commit

5a0a80a

1 Parent(s): 91f8638

beat tracker bugfixes

Browse files

Files changed (4) hide show

requirements.txt +2 -1
vampnet/beats.py +2 -5
vampnet/interface.py +41 -10
vampnet/modules/base.py +1 -2

requirements.txt CHANGED Viewed

@@ -2,7 +2,8 @@ argbind>=0.3.1
 pytorch-ignite
 rich
 audiotools @ git+https://github.com/descriptinc/lyrebird-audiotools.git@hf/backup-info
-lac @ git+https://github.com/descriptinc/lyrebird-audio-codec.git@main
 tqdm
 tensorboard
 google-cloud-logging==2.2.0

 pytorch-ignite
 rich
 audiotools @ git+https://github.com/descriptinc/lyrebird-audiotools.git@hf/backup-info
+lac @ git+https://github.com/descriptinc/lyrebird-audio-codec.git@hf/vampnet-temp
+wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat.git
 tqdm
 tensorboard
 google-cloud-logging==2.2.0

vampnet/beats.py CHANGED Viewed

@@ -200,13 +200,10 @@ class BeatTracker:
 class WaveBeat(BeatTracker):
-    def __init__(self, ckpt_dir: str = "checkpoints/wavebeat", device: str = "cpu"):
         from wavebeat.dstcn import dsTCNModel
-        ckpts = list((ckpt_dir).glob("*.ckpt"))
-        assert len(ckpts) > 0, f"no checkpoints found for wavebeat in  {ckpt_dir}"
-        model = dsTCNModel.load_from_checkpoint(ckpts[-1])
         model.eval()
         self.device = device

 class WaveBeat(BeatTracker):
+    def __init__(self, ckpt_path: str = "checkpoints/wavebeat", device: str = "cpu"):
         from wavebeat.dstcn import dsTCNModel
+        model = dsTCNModel.load_from_checkpoint(ckpt_path)
         model.eval()
         self.device = device

vampnet/interface.py CHANGED Viewed

@@ -3,6 +3,7 @@ from pathlib import Path
 import math
 import torch
 from audiotools import AudioSignal
 import tqdm
@@ -50,7 +51,10 @@ class Interface:
     def s2t(self, seconds: float):
         """seconds to tokens"""
-        return math.ceil(seconds * self.codec.sample_rate / self.codec.hop_length)
     def s2t2s(self, seconds: float):
         """seconds to tokens to seconds"""
@@ -94,11 +98,12 @@ class Interface:
             signal: AudioSignal,
             before_beat_s: float = 0.1,
             after_beat_s: float = 0.1,
-            mask_downbeats: float = 0.1,
-            mask_upbeats: float = 0.1,
             downbeat_downsample_factor: int = None,
             beat_downsample_factor: int = None,
-            invert: bool = False,
     ):
         """make a beat synced mask. that is, make a mask that
         places 1s at and around the beat, and 0s everywhere else.
@@ -112,7 +117,9 @@ class Interface:
         beats_z, downbeats_z = self.s2t(beats), self.s2t(downbeats)
         # remove downbeats from beats
-        beats_z = beats_z[~torch.isin(beats_z, downbeats_z)]
         # make the mask
         seq_len = self.s2t(signal.duration)
@@ -138,16 +145,26 @@ class Interface:
         if mask_upbeats:
             for beat_idx in beats_z:
-                mask[beat_idx - mask_b4:beat_idx + mask_after] = 1
         if mask_downbeats:
             for downbeat_idx in downbeats_z:
-                mask[downbeat_idx - mask_b4:downbeat_idx + mask_after] = 1
         if invert:
             mask = 1 - mask
-        return mask
     def coarse_to_fine(
         self,
@@ -293,6 +310,7 @@ class Interface:
         debug=False,
         swap_prefix_suffix=False,
         ext_mask=None,
         **kwargs
     ):
         z = self.encode(signal)
@@ -319,7 +337,8 @@ class Interface:
         _cz = cz.clone()
         cz_mask = None
-        for _ in range(num_vamps):
             # add noise
             cz_masked, cz_mask = self.coarse.add_noise(
                 _cz, r=1.0-intensity,
@@ -428,8 +447,9 @@ class Interface:
     def variation(
         self,
         signal: AudioSignal,
-        overlap_hop_ratio: float = 1.0, # TODO: should this be fixed to 1.0?  or should we overlap and replace instead of overlap add
         verbose: bool = False,
         **kwargs
     ):
         signal = signal.clone()
@@ -442,6 +462,9 @@ class Interface:
             math.ceil(signal.duration / self.coarse.chunk_size_s)
             * self.coarse.chunk_size_s
         )
         hop_duration = self.coarse.chunk_size_s * overlap_hop_ratio
         original_length = signal.length
@@ -460,10 +483,18 @@ class Interface:
                 signal.samples[i,...], signal.sample_rate
             )
             sig.to(self.device)
             out_z = self.coarse_vamp_v2(
                 sig,
                 num_vamps=1,
                 swap_prefix_suffix=False,
                 **kwargs
             )
             if self.c2f is not None:

 import math
 import torch
+import numpy as np
 from audiotools import AudioSignal
 import tqdm
     def s2t(self, seconds: float):
         """seconds to tokens"""
+        if isinstance(seconds, np.ndarray):
+            return np.ceil(seconds * self.codec.sample_rate / self.codec.hop_length)
+        else:
+            return math.ceil(seconds * self.codec.sample_rate / self.codec.hop_length)
     def s2t2s(self, seconds: float):
         """seconds to tokens to seconds"""
             signal: AudioSignal,
             before_beat_s: float = 0.1,
             after_beat_s: float = 0.1,
+            mask_downbeats: bool = True,
+            mask_upbeats: bool = True,
             downbeat_downsample_factor: int = None,
             beat_downsample_factor: int = None,
+            dropout: float = 0.7,
+            invert: bool = True,
     ):
         """make a beat synced mask. that is, make a mask that
         places 1s at and around the beat, and 0s everywhere else.
         beats_z, downbeats_z = self.s2t(beats), self.s2t(downbeats)
         # remove downbeats from beats
+        beats_z = torch.tensor(beats_z)[~torch.isin(torch.tensor(beats_z), torch.tensor(downbeats_z))]
+        beats_z = beats_z.tolist()
+        downbeats_z = downbeats_z.tolist()
         # make the mask
         seq_len = self.s2t(signal.duration)
         if mask_upbeats:
             for beat_idx in beats_z:
+                _slice = int(beat_idx - mask_b4), int(beat_idx + mask_after)
+                num_steps = mask[_slice[0]:_slice[1]].shape[0]
+                _m = torch.ones(num_steps, device=self.device)
+                _m = torch.nn.functional.dropout(_m, p=dropout)
+                mask[_slice[0]:_slice[1]] = _m
         if mask_downbeats:
             for downbeat_idx in downbeats_z:
+                _slice = int(downbeat_idx - mask_b4), int(downbeat_idx + mask_after)
+                num_steps = mask[_slice[0]:_slice[1]].shape[0]
+                _m = torch.ones(num_steps, device=self.device)
+                _m = torch.nn.functional.dropout(_m, p=dropout)
+                mask[_slice[0]:_slice[1]] = _m
         if invert:
             mask = 1 - mask
+        return mask[None, None, :].bool().long()
     def coarse_to_fine(
         self,
         debug=False,
         swap_prefix_suffix=False,
         ext_mask=None,
+        verbose=False,
         **kwargs
     ):
         z = self.encode(signal)
         _cz = cz.clone()
         cz_mask = None
+        range_fn = tqdm.trange if verbose else range
+        for _ in range_fn(num_vamps):
             # add noise
             cz_masked, cz_mask = self.coarse.add_noise(
                 _cz, r=1.0-intensity,
     def variation(
         self,
         signal: AudioSignal,
         verbose: bool = False,
+        beat_mask: bool = False,
+        beat_mask_kwargs: dict = {},
         **kwargs
     ):
         signal = signal.clone()
             math.ceil(signal.duration / self.coarse.chunk_size_s)
             * self.coarse.chunk_size_s
         )
+        # eventually we DO want overlap, but we want overlap-replace not
+        # overlap-add
+        overlap_hop_ratio = 1.0
         hop_duration = self.coarse.chunk_size_s * overlap_hop_ratio
         original_length = signal.length
                 signal.samples[i,...], signal.sample_rate
             )
             sig.to(self.device)
+            if beat_mask:
+                ext_mask = self.make_beat_mask(sig, **beat_mask_kwargs)
+            else:
+                ext_mask = None
             out_z = self.coarse_vamp_v2(
                 sig,
                 num_vamps=1,
                 swap_prefix_suffix=False,
+                ext_mask=ext_mask,
+                verbose=verbose,
                 **kwargs
             )
             if self.c2f is not None:

vampnet/modules/base.py CHANGED Viewed

@@ -103,8 +103,7 @@ class VampBase(at.ml.BaseModel):
         # add the external mask if we were given one
         if ext_mask is not None:
             assert ext_mask.ndim == 3, "mask must be (batch, n_codebooks, seq)"
-            assert ext_mask.shape == x.shape, "mask must be same shape as x"
-            mask = (mask + ext_mask).bool().long()
         x = x * (1 - mask) + random_x * mask
         return x, mask

         # add the external mask if we were given one
         if ext_mask is not None:
             assert ext_mask.ndim == 3, "mask must be (batch, n_codebooks, seq)"
+            mask = (mask * ext_mask).bool().long()
         x = x * (1 - mask) + random_x * mask
         return x, mask