Spaces:
Runtime error
Runtime error
Hugo Flores Garcia
commited on
Commit
·
5a343f4
1
Parent(s):
f4c9665
the refactor begins
Browse files- Dockerfile +0 -39
- README.md +0 -21
- conf/{vampnet-c2f.yml → c2f.yml} +0 -0
- conf/interface/interface-c2f-exp.yml +0 -5
- conf/interface/{interface-jazzpop.yml → jazzpop.yml} +0 -0
- conf/interface/{interface-maestro.yml → maestro.yml} +0 -0
- conf/interface/{interface-spotdl.yml → spotdl.yml} +0 -0
- conf/lora/birds.yml +10 -0
- conf/lora/birdss.yml +12 -0
- conf/lora/constructions.yml +2 -2
- conf/lora/lora-is-this-charlie-parker.yml +2 -2
- conf/lora/lora.yml +1 -1
- conf/lora/underworld.yml +10 -0
- conf/vampnet-groovemidi.yml +0 -54
- conf/vampnet-maestro.yml +0 -21
- demo.py +22 -10
- docker-compose.yml +0 -92
- requirements.txt +0 -31
- setup.py +3 -2
- vampnet/interface.py +25 -2
- vampnet/modules/base.py +1 -2
Dockerfile
DELETED
|
@@ -1,39 +0,0 @@
|
|
| 1 |
-
FROM us.gcr.io/lyrebird-research/research-image/audio
|
| 2 |
-
|
| 3 |
-
COPY requirements.txt requirements.txt
|
| 4 |
-
ARG GITHUB_TOKEN
|
| 5 |
-
RUN echo machine github.com login ${GITHUB_TOKEN} > ~/.netrc
|
| 6 |
-
|
| 7 |
-
COPY env/alias.sh /alias.sh
|
| 8 |
-
COPY env/entry_script.sh /entry_script.sh
|
| 9 |
-
RUN cat /alias.sh >> ~/.zshrc
|
| 10 |
-
|
| 11 |
-
# USER researcher
|
| 12 |
-
RUN pip install Cython
|
| 13 |
-
RUN pip install madmom
|
| 14 |
-
RUN pip install --upgrade -r requirements.txt
|
| 15 |
-
RUN pip install --upgrade tensorflow
|
| 16 |
-
RUN pip install --upgrade librosa
|
| 17 |
-
RUN pip install --upgrade numba
|
| 18 |
-
RUN pip install protobuf==3.20
|
| 19 |
-
ENV PYTHONPATH "$PYTHONPATH:/u/home/src"
|
| 20 |
-
ENV NUMBA_CACHE_DIR=/tmp/
|
| 21 |
-
|
| 22 |
-
USER root
|
| 23 |
-
RUN wget https://github.com/jgm/pandoc/releases/download/2.18/pandoc-2.18-1-amd64.deb
|
| 24 |
-
RUN dpkg -i pandoc-2.18-1-amd64.deb
|
| 25 |
-
RUN apt-get update && apt-get install task-spooler
|
| 26 |
-
|
| 27 |
-
RUN head -n -1 /entry_script.sh > /entry_script_jupyter.sh
|
| 28 |
-
RUN head -n -1 /entry_script.sh > /entry_script_tensorboard.sh
|
| 29 |
-
RUN head -n -1 /entry_script.sh > /entry_script_gradio.sh
|
| 30 |
-
|
| 31 |
-
RUN echo \
|
| 32 |
-
'su -p ${USER} -c "source ~/.zshrc && jupyter lab --ip=0.0.0.0"' >> \
|
| 33 |
-
/entry_script_jupyter.sh
|
| 34 |
-
RUN echo \
|
| 35 |
-
'su -p ${USER} -c "source ~/.zshrc && tensorboard --logdir=$TENSORBOARD_PATH --samples_per_plugin audio=500 --bind_all"' >> \
|
| 36 |
-
/entry_script_tensorboard.sh
|
| 37 |
-
RUN echo \
|
| 38 |
-
'su -p ${USER} -c "source ~/.zshrc && python app.py --args.load=conf/app.yml"' >> \
|
| 39 |
-
/entry_script_gradio.sh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -2,27 +2,6 @@
|
|
| 2 |
|
| 3 |
This repository contains recipes for training generative music models on top of the Lyrebird Audio Codec.
|
| 4 |
|
| 5 |
-
## Install hooks
|
| 6 |
-
|
| 7 |
-
First install the pre-commit util:
|
| 8 |
-
|
| 9 |
-
https://pre-commit.com/#install
|
| 10 |
-
|
| 11 |
-
pip install pre-commit # with pip
|
| 12 |
-
brew install pre-commit # on Mac
|
| 13 |
-
|
| 14 |
-
Then install the git hooks
|
| 15 |
-
|
| 16 |
-
pre-commit install
|
| 17 |
-
# check .pre-commit-config.yaml for details of hooks
|
| 18 |
-
|
| 19 |
-
Upon `git commit`, the pre-commit hooks will be run automatically on the stage files (i.e. added by `git add`)
|
| 20 |
-
|
| 21 |
-
**N.B. By default, pre-commit checks only run on staged files**
|
| 22 |
-
|
| 23 |
-
If you need to run it on all files:
|
| 24 |
-
|
| 25 |
-
pre-commit run --all-files
|
| 26 |
|
| 27 |
## Development
|
| 28 |
### Setting everything up
|
|
|
|
| 2 |
|
| 3 |
This repository contains recipes for training generative music models on top of the Lyrebird Audio Codec.
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
## Development
|
| 7 |
### Setting everything up
|
conf/{vampnet-c2f.yml → c2f.yml}
RENAMED
|
File without changes
|
conf/interface/interface-c2f-exp.yml
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
Interface.coarse_ckpt: /runs/c2f-exp-03.22.23/ckpt/random/epoch=400/vampnet/weights.pth
|
| 2 |
-
Interface.coarse2fine_ckpt: runs/c2f-exp-03.22.23/ckpt/random/epoch=400/vampnet/weights.pth
|
| 3 |
-
Interface.codec_ckpt: /runs/codec-ckpt/codec.pth
|
| 4 |
-
Interface.coarse_chunk_size_s: 5
|
| 5 |
-
Interface.coarse2fine_chunk_size_s: 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
conf/interface/{interface-jazzpop.yml → jazzpop.yml}
RENAMED
|
File without changes
|
conf/interface/{interface-maestro.yml → maestro.yml}
RENAMED
|
File without changes
|
conf/interface/{interface-spotdl.yml → spotdl.yml}
RENAMED
|
File without changes
|
conf/lora/birds.yml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
$include:
|
| 2 |
+
- conf/lora/lora.yml
|
| 3 |
+
|
| 4 |
+
fine_tune: True
|
| 5 |
+
|
| 6 |
+
train/AudioLoader.sources:
|
| 7 |
+
- /media/CHONK/hugo/spotdl/subsets/birds
|
| 8 |
+
|
| 9 |
+
val/AudioLoader.sources:
|
| 10 |
+
- /media/CHONK/hugo/spotdl/subsets/birds
|
conf/lora/birdss.yml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
$include:
|
| 2 |
+
- conf/lora/lora.yml
|
| 3 |
+
|
| 4 |
+
fine_tune: True
|
| 5 |
+
|
| 6 |
+
train/AudioLoader.sources:
|
| 7 |
+
- /media/CHONK/hugo/spotdl/subsets/birds
|
| 8 |
+
- /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
|
| 9 |
+
|
| 10 |
+
val/AudioLoader.sources:
|
| 11 |
+
- /media/CHONK/hugo/spotdl/subsets/birds
|
| 12 |
+
- /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
|
conf/lora/constructions.yml
CHANGED
|
@@ -4,7 +4,7 @@ $include:
|
|
| 4 |
fine_tune: True
|
| 5 |
|
| 6 |
train/AudioLoader.sources:
|
| 7 |
-
- /media/CHONK/hugo/spotdl/subsets/constructions
|
| 8 |
|
| 9 |
val/AudioLoader.sources:
|
| 10 |
-
- /media/CHONK/hugo/spotdl/subsets/constructions
|
|
|
|
| 4 |
fine_tune: True
|
| 5 |
|
| 6 |
train/AudioLoader.sources:
|
| 7 |
+
- /media/CHONK/hugo/spotdl/subsets/constructions/third.mp3
|
| 8 |
|
| 9 |
val/AudioLoader.sources:
|
| 10 |
+
- /media/CHONK/hugo/spotdl/subsets/constructions/third.mp3
|
conf/lora/lora-is-this-charlie-parker.yml
CHANGED
|
@@ -4,7 +4,7 @@ $include:
|
|
| 4 |
fine_tune: True
|
| 5 |
|
| 6 |
train/AudioLoader.sources:
|
| 7 |
-
- /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
|
| 8 |
|
| 9 |
val/AudioLoader.sources:
|
| 10 |
-
- /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/
|
|
|
|
| 4 |
fine_tune: True
|
| 5 |
|
| 6 |
train/AudioLoader.sources:
|
| 7 |
+
- /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/Charlie Parker - Donna Lee.mp3
|
| 8 |
|
| 9 |
val/AudioLoader.sources:
|
| 10 |
+
- /media/CHONK/hugo/spotdl/subsets/this-is-charlie-parker/Charlie Parker - Donna Lee.mp3
|
conf/lora/lora.yml
CHANGED
|
@@ -8,7 +8,7 @@ train/AudioDataset.n_examples: 10000000
|
|
| 8 |
val/AudioDataset.n_examples: 10
|
| 9 |
|
| 10 |
|
| 11 |
-
NoamScheduler.warmup:
|
| 12 |
|
| 13 |
epoch_length: 100
|
| 14 |
save_audio_epochs: 2
|
|
|
|
| 8 |
val/AudioDataset.n_examples: 10
|
| 9 |
|
| 10 |
|
| 11 |
+
NoamScheduler.warmup: 400
|
| 12 |
|
| 13 |
epoch_length: 100
|
| 14 |
save_audio_epochs: 2
|
conf/lora/underworld.yml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
$include:
|
| 2 |
+
- conf/lora/lora.yml
|
| 3 |
+
|
| 4 |
+
fine_tune: True
|
| 5 |
+
|
| 6 |
+
train/AudioLoader.sources:
|
| 7 |
+
- /media/CHONK/hugo/spotdl/subsets/underworld.mp3
|
| 8 |
+
|
| 9 |
+
val/AudioLoader.sources:
|
| 10 |
+
- /media/CHONK/hugo/spotdl/subsets/underworld.mp3
|
conf/vampnet-groovemidi.yml
DELETED
|
@@ -1,54 +0,0 @@
|
|
| 1 |
-
$include:
|
| 2 |
-
- conf/vampnet.yml
|
| 3 |
-
|
| 4 |
-
VampNet.embedding_dim: 512
|
| 5 |
-
VampNet.n_layers: 12
|
| 6 |
-
VampNet.n_heads: 8
|
| 7 |
-
|
| 8 |
-
AudioDataset.duration: 12.0
|
| 9 |
-
|
| 10 |
-
train/AudioDataset.n_examples: 10000000
|
| 11 |
-
train/AudioLoader.sources:
|
| 12 |
-
# drummer 1 sessions 1, 2, and 3
|
| 13 |
-
- /data/e-gmd-v1.0.0/drummer1/session1
|
| 14 |
-
- /data/e-gmd-v1.0.0/drummer1/session2
|
| 15 |
-
- /data/e-gmd-v1.0.0/drummer1/session3
|
| 16 |
-
# drummer 3 sessions 1 and 2
|
| 17 |
-
- /data/e-gmd-v1.0.0/drummer3/session1
|
| 18 |
-
- /data/e-gmd-v1.0.0/drummer3/session2
|
| 19 |
-
# drummer 4 session 1
|
| 20 |
-
- /data/e-gmd-v1.0.0/drummer4/session1
|
| 21 |
-
# drummer 5 sessions 1 and 2
|
| 22 |
-
- /data/e-gmd-v1.0.0/drummer5/session1
|
| 23 |
-
- /data/e-gmd-v1.0.0/drummer5/session2
|
| 24 |
-
# drummer 6 session 1, 2, and 3
|
| 25 |
-
- /data/e-gmd-v1.0.0/drummer6/session1
|
| 26 |
-
- /data/e-gmd-v1.0.0/drummer6/session2
|
| 27 |
-
- /data/e-gmd-v1.0.0/drummer6/session3
|
| 28 |
-
# drummer 7 session 1, 2 and 3
|
| 29 |
-
- /data/e-gmd-v1.0.0/drummer7/session1
|
| 30 |
-
- /data/e-gmd-v1.0.0/drummer7/session2
|
| 31 |
-
- /data/e-gmd-v1.0.0/drummer7/session3
|
| 32 |
-
# drummer 8 session 1
|
| 33 |
-
- /data/e-gmd-v1.0.0/drummer8/session1
|
| 34 |
-
# drummer 9 session 1
|
| 35 |
-
- /data/e-gmd-v1.0.0/drummer9/session1
|
| 36 |
-
# drummer 10 session 1
|
| 37 |
-
- /data/e-gmd-v1.0.0/drummer10/session1
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
val/AudioDataset.n_examples: 500
|
| 41 |
-
val/AudioLoader.sources:
|
| 42 |
-
# drummer 1 eval session
|
| 43 |
-
- /data/e-gmd-v1.0.0/drummer1/eval_session
|
| 44 |
-
# drummer 5 eval session
|
| 45 |
-
- /data/e-gmd-v1.0.0/drummer5/eval_session
|
| 46 |
-
# drummer 7 eval session
|
| 47 |
-
- /data/e-gmd-v1.0.0/drummer7/eval_session
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
test/AudioDataset.n_examples: 1000
|
| 52 |
-
test/AudioLoader.sources:
|
| 53 |
-
# drummer 8 eval session
|
| 54 |
-
- /data/e-gmd-v1.0.0/drummer8/eval_session
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
conf/vampnet-maestro.yml
DELETED
|
@@ -1,21 +0,0 @@
|
|
| 1 |
-
$include:
|
| 2 |
-
- conf/vampnet.yml
|
| 3 |
-
|
| 4 |
-
VampNet.embedding_dim: 512
|
| 5 |
-
VampNet.n_layers: 12
|
| 6 |
-
VampNet.n_heads: 8
|
| 7 |
-
|
| 8 |
-
AudioDataset.duration: 12.0
|
| 9 |
-
|
| 10 |
-
train/AudioDataset.n_examples: 10000000
|
| 11 |
-
train/AudioLoader.sources:
|
| 12 |
-
- /data/maestro-reorg/train
|
| 13 |
-
|
| 14 |
-
val/AudioDataset.n_examples: 500
|
| 15 |
-
val/AudioLoader.sources:
|
| 16 |
-
- /data/maestro-reorg/val
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
test/AudioDataset.n_examples: 1000
|
| 20 |
-
test/AudioLoader.sources:
|
| 21 |
-
- /data/maestro-reorg/test
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo.py
CHANGED
|
@@ -62,6 +62,7 @@ def load_random_audio():
|
|
| 62 |
def ez_vamp(
|
| 63 |
input_audio, init_temp, final_temp,
|
| 64 |
mask_periodic_amt, mask_periodic_width, num_steps,
|
|
|
|
| 65 |
):
|
| 66 |
print(input_audio)
|
| 67 |
sig = at.AudioSignal(input_audio)
|
|
@@ -74,7 +75,8 @@ def ez_vamp(
|
|
| 74 |
prefix_dur_s=0.0,
|
| 75 |
suffix_dur_s=0.0,
|
| 76 |
num_vamps=1,
|
| 77 |
-
downsample_factor=mask_periodic_amt,
|
|
|
|
| 78 |
periodic_width=mask_periodic_width,
|
| 79 |
periodic_dropout=0.0,
|
| 80 |
periodic_width_dropout=0.0,
|
|
@@ -105,7 +107,7 @@ def vamp(
|
|
| 105 |
num_vamps, mode, use_beats, num_steps, snap_to_beats,
|
| 106 |
beat_unmask_drop, mask_periodic_width,
|
| 107 |
mask_periodic_dropout, mask_periodic_width_dropout,
|
| 108 |
-
n_conditioning_codebooks, use_coarse2fine
|
| 109 |
):
|
| 110 |
# try:
|
| 111 |
print(input_audio)
|
|
@@ -146,6 +148,7 @@ def vamp(
|
|
| 146 |
suffix_dur_s=suffix_s,
|
| 147 |
num_vamps=num_vamps,
|
| 148 |
downsample_factor=mask_periodic_amt,
|
|
|
|
| 149 |
periodic_width=mask_periodic_width,
|
| 150 |
periodic_dropout=mask_periodic_dropout,
|
| 151 |
periodic_width_dropout=mask_periodic_width_dropout,
|
|
@@ -158,7 +161,7 @@ def vamp(
|
|
| 158 |
|
| 159 |
if use_coarse2fine:
|
| 160 |
zv = interface.coarse_to_fine(zv)
|
| 161 |
-
|
| 162 |
|
| 163 |
sig = interface.to_signal(zv).cpu()
|
| 164 |
print("done")
|
|
@@ -166,9 +169,9 @@ def vamp(
|
|
| 166 |
out_dir = OUT_DIR / str(uuid.uuid4())
|
| 167 |
out_dir.mkdir()
|
| 168 |
sig.write(out_dir / "output.wav")
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
return sig.path_to_file,
|
| 172 |
# except Exception as e:
|
| 173 |
# raise gr.Error(f"failed with error: {e}")
|
| 174 |
|
|
@@ -180,7 +183,7 @@ def save_vamp(
|
|
| 180 |
mask_up_chk, up_factor,
|
| 181 |
num_vamps, mode, output_audio, notes, use_beats, num_steps, snap_to_beats,
|
| 182 |
beat_unmask_drop, mask_periodic_width, mask_periodic_dropout, mask_periodic_width_dropout,
|
| 183 |
-
n_conditioning_codebooks, use_coarse2fine
|
| 184 |
):
|
| 185 |
out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
|
| 186 |
out_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -215,6 +218,7 @@ def save_vamp(
|
|
| 215 |
"mask_periodic_width_dropout": mask_periodic_width_dropout,
|
| 216 |
"n_conditioning_codebooks": n_conditioning_codebooks,
|
| 217 |
"use_coarse2fine": use_coarse2fine,
|
|
|
|
| 218 |
}
|
| 219 |
|
| 220 |
# save with yaml
|
|
@@ -333,6 +337,14 @@ with gr.Blocks() as demo:
|
|
| 333 |
precision=0,
|
| 334 |
)
|
| 335 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
mask_periodic_amt = gr.Slider(
|
| 337 |
label="periodic hint (0.0 means no hint, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
|
| 338 |
minimum=0,
|
|
@@ -501,7 +513,7 @@ with gr.Blocks() as demo:
|
|
| 501 |
num_vamps, mode, use_beats, num_steps, snap_to_beats,
|
| 502 |
beat_unmask_drop, mask_periodic_width,
|
| 503 |
mask_periodic_dropout, mask_periodic_width_dropout,
|
| 504 |
-
n_conditioning_codebooks, use_coarse2fine
|
| 505 |
],
|
| 506 |
outputs=[output_audio, audio_mask],
|
| 507 |
api_name="vamp"
|
|
@@ -520,7 +532,7 @@ with gr.Blocks() as demo:
|
|
| 520 |
notes_text, use_beats, num_steps, snap_to_beats,
|
| 521 |
beat_unmask_drop, mask_periodic_width,
|
| 522 |
mask_periodic_dropout, mask_periodic_width_dropout,
|
| 523 |
-
n_conditioning_codebooks, use_coarse2fine
|
| 524 |
],
|
| 525 |
outputs=[thank_you, download_file]
|
| 526 |
)
|
|
@@ -529,7 +541,7 @@ with gr.Blocks() as demo:
|
|
| 529 |
ez_vamp_button.click(
|
| 530 |
fn=ez_vamp,
|
| 531 |
inputs=[input_audio, init_temp, final_temp, mask_periodic_amt,
|
| 532 |
-
mask_periodic_width, num_steps ],
|
| 533 |
outputs=[output_audio],
|
| 534 |
api_name="ez_vamp"
|
| 535 |
)
|
|
|
|
| 62 |
def ez_vamp(
|
| 63 |
input_audio, init_temp, final_temp,
|
| 64 |
mask_periodic_amt, mask_periodic_width, num_steps,
|
| 65 |
+
stretch_factor,
|
| 66 |
):
|
| 67 |
print(input_audio)
|
| 68 |
sig = at.AudioSignal(input_audio)
|
|
|
|
| 75 |
prefix_dur_s=0.0,
|
| 76 |
suffix_dur_s=0.0,
|
| 77 |
num_vamps=1,
|
| 78 |
+
downsample_factor=mask_periodic_amt,
|
| 79 |
+
stretch_factor=stretch_factor,
|
| 80 |
periodic_width=mask_periodic_width,
|
| 81 |
periodic_dropout=0.0,
|
| 82 |
periodic_width_dropout=0.0,
|
|
|
|
| 107 |
num_vamps, mode, use_beats, num_steps, snap_to_beats,
|
| 108 |
beat_unmask_drop, mask_periodic_width,
|
| 109 |
mask_periodic_dropout, mask_periodic_width_dropout,
|
| 110 |
+
n_conditioning_codebooks, use_coarse2fine, stretch_factor,
|
| 111 |
):
|
| 112 |
# try:
|
| 113 |
print(input_audio)
|
|
|
|
| 148 |
suffix_dur_s=suffix_s,
|
| 149 |
num_vamps=num_vamps,
|
| 150 |
downsample_factor=mask_periodic_amt,
|
| 151 |
+
stretch_factor=stretch_factor,
|
| 152 |
periodic_width=mask_periodic_width,
|
| 153 |
periodic_dropout=mask_periodic_dropout,
|
| 154 |
periodic_width_dropout=mask_periodic_width_dropout,
|
|
|
|
| 161 |
|
| 162 |
if use_coarse2fine:
|
| 163 |
zv = interface.coarse_to_fine(zv)
|
| 164 |
+
mask = interface.to_signal(mask_z).cpu()
|
| 165 |
|
| 166 |
sig = interface.to_signal(zv).cpu()
|
| 167 |
print("done")
|
|
|
|
| 169 |
out_dir = OUT_DIR / str(uuid.uuid4())
|
| 170 |
out_dir.mkdir()
|
| 171 |
sig.write(out_dir / "output.wav")
|
| 172 |
+
mask.write(out_dir / "mask.wav")
|
| 173 |
+
return sig.path_to_file, mask.path_to_file
|
| 174 |
+
# return sig.path_to_file, mask_z
|
| 175 |
# except Exception as e:
|
| 176 |
# raise gr.Error(f"failed with error: {e}")
|
| 177 |
|
|
|
|
| 183 |
mask_up_chk, up_factor,
|
| 184 |
num_vamps, mode, output_audio, notes, use_beats, num_steps, snap_to_beats,
|
| 185 |
beat_unmask_drop, mask_periodic_width, mask_periodic_dropout, mask_periodic_width_dropout,
|
| 186 |
+
n_conditioning_codebooks, use_coarse2fine, stretch_factor
|
| 187 |
):
|
| 188 |
out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
|
| 189 |
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 218 |
"mask_periodic_width_dropout": mask_periodic_width_dropout,
|
| 219 |
"n_conditioning_codebooks": n_conditioning_codebooks,
|
| 220 |
"use_coarse2fine": use_coarse2fine,
|
| 221 |
+
"stretch_factor": stretch_factor,
|
| 222 |
}
|
| 223 |
|
| 224 |
# save with yaml
|
|
|
|
| 337 |
precision=0,
|
| 338 |
)
|
| 339 |
|
| 340 |
+
stretch_factor = gr.Slider(
|
| 341 |
+
label="time stretch factor",
|
| 342 |
+
minimum=0,
|
| 343 |
+
maximum=64,
|
| 344 |
+
step=1,
|
| 345 |
+
value=1,
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
mask_periodic_amt = gr.Slider(
|
| 349 |
label="periodic hint (0.0 means no hint, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
|
| 350 |
minimum=0,
|
|
|
|
| 513 |
num_vamps, mode, use_beats, num_steps, snap_to_beats,
|
| 514 |
beat_unmask_drop, mask_periodic_width,
|
| 515 |
mask_periodic_dropout, mask_periodic_width_dropout,
|
| 516 |
+
n_conditioning_codebooks, use_coarse2fine, stretch_factor
|
| 517 |
],
|
| 518 |
outputs=[output_audio, audio_mask],
|
| 519 |
api_name="vamp"
|
|
|
|
| 532 |
notes_text, use_beats, num_steps, snap_to_beats,
|
| 533 |
beat_unmask_drop, mask_periodic_width,
|
| 534 |
mask_periodic_dropout, mask_periodic_width_dropout,
|
| 535 |
+
n_conditioning_codebooks, use_coarse2fine, stretch_factor
|
| 536 |
],
|
| 537 |
outputs=[thank_you, download_file]
|
| 538 |
)
|
|
|
|
| 541 |
ez_vamp_button.click(
|
| 542 |
fn=ez_vamp,
|
| 543 |
inputs=[input_audio, init_temp, final_temp, mask_periodic_amt,
|
| 544 |
+
mask_periodic_width, num_steps, stretch_factor ],
|
| 545 |
outputs=[output_audio],
|
| 546 |
api_name="ez_vamp"
|
| 547 |
)
|
docker-compose.yml
DELETED
|
@@ -1,92 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
version: "3.5"
|
| 3 |
-
services:
|
| 4 |
-
tensorrt:
|
| 5 |
-
build:
|
| 6 |
-
context: .
|
| 7 |
-
dockerfile: ./deployment_build/dockerfile
|
| 8 |
-
args:
|
| 9 |
-
GITHUB_TOKEN: ${GITHUB_TOKEN}
|
| 10 |
-
profiles:
|
| 11 |
-
- tensorrt
|
| 12 |
-
volumes:
|
| 13 |
-
- ./:/u/home/src
|
| 14 |
-
- ~/.config/gcloud:/root/.config/gcloud
|
| 15 |
-
deploy:
|
| 16 |
-
resources:
|
| 17 |
-
limits:
|
| 18 |
-
# match production limits
|
| 19 |
-
cpus: '7'
|
| 20 |
-
memory: 25000M
|
| 21 |
-
reservations:
|
| 22 |
-
devices:
|
| 23 |
-
- driver: nvidia
|
| 24 |
-
count: 1
|
| 25 |
-
capabilities: [gpu]
|
| 26 |
-
working_dir: /u/home/src
|
| 27 |
-
entrypoint:
|
| 28 |
-
- python
|
| 29 |
-
- -m
|
| 30 |
-
- wav2wav.converter
|
| 31 |
-
base:
|
| 32 |
-
build:
|
| 33 |
-
context: .
|
| 34 |
-
dockerfile: ./Dockerfile
|
| 35 |
-
args:
|
| 36 |
-
GITHUB_TOKEN: ${GITHUB_TOKEN}
|
| 37 |
-
volumes:
|
| 38 |
-
- .:/u/home/src
|
| 39 |
-
- ~/.wav2wav:/u/home/.wav2wav
|
| 40 |
-
- ${PATH_TO_DATA}:/data
|
| 41 |
-
- ${PATH_TO_RUNS}:/runs
|
| 42 |
-
- ~/.config/gcloud:/u/home/.config/gcloud
|
| 43 |
-
- ~/.zsh_history:/u/home/.zsh_history
|
| 44 |
-
environment:
|
| 45 |
-
- GITHUB_TOKEN
|
| 46 |
-
- DISCOURSE_API_USERNAME
|
| 47 |
-
- DISCOURSE_SERVER
|
| 48 |
-
- DISCOURSE_API_KEY
|
| 49 |
-
- HOST_USER_ID
|
| 50 |
-
- HOST_USER_GID
|
| 51 |
-
- JUPYTER_TOKEN
|
| 52 |
-
- PATH_TO_DATA=/data
|
| 53 |
-
- PATH_TO_RUNS=/runs
|
| 54 |
-
- TENSORBOARD_PATH
|
| 55 |
-
- MPLCONFIGDIR=/u/home/.mplconfig
|
| 56 |
-
shm_size: 32G
|
| 57 |
-
working_dir: /u/home/src
|
| 58 |
-
deploy:
|
| 59 |
-
resources:
|
| 60 |
-
reservations:
|
| 61 |
-
devices:
|
| 62 |
-
- driver: nvidia
|
| 63 |
-
capabilities: [gpu]
|
| 64 |
-
dev:
|
| 65 |
-
extends: base
|
| 66 |
-
profiles:
|
| 67 |
-
- interactive
|
| 68 |
-
stdin_open: true
|
| 69 |
-
tty: true
|
| 70 |
-
ports:
|
| 71 |
-
- 7860:7860
|
| 72 |
-
jupyter:
|
| 73 |
-
extends: base
|
| 74 |
-
ports:
|
| 75 |
-
- ${JUPYTER_PORT}:8888
|
| 76 |
-
entrypoint:
|
| 77 |
-
- /bin/bash
|
| 78 |
-
- /entry_script_jupyter.sh
|
| 79 |
-
tensorboard:
|
| 80 |
-
extends: base
|
| 81 |
-
ports:
|
| 82 |
-
- ${TENSORBOARD_PORT}:6006
|
| 83 |
-
entrypoint:
|
| 84 |
-
- /bin/bash
|
| 85 |
-
- /entry_script_tensorboard.sh
|
| 86 |
-
gradio:
|
| 87 |
-
extends: base
|
| 88 |
-
ports:
|
| 89 |
-
- 7860:7860
|
| 90 |
-
entrypoint:
|
| 91 |
-
- /bin/bash
|
| 92 |
-
- /entry_script_gradio.sh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
|
@@ -1,31 +0,0 @@
|
|
| 1 |
-
argbind>=0.3.1
|
| 2 |
-
pytorch-ignite
|
| 3 |
-
rich
|
| 4 |
-
audiotools @ git+https://github.com/descriptinc/lyrebird-audiotools.git@hf/backup-info
|
| 5 |
-
lac @ git+https://github.com/descriptinc/lyrebird-audio-codec.git@hf/vampnet-temp
|
| 6 |
-
torch==1.13.1
|
| 7 |
-
torchaudio==0.13.1
|
| 8 |
-
tqdm
|
| 9 |
-
tensorboard
|
| 10 |
-
google-cloud-logging==2.2.0
|
| 11 |
-
pytest
|
| 12 |
-
pytest-cov
|
| 13 |
-
pynvml
|
| 14 |
-
psutil
|
| 15 |
-
pandas
|
| 16 |
-
onnx
|
| 17 |
-
onnx-simplifier
|
| 18 |
-
seaborn
|
| 19 |
-
jupyterlab
|
| 20 |
-
jupyterlab-link-share
|
| 21 |
-
pandas
|
| 22 |
-
watchdog
|
| 23 |
-
pesq
|
| 24 |
-
tabulate
|
| 25 |
-
torchmetrics
|
| 26 |
-
codebraid==0.5.0
|
| 27 |
-
jupyter-client==6.1.12
|
| 28 |
-
tensorboardX
|
| 29 |
-
gradio
|
| 30 |
-
einops
|
| 31 |
-
frechet_audio_distance
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
setup.py
CHANGED
|
@@ -32,12 +32,13 @@ setup(
|
|
| 32 |
"rich",
|
| 33 |
"audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git",
|
| 34 |
"lac @ git+https://github.com/hugofloresgarcia/lac.git",
|
| 35 |
-
"wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat.git",
|
| 36 |
"torch==2.0",
|
| 37 |
"tqdm",
|
| 38 |
"tensorboard",
|
| 39 |
"google-cloud-logging==2.2.0",
|
| 40 |
"einops",
|
| 41 |
-
"frechet_audio_distance"
|
|
|
|
| 42 |
],
|
| 43 |
)
|
|
|
|
| 32 |
"rich",
|
| 33 |
"audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git",
|
| 34 |
"lac @ git+https://github.com/hugofloresgarcia/lac.git",
|
| 35 |
+
# "wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat.git",
|
| 36 |
"torch==2.0",
|
| 37 |
"tqdm",
|
| 38 |
"tensorboard",
|
| 39 |
"google-cloud-logging==2.2.0",
|
| 40 |
"einops",
|
| 41 |
+
# "frechet_audio_distance",
|
| 42 |
+
"gradio"
|
| 43 |
],
|
| 44 |
)
|
vampnet/interface.py
CHANGED
|
@@ -249,6 +249,7 @@ class Interface(torch.nn.Module):
|
|
| 249 |
suffix_dur_s: float = 0.0,
|
| 250 |
num_vamps: int = 1,
|
| 251 |
downsample_factor: int = None,
|
|
|
|
| 252 |
periodic_width: int = 1,
|
| 253 |
periodic_dropout=0.0,
|
| 254 |
periodic_width_dropout=0.0,
|
|
@@ -269,11 +270,33 @@ class Interface(torch.nn.Module):
|
|
| 269 |
n_prefix = self.s2t(prefix_dur_s)
|
| 270 |
n_suffix = self.s2t(suffix_dur_s)
|
| 271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
assert cz.shape[-1] <= self.s2t(self.coarse.chunk_size_s), f"the sequence of tokens provided must match the one specified in the coarse chunk size, but got {cz.shape[-1]} and {self.s2t(self.coarse.chunk_size_s)}"
|
| 273 |
assert n_prefix + n_suffix < c_seq_len, "prefix and suffix must be smaller than the chunk size"
|
| 274 |
|
| 275 |
if swap_prefix_suffix:
|
| 276 |
-
# swap the prefix and suffix
|
| 277 |
assert n_prefix == n_suffix, "prefix and suffix must be the same size for now"
|
| 278 |
cz[:, :, :n_prefix], cz[:, :, c_seq_len-n_suffix:] = cz[:, :, c_seq_len-n_suffix:], cz[:, :, :n_prefix].clone()
|
| 279 |
|
|
@@ -295,7 +318,7 @@ class Interface(torch.nn.Module):
|
|
| 295 |
downsample_factor=downsample_factor,
|
| 296 |
periodic_width=periodic_width,
|
| 297 |
periodic_dropout=periodic_dropout,
|
| 298 |
-
add_random_periodic_offset=
|
| 299 |
periodic_width_dropout=periodic_width_dropout,
|
| 300 |
mask=cz_mask,
|
| 301 |
ext_mask=ext_mask,
|
|
|
|
| 249 |
suffix_dur_s: float = 0.0,
|
| 250 |
num_vamps: int = 1,
|
| 251 |
downsample_factor: int = None,
|
| 252 |
+
stretch_factor: int = None,
|
| 253 |
periodic_width: int = 1,
|
| 254 |
periodic_dropout=0.0,
|
| 255 |
periodic_width_dropout=0.0,
|
|
|
|
| 270 |
n_prefix = self.s2t(prefix_dur_s)
|
| 271 |
n_suffix = self.s2t(suffix_dur_s)
|
| 272 |
|
| 273 |
+
|
| 274 |
+
# hmm, should be a better way to do this? think we just need a mask builder class
|
| 275 |
+
add_random_periodic_offset = True
|
| 276 |
+
|
| 277 |
+
if stretch_factor is not None and stretch_factor > 1:
|
| 278 |
+
print(f"stretching by {stretch_factor}")
|
| 279 |
+
assert stretch_factor >= 1, "stretch factor must be >= 1"
|
| 280 |
+
cz = cz.repeat_interleave(stretch_factor, dim=-1)
|
| 281 |
+
|
| 282 |
+
# the downsample factor is now relative to the stretched sequence
|
| 283 |
+
assert downsample_factor is None or downsample_factor <= 2, "downsample_factor must be None when stretch_factor is not None"
|
| 284 |
+
|
| 285 |
+
downsample_factor = stretch_factor
|
| 286 |
+
add_random_periodic_offset = False
|
| 287 |
+
|
| 288 |
+
assert n_prefix == 0 and n_suffix == 0, "prefix and suffix must be 0 when stretch_factor is not None"
|
| 289 |
+
assert ext_mask is None, "ext_mask must be None when stretch_factor is not None"
|
| 290 |
+
|
| 291 |
+
# trim cz to the original length
|
| 292 |
+
cz = cz[:, :, :c_seq_len]
|
| 293 |
+
|
| 294 |
+
|
| 295 |
assert cz.shape[-1] <= self.s2t(self.coarse.chunk_size_s), f"the sequence of tokens provided must match the one specified in the coarse chunk size, but got {cz.shape[-1]} and {self.s2t(self.coarse.chunk_size_s)}"
|
| 296 |
assert n_prefix + n_suffix < c_seq_len, "prefix and suffix must be smaller than the chunk size"
|
| 297 |
|
| 298 |
if swap_prefix_suffix:
|
| 299 |
+
# swap the prefix and suffix
|
| 300 |
assert n_prefix == n_suffix, "prefix and suffix must be the same size for now"
|
| 301 |
cz[:, :, :n_prefix], cz[:, :, c_seq_len-n_suffix:] = cz[:, :, c_seq_len-n_suffix:], cz[:, :, :n_prefix].clone()
|
| 302 |
|
|
|
|
| 318 |
downsample_factor=downsample_factor,
|
| 319 |
periodic_width=periodic_width,
|
| 320 |
periodic_dropout=periodic_dropout,
|
| 321 |
+
add_random_periodic_offset=add_random_periodic_offset,
|
| 322 |
periodic_width_dropout=periodic_width_dropout,
|
| 323 |
mask=cz_mask,
|
| 324 |
ext_mask=ext_mask,
|
vampnet/modules/base.py
CHANGED
|
@@ -71,7 +71,7 @@ class VampBase(at.ml.BaseModel):
|
|
| 71 |
probs[i, :, -n:] = 0.0
|
| 72 |
|
| 73 |
# if we have a downsample factor, set the mask prob to 0
|
| 74 |
-
if downsample_factor is not None:
|
| 75 |
if not isinstance(downsample_factor, torch.Tensor):
|
| 76 |
downsample_factor = scalar_to_batch_tensor(downsample_factor, x.shape[0])
|
| 77 |
for i, factor in enumerate(downsample_factor):
|
|
@@ -200,7 +200,6 @@ class VampBase(at.ml.BaseModel):
|
|
| 200 |
# find where the mask token is and replace it with silence in the audio
|
| 201 |
for tstep in range(z.shape[-1]):
|
| 202 |
if torch.any(z[:, :, tstep] == self.mask_token):
|
| 203 |
-
print("mask token found at step", tstep)
|
| 204 |
sample_idx_0 = tstep * codec.hop_length
|
| 205 |
sample_idx_1 = sample_idx_0 + codec.hop_length
|
| 206 |
signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0
|
|
|
|
| 71 |
probs[i, :, -n:] = 0.0
|
| 72 |
|
| 73 |
# if we have a downsample factor, set the mask prob to 0
|
| 74 |
+
if downsample_factor is not None and downsample_factor > 0:
|
| 75 |
if not isinstance(downsample_factor, torch.Tensor):
|
| 76 |
downsample_factor = scalar_to_batch_tensor(downsample_factor, x.shape[0])
|
| 77 |
for i, factor in enumerate(downsample_factor):
|
|
|
|
| 200 |
# find where the mask token is and replace it with silence in the audio
|
| 201 |
for tstep in range(z.shape[-1]):
|
| 202 |
if torch.any(z[:, :, tstep] == self.mask_token):
|
|
|
|
| 203 |
sample_idx_0 = tstep * codec.hop_length
|
| 204 |
sample_idx_1 = sample_idx_0 + codec.hop_length
|
| 205 |
signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0
|