Spaces:

Launchpad
/

lofi-bytes

Runtime error

App Files Files Community

amosyou commited on Aug 28, 2024

Commit

7116323

1 Parent(s): 6274dcb

feat: add lofi-bytes-api and gradio app

Browse files

Files changed (14) hide show

app.py +157 -0
model/loss.py +46 -0
model/music_transformer.py +200 -0
model/positional_encoding.py +23 -0
model/rpr.py +464 -0
processor.py +266 -0
requirements.txt +6 -0
uploaded_midis/am_i_blue_jazz.mid +0 -0
uploaded_midis/ghibli_castle_in_the_sky.mid +0 -0
utilities/argument_funcs.py +228 -0
utilities/constants.py +28 -0
utilities/device.py +73 -0
utilities/lr_scheduling.py +65 -0
utilities/run_model.py +95 -0

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import random
+import gradio as gr
+import torch
+from huggingface_hub import hf_hub_download
+from model.music_transformer import MusicTransformer
+from processor import decode_midi, encode_midi
+from utilities.constants import TOKEN_END, TOKEN_PAD, TORCH_LABEL_TYPE
+from utilities.device import get_device, use_cuda
+REPO_ID = "Launchpad/lofi-bytes"
+FILENAME = "weights_maestro_finetuned.pickle"
+SEQUENCE_START = 0
+OUTPUT_PATH = "./output_midi"
+RPR = True
+# TARGET_SEQ_LENGTH = 1023
+TARGET_SEQ_LENGTH = 512
+NUM_PRIME = 65
+MAX_SEQUENCE = 2048
+N_LAYERS = 6
+NUM_HEADS = 8
+D_MODEL = 512
+DIM_FEEDFORWARD = 1024
+BEAM = 0
+FORCE_CPU = False
+ALLOWED_EXTENSIONS = {'mid'}
+UPLOAD_FOLDER = './uploaded_midis'
+generated_midi = None
+use_cuda(True)
+model = MusicTransformer(
+    n_layers=N_LAYERS,
+    num_heads=NUM_HEADS,
+    d_model=D_MODEL,
+    dim_feedforward=DIM_FEEDFORWARD,
+    max_sequence=MAX_SEQUENCE,
+    rpr=RPR
+).to(get_device())
+state_dict = torch.load(
+    hf_hub_download(repo_id=REPO_ID, filename=FILENAME),
+    map_location=get_device()
+)
+model.load_state_dict(state_dict)
+def generate(input_midi):
+    raw_mid = encode_midi(input_midi)
+    if(len(raw_mid) == 0):
+        return
+    primer, _  = process_midi(raw_mid, NUM_PRIME, random_seq=False)
+    primer = torch.tensor(primer, dtype=TORCH_LABEL_TYPE, device=get_device())
+    # saves a pretty_midi at file_path
+    # decode_midi(primer[:NUM_PRIME].cpu().numpy(), file_path=f_path)
+    decode_midi(primer[:NUM_PRIME].cpu().numpy())
+    # GENERATION
+    model.eval()
+    with torch.set_grad_enabled(False):
+        # NOTE: model.generate() returns a MIDI stored as an ARRAY given a primer
+        beam_seq = model.generate(primer[:NUM_PRIME], TARGET_SEQ_LENGTH, beam=BEAM)
+        file_path = "output.mid"
+        # NOTE: function decode_midi() returns an actual MIDI of class pretty_midi.PrettyMIDI
+        decoded_midi = decode_midi(beam_seq[0].cpu().numpy(), file_path=file_path)
+        # THIS SHOULD BE EITHER decoded_midi OR beam_seq
+        # TODO: decoded_midi is actual pretty_midi MIDI file, beam_seq is just an array representing a MIDI
+        # decoded_midi stores more information about instruments and stuff
+        return file_path
+def process_midi(raw_mid, max_seq, random_seq):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Takes in pre-processed raw midi and returns the input and target. Can use a random sequence or
+    go from the start based on random_seq.
+    ----------
+    """
+    x   = torch.full((max_seq, ), TOKEN_PAD, dtype=TORCH_LABEL_TYPE, device=get_device())
+    tgt = torch.full((max_seq, ), TOKEN_PAD, dtype=TORCH_LABEL_TYPE, device=get_device())
+    raw_len     = len(raw_mid)
+    full_seq    = max_seq + 1 # Performing seq2seq
+    if(raw_len == 0):
+        return x, tgt
+    if(raw_len < full_seq):
+        x[:raw_len]         = raw_mid
+        tgt[:raw_len-1]     = raw_mid[1:]
+        tgt[raw_len]        = TOKEN_END
+    else:
+        # Randomly selecting a range
+        if(random_seq):
+            end_range = raw_len - full_seq
+            start = random.randint(SEQUENCE_START, end_range)
+        # Always taking from the start to as far as we can
+        else:
+            start = SEQUENCE_START
+        end = start + full_seq
+        data = raw_mid[start:end]
+        x = data[:max_seq]
+        tgt = data[1:full_seq]
+    return x, tgt
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Image(
+                "https://www.ocf.berkeley.edu/~launchpad/media/uploads/project_logos/410912267_278779401866686_2517511436172822307_n_0iVwDxI.png",
+                elem_id="logo-img",
+                show_label=False,
+                show_share_button=False,
+                show_download_button=False,
+                show_fullscreen_button=False,
+            )
+        with gr.Column(scale=3):
+            gr.Markdown("""lofi-bytes is a [Launchpad](https://launchpad.studentorg.berkeley.edu/) project (Spring 2023) that generates lofi tracks from input MIDI stamples using a MusicTransformer model.
+                        <br/><br/>
+                        **Model**: [lofi-bytes](https://huggingface.co/Launchpad/lofi-bytes)
+                        <br/>
+                        **Project Leader**: Alicia Wang
+                        <br/>
+                        **Members**: Alena Chao, Eric Liu, Zane Mogannam, Chloe Wong, Iris Zhou
+                        <br/>
+                        **Advisors**: Vincent Lim, Winston Liu
+                        <br/>
+                        """
+                        )
+    gr.Interface(
+        fn=generate,
+        inputs=gr.File(),
+        outputs=gr.File(),
+        examples=["uploaded_midis/ghibli_castle_in_the_sky.mid", "uploaded_midis/am_i_blue_jazz.mid"]
+    )
+if __name__ == '__main__':
+    demo.launch(share=True)

model/loss.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.loss import _Loss
+# Borrowed from https://github.com/jason9693/MusicTransformer-pytorch/blob/5f183374833ff6b7e17f3a24e3594dedd93a5fe5/custom/criterion.py#L28
+class SmoothCrossEntropyLoss(_Loss):
+    """
+    https://arxiv.org/abs/1512.00567
+    """
+    __constants__ = ['label_smoothing', 'vocab_size', 'ignore_index', 'reduction']
+    def __init__(self, label_smoothing, vocab_size, ignore_index=-100, reduction='mean', is_logits=True):
+        assert 0.0 <= label_smoothing <= 1.0
+        super().__init__(reduction=reduction)
+        self.label_smoothing = label_smoothing
+        self.vocab_size = vocab_size
+        self.ignore_index = ignore_index
+        self.input_is_logits = is_logits
+    def forward(self, input, target):
+        """
+        Args:
+            input: [B * T, V]
+            target: [B * T]
+        Returns:
+            cross entropy: [1]
+        """
+        mask = (target == self.ignore_index).unsqueeze(-1)
+        q = F.one_hot(target.long(), self.vocab_size).type(torch.float32)
+        u = 1.0 / self.vocab_size
+        q_prime = (1.0 - self.label_smoothing) * q + self.label_smoothing * u
+        q_prime = q_prime.masked_fill(mask, 0)
+        ce = self.cross_entropy_with_logits(q_prime, input)
+        if self.reduction == 'mean':
+            lengths = torch.sum(target != self.ignore_index)
+            return ce.sum() / lengths
+        elif self.reduction == 'sum':
+            return ce.sum()
+        else:
+            raise NotImplementedError
+    def cross_entropy_with_logits(self, p, q):
+        return -torch.sum(p * (q - q.logsumexp(dim=-1, keepdim=True)), dim=-1)

model/music_transformer.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+import torch.nn as nn
+from torch.nn.modules.normalization import LayerNorm
+import random
+from utilities.constants import *
+from utilities.device import get_device
+from .positional_encoding import PositionalEncoding
+from .rpr import TransformerEncoderRPR, TransformerEncoderLayerRPR
+# MusicTransformer
+class MusicTransformer(nn.Module):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Music Transformer reproduction from https://arxiv.org/abs/1809.04281. Arguments allow for
+    tweaking the transformer architecture (https://arxiv.org/abs/1706.03762) and the rpr argument
+    toggles Relative Position Representations (RPR - https://arxiv.org/abs/1803.02155).
+    Supports training and generation using Pytorch's nn.Transformer class with dummy decoder to
+    make a decoder-only transformer architecture
+    For RPR support, there is modified Pytorch 1.2.0 code in rpr.py. Modified source will be
+    kept up to date with Pytorch revisions only as necessary.
+    ----------
+    """
+    def __init__(self, n_layers=6, num_heads=8, d_model=512, dim_feedforward=1024,
+                 dropout=0.1, max_sequence=2048, rpr=False):
+        super(MusicTransformer, self).__init__()
+        self.dummy      = DummyDecoder()
+        self.nlayers    = n_layers
+        self.nhead      = num_heads
+        self.d_model    = d_model
+        self.d_ff       = dim_feedforward
+        self.dropout    = dropout
+        self.max_seq    = max_sequence
+        self.rpr        = rpr
+        # Input embedding
+        self.embedding = nn.Embedding(VOCAB_SIZE, self.d_model)
+        # Positional encoding
+        self.positional_encoding = PositionalEncoding(self.d_model, self.dropout, self.max_seq)
+        # Base transformer
+        if(not self.rpr):
+            # To make a decoder-only transformer we need to use masked encoder layers
+            # Dummy decoder to essentially just return the encoder output
+            self.transformer = nn.Transformer(
+                d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.nlayers,
+                num_decoder_layers=0, dropout=self.dropout, # activation=self.ff_activ,
+                dim_feedforward=self.d_ff, custom_decoder=self.dummy
+            )
+        # RPR Transformer
+        else:
+            encoder_norm = LayerNorm(self.d_model)
+            encoder_layer = TransformerEncoderLayerRPR(self.d_model, self.nhead, self.d_ff, self.dropout, er_len=self.max_seq)
+            encoder = TransformerEncoderRPR(encoder_layer, self.nlayers, encoder_norm)
+            self.transformer = nn.Transformer(
+                d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.nlayers,
+                num_decoder_layers=0, dropout=self.dropout, # activation=self.ff_activ,
+                dim_feedforward=self.d_ff, custom_decoder=self.dummy, custom_encoder=encoder
+            )
+        # Final output is a softmaxed linear layer
+        self.Wout       = nn.Linear(self.d_model, VOCAB_SIZE)
+        self.softmax    = nn.Softmax(dim=-1)
+    # forward
+    def forward(self, x, mask=True):
+        """
+        ----------
+        Author: Damon Gwinn
+        ----------
+        Takes an input sequence and outputs predictions using a sequence to sequence method.
+        A prediction at one index is the "next" prediction given all information seen previously.
+        ----------
+        """
+        if(mask is True):
+            mask = self.transformer.generate_square_subsequent_mask(x.shape[1]).to(get_device())
+        else:
+            mask = None
+        x = self.embedding(x)
+        # Input shape is (max_seq, batch_size, d_model)
+        x = x.permute(1,0,2)
+        x = self.positional_encoding(x)
+        # Since there are no true decoder layers, the tgt is unused
+        # Pytorch wants src and tgt to have some equal dims however
+        x_out = self.transformer(src=x, tgt=x, src_mask=mask)
+        # Back to (batch_size, max_seq, d_model)
+        x_out = x_out.permute(1,0,2)
+        y = self.Wout(x_out)
+        # y = self.softmax(y)
+        del mask
+        # They are trained to predict the next note in sequence (we don't need the last one)
+        return y
+    # generate
+    def generate(self, primer=None, target_seq_length=1024, beam=0, beam_chance=1.0):
+        """
+        ----------
+        Author: Damon Gwinn
+        ----------
+        Generates midi given a primer sample. Music can be generated using a probability distribution over
+        the softmax probabilities (recommended) or by using a beam search.
+        ----------
+        """
+        assert (not self.training), "Cannot generate while in training mode"
+        print("Generating sequence of max length:", target_seq_length)
+        gen_seq = torch.full((1,target_seq_length), TOKEN_PAD, dtype=TORCH_LABEL_TYPE, device=get_device())
+        num_primer = len(primer)
+        gen_seq[..., :num_primer] = primer.type(TORCH_LABEL_TYPE).to(get_device())
+        # print("primer:",primer)
+        # print(gen_seq)
+        cur_i = num_primer
+        while(cur_i < target_seq_length):
+            # gen_seq_batch     = gen_seq.clone()
+            y = self.softmax(self.forward(gen_seq[..., :cur_i]))[..., :TOKEN_END]
+            token_probs = y[:, cur_i-1, :]
+            if(beam == 0):
+                beam_ran = 2.0
+            else:
+                beam_ran = random.uniform(0,1)
+            if(beam_ran <= beam_chance):
+                token_probs = token_probs.flatten()
+                top_res, top_i = torch.topk(token_probs, beam)
+                beam_rows = top_i // VOCAB_SIZE
+                beam_cols = top_i % VOCAB_SIZE
+                gen_seq = gen_seq[beam_rows, :]
+                gen_seq[..., cur_i] = beam_cols
+            else:
+                distrib = torch.distributions.categorical.Categorical(probs=token_probs)
+                next_token = distrib.sample()
+                # print("next token:",next_token)
+                gen_seq[:, cur_i] = next_token
+                # Let the transformer decide to end if it wants to
+                if(next_token == TOKEN_END):
+                    print("Model called end of sequence at:", cur_i, "/", target_seq_length)
+                    break
+            cur_i += 1
+            if(cur_i % 50 == 0):
+                print(cur_i, "/", target_seq_length)
+        return gen_seq[:, :cur_i]
+# Used as a dummy to nn.Transformer
+# DummyDecoder
+class DummyDecoder(nn.Module):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    A dummy decoder that returns its input. Used to make the Pytorch transformer into a decoder-only
+    architecture (stacked encoders with dummy decoder fits the bill)
+    ----------
+    """
+    def __init__(self):
+        super(DummyDecoder, self).__init__()
+    def forward(self, tgt, memory, tgt_mask, memory_mask,tgt_key_padding_mask,memory_key_padding_mask, **kwargs):
+        """
+        ----------
+        Author: Damon Gwinn
+        ----------
+        Returns the input (memory)
+        ----------
+        """
+        return memory

model/positional_encoding.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch
+import torch.nn as nn
+import math
+# PositionalEncoding
+# Taken from https://pytorch.org/tutorials/beginner/transformer_tutorial.html
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)

model/rpr.py ADDED Viewed

	@@ -0,0 +1,464 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.nn.parameter import Parameter
+from torch.nn import Module
+from torch.nn.modules.transformer import _get_clones
+from torch.nn.modules.linear import Linear
+from torch.nn.modules.dropout import Dropout
+from torch.nn.modules.normalization import LayerNorm
+from torch.nn.init import *
+from torch.nn.functional import linear, softmax, dropout
+# TransformerEncoderRPR
+class TransformerEncoderRPR(Module):
+    """
+    ----------
+    Author: Pytorch
+    ----------
+    For Relative Position Representation support (https://arxiv.org/abs/1803.02155)
+    https://pytorch.org/docs/1.2.0/_modules/torch/nn/modules/transformer.html#TransformerEncoder
+    No modification. Copied here to ensure continued compatibility with other edits.
+    ----------
+    """
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoderRPR, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(self, src, mask=None, src_key_padding_mask=None, **kwargs):
+        output = src
+        for i in range(self.num_layers):
+            output = self.layers[i](output, src_mask=mask,
+                                    src_key_padding_mask=src_key_padding_mask)
+        if self.norm:
+            output = self.norm(output)
+        return output
+# TransformerEncoderLayerRPR
+class TransformerEncoderLayerRPR(Module):
+    """
+    ----------
+    Author: Pytorch
+    Modified: Damon Gwinn
+    ----------
+    For Relative Position Representation support (https://arxiv.org/abs/1803.02155)
+    https://pytorch.org/docs/1.2.0/_modules/torch/nn/modules/transformer.html#TransformerEncoderLayer
+    Modification to create and call custom MultiheadAttentionRPR
+    ----------
+    """
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, er_len=None):
+        super(TransformerEncoderLayerRPR, self).__init__()
+        self.self_attn = MultiheadAttentionRPR(d_model, nhead, dropout=dropout, er_len=er_len)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model)
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+    def forward(self, src, src_mask=None, src_key_padding_mask=None):
+        src2 = self.self_attn(src, src, src, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(F.relu(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+# MultiheadAttentionRPR
+class MultiheadAttentionRPR(Module):
+    """
+    ----------
+    Author: Pytorch
+    Modified: Damon Gwinn
+    ----------
+    For Relative Position Representation support (https://arxiv.org/abs/1803.02155)
+    https://pytorch.org/docs/1.2.0/_modules/torch/nn/modules/activation.html#MultiheadAttention
+    Modification to add RPR embedding Er and call custom multi_head_attention_forward_rpr
+    ----------
+    """
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None, er_len=None):
+        super(MultiheadAttentionRPR, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
+        if self._qkv_same_embed_dim is False:
+            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        # Adding RPR embedding matrix
+        if(er_len is not None):
+            self.Er = Parameter(torch.rand((er_len, self.head_dim), dtype=torch.float32))
+        else:
+            self.Er = None
+        self._reset_parameters()
+    def _reset_parameters(self):
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.)
+            constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+    def forward(self, query, key, value, key_padding_mask=None,
+                need_weights=True, attn_mask=None):
+        if hasattr(self, '_qkv_same_embed_dim') and self._qkv_same_embed_dim is False:
+            # return F.multi_head_attention_forward(
+            #     query, key, value, self.embed_dim, self.num_heads,
+            #     self.in_proj_weight, self.in_proj_bias,
+            #     self.bias_k, self.bias_v, self.add_zero_attn,
+            #     self.dropout, self.out_proj.weight, self.out_proj.bias,
+            #     training=self.training,
+            #     key_padding_mask=key_padding_mask, need_weights=need_weights,
+            #     attn_mask=attn_mask, use_separate_proj_weight=True,
+            #     q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+            #     v_proj_weight=self.v_proj_weight)
+            return multi_head_attention_forward_rpr(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight, rpr_mat=self.Er)
+        else:
+            if not hasattr(self, '_qkv_same_embed_dim'):
+                warnings.warn('A new version of MultiheadAttention module has been implemented. \
+                    Please re-train your model with the new module',
+                              UserWarning)
+            # return F.multi_head_attention_forward(
+            #     query, key, value, self.embed_dim, self.num_heads,
+            #     self.in_proj_weight, self.in_proj_bias,
+            #     self.bias_k, self.bias_v, self.add_zero_attn,
+            #     self.dropout, self.out_proj.weight, self.out_proj.bias,
+            #     training=self.training,
+            #     key_padding_mask=key_padding_mask, need_weights=need_weights,
+            #     attn_mask=attn_mask)
+            return multi_head_attention_forward_rpr(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, rpr_mat=self.Er)
+# multi_head_attention_forward_rpr
+def multi_head_attention_forward_rpr(query,                       # type: Tensor
+                                 key,                             # type: Tensor
+                                 value,                           # type: Tensor
+                                 embed_dim_to_check,              # type: int
+                                 num_heads,                       # type: int
+                                 in_proj_weight,                  # type: Tensor
+                                 in_proj_bias,                    # type: Tensor
+                                 bias_k,                          # type: Optional[Tensor]
+                                 bias_v,                          # type: Optional[Tensor]
+                                 add_zero_attn,                   # type: bool
+                                 dropout_p,                       # type: float
+                                 out_proj_weight,                 # type: Tensor
+                                 out_proj_bias,                   # type: Tensor
+                                 training=True,                   # type: bool
+                                 key_padding_mask=None,           # type: Optional[Tensor]
+                                 need_weights=True,               # type: bool
+                                 attn_mask=None,                  # type: Optional[Tensor]
+                                 use_separate_proj_weight=False,  # type: bool
+                                 q_proj_weight=None,              # type: Optional[Tensor]
+                                 k_proj_weight=None,              # type: Optional[Tensor]
+                                 v_proj_weight=None,              # type: Optional[Tensor]
+                                 static_k=None,                   # type: Optional[Tensor]
+                                 static_v=None,                   # type: Optional[Tensor]
+                                 rpr_mat=None
+                                 ):
+    """
+    ----------
+    Author: Pytorch
+    Modified: Damon Gwinn
+    ----------
+    For Relative Position Representation support (https://arxiv.org/abs/1803.02155)
+    https://pytorch.org/docs/1.2.0/_modules/torch/nn/functional.html
+    Modification to take RPR embedding matrix and perform skew optimized RPR (https://arxiv.org/abs/1809.04281)
+    ----------
+    """
+    # type: (...) -> Tuple[Tensor, Optional[Tensor]]
+    qkv_same = torch.equal(query, key) and torch.equal(key, value)
+    kv_same = torch.equal(key, value)
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    assert list(query.size()) == [tgt_len, bsz, embed_dim]
+    assert key.size() == value.size()
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+    if use_separate_proj_weight is not True:
+        if qkv_same:
+            # self-attention
+            q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
+        elif kv_same:
+            # encoder-decoder attention
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = linear(query, _w, _b)
+            if key is None:
+                assert value is None
+                k = None
+                v = None
+            else:
+                # This is inline in_proj function with in_proj_weight and in_proj_bias
+                _b = in_proj_bias
+                _start = embed_dim
+                _end = None
+                _w = in_proj_weight[_start:, :]
+                if _b is not None:
+                    _b = _b[_start:]
+                k, v = linear(key, _w, _b).chunk(2, dim=-1)
+        else:
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = linear(query, _w, _b)
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim
+            _end = embed_dim * 2
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            k = linear(key, _w, _b)
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim * 2
+            _end = None
+            _w = in_proj_weight[_start:, :]
+            if _b is not None:
+                _b = _b[_start:]
+            v = linear(value, _w, _b)
+    else:
+        q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
+        len1, len2 = q_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == query.size(-1)
+        k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
+        len1, len2 = k_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == key.size(-1)
+        v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
+        len1, len2 = v_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == value.size(-1)
+        if in_proj_bias is not None:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
+            k = linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)])
+            v = linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
+        else:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias)
+            k = linear(key, k_proj_weight_non_opt, in_proj_bias)
+            v = linear(value, v_proj_weight_non_opt, in_proj_bias)
+    q = q * scaling
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask,
+                                      torch.zeros((attn_mask.size(0), 1),
+                                                  dtype=attn_mask.dtype,
+                                                  device=attn_mask.device)], dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [key_padding_mask, torch.zeros((key_padding_mask.size(0), 1),
+                                                   dtype=key_padding_mask.dtype,
+                                                   device=key_padding_mask.device)], dim=1)
+        else:
+            assert static_k is None, "bias cannot be added to static key."
+            assert static_v is None, "bias cannot be added to static value."
+    else:
+        assert bias_k is None
+        assert bias_v is None
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == head_dim
+        v = static_v
+    src_len = k.size(1)
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), 1),
+                                                          dtype=attn_mask.dtype,
+                                                          device=attn_mask.device)], dim=1)
+        if key_padding_mask is not None:
+            key_padding_mask = torch.cat(
+                [key_padding_mask, torch.zeros((key_padding_mask.size(0), 1),
+                                               dtype=key_padding_mask.dtype,
+                                               device=key_padding_mask.device)], dim=1)
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+    ######### ADDITION OF RPR ###########
+    if(rpr_mat is not None):
+        rpr_mat = _get_valid_embedding(rpr_mat, q.shape[1], k.shape[1])
+        qe = torch.einsum("hld,md->hlm", q, rpr_mat)
+        srel = _skew(qe)
+        attn_output_weights += srel
+    if attn_mask is not None:
+        attn_mask = attn_mask.unsqueeze(0)
+        attn_output_weights += attn_mask
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+    attn_output_weights = softmax(
+        attn_output_weights, dim=-1)
+    attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
+    attn_output = torch.bmm(attn_output_weights, v)
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / num_heads
+    else:
+        return attn_output, None
+def _get_valid_embedding(Er, len_q, len_k):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Gets valid embeddings based on max length of RPR attention
+    ----------
+    """
+    len_e = Er.shape[0]
+    start = max(0, len_e - len_q)
+    return Er[start:, :]
+def _skew(qe):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Performs the skew optimized RPR computation (https://arxiv.org/abs/1809.04281)
+    ----------
+    """
+    sz = qe.shape[1]
+    mask = (torch.triu(torch.ones(sz, sz).to(qe.device)) == 1).float().flip(0)
+    qe = mask * qe
+    qe = F.pad(qe, (1,0, 0,0, 0,0))
+    qe = torch.reshape(qe, (qe.shape[0], qe.shape[2], qe.shape[1]))
+    srel = qe[:, 1:, :]
+    return srel

processor.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import pretty_midi
+RANGE_NOTE_ON = 128
+RANGE_NOTE_OFF = 128
+RANGE_VEL = 32
+RANGE_TIME_SHIFT = 100
+START_IDX = {
+    'note_on': 0,
+    'note_off': RANGE_NOTE_ON,
+    'time_shift': RANGE_NOTE_ON + RANGE_NOTE_OFF,
+    'velocity': RANGE_NOTE_ON + RANGE_NOTE_OFF + RANGE_TIME_SHIFT
+}
+class SustainAdapter:
+    def __init__(self, time, type):
+        self.start =  time
+        self.type = type
+class SustainDownManager:
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+        self.managed_notes = []
+        self._note_dict = {} # key: pitch, value: note.start
+    def add_managed_note(self, note: pretty_midi.Note):
+        self.managed_notes.append(note)
+    def transposition_notes(self):
+        for note in reversed(self.managed_notes):
+            try:
+                note.end = self._note_dict[note.pitch]
+            except KeyError:
+                note.end = max(self.end, note.end)
+            self._note_dict[note.pitch] = note.start
+# Divided note by note_on, note_off
+class SplitNote:
+    def __init__(self, type, time, value, velocity):
+        ## type: note_on, note_off
+        self.type = type
+        self.time = time
+        self.velocity = velocity
+        self.value = value
+    def __repr__(self):
+        return '<[SNote] time: {} type: {}, value: {}, velocity: {}>'\
+            .format(self.time, self.type, self.value, self.velocity)
+class Event:
+    def __init__(self, event_type, value):
+        self.type = event_type
+        self.value = value
+    def __repr__(self):
+        return '<Event type: {}, value: {}>'.format(self.type, self.value)
+    def to_int(self):
+        return START_IDX[self.type] + self.value
+    @staticmethod
+    def from_int(int_value):
+        info = Event._type_check(int_value)
+        return Event(info['type'], info['value'])
+    @staticmethod
+    def _type_check(int_value):
+        range_note_on = range(0, RANGE_NOTE_ON)
+        range_note_off = range(RANGE_NOTE_ON, RANGE_NOTE_ON+RANGE_NOTE_OFF)
+        range_time_shift = range(RANGE_NOTE_ON+RANGE_NOTE_OFF,RANGE_NOTE_ON+RANGE_NOTE_OFF+RANGE_TIME_SHIFT)
+        valid_value = int_value
+        if int_value in range_note_on:
+            return {'type': 'note_on', 'value': valid_value}
+        elif int_value in range_note_off:
+            valid_value -= RANGE_NOTE_ON
+            return {'type': 'note_off', 'value': valid_value}
+        elif int_value in range_time_shift:
+            valid_value -= (RANGE_NOTE_ON + RANGE_NOTE_OFF)
+            return {'type': 'time_shift', 'value': valid_value}
+        else:
+            valid_value -= (RANGE_NOTE_ON + RANGE_NOTE_OFF + RANGE_TIME_SHIFT)
+            return {'type': 'velocity', 'value': valid_value}
+def _divide_note(notes):
+    result_array = []
+    notes.sort(key=lambda x: x.start)
+    for note in notes:
+        on = SplitNote('note_on', note.start, note.pitch, note.velocity)
+        off = SplitNote('note_off', note.end, note.pitch, None)
+        result_array += [on, off]
+    return result_array
+def _merge_note(snote_sequence):
+    note_on_dict = {}
+    result_array = []
+    for snote in snote_sequence:
+        # print(note_on_dict)
+        if snote.type == 'note_on':
+            note_on_dict[snote.value] = snote
+        elif snote.type == 'note_off':
+            try:
+                on = note_on_dict[snote.value]
+                off = snote
+                if off.time - on.time == 0:
+                    continue
+                result = pretty_midi.Note(on.velocity, snote.value, on.time, off.time)
+                result_array.append(result)
+            except:
+                print('info removed pitch: {}'.format(snote.value))
+    return result_array
+def _snote2events(snote: SplitNote, prev_vel: int):
+    result = []
+    if snote.velocity is not None:
+        modified_velocity = snote.velocity // 4
+        if prev_vel != modified_velocity:
+            result.append(Event(event_type='velocity', value=modified_velocity))
+    result.append(Event(event_type=snote.type, value=snote.value))
+    return result
+def _event_seq2snote_seq(event_sequence):
+    timeline = 0
+    velocity = 0
+    snote_seq = []
+    for event in event_sequence:
+        if event.type == 'time_shift':
+            timeline += ((event.value+1) / 100)
+        if event.type == 'velocity':
+            velocity = event.value * 4
+        else:
+            snote = SplitNote(event.type, timeline, event.value, velocity)
+            snote_seq.append(snote)
+    return snote_seq
+def _make_time_sift_events(prev_time, post_time):
+    time_interval = int(round((post_time - prev_time) * 100))
+    results = []
+    while time_interval >= RANGE_TIME_SHIFT:
+        results.append(Event(event_type='time_shift', value=RANGE_TIME_SHIFT-1))
+        time_interval -= RANGE_TIME_SHIFT
+    if time_interval == 0:
+        return results
+    else:
+        return results + [Event(event_type='time_shift', value=time_interval-1)]
+def _control_preprocess(ctrl_changes):
+    sustains = []
+    manager = None
+    for ctrl in ctrl_changes:
+        if ctrl.value >= 64 and manager is None:
+            # sustain down
+            manager = SustainDownManager(start=ctrl.time, end=None)
+        elif ctrl.value < 64 and manager is not None:
+            # sustain up
+            manager.end = ctrl.time
+            sustains.append(manager)
+            manager = None
+        elif ctrl.value < 64 and len(sustains) > 0:
+            sustains[-1].end = ctrl.time
+    return sustains
+def _note_preprocess(susteins, notes):
+    note_stream = []
+    if susteins:    # if the midi file has sustain controls
+        for sustain in susteins:
+            for note_idx, note in enumerate(notes):
+                if note.start < sustain.start:
+                    note_stream.append(note)
+                elif note.start > sustain.end:
+                    notes = notes[note_idx:]
+                    sustain.transposition_notes()
+                    break
+                else:
+                    sustain.add_managed_note(note)
+        for sustain in susteins:
+            note_stream += sustain.managed_notes
+    else:       # else, just push everything into note stream
+        for note_idx, note in enumerate(notes):
+            note_stream.append(note)
+    note_stream.sort(key= lambda x: x.start)
+    return note_stream
+def encode_midi(file_path):
+    events = []
+    notes = []
+    mid = pretty_midi.PrettyMIDI(midi_file=file_path)
+    for inst in mid.instruments:
+        inst_notes = inst.notes
+        # ctrl.number is the number of sustain control. If you want to know abour the number type of control,
+        # see https://www.midi.org/specifications-old/item/table-3-control-change-messages-data-bytes-2
+        ctrls = _control_preprocess([ctrl for ctrl in inst.control_changes if ctrl.number == 64])
+        notes += _note_preprocess(ctrls, inst_notes)
+    dnotes = _divide_note(notes)
+    # print(dnotes)
+    dnotes.sort(key=lambda x: x.time)
+    # print('sorted:')
+    # print(dnotes)
+    cur_time = 0
+    cur_vel = 0
+    for snote in dnotes:
+        events += _make_time_sift_events(prev_time=cur_time, post_time=snote.time)
+        events += _snote2events(snote=snote, prev_vel=cur_vel)
+        # events += _make_time_sift_events(prev_time=cur_time, post_time=snote.time)
+        cur_time = snote.time
+        cur_vel = snote.velocity
+    return [e.to_int() for e in events]
+def decode_midi(idx_array, file_path=None):
+    event_sequence = [Event.from_int(idx) for idx in idx_array]
+    # print(event_sequence)
+    snote_seq = _event_seq2snote_seq(event_sequence)
+    note_seq = _merge_note(snote_seq)
+    note_seq.sort(key=lambda x:x.start)
+    mid = pretty_midi.PrettyMIDI()
+    # if want to change instument, see https://www.midi.org/specifications/item/gm-level-1-sound-set
+    instument = pretty_midi.Instrument(0, False, "Composed by Super Piano Music Transformer AI")
+    instument.notes = note_seq
+    mid.instruments.append(instument)
+    if file_path is not None:
+        mid.write(file_path)
+    return mid
+if __name__ == '__main__':
+    encoded = encode_midi('bin/ADIG04.mid')
+    print(encoded)
+    decided = decode_midi(encoded,file_path='bin/test.mid')
+    ins = pretty_midi.PrettyMIDI('bin/ADIG04.mid')
+    print(ins)
+    print(ins.instruments[0])
+    for i in ins.instruments:
+        print(i.control_changes)
+        print(i.notes)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+huggingface_hub
+pretty_midi
+setuptools
+spaces
+torch

uploaded_midis/am_i_blue_jazz.mid ADDED Viewed

Binary file (21.5 kB). View file

uploaded_midis/ghibli_castle_in_the_sky.mid ADDED Viewed

Binary file (2.81 kB). View file

utilities/argument_funcs.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import argparse
+from .constants import SEPERATOR
+# parse_train_args
+def parse_train_args():
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Argparse arguments for training a model
+    ----------
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-input_dir", type=str, default="./dataset/e_piano", help="Folder of preprocessed and pickled midi files")
+    parser.add_argument("-output_dir", type=str, default="./saved_models", help="Folder to save model weights. Saves one every epoch")
+    parser.add_argument("-weight_modulus", type=int, default=1, help="How often to save epoch weights (ex: value of 10 means save every 10 epochs)")
+    parser.add_argument("-print_modulus", type=int, default=1, help="How often to print train results for a batch (batch loss, learn rate, etc.)")
+    parser.add_argument("-n_workers", type=int, default=1, help="Number of threads for the dataloader")
+    parser.add_argument("--force_cpu", action="store_true", help="Forces model to run on a cpu even when gpu is available")
+    parser.add_argument("--no_tensorboard", action="store_true", help="Turns off tensorboard result reporting")
+    parser.add_argument("-continue_weights", type=str, default=None, help="Model weights to continue training based on")
+    parser.add_argument("-continue_epoch", type=int, default=None, help="Epoch the continue_weights model was at")
+    parser.add_argument("-lr", type=float, default=None, help="Constant learn rate. Leave as None for a custom scheduler.")
+    parser.add_argument("-ce_smoothing", type=float, default=None, help="Smoothing parameter for smoothed cross entropy loss (defaults to no smoothing)")
+    parser.add_argument("-batch_size", type=int, default=2, help="Batch size to use")
+    parser.add_argument("-epochs", type=int, default=100, help="Number of epochs to use")
+    parser.add_argument("--rpr", action="store_true", help="Use a modified Transformer for Relative Position Representations")
+    parser.add_argument("-max_sequence", type=int, default=2048, help="Maximum midi sequence to consider")
+    parser.add_argument("-n_layers", type=int, default=6, help="Number of decoder layers to use")
+    parser.add_argument("-num_heads", type=int, default=8, help="Number of heads to use for multi-head attention")
+    parser.add_argument("-d_model", type=int, default=512, help="Dimension of the model (output dim of embedding layers, etc.)")
+    parser.add_argument("-dim_feedforward", type=int, default=1024, help="Dimension of the feedforward layer")
+    parser.add_argument("-dropout", type=float, default=0.1, help="Dropout rate")
+    return parser.parse_args()
+# print_train_args
+def print_train_args(args):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Prints training arguments
+    ----------
+    """
+    print(SEPERATOR)
+    print("input_dir:", args.input_dir)
+    print("output_dir:", args.output_dir)
+    print("weight_modulus:", args.weight_modulus)
+    print("print_modulus:", args.print_modulus)
+    print("")
+    print("n_workers:", args.n_workers)
+    print("force_cpu:", args.force_cpu)
+    print("tensorboard:", not args.no_tensorboard)
+    print("")
+    print("continue_weights:", args.continue_weights)
+    print("continue_epoch:", args.continue_epoch)
+    print("")
+    print("lr:", args.lr)
+    print("ce_smoothing:", args.ce_smoothing)
+    print("batch_size:", args.batch_size)
+    print("epochs:", args.epochs)
+    print("")
+    print("rpr:", args.rpr)
+    print("max_sequence:", args.max_sequence)
+    print("n_layers:", args.n_layers)
+    print("num_heads:", args.num_heads)
+    print("d_model:", args.d_model)
+    print("")
+    print("dim_feedforward:", args.dim_feedforward)
+    print("dropout:", args.dropout)
+    print(SEPERATOR)
+    print("")
+# parse_eval_args
+def parse_eval_args():
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Argparse arguments for evaluating a model
+    ----------
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-dataset_dir", type=str, default="./dataset/e_piano", help="Folder of preprocessed and pickled midi files")
+    parser.add_argument("-model_weights", type=str, default="./saved_models/model.pickle", help="Pickled model weights file saved with torch.save and model.state_dict()")
+    parser.add_argument("-n_workers", type=int, default=1, help="Number of threads for the dataloader")
+    parser.add_argument("--force_cpu", action="store_true", help="Forces model to run on a cpu even when gpu is available")
+    parser.add_argument("-batch_size", type=int, default=2, help="Batch size to use")
+    parser.add_argument("--rpr", action="store_true", help="Use a modified Transformer for Relative Position Representations")
+    parser.add_argument("-max_sequence", type=int, default=2048, help="Maximum midi sequence to consider in the model")
+    parser.add_argument("-n_layers", type=int, default=6, help="Number of decoder layers to use")
+    parser.add_argument("-num_heads", type=int, default=8, help="Number of heads to use for multi-head attention")
+    parser.add_argument("-d_model", type=int, default=512, help="Dimension of the model (output dim of embedding layers, etc.)")
+    parser.add_argument("-dim_feedforward", type=int, default=1024, help="Dimension of the feedforward layer")
+    return parser.parse_args()
+# print_eval_args
+def print_eval_args(args):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Prints evaluation arguments
+    ----------
+    """
+    print(SEPERATOR)
+    print("dataset_dir:", args.dataset_dir)
+    print("model_weights:", args.model_weights)
+    print("n_workers:", args.n_workers)
+    print("force_cpu:", args.force_cpu)
+    print("")
+    print("batch_size:", args.batch_size)
+    print("")
+    print("rpr:", args.rpr)
+    print("max_sequence:", args.max_sequence)
+    print("n_layers:", args.n_layers)
+    print("num_heads:", args.num_heads)
+    print("d_model:", args.d_model)
+    print("")
+    print("dim_feedforward:", args.dim_feedforward)
+    print(SEPERATOR)
+    print("")
+# parse_generate_args
+def parse_generate_args():
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Argparse arguments for generation
+    ----------
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-midi_root", type=str, default="./dataset/e_piano/", help="Midi file to prime the generator with")
+    parser.add_argument("-output_dir", type=str, default="./gen", help="Folder to write generated midi to")
+    parser.add_argument("-primer_file", type=str, default=None, help="File path or integer index to the evaluation dataset. Default is to select a random index.")
+    parser.add_argument("--force_cpu", action="store_true", help="Forces model to run on a cpu even when gpu is available")
+    parser.add_argument("-target_seq_length", type=int, default=1024, help="Target length you'd like the midi to be")
+    parser.add_argument("-num_prime", type=int, default=256, help="Amount of messages to prime the generator with")
+    parser.add_argument("-model_weights", type=str, default="./saved_models/model.pickle", help="Pickled model weights file saved with torch.save and model.state_dict()")
+    parser.add_argument("-beam", type=int, default=0, help="Beam search k. 0 for random probability sample and 1 for greedy")
+    parser.add_argument("--rpr", action="store_true", help="Use a modified Transformer for Relative Position Representations")
+    parser.add_argument("-max_sequence", type=int, default=2048, help="Maximum midi sequence to consider")
+    parser.add_argument("-n_layers", type=int, default=6, help="Number of decoder layers to use")
+    parser.add_argument("-num_heads", type=int, default=8, help="Number of heads to use for multi-head attention")
+    parser.add_argument("-d_model", type=int, default=512, help="Dimension of the model (output dim of embedding layers, etc.)")
+    parser.add_argument("-dim_feedforward", type=int, default=1024, help="Dimension of the feedforward layer")
+    return parser.parse_args()
+# print_generate_args
+def print_generate_args(args):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Prints generation arguments
+    ----------
+    """
+    print(SEPERATOR)
+    print("midi_root:", args.midi_root)
+    print("output_dir:", args.output_dir)
+    print("primer_file:", args.primer_file)
+    print("force_cpu:", args.force_cpu)
+    print("")
+    print("target_seq_length:", args.target_seq_length)
+    print("num_prime:", args.num_prime)
+    print("model_weights:", args.model_weights)
+    print("beam:", args.beam)
+    print("")
+    print("rpr:", args.rpr)
+    print("max_sequence:", args.max_sequence)
+    print("n_layers:", args.n_layers)
+    print("num_heads:", args.num_heads)
+    print("d_model:", args.d_model)
+    print("")
+    print("dim_feedforward:", args.dim_feedforward)
+    print(SEPERATOR)
+    print("")
+# write_model_params
+def write_model_params(args, output_file):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Writes given training parameters to text file
+    ----------
+    """
+    o_stream = open(output_file, "w")
+    o_stream.write("rpr: " + str(args.rpr) + "\n")
+    o_stream.write("lr: " + str(args.lr) + "\n")
+    o_stream.write("ce_smoothing: " + str(args.ce_smoothing) + "\n")
+    o_stream.write("batch_size: " + str(args.batch_size) + "\n")
+    o_stream.write("max_sequence: " + str(args.max_sequence) + "\n")
+    o_stream.write("n_layers: " + str(args.n_layers) + "\n")
+    o_stream.write("num_heads: " + str(args.num_heads) + "\n")
+    o_stream.write("d_model: " + str(args.d_model) + "\n")
+    o_stream.write("dim_feedforward: " + str(args.dim_feedforward) + "\n")
+    o_stream.write("dropout: " + str(args.dropout) + "\n")
+    o_stream.close()

utilities/constants.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch
+from processor import RANGE_NOTE_ON, RANGE_NOTE_OFF, RANGE_VEL, RANGE_TIME_SHIFT
+SEPERATOR               = "========================="
+# Taken from the paper
+ADAM_BETA_1             = 0.9
+ADAM_BETA_2             = 0.98
+ADAM_EPSILON            = 10e-9
+LR_DEFAULT_START        = 1.0
+SCHEDULER_WARMUP_STEPS  = 4000
+# LABEL_SMOOTHING_E       = 0.1
+# DROPOUT_P               = 0.1
+TOKEN_END               = RANGE_NOTE_ON + RANGE_NOTE_OFF + RANGE_VEL + RANGE_TIME_SHIFT
+TOKEN_PAD               = TOKEN_END + 1
+VOCAB_SIZE              = TOKEN_PAD + 1
+TORCH_FLOAT             = torch.float32
+TORCH_INT               = torch.int32
+TORCH_LABEL_TYPE        = torch.long
+PREPEND_ZEROS_WIDTH     = 4

utilities/device.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# For all things related to devices
+#### ONLY USE PROVIDED FUNCTIONS, DO NOT USE GLOBAL CONSTANTS ####
+import torch
+import os
+# change cuda devices to ones that are available after running nvidia-smi.
+os.environ["CUDA_VISIBLE_DEVICES"] = '3,4,5'
+TORCH_CPU_DEVICE = torch.device("cpu")
+# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
+if(torch.cuda.device_count() > 0):
+    TORCH_CUDA_DEVICE = torch.device("cuda")
+else:
+    print("----- WARNING: CUDA devices not detected. This will cause the model to run very slow! -----")
+    print("")
+    TORCH_CUDA_DEVICE = None
+USE_CUDA = True
+# use_cuda
+def use_cuda(cuda_bool):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Sets whether to use CUDA (if available), or use the CPU (not recommended)
+    ----------
+    """
+    global USE_CUDA
+    USE_CUDA = cuda_bool
+# get_device
+def get_device():
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Grabs the default device. Default device is CUDA if available and use_cuda is not False, CPU otherwise.
+    ----------
+    """
+    if((not USE_CUDA) or (TORCH_CUDA_DEVICE is None)):
+        return TORCH_CPU_DEVICE
+    else:
+        return TORCH_CUDA_DEVICE
+# cuda_device
+def cuda_device():
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Grabs the cuda device (may be None if CUDA is not available)
+    ----------
+    """
+    return TORCH_CUDA_DEVICE
+# cpu_device
+def cpu_device():
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Grabs the cpu device
+    ----------
+    """
+    return TORCH_CPU_DEVICE

utilities/lr_scheduling.py ADDED Viewed

	@@ -0,0 +1,65 @@

+#Library Imports
+import math
+#Using Adam optimizer with
+#Beta_1=0.9, Beta_2=0.98, and Epsilon=10^-9
+#Learning rate varies over course of training
+#lrate = sqrt(d_model)*min((1/sqrt(step_num)), step_num*(1/warmup_steps*sqrt(warmup_steps)))
+# LrStepTracker
+class LrStepTracker:
+    """
+    ----------
+    Author: Ryan Marshall
+    Modified: Damon Gwinn
+    ----------
+    Class for custom learn rate scheduler (to be used by torch.optim.lr_scheduler.LambdaLR).
+    Learn rate for each step (batch) given the warmup steps is:
+        lr = [ 1/sqrt(d_model) ] * min[ 1/sqrt(step) , step * (warmup_steps)^-1.5 ]
+    This is from Attention is All you Need (https://arxiv.org/abs/1706.03762)
+    ----------
+    """
+    def __init__(self, model_dim=512, warmup_steps=4000, init_steps=0):
+        # Store Values
+        self.warmup_steps = warmup_steps
+        self.model_dim = model_dim
+        self.init_steps = init_steps
+        # Begin Calculations
+        self.invsqrt_dim = (1 / math.sqrt(model_dim))
+        self.invsqrt_warmup = (1 / (warmup_steps * math.sqrt(warmup_steps)))
+    # step
+    def step(self, step):
+        """
+        ----------
+        Author: Ryan Marshall
+        Modified: Damon Gwinn
+        ----------
+        Method to pass to LambdaLR. Increments the step and computes the new learn rate.
+        ----------
+        """
+        step += self.init_steps
+        if(step <= self.warmup_steps):
+            return self.invsqrt_dim * self.invsqrt_warmup * step
+        else:
+            invsqrt_step = (1 / math.sqrt(step))
+            return self.invsqrt_dim * invsqrt_step
+# get_lr
+def get_lr(optimizer):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Hack to get the current learn rate of the model
+    ----------
+    """
+    for param_group in optimizer.param_groups:
+        return param_group['lr']

utilities/run_model.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+import time
+from .constants import *
+from utilities.device import get_device
+from .lr_scheduling import get_lr
+from dataset.e_piano import compute_epiano_accuracy
+# train_epoch
+def train_epoch(cur_epoch, model, dataloader, loss, opt, lr_scheduler=None, print_modulus=1):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Trains a single model epoch
+    ----------
+    """
+    out = -1
+    model.train()
+    for batch_num, batch in enumerate(dataloader):
+        time_before = time.time()
+        opt.zero_grad()
+        x   = batch[0].to(get_device())
+        tgt = batch[1].to(get_device())
+        y = model(x)
+        y   = y.reshape(y.shape[0] * y.shape[1], -1)
+        tgt = tgt.flatten()
+        out = loss.forward(y, tgt)
+        out.backward()
+        opt.step()
+        if(lr_scheduler is not None):
+            lr_scheduler.step()
+        time_after = time.time()
+        time_took = time_after - time_before
+        if((batch_num+1) % print_modulus == 0):
+            print(SEPERATOR)
+            print("Epoch", cur_epoch, " Batch", batch_num+1, "/", len(dataloader))
+            print("LR:", get_lr(opt))
+            print("Train loss:", float(out))
+            print("")
+            print("Time (s):", time_took)
+            print(SEPERATOR)
+            print("")
+    return
+# eval_model
+def eval_model(model, dataloader, loss):
+    """
+    ----------
+    Author: Damon Gwinn
+    ----------
+    Evaluates the model and prints the average loss and accuracy
+    ----------
+    """
+    model.eval()
+    avg_acc     = -1
+    avg_loss    = -1
+    with torch.set_grad_enabled(False):
+        n_test      = len(dataloader)
+        sum_loss   = 0.0
+        sum_acc    = 0.0
+        for batch in dataloader:
+            x   = batch[0].to(get_device())
+            tgt = batch[1].to(get_device())
+            y = model(x)
+            sum_acc += float(compute_epiano_accuracy(y, tgt))
+            y   = y.reshape(y.shape[0] * y.shape[1], -1)
+            tgt = tgt.flatten()
+            out = loss.forward(y, tgt)
+            sum_loss += float(out)
+        avg_loss    = sum_loss / n_test
+        avg_acc     = sum_acc / n_test
+    return avg_loss, avg_acc