from miditok import REMI, TokenizerConfig
from random import shuffle, sample
from pathlib import Path

# Our tokenizer's configuration
BEAT_RES = {(0, 1): 12, (1, 2): 4, (2, 4): 2, (4, 8): 1}
TOKENIZER_PARAMS = {
    "pitch_range": (21, 108),
    "beat_res": BEAT_RES,
    "num_velocities": 32,
    "special_tokens": ["PAD", "BOS", "EOS"],
    "use_chords": True,
    "use_rests": True,
    "use_tempos": True,
    "use_time_signatures": True,
    "use_programs": False,  # We want single track 
    "one_token_stream_for_programs": False, # We want single track
    "programs": list(range(0, 128)), #-1 drums, skip drums
    "num_tempos": 32,
    "tempo_range": (40, 250),  # (min_tempo, max_tempo)
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer REMI PLUS
tokenizer = REMI(config)

root_data_dir = Path('/root')
root_save = Path(root_data_dir / 'HuggingFace_Mistral_Transformer_Single_Instrument')

tokenizer_name = "HuggingFace_Mistral_Transformer_Single_Instrument_v4_single_track.json"


data_dirs = ["MIDIs"]
midi_paths = []
for data_dir in data_dirs:
    path = Path(root_data_dir / data_dir)
    midi_paths.extend(list(path.resolve().glob("**/*.mid")) + list(path.resolve().glob("**/*.midi")))

print(f"Found {len(midi_paths)} MIDI files")

midi_paths = midi_paths.shuffle()

# We need a subset of files otherwise training tokenizer takes too long
percentage_to_select = 0.20
num_files_to_select = int(len(midi_paths) * percentage_to_select)

subset_midi_paths = sample(midi_paths, num_files_to_select)
print(f"Found {len(subset_midi_paths)} MIDI files")


tokenizer.train(
    vocab_size=24000,
    files_paths=subset_midi_paths,
)
tokenizer.save(root_save / tokenizer_name)