from miditok import REMI, TokenizerConfig from random import shuffle, sample from pathlib import Path # Our tokenizer's configuration BEAT_RES = {(0, 1): 12, (1, 2): 4, (2, 4): 2, (4, 8): 1} TOKENIZER_PARAMS = { "pitch_range": (21, 108), "beat_res": BEAT_RES, "num_velocities": 32, "special_tokens": ["PAD", "BOS", "EOS"], "use_chords": True, "use_rests": True, "use_tempos": True, "use_time_signatures": True, "use_programs": False, # We want single track "one_token_stream_for_programs": False, # We want single track "programs": list(range(0, 128)), #-1 drums, skip drums "num_tempos": 32, "tempo_range": (40, 250), # (min_tempo, max_tempo) } config = TokenizerConfig(**TOKENIZER_PARAMS) # Creates the tokenizer REMI PLUS tokenizer = REMI(config) root_data_dir = Path('/root') root_save = Path(root_data_dir / 'HuggingFace_Mistral_Transformer_Single_Instrument') tokenizer_name = "HuggingFace_Mistral_Transformer_Single_Instrument_v4_single_track.json" data_dirs = ["MIDIs"] midi_paths = [] for data_dir in data_dirs: path = Path(root_data_dir / data_dir) midi_paths.extend(list(path.resolve().glob("**/*.mid")) + list(path.resolve().glob("**/*.midi"))) print(f"Found {len(midi_paths)} MIDI files") midi_paths = midi_paths.shuffle() # We need a subset of files otherwise training tokenizer takes too long percentage_to_select = 0.20 num_files_to_select = int(len(midi_paths) * percentage_to_select) subset_midi_paths = sample(midi_paths, num_files_to_select) print(f"Found {len(subset_midi_paths)} MIDI files") tokenizer.train( vocab_size=24000, files_paths=subset_midi_paths, ) tokenizer.save(root_save / tokenizer_name)