| ORI_FOLDER = '' |
| INTERLEAVED_FOLDER = '' |
| AUGMENTED_FOLDER = '' |
| EVAL_SPLIT = 0.1 |
|
|
| import os |
| import re |
| import json |
| import shutil |
| import random |
| from tqdm import tqdm |
| from abctoolkit.utils import ( |
| remove_information_field, |
| remove_bar_no_annotations, |
| Quote_re, |
| Barlines, |
| extract_metadata_and_parts, |
| extract_global_and_local_metadata, |
| extract_barline_and_bartext_dict) |
| from abctoolkit.convert import unidecode_abc_lines |
| from abctoolkit.rotate import rotate_abc |
| from abctoolkit.check import check_alignment_unrotated |
| from abctoolkit.transpose import Key2index, transpose_an_abc_text |
|
|
| os.makedirs(INTERLEAVED_FOLDER, exist_ok=True) |
| os.makedirs(AUGMENTED_FOLDER, exist_ok=True) |
| for key in Key2index.keys(): |
| key_folder = os.path.join(AUGMENTED_FOLDER, key) |
| os.makedirs(key_folder, exist_ok=True) |
|
|
|
|
| def abc_preprocess_pipeline(abc_path): |
|
|
| with open(abc_path, 'r', encoding='utf-8') as f: |
| abc_lines = f.readlines() |
|
|
| |
| abc_lines = [line for line in abc_lines if line.strip() != ''] |
|
|
| |
| abc_lines = unidecode_abc_lines(abc_lines) |
|
|
| |
| abc_lines = remove_information_field(abc_lines=abc_lines, info_fields=['X:', 'T:', 'C:', 'W:', 'w:', 'Z:', '%%MIDI']) |
|
|
| |
| abc_lines = remove_bar_no_annotations(abc_lines) |
|
|
| |
| for i, line in enumerate(abc_lines): |
| if re.search(r'^[A-Za-z]:', line) or line.startswith('%'): |
| continue |
| else: |
| if r'\"' in line: |
| abc_lines[i] = abc_lines[i].replace(r'\"', '') |
|
|
| |
| for i, line in enumerate(abc_lines): |
| quote_contents = re.findall(Quote_re, line) |
| for quote_content in quote_contents: |
| for barline in Barlines: |
| if barline in quote_content: |
| line = line.replace(quote_content, '') |
| abc_lines[i] = line |
|
|
| |
| try: |
| _, bar_no_equal_flag, _ = check_alignment_unrotated(abc_lines) |
| if not bar_no_equal_flag: |
| print(abc_path, 'Unequal bar number') |
| raise Exception |
| except: |
| raise Exception |
|
|
| |
| for i, line in enumerate(abc_lines): |
| quote_matches = re.findall(r'"[^"]*"', line) |
| for match in quote_matches: |
| if match == '""': |
| line = line.replace(match, '') |
| if match[1] in ['^', '_']: |
| sub_string = match |
| pattern = r'([^a-zA-Z0-9])\1+' |
| sub_string = re.sub(pattern, r'\1', sub_string) |
| if len(sub_string) <= 40: |
| line = line.replace(match, sub_string) |
| else: |
| line = line.replace(match, '') |
| abc_lines[i] = line |
|
|
| abc_name = os.path.splitext(os.path.split(abc_path)[-1])[0] |
|
|
| |
| metadata_lines, part_text_dict = extract_metadata_and_parts(abc_lines) |
| global_metadata_dict, local_metadata_dict = extract_global_and_local_metadata(metadata_lines) |
| if global_metadata_dict['K'][0] == 'none': |
| global_metadata_dict['K'][0] = 'C' |
| ori_key = global_metadata_dict['K'][0] |
|
|
| interleaved_abc = rotate_abc(abc_lines) |
| interleaved_path = os.path.join(INTERLEAVED_FOLDER, abc_name + '.abc') |
| with open(interleaved_path, 'w') as w: |
| w.writelines(interleaved_abc) |
|
|
| for key in Key2index.keys(): |
| transposed_abc_text = transpose_an_abc_text(abc_lines, key) |
| transposed_abc_lines = transposed_abc_text.split('\n') |
| transposed_abc_lines = list(filter(None, transposed_abc_lines)) |
| transposed_abc_lines = [line + '\n' for line in transposed_abc_lines] |
|
|
| |
| metadata_lines, prefix_dict, left_barline_dict, bar_text_dict, right_barline_dict = \ |
| extract_barline_and_bartext_dict(transposed_abc_lines) |
| reduced_abc_lines = metadata_lines |
| for i in range(len(bar_text_dict['V:1'])): |
| line = '' |
| for symbol in prefix_dict.keys(): |
| valid_flag = False |
| for char in bar_text_dict[symbol][i]: |
| if char.isalpha() and not char in ['Z', 'z', 'X', 'x']: |
| valid_flag = True |
| break |
| if valid_flag: |
| if i == 0: |
| part_patch = '[' + symbol + ']' + prefix_dict[symbol] + left_barline_dict[symbol][0] + bar_text_dict[symbol][0] + right_barline_dict[symbol][0] |
| else: |
| part_patch = '[' + symbol + ']' + bar_text_dict[symbol][i] + right_barline_dict[symbol][i] |
| line += part_patch |
| line += '\n' |
| reduced_abc_lines.append(line) |
| |
| reduced_abc_name = abc_name + '_' + key |
| reduced_abc_path = os.path.join(AUGMENTED_FOLDER, key, reduced_abc_name + '.abc') |
| |
| with open(reduced_abc_path, 'w', encoding='utf-8') as w: |
| w.writelines(reduced_abc_lines) |
|
|
| return abc_name, ori_key |
|
|
|
|
|
|
|
|
|
|
| if __name__ == '__main__': |
| |
| data = [] |
| file_list = os.listdir(ORI_FOLDER) |
| for file in tqdm(file_list): |
| ori_abc_path = os.path.join(ORI_FOLDER, file) |
| try: |
| abc_name, ori_key = abc_preprocess_pipeline(ori_abc_path) |
| except: |
| print(ori_abc_path, 'failed to pre-process.') |
| continue |
|
|
| data.append({ |
| 'path': os.path.join(AUGMENTED_FOLDER, abc_name), |
| 'key': ori_key |
| }) |
|
|
| random.shuffle(data) |
| eval_data = data[ : int(EVAL_SPLIT * len(data))] |
| train_data = data[int(EVAL_SPLIT * len(data)) : ] |
|
|
| data_index_path = AUGMENTED_FOLDER + '.jsonl' |
| eval_index_path = AUGMENTED_FOLDER + '_eval.jsonl' |
| train_index_path = AUGMENTED_FOLDER + '_train.jsonl' |
|
|
|
|
| with open(data_index_path, 'w', encoding='utf-8') as w: |
| for d in data: |
| w.write(json.dumps(d) + '\n') |
| with open(eval_index_path, 'w', encoding='utf-8') as w: |
| for d in eval_data: |
| w.write(json.dumps(d) + '\n') |
| with open(train_index_path, 'w', encoding='utf-8') as w: |
| for d in train_data: |
| w.write(json.dumps(d) + '\n') |
|
|
| |
|
|
|
|