| import csv |
| import pathlib |
|
|
| import click |
|
|
|
|
| @click.command(help='Add ph_num attribute into transcriptions.csv') |
| @click.argument('transcription', metavar='TRANSCRIPTIONS') |
| @click.option('--dictionary', metavar='DICTIONARY') |
| @click.option('--vowels', metavar='FILE') |
| @click.option('--consonants', metavar='FILE') |
| def add_ph_num( |
| transcription: str, |
| dictionary: str = None, |
| vowels: str = None, |
| consonants: str = None |
| ): |
| assert dictionary is not None or (vowels is not None and consonants is not None), \ |
| 'Either dictionary file or vowels and consonants file should be specified.' |
| if dictionary is not None: |
| dictionary = pathlib.Path(dictionary).resolve() |
| vowels = {'SP', 'AP'} |
| consonants = set() |
| with open(dictionary, 'r', encoding='utf8') as f: |
| rules = f.readlines() |
| for r in rules: |
| syllable, phonemes = r.split('\t') |
| phonemes = phonemes.split() |
| assert len(phonemes) <= 2, 'We only support two-phase dictionaries for automatically adding ph_num.' |
| if len(phonemes) == 1: |
| vowels.add(phonemes[0]) |
| else: |
| consonants.add(phonemes[0]) |
| vowels.add(phonemes[1]) |
| else: |
| vowels_path = pathlib.Path(vowels).resolve() |
| consonants_path = pathlib.Path(consonants).resolve() |
| vowels = {'SP', 'AP'} |
| consonants = set() |
| with open(vowels_path, 'r', encoding='utf8') as f: |
| vowels.update(f.read().split()) |
| with open(consonants_path, 'r', encoding='utf8') as f: |
| consonants.update(f.read().split()) |
| overlapped = vowels.intersection(consonants) |
| assert len(vowels.intersection(consonants)) == 0, \ |
| 'Vowel set and consonant set overlapped. The following phonemes ' \ |
| 'appear both as vowels and as consonants:\n' \ |
| f'{sorted(overlapped)}' |
|
|
| transcription = pathlib.Path(transcription).resolve() |
| items: list[dict] = [] |
| with open(transcription, 'r', encoding='utf8') as f: |
| reader = csv.DictReader(f) |
| for item in reader: |
| items.append(item) |
|
|
| for item in items: |
| item: dict |
| ph_seq = item['ph_seq'].split() |
| for ph in ph_seq: |
| assert ph in vowels or ph in consonants, \ |
| f'Invalid phoneme symbol \'{ph}\' in \'{item["name"]}\'.' |
| ph_num = [] |
| i = 0 |
| while i < len(ph_seq): |
| j = i + 1 |
| while j < len(ph_seq) and ph_seq[j] in consonants: |
| j += 1 |
| ph_num.append(str(j - i)) |
| i = j |
| item['ph_num'] = ' '.join(ph_num) |
|
|
| with open(transcription, 'w', encoding='utf8', newline='') as f: |
| writer = csv.DictWriter(f, fieldnames=items[0].keys()) |
| writer.writeheader() |
| writer.writerows(items) |
|
|
|
|
| if __name__ == '__main__': |
| add_ph_num() |
|
|