| |
|
|
| _pad = "$" |
| _punctuation = ';:,.!?¡¿—…"«»“” ' |
| _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" |
| _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" |
|
|
| |
| symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) |
|
|
| dicts = {} |
| for i in range(len((symbols))): |
| dicts[symbols[i]] = i |
|
|
|
|
| class TextCleaner: |
| def __init__(self, dummy=None): |
| self.word_index_dictionary = dicts |
| print(len(dicts)) |
|
|
| def __call__(self, text): |
| indexes = [] |
| for char in text: |
| try: |
| indexes.append(self.word_index_dictionary[char]) |
| except KeyError: |
| print(text) |
| return indexes |
|
|