| from popular_domains import emailDomains |
| import jellyfish |
| from typing import List |
| from concurrent.futures import ThreadPoolExecutor |
| import numpy as np |
|
|
| class TrieNode: |
| def __init__(self, char: str): |
| self.char = char |
| self.children = {} |
| self.word_end = False |
|
|
| class Trie: |
| def __init__(self): |
| self.root = TrieNode('') |
|
|
| def add(self, word: str): |
| node = self.root |
| for char in word: |
| if char not in node.children: |
| node.children[char] = TrieNode(char) |
| node = node.children[char] |
| node.word_end = True |
|
|
| def search(self, word: str) -> bool: |
| node = self.root |
| for char in word: |
| if char not in node.children: |
| return False |
| node = node.children[char] |
| return node.word_end |
|
|
| def suggest_email_domain(domain: str, valid_domains: List[str]) -> List[str]: |
| |
| trie = Trie() |
| for valid_domain in valid_domains: |
| trie.add(valid_domain) |
|
|
| |
| distances = {} |
| with ThreadPoolExecutor(max_workers=np.minimum(16, len(valid_domains))) as executor: |
| for valid_domain, distance in zip(valid_domains, executor.map(lambda x: jellyfish.damerau_levenshtein_distance(domain, x), valid_domains)): |
| if distance <= 2: |
| if distance in distances: |
| if valid_domain not in distances[distance]: |
| distances[distance].append(valid_domain) |
| else: |
| distances[distance] = [valid_domain] |
|
|
| |
| sorted_domains = np.array([]) |
| if distances: |
| min_distance = min(distances.keys()) |
| sorted_domains = sorted(distances[min_distance]) |
| sorted_domains = [d for d in sorted_domains if trie.search(d)] |
|
|
| |
| soundex_domain = jellyfish.soundex(domain) |
| phonetically_similar_domains = [d for d in valid_domains if jellyfish.soundex(d) == soundex_domain and d not in sorted_domains] |
|
|
| |
| return sorted_domains + phonetically_similar_domains |
|
|
|
|
|
|