-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathsuggestion.py
64 lines (53 loc) · 2.2 KB
/
suggestion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from popular_domains import emailDomains
import jellyfish
from typing import List
from concurrent.futures import ThreadPoolExecutor
import numpy as np
class TrieNode:
def __init__(self, char: str):
self.char = char
self.children = {}
self.word_end = False
class Trie:
def __init__(self):
self.root = TrieNode('')
def add(self, word: str):
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TrieNode(char)
node = node.children[char]
node.word_end = True
def search(self, word: str) -> bool:
node = self.root
for char in word:
if char not in node.children:
return False
node = node.children[char]
return node.word_end
def suggest_email_domain(domain: str, valid_domains: List[str]) -> List[str]:
# Build a trie with valid domains
trie = Trie()
for valid_domain in valid_domains:
trie.add(valid_domain)
# Calculate distances using a faster string distance metric
distances = {}
with ThreadPoolExecutor(max_workers=np.minimum(16, len(valid_domains))) as executor:
for valid_domain, distance in zip(valid_domains, executor.map(lambda x: jellyfish.damerau_levenshtein_distance(domain, x), valid_domains)):
if distance <= 2:
if distance in distances:
if valid_domain not in distances[distance]:
distances[distance].append(valid_domain)
else:
distances[distance] = [valid_domain]
# Choose the most similar domains based on alphabetical order
sorted_domains = np.array([])
if distances:
min_distance = min(distances.keys())
sorted_domains = sorted(distances[min_distance])
sorted_domains = [d for d in sorted_domains if trie.search(d)]
# Check for phonetic similarity using Soundex
soundex_domain = jellyfish.soundex(domain)
phonetically_similar_domains = [d for d in valid_domains if jellyfish.soundex(d) == soundex_domain and d not in sorted_domains]
# Combine and return the results
return sorted_domains + phonetically_similar_domains