Skip to content

Commit 82e3145

Browse files
added error-handler to capture non-strings in input Series
1 parent 4a0b225 commit 82e3145

File tree

2 files changed

+177
-46
lines changed

2 files changed

+177
-46
lines changed

string_grouper/string_grouper.py

+149-39
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,32 @@
4141
# StringGrouper.get_nearest_matches
4242
GROUP_REP_PREFIX: str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate
4343

44+
4445
# High level functions
4546

4647

47-
def compute_pairwise_similarities(string_series_1: pd.Series,
48+
def who(bad_StringGrouper_param, param_1, param_name_1, param_2, param_name_2):
49+
# Private utility function used by high-level functions (that call StringGrouper) to form a
50+
# descriptive name for their series input parameter which caused the exception of type
51+
# StringGrouperNotAllStringsException to occur
52+
if bad_StringGrouper_param == 'master':
53+
return f'\'{param_1.name}\' ({param_name_1})' if param_1.name else param_name_1
54+
else:
55+
return f'\'{param_2.name}\' ({param_name_2})' if param_2.name else param_name_2
56+
57+
58+
def add_this_arg(func):
59+
# Behind-the-scenes function-wrapper (to be used as decorator for high-level functions "func")
60+
# that shifts the parameters of "func" to the right by one, inserting a reference to local
61+
# function "this" in the first parameter position
62+
def this(*args, **kwargs):
63+
return func(this, *args, **kwargs)
64+
return this
65+
66+
67+
@add_this_arg
68+
def compute_pairwise_similarities(this,
69+
string_series_1: pd.Series,
4870
string_series_2: pd.Series,
4971
**kwargs) -> pd.Series:
5072
"""
@@ -55,10 +77,21 @@ def compute_pairwise_similarities(string_series_1: pd.Series,
5577
:param kwargs: All other keyword arguments are passed to StringGrouperConfig
5678
:return: pandas.Series of similarity scores, the same length as string_series_1 and string_series_2
5779
"""
58-
return StringGrouper(string_series_1, string_series_2, **kwargs).dot()
59-
60-
61-
def group_similar_strings(strings_to_group: pd.Series,
80+
sg = StringGrouperPrime(string_series_1, string_series_2, **kwargs)
81+
# error handler (for input Series with values that are not strings)
82+
if sg.non_strings_present:
83+
sname = who(sg.bad_series_name,
84+
string_series_1, 'string_series_1',
85+
string_series_2, 'string_series_2')
86+
this.issues = sg.issues
87+
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
88+
raise TypeError(sg.error_msg(sname, 'compute_pairwise_similarities'))
89+
return sg.dot()
90+
91+
92+
@add_this_arg
93+
def group_similar_strings(this,
94+
strings_to_group: pd.Series,
6295
string_ids: Optional[pd.Series] = None,
6396
**kwargs) -> Union[pd.DataFrame, pd.Series]:
6497
"""
@@ -76,11 +109,22 @@ def group_similar_strings(strings_to_group: pd.Series,
76109
:param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
77110
:return: pandas.Series or pandas.DataFrame.
78111
"""
79-
string_grouper = StringGrouper(strings_to_group, master_id=string_ids, **kwargs).fit()
80-
return string_grouper.get_groups()
81-
82-
83-
def match_most_similar(master: pd.Series,
112+
sg = StringGrouperPrime(strings_to_group, master_id=string_ids, **kwargs)
113+
# error handler (for input Series with values that are not strings)
114+
if sg.non_strings_present:
115+
sname = who(sg.bad_series_name,
116+
strings_to_group, 'strings_to_group',
117+
None, '')
118+
this.issues = sg.issues
119+
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
120+
raise TypeError(sg.error_msg(sname, 'group_similar_strings'))
121+
fit_sg = sg.fit()
122+
return fit_sg.get_groups()
123+
124+
125+
@add_this_arg
126+
def match_most_similar(this,
127+
master: pd.Series,
84128
duplicates: pd.Series,
85129
master_id: Optional[pd.Series] = None,
86130
duplicates_id: Optional[pd.Series] = None,
@@ -105,15 +149,26 @@ def match_most_similar(master: pd.Series,
105149
:param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
106150
:return: pandas.Series or pandas.DataFrame.
107151
"""
108-
string_grouper = StringGrouper(master,
109-
duplicates=duplicates,
110-
master_id=master_id,
111-
duplicates_id=duplicates_id,
112-
**kwargs).fit()
113-
return string_grouper.get_groups()
114-
115-
116-
def match_strings(master: pd.Series,
152+
sg = StringGrouperPrime(master,
153+
duplicates=duplicates,
154+
master_id=master_id,
155+
duplicates_id=duplicates_id,
156+
**kwargs)
157+
# error handler (for input Series with values that are not strings)
158+
if sg.non_strings_present:
159+
sname = who(sg.bad_series_name,
160+
master, 'master',
161+
duplicates, 'duplicates')
162+
this.issues = sg.issues
163+
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
164+
raise TypeError(sg.error_msg(sname, 'match_most_similar'))
165+
fit_sg = sg.fit()
166+
return fit_sg.get_groups()
167+
168+
169+
@add_this_arg
170+
def match_strings(this,
171+
master: pd.Series,
117172
duplicates: Optional[pd.Series] = None,
118173
master_id: Optional[pd.Series] = None,
119174
duplicates_id: Optional[pd.Series] = None,
@@ -130,12 +185,20 @@ def match_strings(master: pd.Series,
130185
:param kwargs: All other keyword arguments are passed to StringGrouperConfig.
131186
:return: pandas.Dataframe.
132187
"""
133-
string_grouper = StringGrouper(master,
134-
duplicates=duplicates,
135-
master_id=master_id,
136-
duplicates_id=duplicates_id,
137-
**kwargs).fit()
138-
return string_grouper.get_matches()
188+
sg = StringGrouperPrime(master,
189+
duplicates=duplicates,
190+
master_id=master_id,
191+
duplicates_id=duplicates_id,
192+
**kwargs)
193+
if sg.non_strings_present:
194+
sname = who(sg.bad_series_name,
195+
master, 'master',
196+
duplicates, 'duplicates')
197+
this.issues = sg.issues
198+
this.issues.rename(f'Non-strings in Series {sname}', inplace=True)
199+
raise TypeError(sg.error_msg(sname, 'match_strings'))
200+
fit_sg = sg.fit()
201+
return fit_sg.get_matches()
139202

140203

141204
class StringGrouperConfig(NamedTuple):
@@ -189,11 +252,15 @@ def wrapper(*args, **kwargs):
189252
return wrapper
190253

191254

192-
class StringGrouperNotFitException(Exception):
255+
class StringGrouperNotFitException(TypeError):
193256
"""Raised when one of the public functions is called which requires the StringGrouper to be fit first"""
194257
pass
195258

196259

260+
class StringGrouperNotAllStringsException(Exception):
261+
"""Raised when either input Series master or duplicates contains non-strings"""
262+
pass
263+
197264
class StringGrouper(object):
198265
def __init__(self, master: pd.Series,
199266
duplicates: Optional[pd.Series] = None,
@@ -213,9 +280,9 @@ def __init__(self, master: pd.Series,
213280
:param kwargs: All other keyword arguments are passed to StringGrouperConfig
214281
"""
215282
# Validate match strings input
216-
if not StringGrouper._is_series_of_strings(master) or \
217-
(duplicates is not None and not StringGrouper._is_series_of_strings(duplicates)):
218-
raise TypeError('Input does not consist of pandas.Series containing only Strings')
283+
self.issues: pd.Series = None
284+
self._check_string_series(master, 'master')
285+
if (duplicates is not None): self._check_string_series(duplicates, 'duplicates')
219286
# Validate optional IDs input
220287
if not StringGrouper._is_input_data_combination_valid(duplicates, master_id, duplicates_id):
221288
raise Exception('List of data Series options is invalid')
@@ -601,6 +668,21 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
601668
dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True)
602669
return master_indices, dupe_indices
603670

671+
def _check_string_series(self, series_to_test: pd.Series, which: str):
672+
self.bad_series_name = which
673+
StringGrouper._check_type(series_to_test, which)
674+
self._check_content(series_to_test, which)
675+
676+
def _check_content(self, series_to_test: pd.Series, which: str):
677+
non_strings_exist = series_to_test.to_frame().applymap(
678+
lambda x: (not isinstance(x, str)) or len(x) == 0
679+
).squeeze(axis=1)
680+
if non_strings_exist.any():
681+
self.issues = series_to_test[non_strings_exist]
682+
sname = f' {series_to_test.name}' if series_to_test.name else ''
683+
self.issues.rename(f'Non-strings in {which} Series{sname}', inplace=True)
684+
raise StringGrouperNotAllStringsException
685+
604686
def _validate_group_rep_specs(self):
605687
group_rep_options = (GROUP_REP_FIRST, GROUP_REP_CENTROID)
606688
if self._config.group_rep not in group_rep_options:
@@ -617,6 +699,11 @@ def _validate_replace_na_and_drop(self):
617699
"index if the number of index-levels does not equal the number of index-columns."
618700
)
619701

702+
@staticmethod
703+
def _check_type(series_to_test: pd.Series, which: str):
704+
if not isinstance(series_to_test, pd.Series):
705+
raise TypeError(f'Input {which} is not a pandas.Series containing only Strings')
706+
620707
@staticmethod
621708
def _symmetrize_matrix_and_fix_diagonal(AA: csr_matrix) -> csr_matrix:
622709
A = AA.tolil()
@@ -656,16 +743,6 @@ def _validate_strings_exist(master_side, dupe_side, master_strings, dupe_strings
656743
elif not dupe_strings.isin([dupe_side]).any():
657744
raise ValueError(f'{dupe_side} not found in StringGrouper dupe string series')
658745

659-
@staticmethod
660-
def _is_series_of_strings(series_to_test: pd.Series) -> bool:
661-
if not isinstance(series_to_test, pd.Series):
662-
return False
663-
elif series_to_test.to_frame().applymap(
664-
lambda x: not isinstance(x, str)
665-
).squeeze(axis=1).any():
666-
return False
667-
return True
668-
669746
@staticmethod
670747
def _is_input_data_combination_valid(duplicates, master_id, duplicates_id) -> bool:
671748
if duplicates is None and (duplicates_id is not None) \
@@ -680,3 +757,36 @@ def _validate_id_data(master, duplicates, master_id, duplicates_id):
680757
raise Exception('Both master and master_id must be pandas.Series of the same length.')
681758
if duplicates is not None and duplicates_id is not None and len(duplicates) != len(duplicates_id):
682759
raise Exception('Both duplicates and duplicates_id must be pandas.Series of the same length.')
760+
761+
762+
class StringGrouperPrime(StringGrouper):
763+
# (To be used in high-level functions)
764+
# Child class of StringGrouper that captures information about the input Series
765+
# that caused the StringGrouperNotAllStringsException even when the StringGrouper
766+
# instance is not fully initialized
767+
def __init__(self, master: pd.Series,
768+
duplicates: Optional[pd.Series] = None,
769+
master_id: Optional[pd.Series] = None,
770+
duplicates_id: Optional[pd.Series] = None,
771+
**kwargs):
772+
self.issues = None
773+
self.non_strings_present = False
774+
self.bad_series_name = None
775+
try:
776+
super().__init__(master,
777+
duplicates=duplicates,
778+
master_id=master_id,
779+
duplicates_id=duplicates_id,
780+
**kwargs)
781+
except StringGrouperNotAllStringsException:
782+
self.non_strings_present = True
783+
784+
def error_msg(self, bad_series_name, function_name):
785+
nl = ':\n'
786+
return (
787+
f'\n\nERROR: Input pandas Series {bad_series_name} contains values that are not strings!\n'
788+
f'Display the pandas Series \'{function_name}.issues\' to find where these values are'
789+
f'{nl if 0 < len(self.issues) < 12 else "."}'
790+
f'{self.issues.to_frame() if 0 < len(self.issues) < 12 else ""}'
791+
)
792+

string_grouper/test/test_string_grouper.py

+28-7
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \
66
DEFAULT_MAX_N_MATCHES, DEFAULT_REGEX, \
77
DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
8-
StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \
8+
StringGrouperConfig, StringGrouper, \
9+
StringGrouperNotFitException, StringGrouperNotAllStringsException, \
910
match_most_similar, group_similar_strings, match_strings,\
1011
compute_pairwise_similarities
1112
from unittest.mock import patch
@@ -144,12 +145,14 @@ def test_compute_pairwise_similarities_data_integrity(self):
144145
with self.assertRaises(Exception):
145146
_ = compute_pairwise_similarities(df1, df2[:-2])
146147

147-
@patch('string_grouper.string_grouper.StringGrouper')
148+
@patch('string_grouper.string_grouper.StringGrouperPrime')
148149
def test_group_similar_strings(self, mock_StringGouper):
149150
"""mocks StringGrouper to test if the high-level function group_similar_strings utilizes it as expected"""
150151
mock_StringGrouper_instance = mock_StringGouper.return_value
151152
mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
152153
mock_StringGrouper_instance.get_groups.return_value = 'whatever'
154+
mock_StringGrouper_instance.non_strings_present = False
155+
mock_StringGrouper_instance.error_msg.return_value = 'mock_error'
153156

154157
test_series_1 = None
155158
test_series_id_1 = None
@@ -162,12 +165,14 @@ def test_group_similar_strings(self, mock_StringGouper):
162165
mock_StringGrouper_instance.get_groups.assert_called_once()
163166
self.assertEqual(df, 'whatever')
164167

165-
@patch('string_grouper.string_grouper.StringGrouper')
168+
@patch('string_grouper.string_grouper.StringGrouperPrime')
166169
def test_match_most_similar(self, mock_StringGouper):
167170
"""mocks StringGrouper to test if the high-level function match_most_similar utilizes it as expected"""
168171
mock_StringGrouper_instance = mock_StringGouper.return_value
169172
mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
170173
mock_StringGrouper_instance.get_groups.return_value = 'whatever'
174+
mock_StringGrouper_instance.non_strings_present = False
175+
mock_StringGrouper_instance.error_msg.return_value = 'mock_error'
171176

172177
test_series_1 = None
173178
test_series_2 = None
@@ -184,12 +189,14 @@ def test_match_most_similar(self, mock_StringGouper):
184189
mock_StringGrouper_instance.get_groups.assert_called_once()
185190
self.assertEqual(df, 'whatever')
186191

187-
@patch('string_grouper.string_grouper.StringGrouper')
192+
@patch('string_grouper.string_grouper.StringGrouperPrime')
188193
def test_match_strings(self, mock_StringGouper):
189194
"""mocks StringGrouper to test if the high-level function match_strings utilizes it as expected"""
190195
mock_StringGrouper_instance = mock_StringGouper.return_value
191196
mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
192197
mock_StringGrouper_instance.get_matches.return_value = 'whatever'
198+
mock_StringGrouper_instance.non_strings_present = False
199+
mock_StringGrouper_instance.error_msg.return_value = 'mock_error'
193200

194201
test_series_1 = None
195202
test_series_id_1 = None
@@ -792,18 +799,32 @@ def test_string_grouper_type_error(self):
792799
"""StringGrouper should raise an typeerror master or duplicates are not a series of strings"""
793800
with self.assertRaises(TypeError):
794801
_ = StringGrouper('foo', 'bar')
795-
with self.assertRaises(TypeError):
802+
with self.assertRaises(StringGrouperNotAllStringsException):
796803
_ = StringGrouper(pd.Series(['foo', 'bar']), pd.Series(['foo', 1]))
797-
with self.assertRaises(TypeError):
804+
with self.assertRaises(StringGrouperNotAllStringsException):
798805
_ = StringGrouper(pd.Series(['foo', np.nan]), pd.Series(['foo', 'j']))
806+
with self.assertRaises(StringGrouperNotAllStringsException):
807+
_ = StringGrouper(pd.Series(['foo', 'j']), pd.Series(['foo', np.nan]))
808+
809+
def test_not_all_strings_exception_in_high_level_fucntions(self):
810+
good_series = pd.Series(['foo', 'bar'])
811+
bad_series = pd.Series([None, 'foo', 1, np.nan], name='dupes')
812+
with self.assertRaises(TypeError):
813+
_ = compute_pairwise_similarities(good_series, bad_series.rename_axis('dupes_id'))
814+
with self.assertRaises(TypeError):
815+
_ = group_similar_strings(bad_series.rename_axis('string_id'))
816+
with self.assertRaises(TypeError):
817+
_ = match_most_similar(bad_series.rename('master'), good_series)
818+
with self.assertRaises(TypeError):
819+
_ = match_strings(good_series, bad_series.rename('dupes').rename_axis('dupes_id'))
799820

800821
def test_prior_matches_added(self):
801822
"""When a new match is added, any pre-existing matches should also be updated"""
802823
sample = [
803824
'microsoftoffice 365 home',
804825
'microsoftoffice 365 pers',
805826
'microsoft office'
806-
]
827+
]
807828

808829
df = pd.DataFrame(sample, columns=['name'])
809830

0 commit comments

Comments
 (0)