41
41
# StringGrouper.get_nearest_matches
42
42
GROUP_REP_PREFIX : str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate
43
43
44
+
44
45
# High level functions
45
46
46
47
47
- def compute_pairwise_similarities (string_series_1 : pd .Series ,
48
+ def who (bad_StringGrouper_param , param_1 , param_name_1 , param_2 , param_name_2 ):
49
+ # Private utility function used by high-level functions (that call StringGrouper) to form a
50
+ # descriptive name for their series input parameter which caused the exception of type
51
+ # StringGrouperNotAllStringsException to occur
52
+ if bad_StringGrouper_param == 'master' :
53
+ return f'\' { param_1 .name } \' ({ param_name_1 } )' if param_1 .name else param_name_1
54
+ else :
55
+ return f'\' { param_2 .name } \' ({ param_name_2 } )' if param_2 .name else param_name_2
56
+
57
+
58
+ def add_this_arg (func ):
59
+ # Behind-the-scenes function-wrapper (to be used as decorator for high-level functions "func")
60
+ # that shifts the parameters of "func" to the right by one, inserting a reference to local
61
+ # function "this" in the first parameter position
62
+ def this (* args , ** kwargs ):
63
+ return func (this , * args , ** kwargs )
64
+ return this
65
+
66
+
67
+ @add_this_arg
68
+ def compute_pairwise_similarities (this ,
69
+ string_series_1 : pd .Series ,
48
70
string_series_2 : pd .Series ,
49
71
** kwargs ) -> pd .Series :
50
72
"""
@@ -55,10 +77,21 @@ def compute_pairwise_similarities(string_series_1: pd.Series,
55
77
:param kwargs: All other keyword arguments are passed to StringGrouperConfig
56
78
:return: pandas.Series of similarity scores, the same length as string_series_1 and string_series_2
57
79
"""
58
- return StringGrouper (string_series_1 , string_series_2 , ** kwargs ).dot ()
59
-
60
-
61
- def group_similar_strings (strings_to_group : pd .Series ,
80
+ sg = StringGrouperPrime (string_series_1 , string_series_2 , ** kwargs )
81
+ # error handler (for input Series with values that are not strings)
82
+ if sg .non_strings_present :
83
+ sname = who (sg .bad_series_name ,
84
+ string_series_1 , 'string_series_1' ,
85
+ string_series_2 , 'string_series_2' )
86
+ this .issues = sg .issues
87
+ this .issues .rename (f'Non-strings in Series { sname } ' , inplace = True )
88
+ raise TypeError (sg .error_msg (sname , 'compute_pairwise_similarities' ))
89
+ return sg .dot ()
90
+
91
+
92
+ @add_this_arg
93
+ def group_similar_strings (this ,
94
+ strings_to_group : pd .Series ,
62
95
string_ids : Optional [pd .Series ] = None ,
63
96
** kwargs ) -> Union [pd .DataFrame , pd .Series ]:
64
97
"""
@@ -76,11 +109,22 @@ def group_similar_strings(strings_to_group: pd.Series,
76
109
:param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
77
110
:return: pandas.Series or pandas.DataFrame.
78
111
"""
79
- string_grouper = StringGrouper (strings_to_group , master_id = string_ids , ** kwargs ).fit ()
80
- return string_grouper .get_groups ()
81
-
82
-
83
- def match_most_similar (master : pd .Series ,
112
+ sg = StringGrouperPrime (strings_to_group , master_id = string_ids , ** kwargs )
113
+ # error handler (for input Series with values that are not strings)
114
+ if sg .non_strings_present :
115
+ sname = who (sg .bad_series_name ,
116
+ strings_to_group , 'strings_to_group' ,
117
+ None , '' )
118
+ this .issues = sg .issues
119
+ this .issues .rename (f'Non-strings in Series { sname } ' , inplace = True )
120
+ raise TypeError (sg .error_msg (sname , 'group_similar_strings' ))
121
+ fit_sg = sg .fit ()
122
+ return fit_sg .get_groups ()
123
+
124
+
125
+ @add_this_arg
126
+ def match_most_similar (this ,
127
+ master : pd .Series ,
84
128
duplicates : pd .Series ,
85
129
master_id : Optional [pd .Series ] = None ,
86
130
duplicates_id : Optional [pd .Series ] = None ,
@@ -105,15 +149,26 @@ def match_most_similar(master: pd.Series,
105
149
:param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
106
150
:return: pandas.Series or pandas.DataFrame.
107
151
"""
108
- string_grouper = StringGrouper (master ,
109
- duplicates = duplicates ,
110
- master_id = master_id ,
111
- duplicates_id = duplicates_id ,
112
- ** kwargs ).fit ()
113
- return string_grouper .get_groups ()
114
-
115
-
116
- def match_strings (master : pd .Series ,
152
+ sg = StringGrouperPrime (master ,
153
+ duplicates = duplicates ,
154
+ master_id = master_id ,
155
+ duplicates_id = duplicates_id ,
156
+ ** kwargs )
157
+ # error handler (for input Series with values that are not strings)
158
+ if sg .non_strings_present :
159
+ sname = who (sg .bad_series_name ,
160
+ master , 'master' ,
161
+ duplicates , 'duplicates' )
162
+ this .issues = sg .issues
163
+ this .issues .rename (f'Non-strings in Series { sname } ' , inplace = True )
164
+ raise TypeError (sg .error_msg (sname , 'match_most_similar' ))
165
+ fit_sg = sg .fit ()
166
+ return fit_sg .get_groups ()
167
+
168
+
169
+ @add_this_arg
170
+ def match_strings (this ,
171
+ master : pd .Series ,
117
172
duplicates : Optional [pd .Series ] = None ,
118
173
master_id : Optional [pd .Series ] = None ,
119
174
duplicates_id : Optional [pd .Series ] = None ,
@@ -130,12 +185,20 @@ def match_strings(master: pd.Series,
130
185
:param kwargs: All other keyword arguments are passed to StringGrouperConfig.
131
186
:return: pandas.Dataframe.
132
187
"""
133
- string_grouper = StringGrouper (master ,
134
- duplicates = duplicates ,
135
- master_id = master_id ,
136
- duplicates_id = duplicates_id ,
137
- ** kwargs ).fit ()
138
- return string_grouper .get_matches ()
188
+ sg = StringGrouperPrime (master ,
189
+ duplicates = duplicates ,
190
+ master_id = master_id ,
191
+ duplicates_id = duplicates_id ,
192
+ ** kwargs )
193
+ if sg .non_strings_present :
194
+ sname = who (sg .bad_series_name ,
195
+ master , 'master' ,
196
+ duplicates , 'duplicates' )
197
+ this .issues = sg .issues
198
+ this .issues .rename (f'Non-strings in Series { sname } ' , inplace = True )
199
+ raise TypeError (sg .error_msg (sname , 'match_strings' ))
200
+ fit_sg = sg .fit ()
201
+ return fit_sg .get_matches ()
139
202
140
203
141
204
class StringGrouperConfig (NamedTuple ):
@@ -189,11 +252,15 @@ def wrapper(*args, **kwargs):
189
252
return wrapper
190
253
191
254
192
- class StringGrouperNotFitException (Exception ):
255
+ class StringGrouperNotFitException (TypeError ):
193
256
"""Raised when one of the public functions is called which requires the StringGrouper to be fit first"""
194
257
pass
195
258
196
259
260
+ class StringGrouperNotAllStringsException (Exception ):
261
+ """Raised when either input Series master or duplicates contains non-strings"""
262
+ pass
263
+
197
264
class StringGrouper (object ):
198
265
def __init__ (self , master : pd .Series ,
199
266
duplicates : Optional [pd .Series ] = None ,
@@ -213,9 +280,9 @@ def __init__(self, master: pd.Series,
213
280
:param kwargs: All other keyword arguments are passed to StringGrouperConfig
214
281
"""
215
282
# Validate match strings input
216
- if not StringGrouper . _is_series_of_strings ( master ) or \
217
- ( duplicates is not None and not StringGrouper . _is_series_of_strings ( duplicates )):
218
- raise TypeError ( 'Input does not consist of pandas.Series containing only Strings ' )
283
+ self . issues : pd . Series = None
284
+ self . _check_string_series ( master , 'master' )
285
+ if ( duplicates is not None ): self . _check_string_series ( duplicates , 'duplicates ' )
219
286
# Validate optional IDs input
220
287
if not StringGrouper ._is_input_data_combination_valid (duplicates , master_id , duplicates_id ):
221
288
raise Exception ('List of data Series options is invalid' )
@@ -601,6 +668,21 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
601
668
dupe_indices = dupe_strings [dupe_strings == dupe_side ].index .to_series ().reset_index (drop = True )
602
669
return master_indices , dupe_indices
603
670
671
+ def _check_string_series (self , series_to_test : pd .Series , which : str ):
672
+ self .bad_series_name = which
673
+ StringGrouper ._check_type (series_to_test , which )
674
+ self ._check_content (series_to_test , which )
675
+
676
+ def _check_content (self , series_to_test : pd .Series , which : str ):
677
+ non_strings_exist = series_to_test .to_frame ().applymap (
678
+ lambda x : (not isinstance (x , str )) or len (x ) == 0
679
+ ).squeeze (axis = 1 )
680
+ if non_strings_exist .any ():
681
+ self .issues = series_to_test [non_strings_exist ]
682
+ sname = f' { series_to_test .name } ' if series_to_test .name else ''
683
+ self .issues .rename (f'Non-strings in { which } Series{ sname } ' , inplace = True )
684
+ raise StringGrouperNotAllStringsException
685
+
604
686
def _validate_group_rep_specs (self ):
605
687
group_rep_options = (GROUP_REP_FIRST , GROUP_REP_CENTROID )
606
688
if self ._config .group_rep not in group_rep_options :
@@ -617,6 +699,11 @@ def _validate_replace_na_and_drop(self):
617
699
"index if the number of index-levels does not equal the number of index-columns."
618
700
)
619
701
702
+ @staticmethod
703
+ def _check_type (series_to_test : pd .Series , which : str ):
704
+ if not isinstance (series_to_test , pd .Series ):
705
+ raise TypeError (f'Input { which } is not a pandas.Series containing only Strings' )
706
+
620
707
@staticmethod
621
708
def _symmetrize_matrix_and_fix_diagonal (AA : csr_matrix ) -> csr_matrix :
622
709
A = AA .tolil ()
@@ -656,16 +743,6 @@ def _validate_strings_exist(master_side, dupe_side, master_strings, dupe_strings
656
743
elif not dupe_strings .isin ([dupe_side ]).any ():
657
744
raise ValueError (f'{ dupe_side } not found in StringGrouper dupe string series' )
658
745
659
- @staticmethod
660
- def _is_series_of_strings (series_to_test : pd .Series ) -> bool :
661
- if not isinstance (series_to_test , pd .Series ):
662
- return False
663
- elif series_to_test .to_frame ().applymap (
664
- lambda x : not isinstance (x , str )
665
- ).squeeze (axis = 1 ).any ():
666
- return False
667
- return True
668
-
669
746
@staticmethod
670
747
def _is_input_data_combination_valid (duplicates , master_id , duplicates_id ) -> bool :
671
748
if duplicates is None and (duplicates_id is not None ) \
@@ -680,3 +757,36 @@ def _validate_id_data(master, duplicates, master_id, duplicates_id):
680
757
raise Exception ('Both master and master_id must be pandas.Series of the same length.' )
681
758
if duplicates is not None and duplicates_id is not None and len (duplicates ) != len (duplicates_id ):
682
759
raise Exception ('Both duplicates and duplicates_id must be pandas.Series of the same length.' )
760
+
761
+
762
+ class StringGrouperPrime (StringGrouper ):
763
+ # (To be used in high-level functions)
764
+ # Child class of StringGrouper that captures information about the input Series
765
+ # that caused the StringGrouperNotAllStringsException even when the StringGrouper
766
+ # instance is not fully initialized
767
+ def __init__ (self , master : pd .Series ,
768
+ duplicates : Optional [pd .Series ] = None ,
769
+ master_id : Optional [pd .Series ] = None ,
770
+ duplicates_id : Optional [pd .Series ] = None ,
771
+ ** kwargs ):
772
+ self .issues = None
773
+ self .non_strings_present = False
774
+ self .bad_series_name = None
775
+ try :
776
+ super ().__init__ (master ,
777
+ duplicates = duplicates ,
778
+ master_id = master_id ,
779
+ duplicates_id = duplicates_id ,
780
+ ** kwargs )
781
+ except StringGrouperNotAllStringsException :
782
+ self .non_strings_present = True
783
+
784
+ def error_msg (self , bad_series_name , function_name ):
785
+ nl = ':\n '
786
+ return (
787
+ f'\n \n ERROR: Input pandas Series { bad_series_name } contains values that are not strings!\n '
788
+ f'Display the pandas Series \' { function_name } .issues\' to find where these values are'
789
+ f'{ nl if 0 < len (self .issues ) < 12 else "." } '
790
+ f'{ self .issues .to_frame () if 0 < len (self .issues ) < 12 else "" } '
791
+ )
792
+
0 commit comments