18
18
DEFAULT_IGNORE_CASE : bool = True # ignores case by default
19
19
DEFAULT_DROP_INDEX : bool = False # includes index-columns in output
20
20
DEFAULT_REPLACE_NA : bool = False # when finding the most similar strings, does not replace NaN values in most
21
- # similar string index-columns with corresponding duplicates-index values
22
- DEFAULT_INCLUDE_ZEROES : bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
23
- # matches appear in the output
21
+ # similar string index-columns with corresponding duplicates-index values
22
+ DEFAULT_INCLUDE_ZEROES : bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
23
+ # matches appear in the output
24
24
DEFAULT_SUPPRESS_WARNING : bool = False # when the minimum cosine similarity <=0 and zero-similarity matches are
25
- # requested, determines whether or not to suppress the message warning that
26
- # max_n_matches may be too small
25
+ # requested, determines whether or not to suppress the message warning that max_n_matches may be too small
27
26
GROUP_REP_CENTROID : str = 'centroid' # Option value to select the string in each group with the largest
28
- # similarity aggregate as group-representative:
27
+ # similarity aggregate as group-representative:
29
28
GROUP_REP_FIRST : str = 'first' # Option value to select the first string in each group as group-representative:
30
- DEFAULT_GROUP_REP : str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
29
+ DEFAULT_GROUP_REP : str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
31
30
32
31
# The following string constants are used by (but aren't [yet] options passed to) StringGrouper
33
32
DEFAULT_COLUMN_NAME : str = 'side' # used to name non-index columns of the output of StringGrouper.get_matches
34
- DEFAULT_ID_NAME : str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
33
+ DEFAULT_ID_NAME : str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
35
34
LEFT_PREFIX : str = 'left_' # used to prefix columns on the left of the output of StringGrouper.get_matches
36
35
RIGHT_PREFIX : str = 'right_' # used to prefix columns on the right of the output of StringGrouper.get_matches
37
36
MOST_SIMILAR_PREFIX : str = 'most_similar_' # used to prefix columns of the output of
38
- # StringGrouper._get_nearest_matches
39
- DEFAULT_MASTER_NAME : str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
37
+ # StringGrouper._get_nearest_matches
38
+ DEFAULT_MASTER_NAME : str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
40
39
DEFAULT_MASTER_ID_NAME : str = f'{ DEFAULT_MASTER_NAME } _{ DEFAULT_ID_NAME } ' # used to name id-column of the output of
41
- # StringGrouper.get_nearest_matches
40
+ # StringGrouper.get_nearest_matches
42
41
GROUP_REP_PREFIX : str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate
43
42
44
43
@@ -65,7 +64,7 @@ def this(*args, **kwargs):
65
64
66
65
67
66
@add_this_arg
68
- def compute_pairwise_similarities (this ,
67
+ def compute_pairwise_similarities (this ,
69
68
string_series_1 : pd .Series ,
70
69
string_series_2 : pd .Series ,
71
70
** kwargs ) -> pd .Series :
@@ -214,11 +213,11 @@ class StringGrouperConfig(NamedTuple):
214
213
Defaults to number of cores on a machine - 1.
215
214
:param ignore_case: bool. Whether or not case should be ignored. Defaults to True (ignore case).
216
215
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to False.
217
- :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
216
+ :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
218
217
appear in the output. Defaults to True.
219
218
:param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to supress
220
219
the message warning that max_n_matches may be too small. Defaults to False.
221
- :param replace_na: whether or not to replace NaN values in most similar string index-columns with
220
+ :param replace_na: whether or not to replace NaN values in most similar string index-columns with
222
221
corresponding duplicates-index values. Defaults to False.
223
222
:param group_rep: str. The scheme to select the group-representative. Default is 'centroid'.
224
223
The other choice is 'first'.
@@ -261,6 +260,7 @@ class StringGrouperNotAllStringsException(TypeError):
261
260
"""Raised when either input Series master or duplicates contains non-strings"""
262
261
pass
263
262
263
+
264
264
class StringGrouper (object ):
265
265
def __init__ (self , master : pd .Series ,
266
266
duplicates : Optional [pd .Series ] = None ,
@@ -282,7 +282,8 @@ def __init__(self, master: pd.Series,
282
282
# Validate match strings input
283
283
self .issues : pd .Series = None
284
284
self ._check_string_series (master , 'master' )
285
- if (duplicates is not None ): self ._check_string_series (duplicates , 'duplicates' )
285
+ if (duplicates is not None ):
286
+ self ._check_string_series (duplicates , 'duplicates' )
286
287
# Validate optional IDs input
287
288
if not StringGrouper ._is_input_data_combination_valid (duplicates , master_id , duplicates_id ):
288
289
raise Exception ('List of data Series options is invalid' )
@@ -320,7 +321,7 @@ def fit(self) -> 'StringGrouper':
320
321
matches = self ._build_matches (master_matrix , duplicate_matrix )
321
322
if self ._duplicates is None :
322
323
# the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
323
- # and each of its diagonal components must be equal to 1
324
+ # and each of its diagonal components must be equal to 1
324
325
matches = StringGrouper ._symmetrize_matrix_and_fix_diagonal (matches )
325
326
# retrieve all matches
326
327
self ._matches_list = self ._get_matches_list (matches )
@@ -339,15 +340,15 @@ def dot(self) -> pd.Series:
339
340
@validate_is_fit
340
341
def get_matches (self ,
341
342
ignore_index : Optional [bool ] = None ,
342
- include_zeroes : Optional [bool ]= None ,
343
- suppress_warning : Optional [bool ]= None ) -> pd .DataFrame :
343
+ include_zeroes : Optional [bool ] = None ,
344
+ suppress_warning : Optional [bool ] = None ) -> pd .DataFrame :
344
345
"""
345
346
Returns a DataFrame with all the matches and their cosine similarity.
346
347
If optional IDs are used, returned as extra columns with IDs matched to respective data rows
347
348
348
- :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
349
+ :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
349
350
self._config.ignore_index.
350
- :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
351
+ :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
351
352
appear in the output. Defaults to self._config.include_zeroes.
352
353
:param suppress_warning: when min_similarity <=0 and include_zeroes=True, determines whether or not to suppress
353
354
the message warning that max_n_matches may be too small. Defaults to self._config.suppress_warning.
@@ -372,19 +373,22 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str):
372
373
else :
373
374
return data .rename (f"{ prefix } { data .name } " )
374
375
375
- if ignore_index is None : ignore_index = self ._config .ignore_index
376
- if include_zeroes is None : include_zeroes = self ._config .include_zeroes
377
- if suppress_warning is None : suppress_warning = self ._config .suppress_warning
376
+ if ignore_index is None :
377
+ ignore_index = self ._config .ignore_index
378
+ if include_zeroes is None :
379
+ include_zeroes = self ._config .include_zeroes
380
+ if suppress_warning is None :
381
+ suppress_warning = self ._config .suppress_warning
378
382
if self ._config .min_similarity > 0 or not include_zeroes :
379
383
matches_list = self ._matches_list
380
384
elif include_zeroes :
381
385
# Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic):
382
- # the fix includes zero-similarity matches that are missing by default
383
- # in _matches_list due to our use of sparse matrices
386
+ # the fix includes zero-similarity matches that are missing by default
387
+ # in _matches_list due to our use of sparse matrices
384
388
non_matches_list = self ._get_non_matches_list (suppress_warning )
385
389
matches_list = self ._matches_list if non_matches_list .empty else \
386
390
pd .concat ([self ._matches_list , non_matches_list ], axis = 0 , ignore_index = True )
387
-
391
+
388
392
left_side , right_side = get_both_sides (self ._master , self ._duplicates , drop_index = ignore_index )
389
393
similarity = matches_list .similarity .reset_index (drop = True )
390
394
if self ._master_id is None :
@@ -426,16 +430,18 @@ def get_groups(self,
426
430
If there are IDs (master_id and/or duplicates_id) then the IDs corresponding to the string outputs
427
431
above are returned as well altogether in a DataFrame.
428
432
429
- :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
433
+ :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
430
434
self._config.ignore_index.
431
- :param replace_na: whether or not to replace NaN values in most similar string index-columns with
435
+ :param replace_na: whether or not to replace NaN values in most similar string index-columns with
432
436
corresponding duplicates-index values. Defaults to self._config.replace_na.
433
437
"""
434
- if ignore_index is None : ignore_index = self ._config .ignore_index
438
+ if ignore_index is None :
439
+ ignore_index = self ._config .ignore_index
435
440
if self ._duplicates is None :
436
441
return self ._deduplicate (ignore_index = ignore_index )
437
442
else :
438
- if replace_na is None : replace_na = self ._config .replace_na
443
+ if replace_na is None :
444
+ replace_na = self ._config .replace_na
439
445
return self ._get_nearest_matches (ignore_index = ignore_index , replace_na = replace_na )
440
446
441
447
@validate_is_fit
@@ -524,7 +530,8 @@ def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame:
524
530
all_pairs = pd .MultiIndex .from_product ([range (m_sz ), range (d_sz )], names = ['master_side' , 'dupe_side' ])
525
531
matched_pairs = pd .MultiIndex .from_frame (self ._matches_list [['master_side' , 'dupe_side' ]])
526
532
missing_pairs = all_pairs .difference (matched_pairs )
527
- if missing_pairs .empty : return pd .DataFrame ()
533
+ if missing_pairs .empty :
534
+ return pd .DataFrame ()
528
535
if (self ._config .max_n_matches < d_sz ) and not suppress_warning :
529
536
warnings .warn (f'WARNING: max_n_matches={ self ._config .max_n_matches } may be too small!\n '
530
537
f'\t \t Some zero-similarity matches returned may be false!\n '
@@ -542,8 +549,8 @@ def _get_nearest_matches(self,
542
549
master_label = f'{ prefix } { self ._master .name if self ._master .name else DEFAULT_MASTER_NAME } '
543
550
master = self ._master .rename (master_label ).reset_index (drop = ignore_index )
544
551
dupes = self ._duplicates .rename ('duplicates' ).reset_index (drop = ignore_index )
545
-
546
- # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
552
+
553
+ # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
547
554
if isinstance (dupes , pd .DataFrame ):
548
555
master .rename (
549
556
columns = {col : f'{ prefix } { col } ' for col in master .columns if str (col ) != master_label },
@@ -573,14 +580,14 @@ def _get_nearest_matches(self,
573
580
if self ._master_id is not None :
574
581
# Also update the master_id-series with the duplicates_id in cases were there is no match
575
582
dupes_max_sim .loc [rows_to_update , master_id_label ] = dupes_max_sim [rows_to_update ].duplicates_id
576
-
583
+
577
584
# For some weird reason, pandas' merge function changes int-datatype columns to float when NaN values
578
585
# appear within them. So here we change them back to their original datatypes if possible:
579
586
if dupes_max_sim [master_id_label ].dtype != self ._master_id .dtype and \
580
- self ._duplicates_id .dtype == self ._master_id .dtype :
587
+ self ._duplicates_id .dtype == self ._master_id .dtype :
581
588
dupes_max_sim .loc [:, master_id_label ] = \
582
- dupes_max_sim .loc [:, master_id_label ].astype (self ._master_id .dtype )
583
-
589
+ dupes_max_sim .loc [:, master_id_label ].astype (self ._master_id .dtype )
590
+
584
591
# Prepare the output:
585
592
required_column_list = [master_label ] if self ._master_id is None else [master_id_label , master_label ]
586
593
index_column_list = \
@@ -590,13 +597,13 @@ def _get_nearest_matches(self,
590
597
# Update the master index-columns with the duplicates index-column values in cases were there is no match
591
598
dupes_index_columns = [col for col in dupes .columns if str (col ) != 'duplicates' ]
592
599
dupes_max_sim .loc [rows_to_update , index_column_list ] = \
593
- dupes_max_sim .loc [rows_to_update , dupes_index_columns ].values
594
-
600
+ dupes_max_sim .loc [rows_to_update , dupes_index_columns ].values
601
+
595
602
# Restore their original datatypes if possible:
596
603
for m , d in zip (index_column_list , dupes_index_columns ):
597
604
if dupes_max_sim [m ].dtype != master [m ].dtype and dupes [d ].dtype == master [m ].dtype :
598
605
dupes_max_sim .loc [:, m ] = dupes_max_sim .loc [:, m ].astype (master [m ].dtype )
599
-
606
+
600
607
# Make sure to keep same order as duplicates
601
608
dupes_max_sim = dupes_max_sim .sort_values ('dupe_side' ).set_index ('dupe_side' )
602
609
output = dupes_max_sim [index_column_list + required_column_list ]
@@ -667,9 +674,9 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
667
674
master_indices = master_strings [master_strings == master_side ].index .to_series ().reset_index (drop = True )
668
675
dupe_indices = dupe_strings [dupe_strings == dupe_side ].index .to_series ().reset_index (drop = True )
669
676
return master_indices , dupe_indices
670
-
677
+
671
678
def _check_string_series (self , series_to_test : pd .Series , which : str ):
672
- self .bad_series_name = which
679
+ self .bad_series_name = which
673
680
StringGrouper ._check_type (series_to_test , which )
674
681
self ._check_content (series_to_test , which )
675
682
@@ -780,7 +787,7 @@ def __init__(self, master: pd.Series,
780
787
** kwargs )
781
788
except StringGrouperNotAllStringsException :
782
789
self .non_strings_present = True
783
-
790
+
784
791
def error_msg (self , bad_series_name , function_name ):
785
792
nl = ':\n '
786
793
return (
@@ -789,4 +796,3 @@ def error_msg(self, bad_series_name, function_name):
789
796
f'{ nl if 0 < len (self .issues ) < 12 else "." } '
790
797
f'{ self .issues .to_frame () if 0 < len (self .issues ) < 12 else "" } '
791
798
)
792
-
0 commit comments