26
26
# similar string index-columns with corresponding duplicates-index values
27
27
DEFAULT_INCLUDE_ZEROES : bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
28
28
# matches appear in the output
29
+ DEFAULT_ENABLE_CACHE : bool = False # does not cache the master tf-idf matrix between queries which preserve master
29
30
GROUP_REP_CENTROID : str = 'centroid' # Option value to select the string in each group with the largest
30
31
# similarity aggregate as group-representative:
31
32
GROUP_REP_FIRST : str = 'first' # Option value to select the first string in each group as group-representative:
@@ -185,6 +186,9 @@ class StringGrouperConfig(NamedTuple):
185
186
before performing the string-comparisons block-wise. Defaults to 'guess', in which case the numbers of
186
187
blocks are estimated based on previous empirical results. If n_blocks = 'auto', then splitting is done
187
188
automatically in the event of an OverflowError.
189
+ :param enable_cache: bool. Whether or not to cache the tf-idf matrix for ``master`` between queries which
190
+ preserve ``master``. Defaults to False. Use with caution: setting this option to True may degrade
191
+ performance when ``master`` is too large to fit into RAM.
188
192
"""
189
193
190
194
ngram_size : int = DEFAULT_NGRAM_SIZE
@@ -200,6 +204,7 @@ class StringGrouperConfig(NamedTuple):
200
204
group_rep : str = DEFAULT_GROUP_REP
201
205
force_symmetries : bool = DEFAULT_FORCE_SYMMETRIES
202
206
n_blocks : Tuple [int , int ] = DEFAULT_N_BLOCKS
207
+ enable_cache : bool = DEFAULT_ENABLE_CACHE
203
208
204
209
205
210
def validate_is_fit (f ):
@@ -242,6 +247,7 @@ def __init__(self, master: pd.Series,
242
247
"""
243
248
# private members:
244
249
self .is_build = False
250
+ self ._cache = dict ()
245
251
246
252
self ._master : pd .DataFrame = pd .DataFrame ()
247
253
self ._duplicates : Optional [pd .Series ] = None
@@ -323,8 +329,24 @@ def reset_data(self,
323
329
:param duplicates_id: pandas.Series. If set, contains ID values for each row in duplicates Series.
324
330
:param kwargs: All other keyword arguments are passed to StringGrouperConfig
325
331
"""
332
+ self ._cache .clear ()
326
333
self ._set_data (master , duplicates , master_id , duplicates_id )
327
334
335
+ def _reset_duplicates_only (self , duplicates : pd .Series = None , duplicates_id : Optional [pd .Series ] = None ):
336
+ # Validate input strings data
337
+ self .duplicates = duplicates
338
+
339
+ # Validate optional IDs input
340
+ if not StringGrouper ._is_input_data_combination_valid (duplicates , self ._master_id , duplicates_id ):
341
+ raise Exception ('List of data Series options is invalid' )
342
+ StringGrouper ._validate_id_data (self ._master , duplicates , self ._master_id , duplicates_id )
343
+ self ._duplicates_id = duplicates_id
344
+
345
+ # Set some private members
346
+ self ._left_Series = self ._duplicates
347
+
348
+ self .is_build = False
349
+
328
350
def clear_data (self ):
329
351
self ._master = None
330
352
self ._duplicates = None
@@ -333,6 +355,7 @@ def clear_data(self):
333
355
self ._matches_list = None
334
356
self ._left_Series = None
335
357
self ._right_Series = None
358
+ self ._cache .clear ()
336
359
self .is_build = False
337
360
338
361
def update_options (self , ** kwargs ):
@@ -718,7 +741,7 @@ def get_groups(self,
718
741
return self ._get_nearest_matches (ignore_index = ignore_index , replace_na = replace_na )
719
742
720
743
def match_strings (self ,
721
- master : pd .Series ,
744
+ master : Optional [ pd .Series ] = None ,
722
745
duplicates : Optional [pd .Series ] = None ,
723
746
master_id : Optional [pd .Series ] = None ,
724
747
duplicates_id : Optional [pd .Series ] = None ,
@@ -729,14 +752,19 @@ def match_strings(self,
729
752
This can be seen as an self-join. If both master and duplicates is given, it will return highly similar strings
730
753
between master and duplicates. This can be seen as an inner-join.
731
754
732
- :param master: pandas.Series. Series of strings against which matches are calculated.
755
+ :param master: pandas.Series. Series of strings against which matches are calculated. If not set, or is set to
756
+ ``None``, then the currently stored ``master`` Series will be reused.
733
757
:param duplicates: pandas.Series. Series of strings that will be matched with master if given (Optional).
734
758
:param master_id: pandas.Series. Series of values that are IDs for master column rows (Optional).
735
759
:param duplicates_id: pandas.Series. Series of values that are IDs for duplicates column rows (Optional).
736
760
:param kwargs: All other keyword arguments are passed to StringGrouperConfig.
737
761
:return: pandas.Dataframe.
738
762
"""
739
- self .reset_data (master , duplicates , master_id , duplicates_id )
763
+ if master is None :
764
+ self ._reset_duplicates_only (duplicates , duplicates_id )
765
+ else :
766
+ self .reset_data (master , duplicates , master_id , duplicates_id )
767
+
740
768
self .update_options (** kwargs )
741
769
self = self .fit ()
742
770
return self .get_matches ()
@@ -761,14 +789,18 @@ def match_most_similar(self,
761
789
If IDs (both 'master_id' and 'duplicates_id') are also given, returns a DataFrame of the same strings
762
790
output in the above case with their corresponding IDs.
763
791
764
- :param master: pandas.Series. Series of strings that the duplicates will be matched with.
792
+ :param master: pandas.Series. Series of strings that the duplicates will be matched with. If it is
793
+ set to ``None``, then the currently stored ``master`` Series will be reused.
765
794
:param duplicates: pandas.Series. Series of strings that will me matched with the master.
766
795
:param master_id: pandas.Series. Series of values that are IDs for master column rows. (Optional)
767
796
:param duplicates_id: pandas.Series. Series of values that are IDs for duplicates column rows. (Optional)
768
797
:param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
769
798
:return: pandas.Series or pandas.DataFrame.
770
799
"""
771
- self .reset_data (master , duplicates , master_id , duplicates_id )
800
+ if master is None :
801
+ self ._reset_duplicates_only (duplicates , duplicates_id )
802
+ else :
803
+ self .reset_data (master , duplicates , master_id , duplicates_id )
772
804
773
805
old_max_n_matches = self ._max_n_matches
774
806
new_max_n_matches = None
@@ -875,8 +907,17 @@ def _get_right_tf_idf_matrix(self, partition=(None, None)):
875
907
# unlike _get_tf_idf_matrices(), _get_right_tf_idf_matrix
876
908
# does not set the corpus but rather
877
909
# builds a matrix using the existing corpus
878
- return self ._vectorizer .transform (
879
- self ._right_Series .iloc [slice (* partition )])
910
+ key = tuple (partition )
911
+ if self ._config .enable_cache and key in self ._cache :
912
+ matrix = self ._cache [key ]
913
+ else :
914
+ matrix = self ._vectorizer .transform (
915
+ self ._right_Series .iloc [slice (* partition )])
916
+
917
+ if self ._config .enable_cache :
918
+ self ._cache [key ] = matrix
919
+
920
+ return matrix
880
921
881
922
def _fit_vectorizer (self ) -> TfidfVectorizer :
882
923
# if both dupes and master string series are set - we concat them to fit the vectorizer on all
0 commit comments