added ability to cache matrix in queries across which master is constant

ParticularMiner · ParticularMiner · commit c981048cacdd · 2022-05-23T12:28:31.000+02:00
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
@@ -26,6 +26,7 @@
 # similar string index-columns with corresponding duplicates-index values
 DEFAULT_INCLUDE_ZEROES: bool = True  # when the minimum cosine similarity <=0, determines whether zero-similarity
 # matches appear in the output
+DEFAULT_ENABLE_CACHE: bool = False  # does not cache the master tf-idf matrix between queries which preserve master
 GROUP_REP_CENTROID: str = 'centroid'    # Option value to select the string in each group with the largest
 # similarity aggregate as group-representative:
 GROUP_REP_FIRST: str = 'first'  # Option value to select the first string in each group as group-representative:
@@ -185,6 +186,9 @@ class StringGrouperConfig(NamedTuple):
     before performing the string-comparisons block-wise.  Defaults to 'guess', in which case the numbers of
     blocks are estimated based on previous empirical results.  If n_blocks = 'auto', then splitting is done
     automatically in the event of an OverflowError.
+    :param enable_cache: bool. Whether or not to cache the tf-idf matrix for ``master`` between queries which
+    preserve ``master``.  Defaults to False.  Use with caution: setting this option to True may degrade
+    performance when ``master`` is too large to fit into RAM.
     """
 
     ngram_size: int = DEFAULT_NGRAM_SIZE
@@ -200,6 +204,7 @@ class StringGrouperConfig(NamedTuple):
     group_rep: str = DEFAULT_GROUP_REP
     force_symmetries: bool = DEFAULT_FORCE_SYMMETRIES
     n_blocks: Tuple[int, int] = DEFAULT_N_BLOCKS
+    enable_cache: bool = DEFAULT_ENABLE_CACHE
 
 
 def validate_is_fit(f):
@@ -242,6 +247,7 @@ def __init__(self, master: pd.Series,
         """
         # private members:
         self.is_build = False
+        self._cache = dict()
 
         self._master: pd.DataFrame = pd.DataFrame()
         self._duplicates: Optional[pd.Series] = None
@@ -323,8 +329,24 @@ def reset_data(self,
         :param duplicates_id: pandas.Series. If set, contains ID values for each row in duplicates Series.
         :param kwargs: All other keyword arguments are passed to StringGrouperConfig
         """
+        self._cache.clear()
         self._set_data(master, duplicates, master_id, duplicates_id)
 
+    def _reset_duplicates_only(self, duplicates: pd.Series = None, duplicates_id: Optional[pd.Series] = None):
+        # Validate input strings data
+        self.duplicates = duplicates
+
+        # Validate optional IDs input
+        if not StringGrouper._is_input_data_combination_valid(duplicates, self._master_id, duplicates_id):
+            raise Exception('List of data Series options is invalid')
+        StringGrouper._validate_id_data(self._master, duplicates, self._master_id, duplicates_id)
+        self._duplicates_id = duplicates_id
+
+        # Set some private members
+        self._left_Series = self._duplicates
+
+        self.is_build = False
+
     def clear_data(self):
         self._master = None
         self._duplicates = None
@@ -333,6 +355,7 @@ def clear_data(self):
         self._matches_list = None
         self._left_Series = None
         self._right_Series = None
+        self._cache.clear()
         self.is_build = False
 
     def update_options(self, **kwargs):
@@ -718,7 +741,7 @@ def get_groups(self,
             return self._get_nearest_matches(ignore_index=ignore_index, replace_na=replace_na)
 
     def match_strings(self,
-                      master: pd.Series,
+                      master: Optional[pd.Series] = None,
                       duplicates: Optional[pd.Series] = None,
                       master_id: Optional[pd.Series] = None,
                       duplicates_id: Optional[pd.Series] = None,
@@ -729,14 +752,19 @@ def match_strings(self,
         This can be seen as an self-join. If both master and duplicates is given, it will return highly similar strings
         between master and duplicates. This can be seen as an inner-join.
 
-        :param master: pandas.Series. Series of strings against which matches are calculated.
+        :param master: pandas.Series. Series of strings against which matches are calculated.  If not set, or is set to
+        ``None``, then the currently stored ``master`` Series will be reused.
         :param duplicates: pandas.Series. Series of strings that will be matched with master if given (Optional).
         :param master_id: pandas.Series. Series of values that are IDs for master column rows (Optional).
         :param duplicates_id: pandas.Series. Series of values that are IDs for duplicates column rows (Optional).
         :param kwargs: All other keyword arguments are passed to StringGrouperConfig.
         :return: pandas.Dataframe.
         """
-        self.reset_data(master, duplicates, master_id, duplicates_id)
+        if master is None:
+            self._reset_duplicates_only(duplicates, duplicates_id)
+        else:
+            self.reset_data(master, duplicates, master_id, duplicates_id)
+
         self.update_options(**kwargs)
         self = self.fit()
         return self.get_matches()
@@ -761,14 +789,18 @@ def match_most_similar(self,
         If IDs (both 'master_id' and 'duplicates_id') are also given, returns a DataFrame of the same strings
         output in the above case with their corresponding IDs.
 
-        :param master: pandas.Series. Series of strings that the duplicates will be matched with.
+        :param master: pandas.Series. Series of strings that the duplicates will be matched with. If it is
+        set to ``None``, then the currently stored ``master`` Series will be reused.
         :param duplicates: pandas.Series. Series of strings that will me matched with the master.
         :param master_id: pandas.Series. Series of values that are IDs for master column rows. (Optional)
         :param duplicates_id: pandas.Series. Series of values that are IDs for duplicates column rows. (Optional)
         :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
         :return: pandas.Series or pandas.DataFrame.
         """
-        self.reset_data(master, duplicates, master_id, duplicates_id)
+        if master is None:
+            self._reset_duplicates_only(duplicates, duplicates_id)
+        else:
+            self.reset_data(master, duplicates, master_id, duplicates_id)
 
         old_max_n_matches = self._max_n_matches
         new_max_n_matches = None
@@ -875,8 +907,17 @@ def _get_right_tf_idf_matrix(self, partition=(None, None)):
         # unlike _get_tf_idf_matrices(), _get_right_tf_idf_matrix
         # does not set the corpus but rather
         # builds a matrix using the existing corpus
-        return self._vectorizer.transform(
-            self._right_Series.iloc[slice(*partition)])
+        key = tuple(partition)
+        if self._config.enable_cache and key in self._cache:
+            matrix = self._cache[key]
+        else:
+            matrix = self._vectorizer.transform(
+                self._right_Series.iloc[slice(*partition)])
+
+            if self._config.enable_cache:
+                self._cache[key] = matrix
+
+        return matrix
 
     def _fit_vectorizer(self) -> TfidfVectorizer:
         # if both dupes and master string series are set - we concat them to fit the vectorizer on all
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
@@ -4,6 +4,7 @@
 from scipy.sparse.csr import csr_matrix
 from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \
     DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
+    DEFAULT_ENABLE_CACHE, \
     StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \
     match_most_similar, group_similar_strings, match_strings, \
     compute_pairwise_similarities
@@ -100,6 +101,7 @@ def test_config_defaults(self):
         self.assertEqual(config.ngram_size, DEFAULT_NGRAM_SIZE)
         self.assertEqual(config.number_of_processes, DEFAULT_N_PROCESSES)
         self.assertEqual(config.ignore_case, DEFAULT_IGNORE_CASE)
+        self.assertEqual(config.enable_cache, DEFAULT_ENABLE_CACHE)
 
     def test_config_immutable(self):
         """Configurations should be immutable"""
@@ -117,6 +119,35 @@ def test_config_non_default_values(self):
 
 class StringGrouperTest(unittest.TestCase):
 
+    def test_cache(self):
+        """tests caching when the option is enabled"""
+
+        sort_cols = ['right_index', 'left_index']
+
+        def fix_row_order(df):
+            return df.sort_values(sort_cols).reset_index(drop=True)
+
+        simple_example = SimpleExample()
+        df1 = simple_example.customers_df2['Customer Name']
+
+        sg = StringGrouper(df1, min_similarity=0.1)
+        assert sg._cache == dict()
+        matches = fix_row_order(sg.match_strings(df1))  # no cache
+        assert sg._cache == dict()
+
+        matches_ = fix_row_order(sg.match_strings(duplicates=df1, enable_cache=True))
+        assert len(sg._cache) > 0
+        for _, value in sg._cache.items():
+            assert isinstance(value, csr_matrix)
+        pd.testing.assert_frame_equal(matches_, matches)
+        matches__ = fix_row_order(sg.match_strings(duplicates=df1))
+        assert len(sg._cache) > 0
+        for _, value in sg._cache.items():
+            assert isinstance(value, csr_matrix)
+        pd.testing.assert_frame_equal(matches__, matches)
+        with self.assertRaises(Exception):
+            _ = sg.match_strings(duplicates=df1, duplicates_id=simple_example.customers_df2['Customer ID'])
+
     def test_auto_blocking_single_Series(self):
         """tests whether automatic blocking yields consistent results"""
         # This function will force an OverflowError to occur when
@@ -870,8 +901,10 @@ def test_get_groups_two_df(self):
         result = sg.get_groups()
         expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='most_similar_master')
         pd.testing.assert_series_equal(expected_result, result)
-        result = sg.match_most_similar(test_series_1, test_series_2, max_n_matches=3)
+        result = sg.match_most_similar(test_series_1, test_series_2, max_n_matches=3, enable_cache=True)
         pd.testing.assert_series_equal(expected_result, result)
+        result2 = sg.match_most_similar(None, test_series_2, max_n_matches=3)
+        pd.testing.assert_series_equal(expected_result, result2)
 
     def test_get_groups_2_string_series_2_id_series(self):
         """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string