added new keyword argument tfidf_matrix_dtype (the datatype for the

ParticularMiner · ParticularMiner · commit 0e1d65f84ece · 2021-06-12T09:52:07.000+02:00
tf-idf values of the matrix components). Allowed values are np.float32
and np.float64 (used by sparse_dot_topn v0.3.1).  Default is np.float32:
np.float32 often leads to faster processing but less precision than
np.float64
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,13 +10,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.4.1?] - 2021-06-11
 
 ### Added
-[No additions were made]
+
+* Added new keyword argument **`tfidf_matrix_dtype`** (the datatype for the tf-idf values of the matrix components). Allowed values are `np.float32` and `np.float64` (used by the required external package `sparse_dot_topn` version 0.3.1).  Default is `np.float32`.  (Note: `np.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `np.float64`.)
 
 ### Changed
 
 * Changed dependency on `sparse_dot_topn` from version 0.2.9 to 0.3.1
-* Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if 
-`duplicates` is not given). 
+* Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if `duplicates` is not given). 
 * Changed warning issued when the condition \[`include_zeroes=True` and `min_similarity` &le; 0 and `max_n_matches` is not sufficiently high to capture all nonzero-similarity-matches\] is met to an exception. 
  
 ### Removed
diff --git a/README.md b/README.md
@@ -134,6 +134,7 @@ All functions are built using a class **`StringGrouper`**. This class can be use
    All keyword arguments not mentioned in the function definitions above are used to update the default settings. The following optional arguments can be used:
 
    * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`.
+   * **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `np.float32` and `np.float64`.  Default is `np.float32`.  (Note: `np.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `np.float64`.)
    * **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`.
    * **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given).
    * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match.
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
@@ -11,6 +11,7 @@
 from functools import wraps
 
 DEFAULT_NGRAM_SIZE: int = 3
+DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32   # (only types np.float32 and np.float64 are allowed by sparse_dot_topn)
 DEFAULT_REGEX: str = r'[,-./]|\s'
 DEFAULT_MAX_N_MATCHES: int = 20
 DEFAULT_MIN_SIMILARITY: float = 0.8  # minimum cosine similarity for an item to be considered a match
@@ -140,6 +141,10 @@ class StringGrouperConfig(NamedTuple):
     Class with configuration variables.
 
     :param ngram_size: int. The amount of characters in each n-gram. Default is 3.
+    :param tfidf_matrix_dtype: type. The datatype for the tf-idf values of the matrix components.
+    Possible values allowed by sparse_dot_topn are np.float32 and np.float64.  Default is np.float32.
+    (Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision
+    than np.float64.)
     :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'.
     :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20.
     :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match.
@@ -157,6 +162,7 @@ class StringGrouperConfig(NamedTuple):
     """
 
     ngram_size: int = DEFAULT_NGRAM_SIZE
+    tfidf_matrix_dtype: int = DEFAULT_TFIDF_MATRIX_DTYPE
     regex: str = DEFAULT_REGEX
     max_n_matches: Optional[int] = None
     min_similarity: float = DEFAULT_MIN_SIMILARITY
@@ -227,9 +233,10 @@ def __init__(self, master: pd.Series,
             self._max_n_matches = self._config.max_n_matches
 
         self._validate_group_rep_specs()
+        self._validate_tfidf_matrix_dtype()
         self._validate_replace_na_and_drop()
         self.is_build = False  # indicates if the grouper was fit or not
-        self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams)
+        self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams, dtype=self._config.tfidf_matrix_dtype)
         # After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches
         self._matches_list: pd.DataFrame = pd.DataFrame()
         # _true_max_n_matches will contain the true maximum number of matches over all strings in master if
@@ -622,6 +629,13 @@ def _validate_group_rep_specs(self):
                 f"Invalid option value for group_rep. The only permitted values are\n {group_rep_options}"
             )
 
+    def _validate_tfidf_matrix_dtype(self):
+        dtype_options = (np.float32, np.float64)
+        if self._config.tfidf_matrix_dtype not in dtype_options:
+            raise Exception(
+                f"Invalid option value for tfidf_matrix_dtype. The only permitted values are\n {dtype_options}"
+            )
+
     def _validate_replace_na_and_drop(self):
         if self._config.ignore_index and self._config.replace_na:
             raise Exception("replace_na can only be set to True when ignore_index=False.")
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
@@ -133,6 +133,7 @@ def test_compute_pairwise_similarities(self):
             ],
             name='similarity'
         )
+        expected_result = expected_result.astype(np.float32)
         pd.testing.assert_series_equal(expected_result, similarities)
 
     def test_compute_pairwise_similarities_data_integrity(self):
@@ -367,6 +368,7 @@ def test_build_matches_list(self):
         dupe_side = [0, 1]
         similarity = [1.0, 1.0]
         expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg._matches_list)
 
     def test_case_insensitive_build_matches_list(self):
@@ -379,6 +381,7 @@ def test_case_insensitive_build_matches_list(self):
         dupe_side = [0, 1]
         similarity = [1.0, 1.0]
         expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg._matches_list)
 
     def test_get_matches_two_dataframes(self):
@@ -393,6 +396,7 @@ def test_get_matches_two_dataframes(self):
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
                                     'similarity': similarity,
                                     'right_side': right_side, 'right_index': right_index})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 
     def test_get_matches_single(self):
@@ -407,6 +411,7 @@ def test_get_matches_single(self):
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
                                     'similarity': similarity,
                                     'right_side': right_side, 'right_index': right_index})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 
     def test_get_matches_1_series_1_id_series(self):
@@ -424,6 +429,7 @@ def test_get_matches_1_series_1_id_series(self):
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
                                     'similarity': similarity,
                                     'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 
     def test_get_matches_2_series_2_id_series(self):
@@ -443,6 +449,7 @@ def test_get_matches_2_series_2_id_series(self):
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
                                     'similarity': similarity,
                                     'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index})
+        expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
         pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 
     def test_get_matches_raises_exception_if_unexpected_options_given(self):