Skip to content

Commit 0e1d65f

Browse files
added new keyword argument tfidf_matrix_dtype (the datatype for the
tf-idf values of the matrix components). Allowed values are np.float32 and np.float64 (used by sparse_dot_topn v0.3.1). Default is np.float32: np.float32 often leads to faster processing but less precision than np.float64
1 parent 36f7316 commit 0e1d65f

File tree

4 files changed

+26
-4
lines changed

4 files changed

+26
-4
lines changed

CHANGELOG.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
## [0.4.1?] - 2021-06-11
1111

1212
### Added
13-
[No additions were made]
13+
14+
* Added new keyword argument **`tfidf_matrix_dtype`** (the datatype for the tf-idf values of the matrix components). Allowed values are `np.float32` and `np.float64` (used by the required external package `sparse_dot_topn` version 0.3.1). Default is `np.float32`. (Note: `np.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `np.float64`.)
1415

1516
### Changed
1617

1718
* Changed dependency on `sparse_dot_topn` from version 0.2.9 to 0.3.1
18-
* Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if
19-
`duplicates` is not given).
19+
* Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if `duplicates` is not given).
2020
* Changed warning issued when the condition \[`include_zeroes=True` and `min_similarity` ≤ 0 and `max_n_matches` is not sufficiently high to capture all nonzero-similarity-matches\] is met to an exception.
2121

2222
### Removed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ All functions are built using a class **`StringGrouper`**. This class can be use
134134
All keyword arguments not mentioned in the function definitions above are used to update the default settings. The following optional arguments can be used:
135135

136136
* **`ngram_size`**: The amount of characters in each n-gram. Default is `3`.
137+
* **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `np.float32` and `np.float64`. Default is `np.float32`. (Note: `np.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `np.float64`.)
137138
* **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`.
138139
* **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given).
139140
* **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match.

string_grouper/string_grouper.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from functools import wraps
1212

1313
DEFAULT_NGRAM_SIZE: int = 3
14+
DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32 # (only types np.float32 and np.float64 are allowed by sparse_dot_topn)
1415
DEFAULT_REGEX: str = r'[,-./]|\s'
1516
DEFAULT_MAX_N_MATCHES: int = 20
1617
DEFAULT_MIN_SIMILARITY: float = 0.8 # minimum cosine similarity for an item to be considered a match
@@ -140,6 +141,10 @@ class StringGrouperConfig(NamedTuple):
140141
Class with configuration variables.
141142
142143
:param ngram_size: int. The amount of characters in each n-gram. Default is 3.
144+
:param tfidf_matrix_dtype: type. The datatype for the tf-idf values of the matrix components.
145+
Possible values allowed by sparse_dot_topn are np.float32 and np.float64. Default is np.float32.
146+
(Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision
147+
than np.float64.)
143148
:param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'.
144149
:param max_n_matches: int. The maximum number of matches allowed per string. Default is 20.
145150
:param min_similarity: float. The minimum cosine similarity for two strings to be considered a match.
@@ -157,6 +162,7 @@ class StringGrouperConfig(NamedTuple):
157162
"""
158163

159164
ngram_size: int = DEFAULT_NGRAM_SIZE
165+
tfidf_matrix_dtype: int = DEFAULT_TFIDF_MATRIX_DTYPE
160166
regex: str = DEFAULT_REGEX
161167
max_n_matches: Optional[int] = None
162168
min_similarity: float = DEFAULT_MIN_SIMILARITY
@@ -227,9 +233,10 @@ def __init__(self, master: pd.Series,
227233
self._max_n_matches = self._config.max_n_matches
228234

229235
self._validate_group_rep_specs()
236+
self._validate_tfidf_matrix_dtype()
230237
self._validate_replace_na_and_drop()
231238
self.is_build = False # indicates if the grouper was fit or not
232-
self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams)
239+
self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams, dtype=self._config.tfidf_matrix_dtype)
233240
# After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches
234241
self._matches_list: pd.DataFrame = pd.DataFrame()
235242
# _true_max_n_matches will contain the true maximum number of matches over all strings in master if
@@ -622,6 +629,13 @@ def _validate_group_rep_specs(self):
622629
f"Invalid option value for group_rep. The only permitted values are\n {group_rep_options}"
623630
)
624631

632+
def _validate_tfidf_matrix_dtype(self):
633+
dtype_options = (np.float32, np.float64)
634+
if self._config.tfidf_matrix_dtype not in dtype_options:
635+
raise Exception(
636+
f"Invalid option value for tfidf_matrix_dtype. The only permitted values are\n {dtype_options}"
637+
)
638+
625639
def _validate_replace_na_and_drop(self):
626640
if self._config.ignore_index and self._config.replace_na:
627641
raise Exception("replace_na can only be set to True when ignore_index=False.")

string_grouper/test/test_string_grouper.py

+7
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ def test_compute_pairwise_similarities(self):
133133
],
134134
name='similarity'
135135
)
136+
expected_result = expected_result.astype(np.float32)
136137
pd.testing.assert_series_equal(expected_result, similarities)
137138

138139
def test_compute_pairwise_similarities_data_integrity(self):
@@ -367,6 +368,7 @@ def test_build_matches_list(self):
367368
dupe_side = [0, 1]
368369
similarity = [1.0, 1.0]
369370
expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity})
371+
expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
370372
pd.testing.assert_frame_equal(expected_df, sg._matches_list)
371373

372374
def test_case_insensitive_build_matches_list(self):
@@ -379,6 +381,7 @@ def test_case_insensitive_build_matches_list(self):
379381
dupe_side = [0, 1]
380382
similarity = [1.0, 1.0]
381383
expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity})
384+
expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
382385
pd.testing.assert_frame_equal(expected_df, sg._matches_list)
383386

384387
def test_get_matches_two_dataframes(self):
@@ -393,6 +396,7 @@ def test_get_matches_two_dataframes(self):
393396
expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
394397
'similarity': similarity,
395398
'right_side': right_side, 'right_index': right_index})
399+
expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
396400
pd.testing.assert_frame_equal(expected_df, sg.get_matches())
397401

398402
def test_get_matches_single(self):
@@ -407,6 +411,7 @@ def test_get_matches_single(self):
407411
expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
408412
'similarity': similarity,
409413
'right_side': right_side, 'right_index': right_index})
414+
expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
410415
pd.testing.assert_frame_equal(expected_df, sg.get_matches())
411416

412417
def test_get_matches_1_series_1_id_series(self):
@@ -424,6 +429,7 @@ def test_get_matches_1_series_1_id_series(self):
424429
expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
425430
'similarity': similarity,
426431
'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index})
432+
expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
427433
pd.testing.assert_frame_equal(expected_df, sg.get_matches())
428434

429435
def test_get_matches_2_series_2_id_series(self):
@@ -443,6 +449,7 @@ def test_get_matches_2_series_2_id_series(self):
443449
expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
444450
'similarity': similarity,
445451
'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index})
452+
expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype(sg._config.tfidf_matrix_dtype)
446453
pd.testing.assert_frame_equal(expected_df, sg.get_matches())
447454

448455
def test_get_matches_raises_exception_if_unexpected_options_given(self):

0 commit comments

Comments
 (0)