diff --git a/CHANGELOG.md b/CHANGELOG.md index e73d33fd..399a1b45 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +## [0.5.2?] - 2021-07-05 + +* Provided a more user-friendly error message to be issued when any entries in the input string-Series are not strings. + +## [0.5.1?] - 2021-07-05 + +* Improved the performance of the function `match_most_similar`. +* Changed the default value of the keyword argument `max_n_matches` to the total number of strings in `master`. (`max_n_matches` is now defined as the maximum number of matches allowed per string in `duplicates` \[or `master` if `duplicates` is not given\]). + ## [0.5.0] - 2021-06-11 ### Added diff --git a/README.md b/README.md index 1b18c3c9..2da24cf5 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ All functions are built using a class **`StringGrouper`**. This class can be use * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`. * **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `numpy.float32` and `numpy.float64`. Default is `numpy.float32`. (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.) * **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`. - * **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given). + * **`max_n_matches`**: The maximum number of matching strings in `master` allowed per string in `duplicates` (or `master` if `duplicates` is not given). Default is the total number of strings in `master`. * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match. Defaults to `0.8` * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index d1612511..f7fa7b75 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -13,7 +13,6 @@ DEFAULT_NGRAM_SIZE: int = 3 DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32 # (only types np.float32 and np.float64 are allowed by sparse_dot_topn) DEFAULT_REGEX: str = r'[,-./]|\s' -DEFAULT_MAX_N_MATCHES: int = 20 DEFAULT_MIN_SIMILARITY: float = 0.8 # minimum cosine similarity for an item to be considered a match DEFAULT_N_PROCESSES: int = multiprocessing.cpu_count() - 1 DEFAULT_IGNORE_CASE: bool = True # ignores case by default @@ -42,7 +41,28 @@ # High level functions -def compute_pairwise_similarities(string_series_1: pd.Series, +def who(bad_StringGrouper_param, param_1, param_name_1, param_2, param_name_2): + # Private utility function used by high-level functions (that call StringGrouper) to form a + # descriptive name for their series input parameter which caused the exception of type + # StringGrouperNotAllStringsException to occur + if bad_StringGrouper_param == 'master': + return f'\'{param_1.name}\' ({param_name_1})' if param_1.name else param_name_1 + else: + return f'\'{param_2.name}\' ({param_name_2})' if param_2.name else param_name_2 + + +def add_this_arg(func): + # Behind-the-scenes function-wrapper (to be used as decorator for high-level functions "func") + # that shifts the parameters of "func" to the right by one, inserting a reference to local + # function "this" in the first parameter position + def this(*args, **kwargs): + return func(this, *args, **kwargs) + return this + + +@add_this_arg +def compute_pairwise_similarities(this, + string_series_1: pd.Series, string_series_2: pd.Series, **kwargs) -> pd.Series: """ @@ -53,10 +73,21 @@ def compute_pairwise_similarities(string_series_1: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig :return: pandas.Series of similarity scores, the same length as string_series_1 and string_series_2 """ - return StringGrouper(string_series_1, string_series_2, **kwargs).dot() - - -def group_similar_strings(strings_to_group: pd.Series, + sg = StringGrouperPrime(string_series_1, string_series_2, **kwargs) + # error handler (for input Series with values that are not strings) + if sg.non_strings_present: + sname = who(sg.bad_series_name, + string_series_1, 'string_series_1', + string_series_2, 'string_series_2') + this.issues = sg.issues + this.issues.rename(f'Non-strings in Series {sname}', inplace=True) + raise TypeError(sg.error_msg(sname, 'compute_pairwise_similarities')) + return sg.dot() + + +@add_this_arg +def group_similar_strings(this, + strings_to_group: pd.Series, string_ids: Optional[pd.Series] = None, **kwargs) -> Union[pd.DataFrame, pd.Series]: """ @@ -74,11 +105,22 @@ def group_similar_strings(strings_to_group: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional) :return: pandas.Series or pandas.DataFrame. """ - string_grouper = StringGrouper(strings_to_group, master_id=string_ids, **kwargs).fit() - return string_grouper.get_groups() - - -def match_most_similar(master: pd.Series, + sg = StringGrouperPrime(strings_to_group, master_id=string_ids, **kwargs) + # error handler (for input Series with values that are not strings) + if sg.non_strings_present: + sname = who(sg.bad_series_name, + strings_to_group, 'strings_to_group', + None, '') + this.issues = sg.issues + this.issues.rename(f'Non-strings in Series {sname}', inplace=True) + raise TypeError(sg.error_msg(sname, 'group_similar_strings')) + fit_sg = sg.fit() + return fit_sg.get_groups() + + +@add_this_arg +def match_most_similar(this, + master: pd.Series, duplicates: pd.Series, master_id: Optional[pd.Series] = None, duplicates_id: Optional[pd.Series] = None, @@ -103,15 +145,27 @@ def match_most_similar(master: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional) :return: pandas.Series or pandas.DataFrame. """ - string_grouper = StringGrouper(master, - duplicates=duplicates, - master_id=master_id, - duplicates_id=duplicates_id, - **kwargs).fit() - return string_grouper.get_groups() - - -def match_strings(master: pd.Series, + kwargs['max_n_matches'] = 1 + sg = StringGrouperPrime(master, + duplicates=duplicates, + master_id=master_id, + duplicates_id=duplicates_id, + **kwargs) + # error handler (for input Series with values that are not strings) + if sg.non_strings_present: + sname = who(sg.bad_series_name, + master, 'master', + duplicates, 'duplicates') + this.issues = sg.issues + this.issues.rename(f'Non-strings in Series {sname}', inplace=True) + raise TypeError(sg.error_msg(sname, 'match_most_similar')) + fit_sg = sg.fit() + return fit_sg.get_groups() + + +@add_this_arg +def match_strings(this, + master: pd.Series, duplicates: Optional[pd.Series] = None, master_id: Optional[pd.Series] = None, duplicates_id: Optional[pd.Series] = None, @@ -128,12 +182,20 @@ def match_strings(master: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig. :return: pandas.Dataframe. """ - string_grouper = StringGrouper(master, - duplicates=duplicates, - master_id=master_id, - duplicates_id=duplicates_id, - **kwargs).fit() - return string_grouper.get_matches() + sg = StringGrouperPrime(master, + duplicates=duplicates, + master_id=master_id, + duplicates_id=duplicates_id, + **kwargs) + if sg.non_strings_present: + sname = who(sg.bad_series_name, + master, 'master', + duplicates, 'duplicates') + this.issues = sg.issues + this.issues.rename(f'Non-strings in Series {sname}', inplace=True) + raise TypeError(sg.error_msg(sname, 'match_strings')) + fit_sg = sg.fit() + return fit_sg.get_matches() class StringGrouperConfig(NamedTuple): @@ -146,7 +208,8 @@ class StringGrouperConfig(NamedTuple): (Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision than np.float64.) :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'. - :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20. + :param max_n_matches: int. The maximum number of matching strings in `master` allowed per string in + `duplicates` (or `master` if `duplicates` is not given). Default will be set by StringGrouper. :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match. Defaults to 0.8. :param number_of_processes: int. The number of processes used by the cosine similarity calculation. @@ -194,6 +257,11 @@ class StringGrouperNotFitException(Exception): pass +class StringGrouperNotAllStringsException(TypeError): + """Raised when either input Series master or duplicates contains non-strings""" + pass + + class StringGrouper(object): def __init__(self, master: pd.Series, duplicates: Optional[pd.Series] = None, @@ -213,9 +281,10 @@ def __init__(self, master: pd.Series, :param kwargs: All other keyword arguments are passed to StringGrouperConfig """ # Validate match strings input - if not StringGrouper._is_series_of_strings(master) or \ - (duplicates is not None and not StringGrouper._is_series_of_strings(duplicates)): - raise TypeError('Input does not consist of pandas.Series containing only Strings') + self.issues: pd.Series = None + self._check_string_series(master, 'master') + if (duplicates is not None): + self._check_string_series(duplicates, 'duplicates') # Validate optional IDs input if not StringGrouper._is_input_data_combination_valid(duplicates, master_id, duplicates_id): raise Exception('List of data Series options is invalid') @@ -228,7 +297,7 @@ def __init__(self, master: pd.Series, self._config: StringGrouperConfig = StringGrouperConfig(**kwargs) if self._config.max_n_matches is None: - self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates) + self._max_n_matches = len(self._master) else: self._max_n_matches = self._config.max_n_matches @@ -455,8 +524,8 @@ def _fit_vectorizer(self) -> TfidfVectorizer: def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix) -> csr_matrix: """Builds the cossine similarity matrix of two csr matrices""" - tf_idf_matrix_1 = master_matrix - tf_idf_matrix_2 = duplicate_matrix.transpose() + tf_idf_matrix_1 = duplicate_matrix + tf_idf_matrix_2 = master_matrix.transpose() optional_kwargs = { 'return_best_ntop': True, @@ -622,6 +691,21 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series, dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True) return master_indices, dupe_indices + def _check_string_series(self, series_to_test: pd.Series, which: str): + self.bad_series_name = which + StringGrouper._check_type(series_to_test, which) + self._check_content(series_to_test, which) + + def _check_content(self, series_to_test: pd.Series, which: str): + non_strings_exist = series_to_test.to_frame().applymap( + lambda x: (not isinstance(x, str)) or len(x) == 0 + ).squeeze(axis=1) + if non_strings_exist.any(): + self.issues = series_to_test[non_strings_exist] + sname = f' {series_to_test.name}' if series_to_test.name else '' + self.issues.rename(f'Non-strings in {which} Series{sname}', inplace=True) + raise StringGrouperNotAllStringsException + def _validate_group_rep_specs(self): group_rep_options = (GROUP_REP_FIRST, GROUP_REP_CENTROID) if self._config.group_rep not in group_rep_options: @@ -645,6 +729,11 @@ def _validate_replace_na_and_drop(self): "index if the number of index-levels does not equal the number of index-columns." ) + @staticmethod + def _check_type(series_to_test: pd.Series, which: str): + if not isinstance(series_to_test, pd.Series): + raise TypeError(f'Input {which} is not a pandas.Series containing only Strings') + @staticmethod def _fix_diagonal(m: lil_matrix) -> csr_matrix: r = np.arange(m.shape[0]) @@ -661,8 +750,8 @@ def _symmetrize_matrix(m_symmetric: lil_matrix) -> csr_matrix: def _get_matches_list(matches: csr_matrix) -> pd.DataFrame: """Returns a list of all the indices of matches""" r, c = matches.nonzero() - matches_list = pd.DataFrame({'master_side': r.astype(np.int64), - 'dupe_side': c.astype(np.int64), + matches_list = pd.DataFrame({'master_side': c.astype(np.int64), + 'dupe_side': r.astype(np.int64), 'similarity': matches.data}) return matches_list @@ -687,16 +776,6 @@ def _validate_strings_exist(master_side, dupe_side, master_strings, dupe_strings elif not dupe_strings.isin([dupe_side]).any(): raise ValueError(f'{dupe_side} not found in StringGrouper dupe string series') - @staticmethod - def _is_series_of_strings(series_to_test: pd.Series) -> bool: - if not isinstance(series_to_test, pd.Series): - return False - elif series_to_test.to_frame().applymap( - lambda x: not isinstance(x, str) - ).squeeze(axis=1).any(): - return False - return True - @staticmethod def _is_input_data_combination_valid(duplicates, master_id, duplicates_id) -> bool: if duplicates is None and (duplicates_id is not None) \ @@ -711,3 +790,35 @@ def _validate_id_data(master, duplicates, master_id, duplicates_id): raise Exception('Both master and master_id must be pandas.Series of the same length.') if duplicates is not None and duplicates_id is not None and len(duplicates) != len(duplicates_id): raise Exception('Both duplicates and duplicates_id must be pandas.Series of the same length.') + + +class StringGrouperPrime(StringGrouper): + # (To be used in high-level functions) + # Child class of StringGrouper that captures information about the input Series + # that caused the StringGrouperNotAllStringsException even when the StringGrouper + # instance is not fully initialized + def __init__(self, master: pd.Series, + duplicates: Optional[pd.Series] = None, + master_id: Optional[pd.Series] = None, + duplicates_id: Optional[pd.Series] = None, + **kwargs): + self.issues = None + self.non_strings_present = False + self.bad_series_name = None + try: + super().__init__(master, + duplicates=duplicates, + master_id=master_id, + duplicates_id=duplicates_id, + **kwargs) + except StringGrouperNotAllStringsException: + self.non_strings_present = True + + def error_msg(self, bad_series_name, function_name): + nl = ':\n' + return ( + f'\n\nERROR: Input pandas Series {bad_series_name} contains values that are not strings!\n' + f'Display the pandas Series \'{function_name}.issues\' to find where these values are' + f'{nl if 0 < len(self.issues) < 12 else "."}' + f'{self.issues.to_frame() if 0 < len(self.issues) < 12 else ""}' + ) diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index f5f0aac8..fa527216 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -4,8 +4,9 @@ from scipy.sparse.csr import csr_matrix from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \ DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \ - StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \ - match_most_similar, group_similar_strings, match_strings, \ + StringGrouperConfig, StringGrouper, \ + StringGrouperNotFitException, StringGrouperNotAllStringsException, \ + match_most_similar, group_similar_strings, match_strings,\ compute_pairwise_similarities from unittest.mock import patch @@ -145,12 +146,14 @@ def test_compute_pairwise_similarities_data_integrity(self): with self.assertRaises(Exception): _ = compute_pairwise_similarities(df1, df2[:-2]) - @patch('string_grouper.string_grouper.StringGrouper') + @patch('string_grouper.string_grouper.StringGrouperPrime') def test_group_similar_strings(self, mock_StringGouper): """mocks StringGrouper to test if the high-level function group_similar_strings utilizes it as expected""" mock_StringGrouper_instance = mock_StringGouper.return_value mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance mock_StringGrouper_instance.get_groups.return_value = 'whatever' + mock_StringGrouper_instance.non_strings_present = False + mock_StringGrouper_instance.error_msg.return_value = 'mock_error' test_series_1 = None test_series_id_1 = None @@ -163,12 +166,14 @@ def test_group_similar_strings(self, mock_StringGouper): mock_StringGrouper_instance.get_groups.assert_called_once() self.assertEqual(df, 'whatever') - @patch('string_grouper.string_grouper.StringGrouper') + @patch('string_grouper.string_grouper.StringGrouperPrime') def test_match_most_similar(self, mock_StringGouper): """mocks StringGrouper to test if the high-level function match_most_similar utilizes it as expected""" mock_StringGrouper_instance = mock_StringGouper.return_value mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance mock_StringGrouper_instance.get_groups.return_value = 'whatever' + mock_StringGrouper_instance.non_strings_present = False + mock_StringGrouper_instance.error_msg.return_value = 'mock_error' test_series_1 = None test_series_2 = None @@ -185,12 +190,14 @@ def test_match_most_similar(self, mock_StringGouper): mock_StringGrouper_instance.get_groups.assert_called_once() self.assertEqual(df, 'whatever') - @patch('string_grouper.string_grouper.StringGrouper') + @patch('string_grouper.string_grouper.StringGrouperPrime') def test_match_strings(self, mock_StringGouper): """mocks StringGrouper to test if the high-level function match_strings utilizes it as expected""" mock_StringGrouper_instance = mock_StringGouper.return_value mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance mock_StringGrouper_instance.get_matches.return_value = 'whatever' + mock_StringGrouper_instance.non_strings_present = False + mock_StringGrouper_instance.error_msg.return_value = 'mock_error' test_series_1 = None test_series_id_1 = None @@ -405,8 +412,8 @@ def test_get_matches_single(self): sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] - left_index = [0, 0, 1, 2, 3, 3] - right_index = [0, 3, 1, 2, 0, 3] + left_index = [0, 3, 1, 2, 0, 3] + right_index = [0, 0, 1, 2, 3, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'similarity': similarity, @@ -420,11 +427,11 @@ def test_get_matches_1_series_1_id_series(self): sg = StringGrouper(test_series_1, master_id=test_series_id_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] - left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] - left_index = [0, 0, 1, 2, 3, 3] + left_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3'] + left_index = [0, 3, 1, 2, 0, 3] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] - right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3'] - right_index = [0, 3, 1, 2, 0, 3] + right_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] + right_index = [0, 0, 1, 2, 3, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, 'similarity': similarity, @@ -798,10 +805,24 @@ def test_string_grouper_type_error(self): """StringGrouper should raise an typeerror master or duplicates are not a series of strings""" with self.assertRaises(TypeError): _ = StringGrouper('foo', 'bar') - with self.assertRaises(TypeError): + with self.assertRaises(StringGrouperNotAllStringsException): _ = StringGrouper(pd.Series(['foo', 'bar']), pd.Series(['foo', 1])) - with self.assertRaises(TypeError): + with self.assertRaises(StringGrouperNotAllStringsException): _ = StringGrouper(pd.Series(['foo', np.nan]), pd.Series(['foo', 'j'])) + with self.assertRaises(StringGrouperNotAllStringsException): + _ = StringGrouper(pd.Series(['foo', 'j']), pd.Series(['foo', np.nan])) + + def test_not_all_strings_exception_in_high_level_fucntions(self): + good_series = pd.Series(['foo', 'bar']) + bad_series = pd.Series([None, 'foo', 1, np.nan], name='dupes') + with self.assertRaises(TypeError): + _ = compute_pairwise_similarities(good_series, bad_series.rename_axis('dupes_id')) + with self.assertRaises(TypeError): + _ = group_similar_strings(bad_series.rename_axis('string_id')) + with self.assertRaises(TypeError): + _ = match_most_similar(bad_series.rename('master'), good_series) + with self.assertRaises(TypeError): + _ = match_strings(good_series, bad_series.rename('dupes').rename_axis('dupes_id')) def test_prior_matches_added(self): """When a new match is added, any pre-existing matches should also be updated""" @@ -809,7 +830,7 @@ def test_prior_matches_added(self): 'microsoftoffice 365 home', 'microsoftoffice 365 pers', 'microsoft office' - ] + ] df = pd.DataFrame(sample, columns=['name'])