Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

created a more user-friendly error message when bad data is found #56

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
boosted _symmetrize_matches_list() (5x) and _get_matches_list() (33x)
ParticularMiner committed Apr 26, 2021
commit 92436e351adcc58117199576ad8ee0884118d669
48 changes: 15 additions & 33 deletions string_grouper/string_grouper.py
Original file line number Diff line number Diff line change
@@ -251,11 +251,11 @@ def fit(self) -> 'StringGrouper':
master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
# Calculate the matches using the cosine similarity
matches = self._build_matches(master_matrix, duplicate_matrix)
if self._duplicates is None:
# the matrix of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
matches = StringGrouper._symmetrize_matrix(matches)
# retrieve all matches
self._matches_list = self._get_matches_list(matches)
if self._duplicates is None:
# the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
self._symmetrize_matches_list()
self.is_build = True
return self

@@ -450,18 +450,6 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
self._config.min_similarity,
**optional_kwargs)

def _symmetrize_matches_list(self):
# [symmetrized matches_list] = [matches_list] UNION [transposed matches_list] (i.e., column-names swapped):
self._matches_list = self._matches_list.set_index(['master_side', 'dupe_side'])\
.combine_first(
self._matches_list.rename(
columns={
'master_side': 'dupe_side',
'dupe_side': 'master_side'
}
).set_index(['master_side', 'dupe_side'])
).reset_index()

def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame:
"""Returns a list of all the indices of non-matching pairs (with similarity set to 0)"""
m_sz, d_sz = len(self._master), len(self._master if self._duplicates is None else self._duplicates)
@@ -480,25 +468,19 @@ def _get_non_matches_list(self, suppress_warning=False) -> pd.DataFrame:
return missing_pairs

@staticmethod
def _get_matches_list(matches) -> pd.DataFrame:
def _symmetrize_matrix(AA: csr_matrix) -> csr_matrix:
A = AA.tolil()
r, c = A.nonzero()
A[c, r] = A[r, c]
return A.tocsr()

@staticmethod
def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
"""Returns a list of all the indices of matches"""
non_zeros = matches.nonzero()

sparserows = non_zeros[0]
sparsecols = non_zeros[1]
nr_matches = sparsecols.size
master_side = np.empty([nr_matches], dtype=np.int64)
dupe_side = np.empty([nr_matches], dtype=np.int64)
similarity = np.zeros(nr_matches)

for index in range(0, nr_matches):
master_side[index] = sparserows[index]
dupe_side[index] = sparsecols[index]
similarity[index] = matches.data[index]

matches_list = pd.DataFrame({'master_side': master_side,
'dupe_side': dupe_side,
'similarity': similarity})
r, c = matches.nonzero()
matches_list = pd.DataFrame({'master_side': r.astype(np.int64),
'dupe_side': c.astype(np.int64),
'similarity': matches.data})
return matches_list

def _get_nearest_matches(self,
8 changes: 5 additions & 3 deletions string_grouper/test/test_string_grouper.py
Original file line number Diff line number Diff line change
@@ -11,6 +11,8 @@
from unittest.mock import patch
import warnings

def mock_symmetrize_matrix(A: csr_matrix) -> csr_matrix:
return A

class SimpleExample(object):
def __init__(self):
@@ -197,14 +199,14 @@ def test_match_strings(self, mock_StringGouper):
mock_StringGrouper_instance.get_matches.assert_called_once()
self.assertEqual(df, 'whatever')

@patch('string_grouper.string_grouper.StringGrouper._symmetrize_matches_list')
def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matches_list):
@patch('string_grouper.string_grouper.StringGrouper._symmetrize_matrix', side_effect=mock_symmetrize_matrix)
def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix):
"""mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is
**partially** symmetric which often occurs when the kwarg max_n_matches is too small"""
simple_example = SimpleExample()
df = simple_example.customers_df2['Customer Name']
sg = StringGrouper(df, max_n_matches=2).fit()
mock_symmetrize_matches_list.assert_called_once()
mock_symmetrize_matrix.assert_called_once()
# obtain the upper and lower triangular parts of the matrix of matches:
upper = sg._matches_list[sg._matches_list['master_side'] < sg._matches_list['dupe_side']]
lower = sg._matches_list[sg._matches_list['master_side'] > sg._matches_list['dupe_side']]