|
13 | 13 | DEFAULT_NGRAM_SIZE: int = 3
|
14 | 14 | DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32 # (only types np.float32 and np.float64 are allowed by sparse_dot_topn)
|
15 | 15 | DEFAULT_REGEX: str = r'[,-./]|\s'
|
16 |
| -DEFAULT_MAX_N_MATCHES: int = 20 |
17 | 16 | DEFAULT_MIN_SIMILARITY: float = 0.8 # minimum cosine similarity for an item to be considered a match
|
18 | 17 | DEFAULT_N_PROCESSES: int = multiprocessing.cpu_count() - 1
|
19 | 18 | DEFAULT_IGNORE_CASE: bool = True # ignores case by default
|
@@ -209,7 +208,8 @@ class StringGrouperConfig(NamedTuple):
|
209 | 208 | (Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision
|
210 | 209 | than np.float64.)
|
211 | 210 | :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'.
|
212 |
| - :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20. |
| 211 | + :param max_n_matches: int. The maximum number of matches allowed per string in `duplicates` (or `master` |
| 212 | + is duplicates is not given). Default will be set by StringGrouper. |
213 | 213 | :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match.
|
214 | 214 | Defaults to 0.8.
|
215 | 215 | :param number_of_processes: int. The number of processes used by the cosine similarity calculation.
|
@@ -297,7 +297,7 @@ def __init__(self, master: pd.Series,
|
297 | 297 |
|
298 | 298 | self._config: StringGrouperConfig = StringGrouperConfig(**kwargs)
|
299 | 299 | if self._config.max_n_matches is None:
|
300 |
| - self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates) |
| 300 | + self._max_n_matches = len(self._master) |
301 | 301 | else:
|
302 | 302 | self._max_n_matches = self._config.max_n_matches
|
303 | 303 |
|
|
0 commit comments