changed default value of kwarg max_n_matches to #strings in master

ParticularMiner · ParticularMiner · commit b6180ae78137 · 2021-07-05T05:51:39.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+## [0.5.1?] - 2021-07-05
+
+* Improved the performance of the function `match_most_similar`.
+* Changed the default value of the keyword argument `max_n_matches` to the number of strings in `master`.  (`max_n_matches` is now defined as the maximum number of matches allowed per string in `duplicates` \[or `master` if `duplicates` is not given\]).
+
 ## [0.5.0] - 2021-06-11
 
 ### Added
diff --git a/README.md b/README.md
@@ -136,7 +136,7 @@ All functions are built using a class **`StringGrouper`**. This class can be use
    * **`ngram_size`**: The amount of characters in each n-gram. Default is `3`.
    * **`tfidf_matrix_dtype`**: The datatype for the tf-idf values of the matrix components. Allowed values are `numpy.float32` and `numpy.float64`.  Default is `numpy.float32`.  (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.)
    * **`regex`**: The regex string used to clean-up the input string. Default is `"[,-./]|\s"`.
-   * **`max_n_matches`**: The maximum number of matches allowed per string in `master`. Default is the number of strings in `duplicates` (or `master`, if `duplicates` is not given).
+   * **`max_n_matches`**: The maximum number of matches allowed per string in `duplicates` (or `master` if `duplicates` is not given). Default is the number of strings in `master`.
    * **`min_similarity`**: The minimum cosine similarity for two strings to be considered a match.
     Defaults to `0.8`
    * **`number_of_processes`**: The number of processes used by the cosine similarity calculation. Defaults to
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
@@ -13,7 +13,6 @@
 DEFAULT_NGRAM_SIZE: int = 3
 DEFAULT_TFIDF_MATRIX_DTYPE: type = np.float32   # (only types np.float32 and np.float64 are allowed by sparse_dot_topn)
 DEFAULT_REGEX: str = r'[,-./]|\s'
-DEFAULT_MAX_N_MATCHES: int = 20
 DEFAULT_MIN_SIMILARITY: float = 0.8  # minimum cosine similarity for an item to be considered a match
 DEFAULT_N_PROCESSES: int = multiprocessing.cpu_count() - 1
 DEFAULT_IGNORE_CASE: bool = True  # ignores case by default
@@ -209,7 +208,8 @@ class StringGrouperConfig(NamedTuple):
     (Note: np.float32 often leads to faster processing and a smaller memory footprint albeit less precision
     than np.float64.)
     :param regex: str. The regex string used to cleanup the input string. Default is '[,-./]|\s'.
-    :param max_n_matches: int. The maximum number of matches allowed per string. Default is 20.
+    :param max_n_matches: int. The maximum number of matches allowed per string in `duplicates` (or `master`
+    is duplicates is not given). Default will be set by StringGrouper.
     :param min_similarity: float. The minimum cosine similarity for two strings to be considered a match.
     Defaults to 0.8.
     :param number_of_processes: int. The number of processes used by the cosine similarity calculation.
@@ -297,7 +297,7 @@ def __init__(self, master: pd.Series,
 
         self._config: StringGrouperConfig = StringGrouperConfig(**kwargs)
         if self._config.max_n_matches is None:
-            self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates)
+            self._max_n_matches = len(self._master)
         else:
             self._max_n_matches = self._config.max_n_matches