ParticularMiner · ParticularMiner · May 12, 2021 · Jul 2, 2021 · Jul 2, 2021 · Aug 8, 2021
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.7, 3.8, 3.9]
-        os: [ubuntu-latest, windows-latest]
+        os: [ubuntu-latest]
 
     steps:
     - uses: actions/checkout@v2
@@ -21,8 +21,13 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
 
-    - name: Install package
-      run: pip install .
+    - name: Install dev-package
+      run: |
+        sudo apt-get install qemu tree
+        python -m pip install --upgrade pip
+        pip install -v -e .
+        qemu-x86_64 -R 20M python time_match_strings.py
+
 
     - name: Run tests
       run: python -m unittest
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.6.0?] - 2021-09-21
+
+### Added
+
+* matrix-blocking/splitting as a performance-enhancer (see [README.md](https://github.com/ParticularMiner/string_grouper/tree/block#performance) for details)
+* new keyword arguments `force_symmetries` and `n_blocks` (see [README.md](https://github.com/ParticularMiner/string_grouper/tree/block#kwargs) for details)
+* new dependency on packages `topn` and `sparse_dot_topn_for_blocks` to help with the matrix-blocking
+* capability to reuse a previously initialized StringGrouper (that is, the corpus can now persist across high-level function calls like `match_strings()`.  See [README.md](https://github.com/ParticularMiner/string_grouper/tree/block#corpus) for details.)
+
+
+## [0.5.0] - 2021-06-11
+
+### Added
+
+* Added new keyword argument **`tfidf_matrix_dtype`** (the datatype for the tf-idf values of the matrix components). Allowed values are `numpy.float32` and `numpy.float64` (used by the required external package `sparse_dot_topn` version 0.3.1).  Default is `numpy.float32`.  (Note: `numpy.float32` often leads to faster processing and a smaller memory footprint albeit less numerical precision than `numpy.float64`.)
+
+### Changed
+
+* Changed dependency on `sparse_dot_topn` from version 0.2.9 to 0.3.1
+* Changed the default datatype for cosine similarities from numpy.float64 to numpy.float32 to boost computational performance at the expense of numerical precision.
+* Changed the default value of the keyword argument `max_n_matches` from 20 to the number of strings in `duplicates` (or `master`, if `duplicates` is not given). 
+* Changed warning issued when the condition \[`include_zeroes=True` and `min_similarity` &le; 0 and `max_n_matches` is not sufficiently high to capture all nonzero-similarity-matches\] is met to an exception. 
+
+### Removed
+
+* Removed the keyword argument `suppress_warning`
+
 ## [0.4.0] - 2021-04-11
 
 ### Added

diff --git a/README.md b/README.md
diff --git a/images/BlockMatrix_1_1.png b/images/BlockMatrix_1_1.png
diff --git a/images/BlockMatrix_1_2.png b/images/BlockMatrix_1_2.png
diff --git a/images/BlockMatrix_2_2.png b/images/BlockMatrix_2_2.png
diff --git a/images/BlockNumberSpaceExploration1.png b/images/BlockNumberSpaceExploration1.png
diff --git a/images/Fuzzy_vs_Exact.png b/images/Fuzzy_vs_Exact.png
diff --git a/images/ScaledRuntimeContourPlot.png b/images/ScaledRuntimeContourPlot.png
diff --git a/images/ScaledTimePerComparison.png b/images/ScaledTimePerComparison.png
diff --git a/setup.py b/setup.py
@@ -9,8 +9,8 @@
 
 setup(
     name='string_grouper',
-    version='0.4.0',
-    packages=['string_grouper'],
+    version='0.6.0',
+    packages=['string_grouper', 'string_grouper_utils'],
     license='MIT License',
     description='String grouper contains functions to do string matching using TF-IDF and the cossine similarity. '
                 'Based on https://bergvca.github.io/2017/10/14/super-fast-string-matching.html',
@@ -25,6 +25,7 @@
                       , 'scipy'
                       , 'scikit-learn'
                       , 'numpy'
-                      , 'sparse_dot_topn>=0.2.6'
+                      , 'sparse_dot_topn_for_blocks>=0.3.1'
+                      , 'topn>=0.0.7'
                       ]
 )
diff --git a/string_grouper/__init__.py b/string_grouper/__init__.py
@@ -1,2 +1,2 @@
 from .string_grouper import compute_pairwise_similarities, group_similar_strings, match_most_similar, match_strings, \
-StringGrouperConfig, StringGrouper
+    StringGrouperConfig, StringGrouper
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py