Merge pull request #39 from zkarnin/sim_encode

zkarnin · web-flow · commit e6cc77faed58 · 2021-11-29T13:23:54.000+02:00
feat: Similarity encoding
diff --git a/README.rst b/README.rst
@@ -134,3 +134,4 @@ Overview of Submodules
    * :code:`RobustLabelEncoder` encode labels for seen and unseen labels
    * :code:`RobustStandardScaler` standardization for dense and sparse inputs
    * :code:`WOEEncoder` weight of evidence supervised encoder
+   * :code:`SimilarityEncoder` encode categorical values based on their descriptive string
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ scikit-learn==0.23.2
 python-dateutil==2.8.0
 pandas==1.2.4
 tsfresh==0.18.0
+statsmodels==0.12.2
diff --git a/src/sagemaker_sklearn_extension/preprocessing/__init__.py b/src/sagemaker_sklearn_extension/preprocessing/__init__.py
@@ -24,6 +24,7 @@
 from .encoders import RobustOrdinalEncoder
 from .encoders import ThresholdOneHotEncoder
 from .encoders import WOEEncoder
+from .encoders import SimilarityEncoder
 
 __all__ = [
     "BaseExtremeValueTransformer",
@@ -39,4 +40,5 @@
     "log_transform",
     "quantile_transform_nonrandom",
     "WOEEncoder",
+    "SimilarityEncoder",
 ]
diff --git a/src/sagemaker_sklearn_extension/preprocessing/encoders.py b/src/sagemaker_sklearn_extension/preprocessing/encoders.py
@@ -916,3 +916,141 @@ def fit_transform(self, X, y):
 
     def _more_tags(self):
         return {"X_types": ["categorical"], "binary_only": True, "requires_y": True}
+
+
+class SimilarityAsserts(Enum):
+    TARGET_DIM = "Target dimension must be a positive integer."
+
+
+class SimilarityEncoder(BaseEstimator, TransformerMixin):
+    """Similarity encoder: encodes categorical features as a numerical vector
+    using their textual representation. Categories with similar descriptions are mapped to
+    similar vectors.
+    The underlying method used is locally sensitive hashing (LSH [2]) of the character level 3-gram
+    tokens. The similarity between two category description is defined as the Jackard
+    similarity between their corresponding bags of 3-grams. The known min-hash [3] embedding is
+    then used to convert these token sets into vectors in a way that the l_0 distance, defined
+    as the number of different entries, approximates the Jackard distance. This technique has
+    been provided in [1] and shown to significantly outperform 1-hot encoding in scenarios where
+    the number of categories is large.
+
+    Parameters
+    ----------
+    target_dimension: int, default=30
+        Dimension of the embedding. Small target dimension might not represent the categories in a descriptive enough
+        way, and large target dimension take longer to compute and might result in over-fitting. For large datasets
+        and a number of categories much larger than 30, consider raising this value.
+
+    seed: int, default=None
+        seed for random number generation. Used when fitting and setting the hash functions
+
+    Example
+    -------
+    >>> import numpy as np
+    >>> from sagemaker_sklearn_extension.preprocessing import SimilarityEncoder
+    >>> category_data = np.array(['table', 'chair', 'table (red)', 'ladder', 'table (blue)', 'table'])
+    >>> SimilarityEncoder(target_dimension=2, seed=112).fit_transform(category_data.reshape(-1, 1))
+    array([[0.06143999, 0.08793556],
+           [0.29021414, 0.29044514],
+           [0.06143999, 0.08793556],
+           [0.1312301 , 0.0455779 ],
+           [0.06143999, 0.08793556],
+           [0.06143999, 0.08793556]])
+
+    Attributes
+    ----------
+    hash_prime_: prime used for hash funtions
+        Hash functions operate on integers. A function consists of two numbers a,b and an integer x is hashed into
+        x*a+b modulo hash_prime. To avoid overflows we use int64 and the largest prime p such that p*p < 2^63 -1, the
+        maximum int64 value.
+
+    References
+    ----------
+    [1] https://arxiv.org/abs/1907.01860
+    [2] https://en.wikipedia.org/wiki/Locality-sensitive_hashing
+    [3] https://en.wikipedia.org/wiki/MinHash
+    """
+
+    def __init__(self, target_dimension=30, seed=None):
+        self.target_dimension = target_dimension
+        self.seed = seed
+
+    def fit(self, X=None, y=None):
+        """Fit Similarity encoder.
+        Ignores input data. This fixes the hash funtion(s) to be used for the minhash encoding
+
+        Parameters
+        ----------
+        X: array-like, shape (n_samples, n_features)
+            The data to encode.
+
+        y: array-like, shape (n_samples,)
+            The binary target vector.
+
+        Returns
+        -------
+        self: SimilarityEncoder.
+        """
+        # Validate parameters
+        assert isinstance(self.target_dimension, int) and self.target_dimension > 0, SimilarityAsserts.TARGET_DIM
+
+        # prime to be used for hash function (largest prime p such that p**2 is still within int64 range)
+        self.hash_prime_ = 2038074743
+        # random numbers for hash functions
+        generator = np.random.RandomState(seed=self.seed)
+        self._mult = generator.randint(low=1, high=self.hash_prime_, size=(self.target_dimension, 1))
+        self._add = generator.randint(low=0, high=self.hash_prime_, size=(self.target_dimension, 1))
+        return self
+
+    def _minhash_index_sparse_vec(self, vec):
+        # prepare tokens as valid integers
+        ind = vec.indices.astype(np.int64)
+        ind %= self.hash_prime_
+        # if the vector was zero, ind is an empty list. In this case fill it with a single zero. This is needed to
+        # avoid an error below when taking a minimum along an axis
+        if ind.shape == (0,):
+            ind = np.zeros((1,), dtype=np.int64)
+
+        # compute for each token its hash values, create a matrix of dimensions (num_hash, num_tokens)
+        all_hash_values = self._mult * ind.reshape((1, -1)) + self._add
+        all_hash_values %= self.hash_prime_
+
+        # compute row-wise min to get vector of length num_hash
+        hash_values = np.min(all_hash_values, axis=1)
+
+        # normalize in [0,1)
+        return hash_values.astype(np.float64) / self.hash_prime_
+
+    def transform(self, X):
+        """Transform each column of `X` using the Similarity encoding.
+
+        Returns
+        -------
+        X_encoded: array, shape (n_samples, n_encoded_features * target_dimension)
+            Array with each of the encoded columns.
+        """
+        check_is_fitted(self, "hash_prime_")
+        X = check_array(X, dtype=str)
+
+        # remember shape, flatten X to be 1dim, and convert to string. Note - this makes sure all None values become
+        # the string 'None'. This is acceptible behavior
+        str_list = X.reshape((-1,)).astype("str")
+        # replace nones
+        # tokenize each string
+        # convert each token array into integers via hash function
+        from sklearn.feature_extraction.text import HashingVectorizer
+
+        # TODO: In the paper this function is based on the ngram number was fixed as 3. As a follow up, consider
+        #  parametrizing this.
+        hv = HashingVectorizer(analyzer="char_wb", ngram_range=(3, 3), binary=True)
+        token_hash_matrix = hv.fit_transform(str_list)
+        # apply minhash
+        minhash_vectors = np.array([self._minhash_index_sparse_vec(row) for row in token_hash_matrix])
+        # reshape back
+        return minhash_vectors.reshape((X.shape[0], X.shape[1] * self.target_dimension))
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X, y).transform(X)
+
+    def _more_tags(self):
+        return {"X_types": ["string"]}
diff --git a/test/test_preprocessing_encoders.py b/test/test_preprocessing_encoders.py
@@ -20,6 +20,7 @@
 from sagemaker_sklearn_extension.preprocessing import ThresholdOneHotEncoder
 from sagemaker_sklearn_extension.preprocessing import RobustOrdinalEncoder
 from sagemaker_sklearn_extension.preprocessing import WOEEncoder
+from sagemaker_sklearn_extension.preprocessing import SimilarityEncoder
 
 
 X = np.array([["hot dog", 1], ["hot dog", 1], ["apple", 2], ["hot dog", 3], ["hot dog", 1], ["banana", 3]])
@@ -578,3 +579,50 @@ def test_woe_multi_cols():
     Xe = enc.fit_transform(X, titanic_y)
     assert len(np.unique(Xe[:, 0])) == 4
     assert len(np.unique(Xe[:, 1])) == 4
+
+
+def test_similarity_consistent():
+    X = np.array(
+        [
+            "cat1",
+            "cat2",
+            "cat1",
+            "abcdefghijkkjihgfedcba",
+            "abcdefghijkkjihgfedcab",
+            "lmnopqrstuvwxyzzyxwvutsrqponml",
+            "a",
+            "b",
+        ]
+    ).reshape((-1, 1))
+    se = SimilarityEncoder(target_dimension=300, seed=5)
+    out = se.fit_transform(X)
+    # exact equal strings should get equal vectors
+    assert np.array_equal(out[0], out[2])
+    # completely different strings should get different vectors
+    assert not np.array_equal(out[0], out[1])
+    # output 3,4 should be similar vectors, meaning closer than the vectors of 3 and 5 since these are very
+    # different strings
+    assert np.linalg.norm(out[3] - out[4]) < np.linalg.norm(out[3] - out[5])
+    # make sure single character inputs also get different outputs
+    assert not np.array_equal(out[6], out[7])
+
+
+def test_similarity_multicol():
+    X = np.array([["cat1a", "cat1b"], ["cat2a", "cat2b"], ["cat1a", "cat1b"]])
+    se = SimilarityEncoder(target_dimension=3, seed=5)
+    out = se.fit_transform(X)
+    assert out.shape[1] == 6
+
+
+def test_similarity_fails_ilegal_target_dim():
+    X = np.array(["cat1", "cat2", "cat1"]).reshape((-1, 1))
+    se = SimilarityEncoder(target_dimension=0, seed=5)
+    with pytest.raises(Exception):
+        se.fit_transform(X)
+
+
+def test_similarity_handles_empty_string():
+    X = np.array(["", " ", "  ", "-1201230()*&(*&%$#!", None, np.nan]).reshape((-1, 1))
+    se = SimilarityEncoder(target_dimension=3, seed=5)
+    out = se.fit_transform(X)
+    assert out.shape == (6, 3)