hlin117 · mmakowski · Jan 14, 2018 · Jan 14, 2018 · Jan 15, 2018 · Jan 15, 2018
diff --git a/mdlp/discretization.py b/mdlp/discretization.py
@@ -5,6 +5,8 @@
 
 from __future__ import division
 
+import itertools
+import multiprocessing as mp
 from sklearn.base import BaseEstimator
 from sklearn.base import TransformerMixin
 from sklearn.utils import (
@@ -17,6 +19,7 @@
 from mdlp._mdlp import MDLPDiscretize
 
 import numpy as np
+import scipy
 
 
 class MDLP(BaseEstimator, TransformerMixin):
@@ -36,6 +39,10 @@ class MDLP(BaseEstimator, TransformerMixin):
         If `X` is a 1-D array, then continuous_features should be None.
         Otherwise, for a 2-D array, defaults to `np.arange(X.shape[1])`.
 
+    drop_collapsed_features : bool (default=False)
+        When set to `True` will remove single-bin features when transforming
+        `X`.
+
     min_depth : int (default=0)
         The minimum depth of the interval splitting. Overrides
         the MDLP stopping criterion. If the entropy at a given interval
@@ -48,6 +55,10 @@ class MDLP(BaseEstimator, TransformerMixin):
         random_state parameter. Thus, setting shuffle=False will override
         the affect of random_state.)
 
+    n_jobs : int (default=1)
+        The number of jobs to run in parallel for fit (but not transform).
+        If -1, then the number of jobs is set to the number of cores.
+
     random_state : int (default=None)
         Seed of pseudo RNG to use when shuffling the data. Affects the
         outcome of MDLP if there are multiple samples with the same
@@ -93,17 +104,23 @@ class MDLP(BaseEstimator, TransformerMixin):
     """
 
     def __init__(self, continuous_features=None, min_depth=0, shuffle=True,
-                 random_state=None):
+                 drop_collapsed_features=False, n_jobs=1, random_state=None):
         # Parameters
         self.continous_features = continuous_features
         self.min_depth = min_depth
         self.random_state = random_state
         self.shuffle = shuffle
+        self.drop_collapsed_features = drop_collapsed_features
+        if n_jobs < -1 or n_jobs == 0:
+            raise ValueError("Valid values for `n_jobs` are -1 or a positive integer."
+                             "supplied value: {0}".format(n_jobs))
+        self.n_jobs = n_jobs
 
         # Attributes
         self.continuous_features_ = continuous_features
         self.cut_points_ = None
         self.dimensions_ = None
+        self.__collapsed_features_count = 0
 
     def fit(self, X, y):
         """Finds the intervals of interest from the input data.
@@ -116,10 +133,11 @@ def fit(self, X, y):
 
         y : A list or array of class labels corresponding to `X`.
         """
-        X = check_array(X, force_all_finite=True, ensure_2d=False, dtype=np.float64)
+        X = check_array(X, accept_sparse=True, force_all_finite=True, \
+                        ensure_2d=False, dtype=np.float64)
         y = column_or_1d(y)
         y = check_array(y, ensure_2d=False, dtype=np.int64)
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, accept_sparse=True)
 
         self.dimensions_ = len(X.shape)
 
@@ -138,16 +156,7 @@ def fit(self, X, y):
             y = y[perm]
 
         if self.dimensions_ == 2:
-            if self.continuous_features_ is None:
-                self.continuous_features_ = np.arange(X.shape[1])
-
-            self.cut_points_ = dict()
-
-            for index, col in enumerate(X.T):
-                if index not in self.continuous_features_:
-                    continue
-                cut_points = MDLPDiscretize(col, y, self.min_depth)
-                self.cut_points_[index] = cut_points
+            self.__fit_2d(X, y)
         else:
             if self.continuous_features_ is not None:
                 raise ValueError("Passed in a 1-d column of continuous features, "
@@ -156,24 +165,85 @@ def fit(self, X, y):
             cut_points = MDLPDiscretize(X, y, self.min_depth)
             self.cut_points_ = cut_points
 
+        self.__collapsed_features_count = \
+            len([cp for cp in self.cut_points_.values() if cp.size == 0])
+
         return self
 
+    def __fit_2d(self, X, y):
+        if self.continuous_features_ is None:
+            self.continuous_features_ = np.arange(X.shape[1])
+
+
+        def __append_y_and_min_depth(index_and_col):
+            (index, col) = index_and_col
+            return index, col, y, self.min_depth
+
+        inputs = map(__append_y_and_min_depth, \
+                     filter(lambda indcol: indcol[0] in self.continuous_features_, \
+                            enumerate(X.T)))
+
+        if self.n_jobs > 1:
+            pool = mp.Pool(None if self.n_jobs < 1 else self.n_jobs)
+            results = pool.map(_calculate_cut_points, inputs)
+            pool.close()
+            pool.join()
+        else:
+            results = []
+            for inp in inputs:
+                results.append(_calculate_cut_points(inp))
+
+        self.cut_points_ = dict(results)
+
     def transform(self, X, y=None):
         """Discretizes values in X into {0, ..., k-1}.
 
         `k` is the number of bins the discretizer creates from a continuous
         feature.
         """
-        X = check_array(X, force_all_finite=True, ensure_2d=False)
+        X = check_array(X, accept_sparse=True, force_all_finite=True, ensure_2d=False)
         check_is_fitted(self, "cut_points_")
         if self.dimensions_ == 1:
             output = np.searchsorted(self.cut_points_, X)
+        else:
+            output = self.__transform_2d(X)
+        return output
+
+    def __transform_2d(self, X):
+        if self.drop_collapsed_features:
+            new_shape = (X.shape[0], X.shape[1] - self.__collapsed_features_count)
+            output = self.__make_output(X, new_shape)
+            output_col = 0
+            for input_col in range(X.shape[1]):
+                if input_col in self.continuous_features_:
+                    if self.cut_points_[input_col].size > 0:
+                        output[:, output_col] = \
+                            np.searchsorted(self.cut_points_[input_col], \
+                                            self.__get_col(X, input_col))
+                        output_col += 1
+                else:
+                    output[:, output_col] = self.__get_col(X, input_col)
+                    output_col += 1
         else:
             output = X.copy()
             for i in self.continuous_features_:
-                output[:, i] = np.searchsorted(self.cut_points_[i], X[:, i])
+                output[:, i] = np.searchsorted(self.cut_points_[i], self.__get_col(X, i))
+        return output
+
+    def __make_output(self, X, shape):
+        if scipy.sparse.issparse(X):
+            output = scipy.sparse.dok_matrix(shape, dtype=X.dtype)
+        else:
+            output = np.ndarray(shape=shape, dtype=X.dtype)
         return output
 
+    def __get_col(self, X, col_index):
+        if scipy.sparse.issparse(X):
+            col = X[:, col_index].toarray()
+        else:
+            col = X[:, col_index]
+        return col
+
     def cat2intervals(self, X, index=None):
         """Converts a categorical feature into a list of intervals.
         """
@@ -221,3 +291,14 @@ def _assign_intervals(self, cp_indices, index):
         backs[cp_indices != numCuts] = cut_points[non_numCuts_mask]
 
         return [(front, back) for front, back in zip(fronts, backs)]
+
+
+def _calculate_cut_points(inputs):
+    """Calculates the cut points given a single feature column and
+       corresponding labels. Expects the argument to be a tuple of
+       `(index, column, labels, min_depth)`.
+    """
+    (index, col, y, min_depth) = inputs
+    if scipy.sparse.issparse(col):
+        col = col.toarray()[0]
+    return index, MDLPDiscretize(col, y, min_depth)
diff --git a/setup.py b/setup.py
@@ -43,7 +43,7 @@ def run(self):
 
     setup(
         name='mdlp-discretization',
-        version='0.3',
+        version='0.4',
         description=__doc__,
         license='BSD 3 Clause',
         url='github.com/hlin117/mdlp-discretization',

diff --git a/tests/test_mdlp.py b/tests/test_mdlp.py
@@ -1,5 +1,6 @@
 import itertools
 import numpy as np
+import scipy.sparse
 
 from numpy.testing import assert_almost_equal
 from numpy.testing import assert_array_almost_equal
@@ -32,45 +33,122 @@ def test_find_cut_no_cut():
     assert_equal(-1, k)
 
 def test_fit_transform_scale():
-  expected = [
-    [0, 0],
-    [0, 0],
-    [1, 0],
-    [2, 0],
-  ]
-
-  X = np.array([
-    [0.1, 0.1],
-    [0.2, 0.4],
-    [0.3, 0.2],
-    [0.4, 0.3]
-  ])
-  y = np.array([0, 0, 1, 2])
-  for i in range(10):
-    scaled_disc = MDLP(shuffle=False).fit_transform(X / 10**i, y)
-    assert_array_equal(expected, scaled_disc)
+    expected = [
+        [0, 0],
+        [0, 0],
+        [1, 0],
+        [2, 0],
+    ]
+
+    X = np.array([
+        [0.1, 0.1],
+        [0.2, 0.4],
+        [0.3, 0.2],
+        [0.4, 0.3]
+    ])
+    y = np.array([0, 0, 1, 2])
+    for i in range(10):
+        scaled_disc = MDLP(shuffle=False).fit_transform(X / 10**i, y)
+        assert_array_equal(expected, scaled_disc)
 
 def test_fit_transform_translate():
-  expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1)
+    expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1)
 
-  X = np.arange(9, dtype=float).reshape(-1, 1)
-  y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1])
-  transformed = MDLP(shuffle=False).fit_transform(X, y)
-  assert_array_equal(expected, transformed)
+    X = np.arange(9, dtype=float).reshape(-1, 1)
+    y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1])
+    transformed = MDLP(shuffle=False).fit_transform(X, y)
+    assert_array_equal(expected, transformed)
 
-  # translating data does not affect discretization result
-  translated = MDLP(shuffle=False).fit_transform(X - 5, y)
-  assert_array_equal(expected, translated)
+    # translating data does not affect discretization result
+    translated = MDLP(shuffle=False).fit_transform(X - 5, y)
+    assert_array_equal(expected, translated)
 
 def test_coerce_list():
-  expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1)
+    expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1)
+
+    X = [[i] for i in range(9)]
+    y = [0, 0, 0, 0, 1, 0, 1, 1, 1]
+    transformed = MDLP(shuffle=False).fit_transform(X, y)
+    assert_array_equal(expected, transformed)
+
+    np_X = np.arange(9).reshape(-1, 1)
+    np_y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1])
+    np_transformed = MDLP(shuffle=False).fit_transform(np_X, np_y)
+    assert_array_equal(expected, np_transformed)
+
+def test_drop_collapsed_features_dense():
+    expected = [
+        [0, 0],
+        [0, 0],
+        [1, 1],
+        [2, 2],
+    ]
+
+    X = np.array([
+        [0.1, 0.1, 0.1, 0.1, 0.1],
+        [0.4, 0.2, 0.4, 0.2, 0.4],
+        [0.2, 0.3, 0.2, 0.3, 0.2],
+        [0.3, 0.4, 0.3, 0.4, 0.3]
+    ])
+    y = np.array([0, 0, 1, 2])
+    disc = MDLP(drop_collapsed_features=True, shuffle=False).fit_transform(X, y)
+    assert_array_equal(expected, disc)
+
+def test_sparse_input():
+    expected = [
+        [0, 0],
+        [0, 0],
+        [1, 0],
+        [2, 0],
+    ]
+
+    dense_X = np.array([
+        [0.1, 0.1],
+        [0.2, 0.4],
+        [0.3, 0.2],
+        [0.4, 0.3]
+    ])
+    X = scipy.sparse.csr_matrix(dense_X)
+    y = np.array([0, 0, 1, 2])
+    disc = MDLP(shuffle=False).fit_transform(X, y)
+    assert_array_equal(expected, disc.toarray())
+
+def test_drop_collapsed_features_sparse():
+    expected = [
+        [0, 0],
+        [0, 0],
+        [1, 1],
+        [2, 2],
+    ]
+
+    dense_X = np.array([
+        [0.1, 0.1, 0.1, 0.1, 0.1],
+        [0.4, 0.2, 0.4, 0.2, 0.4],
+        [0.2, 0.3, 0.2, 0.3, 0.2],
+        [0.3, 0.4, 0.3, 0.4, 0.3]
+    ])
+    X = scipy.sparse.csr_matrix(dense_X)
+    y = np.array([0, 0, 1, 2])
+    disc = MDLP(drop_collapsed_features=True, shuffle=False).fit_transform(X, y)
+    assert_array_equal(expected, disc.toarray())
 
-  X = [[i] for i in range(9)]
-  y = [0, 0, 0, 0, 1, 0, 1, 1, 1]
-  transformed = MDLP(shuffle=False).fit_transform(X, y)
-  assert_array_equal(expected, transformed)
+def test_multiprocessing():
+    """Only tests that the functionality is not affected, not that parallel
+       processing actually takes place.
+    """
+    expected = [
+        [0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0],
+        [0, 1, 0, 1, 0],
+        [0, 2, 0, 2, 0],
+    ]
 
-  np_X = np.arange(9).reshape(-1, 1)
-  np_y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1])
-  np_transformed = MDLP(shuffle=False).fit_transform(np_X, np_y)
-  assert_array_equal(expected, np_transformed)
+    X = np.array([
+        [0.1, 0.1, 0.1, 0.1, 0.1],
+        [0.4, 0.2, 0.4, 0.2, 0.4],
+        [0.2, 0.3, 0.2, 0.3, 0.2],
+        [0.3, 0.4, 0.3, 0.4, 0.3]
+    ])
+    y = np.array([0, 0, 1, 2])
+    disc = MDLP(n_jobs=3, shuffle=False).fit_transform(X, y)
+    assert_array_equal(expected, disc)