diff --git a/mdlp/discretization.py b/mdlp/discretization.py index f04251f..1250e6e 100644 --- a/mdlp/discretization.py +++ b/mdlp/discretization.py @@ -5,6 +5,8 @@ from __future__ import division +import itertools +import multiprocessing as mp from sklearn.base import BaseEstimator from sklearn.base import TransformerMixin from sklearn.utils import ( @@ -17,6 +19,7 @@ from mdlp._mdlp import MDLPDiscretize import numpy as np +import scipy class MDLP(BaseEstimator, TransformerMixin): @@ -36,6 +39,10 @@ class MDLP(BaseEstimator, TransformerMixin): If `X` is a 1-D array, then continuous_features should be None. Otherwise, for a 2-D array, defaults to `np.arange(X.shape[1])`. + drop_collapsed_features : bool (default=False) + When set to `True` will remove single-bin features when transforming + `X`. + min_depth : int (default=0) The minimum depth of the interval splitting. Overrides the MDLP stopping criterion. If the entropy at a given interval @@ -48,6 +55,10 @@ class MDLP(BaseEstimator, TransformerMixin): random_state parameter. Thus, setting shuffle=False will override the affect of random_state.) + n_jobs : int (default=1) + The number of jobs to run in parallel for fit (but not transform). + If -1, then the number of jobs is set to the number of cores. + random_state : int (default=None) Seed of pseudo RNG to use when shuffling the data. Affects the outcome of MDLP if there are multiple samples with the same @@ -93,17 +104,23 @@ class MDLP(BaseEstimator, TransformerMixin): """ def __init__(self, continuous_features=None, min_depth=0, shuffle=True, - random_state=None): + drop_collapsed_features=False, n_jobs=1, random_state=None): # Parameters self.continous_features = continuous_features self.min_depth = min_depth self.random_state = random_state self.shuffle = shuffle + self.drop_collapsed_features = drop_collapsed_features + if n_jobs < -1 or n_jobs == 0: + raise ValueError("Valid values for `n_jobs` are -1 or a positive integer." + "supplied value: {0}".format(n_jobs)) + self.n_jobs = n_jobs # Attributes self.continuous_features_ = continuous_features self.cut_points_ = None self.dimensions_ = None + self.__collapsed_features_count = 0 def fit(self, X, y): """Finds the intervals of interest from the input data. @@ -116,10 +133,11 @@ def fit(self, X, y): y : A list or array of class labels corresponding to `X`. """ - X = check_array(X, force_all_finite=True, ensure_2d=False, dtype=np.float64) + X = check_array(X, accept_sparse=True, force_all_finite=True, \ + ensure_2d=False, dtype=np.float64) y = column_or_1d(y) y = check_array(y, ensure_2d=False, dtype=np.int64) - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=True) self.dimensions_ = len(X.shape) @@ -138,16 +156,7 @@ def fit(self, X, y): y = y[perm] if self.dimensions_ == 2: - if self.continuous_features_ is None: - self.continuous_features_ = np.arange(X.shape[1]) - - self.cut_points_ = dict() - - for index, col in enumerate(X.T): - if index not in self.continuous_features_: - continue - cut_points = MDLPDiscretize(col, y, self.min_depth) - self.cut_points_[index] = cut_points + self.__fit_2d(X, y) else: if self.continuous_features_ is not None: raise ValueError("Passed in a 1-d column of continuous features, " @@ -156,24 +165,85 @@ def fit(self, X, y): cut_points = MDLPDiscretize(X, y, self.min_depth) self.cut_points_ = cut_points + self.__collapsed_features_count = \ + len([cp for cp in self.cut_points_.values() if cp.size == 0]) + return self + def __fit_2d(self, X, y): + if self.continuous_features_ is None: + self.continuous_features_ = np.arange(X.shape[1]) + + + def __append_y_and_min_depth(index_and_col): + (index, col) = index_and_col + return index, col, y, self.min_depth + + inputs = map(__append_y_and_min_depth, \ + filter(lambda indcol: indcol[0] in self.continuous_features_, \ + enumerate(X.T))) + + if self.n_jobs > 1: + pool = mp.Pool(None if self.n_jobs < 1 else self.n_jobs) + results = pool.map(_calculate_cut_points, inputs) + pool.close() + pool.join() + else: + results = [] + for inp in inputs: + results.append(_calculate_cut_points(inp)) + + self.cut_points_ = dict(results) + def transform(self, X, y=None): """Discretizes values in X into {0, ..., k-1}. `k` is the number of bins the discretizer creates from a continuous feature. """ - X = check_array(X, force_all_finite=True, ensure_2d=False) + X = check_array(X, accept_sparse=True, force_all_finite=True, ensure_2d=False) check_is_fitted(self, "cut_points_") if self.dimensions_ == 1: output = np.searchsorted(self.cut_points_, X) + else: + output = self.__transform_2d(X) + return output + + def __transform_2d(self, X): + if self.drop_collapsed_features: + new_shape = (X.shape[0], X.shape[1] - self.__collapsed_features_count) + output = self.__make_output(X, new_shape) + output_col = 0 + for input_col in range(X.shape[1]): + if input_col in self.continuous_features_: + if self.cut_points_[input_col].size > 0: + output[:, output_col] = \ + np.searchsorted(self.cut_points_[input_col], \ + self.__get_col(X, input_col)) + output_col += 1 + else: + output[:, output_col] = self.__get_col(X, input_col) + output_col += 1 else: output = X.copy() for i in self.continuous_features_: - output[:, i] = np.searchsorted(self.cut_points_[i], X[:, i]) + output[:, i] = np.searchsorted(self.cut_points_[i], self.__get_col(X, i)) + return output + + def __make_output(self, X, shape): + if scipy.sparse.issparse(X): + output = scipy.sparse.dok_matrix(shape, dtype=X.dtype) + else: + output = np.ndarray(shape=shape, dtype=X.dtype) return output + def __get_col(self, X, col_index): + if scipy.sparse.issparse(X): + col = X[:, col_index].toarray() + else: + col = X[:, col_index] + return col + def cat2intervals(self, X, index=None): """Converts a categorical feature into a list of intervals. """ @@ -221,3 +291,14 @@ def _assign_intervals(self, cp_indices, index): backs[cp_indices != numCuts] = cut_points[non_numCuts_mask] return [(front, back) for front, back in zip(fronts, backs)] + + +def _calculate_cut_points(inputs): + """Calculates the cut points given a single feature column and + corresponding labels. Expects the argument to be a tuple of + `(index, column, labels, min_depth)`. + """ + (index, col, y, min_depth) = inputs + if scipy.sparse.issparse(col): + col = col.toarray()[0] + return index, MDLPDiscretize(col, y, min_depth) diff --git a/setup.py b/setup.py index d5fa4cd..a283d58 100755 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ def run(self): setup( name='mdlp-discretization', - version='0.3', + version='0.4', description=__doc__, license='BSD 3 Clause', url='github.com/hlin117/mdlp-discretization', diff --git a/tests/test_mdlp.py b/tests/test_mdlp.py index 9d5511e..ebbbe9e 100644 --- a/tests/test_mdlp.py +++ b/tests/test_mdlp.py @@ -1,5 +1,6 @@ import itertools import numpy as np +import scipy.sparse from numpy.testing import assert_almost_equal from numpy.testing import assert_array_almost_equal @@ -32,45 +33,122 @@ def test_find_cut_no_cut(): assert_equal(-1, k) def test_fit_transform_scale(): - expected = [ - [0, 0], - [0, 0], - [1, 0], - [2, 0], - ] - - X = np.array([ - [0.1, 0.1], - [0.2, 0.4], - [0.3, 0.2], - [0.4, 0.3] - ]) - y = np.array([0, 0, 1, 2]) - for i in range(10): - scaled_disc = MDLP(shuffle=False).fit_transform(X / 10**i, y) - assert_array_equal(expected, scaled_disc) + expected = [ + [0, 0], + [0, 0], + [1, 0], + [2, 0], + ] + + X = np.array([ + [0.1, 0.1], + [0.2, 0.4], + [0.3, 0.2], + [0.4, 0.3] + ]) + y = np.array([0, 0, 1, 2]) + for i in range(10): + scaled_disc = MDLP(shuffle=False).fit_transform(X / 10**i, y) + assert_array_equal(expected, scaled_disc) def test_fit_transform_translate(): - expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1) + expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1) - X = np.arange(9, dtype=float).reshape(-1, 1) - y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1]) - transformed = MDLP(shuffle=False).fit_transform(X, y) - assert_array_equal(expected, transformed) + X = np.arange(9, dtype=float).reshape(-1, 1) + y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1]) + transformed = MDLP(shuffle=False).fit_transform(X, y) + assert_array_equal(expected, transformed) - # translating data does not affect discretization result - translated = MDLP(shuffle=False).fit_transform(X - 5, y) - assert_array_equal(expected, translated) + # translating data does not affect discretization result + translated = MDLP(shuffle=False).fit_transform(X - 5, y) + assert_array_equal(expected, translated) def test_coerce_list(): - expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1) + expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1) + + X = [[i] for i in range(9)] + y = [0, 0, 0, 0, 1, 0, 1, 1, 1] + transformed = MDLP(shuffle=False).fit_transform(X, y) + assert_array_equal(expected, transformed) + + np_X = np.arange(9).reshape(-1, 1) + np_y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1]) + np_transformed = MDLP(shuffle=False).fit_transform(np_X, np_y) + assert_array_equal(expected, np_transformed) + +def test_drop_collapsed_features_dense(): + expected = [ + [0, 0], + [0, 0], + [1, 1], + [2, 2], + ] + + X = np.array([ + [0.1, 0.1, 0.1, 0.1, 0.1], + [0.4, 0.2, 0.4, 0.2, 0.4], + [0.2, 0.3, 0.2, 0.3, 0.2], + [0.3, 0.4, 0.3, 0.4, 0.3] + ]) + y = np.array([0, 0, 1, 2]) + disc = MDLP(drop_collapsed_features=True, shuffle=False).fit_transform(X, y) + assert_array_equal(expected, disc) + +def test_sparse_input(): + expected = [ + [0, 0], + [0, 0], + [1, 0], + [2, 0], + ] + + dense_X = np.array([ + [0.1, 0.1], + [0.2, 0.4], + [0.3, 0.2], + [0.4, 0.3] + ]) + X = scipy.sparse.csr_matrix(dense_X) + y = np.array([0, 0, 1, 2]) + disc = MDLP(shuffle=False).fit_transform(X, y) + assert_array_equal(expected, disc.toarray()) + +def test_drop_collapsed_features_sparse(): + expected = [ + [0, 0], + [0, 0], + [1, 1], + [2, 2], + ] + + dense_X = np.array([ + [0.1, 0.1, 0.1, 0.1, 0.1], + [0.4, 0.2, 0.4, 0.2, 0.4], + [0.2, 0.3, 0.2, 0.3, 0.2], + [0.3, 0.4, 0.3, 0.4, 0.3] + ]) + X = scipy.sparse.csr_matrix(dense_X) + y = np.array([0, 0, 1, 2]) + disc = MDLP(drop_collapsed_features=True, shuffle=False).fit_transform(X, y) + assert_array_equal(expected, disc.toarray()) - X = [[i] for i in range(9)] - y = [0, 0, 0, 0, 1, 0, 1, 1, 1] - transformed = MDLP(shuffle=False).fit_transform(X, y) - assert_array_equal(expected, transformed) +def test_multiprocessing(): + """Only tests that the functionality is not affected, not that parallel + processing actually takes place. + """ + expected = [ + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 1, 0, 1, 0], + [0, 2, 0, 2, 0], + ] - np_X = np.arange(9).reshape(-1, 1) - np_y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1]) - np_transformed = MDLP(shuffle=False).fit_transform(np_X, np_y) - assert_array_equal(expected, np_transformed) + X = np.array([ + [0.1, 0.1, 0.1, 0.1, 0.1], + [0.4, 0.2, 0.4, 0.2, 0.4], + [0.2, 0.3, 0.2, 0.3, 0.2], + [0.3, 0.4, 0.3, 0.4, 0.3] + ]) + y = np.array([0, 0, 1, 2]) + disc = MDLP(n_jobs=3, shuffle=False).fit_transform(X, y) + assert_array_equal(expected, disc)