Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 96 additions & 15 deletions mdlp/discretization.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from __future__ import division

import itertools
import multiprocessing as mp
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.utils import (
Expand All @@ -17,6 +19,7 @@
from mdlp._mdlp import MDLPDiscretize

import numpy as np
import scipy


class MDLP(BaseEstimator, TransformerMixin):
Expand All @@ -36,6 +39,10 @@ class MDLP(BaseEstimator, TransformerMixin):
If `X` is a 1-D array, then continuous_features should be None.
Otherwise, for a 2-D array, defaults to `np.arange(X.shape[1])`.

drop_collapsed_features : bool (default=False)
When set to `True` will remove single-bin features when transforming
`X`.

min_depth : int (default=0)
The minimum depth of the interval splitting. Overrides
the MDLP stopping criterion. If the entropy at a given interval
Expand All @@ -48,6 +55,10 @@ class MDLP(BaseEstimator, TransformerMixin):
random_state parameter. Thus, setting shuffle=False will override
the affect of random_state.)

n_jobs : int (default=1)
The number of jobs to run in parallel for fit (but not transform).
If -1, then the number of jobs is set to the number of cores.

random_state : int (default=None)
Seed of pseudo RNG to use when shuffling the data. Affects the
outcome of MDLP if there are multiple samples with the same
Expand Down Expand Up @@ -93,17 +104,23 @@ class MDLP(BaseEstimator, TransformerMixin):
"""

def __init__(self, continuous_features=None, min_depth=0, shuffle=True,
random_state=None):
drop_collapsed_features=False, n_jobs=1, random_state=None):
# Parameters
self.continous_features = continuous_features
self.min_depth = min_depth
self.random_state = random_state
self.shuffle = shuffle
self.drop_collapsed_features = drop_collapsed_features
if n_jobs < -1 or n_jobs == 0:
raise ValueError("Valid values for `n_jobs` are -1 or a positive integer."
"supplied value: {0}".format(n_jobs))
self.n_jobs = n_jobs

# Attributes
self.continuous_features_ = continuous_features
self.cut_points_ = None
self.dimensions_ = None
self.__collapsed_features_count = 0

def fit(self, X, y):
"""Finds the intervals of interest from the input data.
Expand All @@ -116,10 +133,11 @@ def fit(self, X, y):

y : A list or array of class labels corresponding to `X`.
"""
X = check_array(X, force_all_finite=True, ensure_2d=False, dtype=np.float64)
X = check_array(X, accept_sparse=True, force_all_finite=True, \
ensure_2d=False, dtype=np.float64)
y = column_or_1d(y)
y = check_array(y, ensure_2d=False, dtype=np.int64)
X, y = check_X_y(X, y)
X, y = check_X_y(X, y, accept_sparse=True)

self.dimensions_ = len(X.shape)

Expand All @@ -138,16 +156,7 @@ def fit(self, X, y):
y = y[perm]

if self.dimensions_ == 2:
if self.continuous_features_ is None:
self.continuous_features_ = np.arange(X.shape[1])

self.cut_points_ = dict()

for index, col in enumerate(X.T):
if index not in self.continuous_features_:
continue
cut_points = MDLPDiscretize(col, y, self.min_depth)
self.cut_points_[index] = cut_points
self.__fit_2d(X, y)
else:
if self.continuous_features_ is not None:
raise ValueError("Passed in a 1-d column of continuous features, "
Expand All @@ -156,24 +165,85 @@ def fit(self, X, y):
cut_points = MDLPDiscretize(X, y, self.min_depth)
self.cut_points_ = cut_points

self.__collapsed_features_count = \
len([cp for cp in self.cut_points_.values() if cp.size == 0])

return self

def __fit_2d(self, X, y):
if self.continuous_features_ is None:
self.continuous_features_ = np.arange(X.shape[1])


def __append_y_and_min_depth(index_and_col):
(index, col) = index_and_col
return index, col, y, self.min_depth

inputs = map(__append_y_and_min_depth, \
filter(lambda indcol: indcol[0] in self.continuous_features_, \
enumerate(X.T)))

if self.n_jobs > 1:
pool = mp.Pool(None if self.n_jobs < 1 else self.n_jobs)
results = pool.map(_calculate_cut_points, inputs)
pool.close()
pool.join()
else:
results = []
for inp in inputs:
results.append(_calculate_cut_points(inp))

self.cut_points_ = dict(results)

def transform(self, X, y=None):
"""Discretizes values in X into {0, ..., k-1}.

`k` is the number of bins the discretizer creates from a continuous
feature.
"""
X = check_array(X, force_all_finite=True, ensure_2d=False)
X = check_array(X, accept_sparse=True, force_all_finite=True, ensure_2d=False)
check_is_fitted(self, "cut_points_")
if self.dimensions_ == 1:
output = np.searchsorted(self.cut_points_, X)
else:
output = self.__transform_2d(X)
return output

def __transform_2d(self, X):
if self.drop_collapsed_features:
new_shape = (X.shape[0], X.shape[1] - self.__collapsed_features_count)
output = self.__make_output(X, new_shape)
output_col = 0
for input_col in range(X.shape[1]):
if input_col in self.continuous_features_:
if self.cut_points_[input_col].size > 0:
output[:, output_col] = \
np.searchsorted(self.cut_points_[input_col], \
self.__get_col(X, input_col))
output_col += 1
else:
output[:, output_col] = self.__get_col(X, input_col)
output_col += 1
else:
output = X.copy()
for i in self.continuous_features_:
output[:, i] = np.searchsorted(self.cut_points_[i], X[:, i])
output[:, i] = np.searchsorted(self.cut_points_[i], self.__get_col(X, i))
return output

def __make_output(self, X, shape):
if scipy.sparse.issparse(X):
output = scipy.sparse.dok_matrix(shape, dtype=X.dtype)
else:
output = np.ndarray(shape=shape, dtype=X.dtype)
return output

def __get_col(self, X, col_index):
if scipy.sparse.issparse(X):
col = X[:, col_index].toarray()
else:
col = X[:, col_index]
return col

def cat2intervals(self, X, index=None):
"""Converts a categorical feature into a list of intervals.
"""
Expand Down Expand Up @@ -221,3 +291,14 @@ def _assign_intervals(self, cp_indices, index):
backs[cp_indices != numCuts] = cut_points[non_numCuts_mask]

return [(front, back) for front, back in zip(fronts, backs)]


def _calculate_cut_points(inputs):
"""Calculates the cut points given a single feature column and
corresponding labels. Expects the argument to be a tuple of
`(index, column, labels, min_depth)`.
"""
(index, col, y, min_depth) = inputs
if scipy.sparse.issparse(col):
col = col.toarray()[0]
return index, MDLPDiscretize(col, y, min_depth)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def run(self):

setup(
name='mdlp-discretization',
version='0.3',
version='0.4',
description=__doc__,
license='BSD 3 Clause',
url='github.com/hlin117/mdlp-discretization',
Expand Down
146 changes: 112 additions & 34 deletions tests/test_mdlp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import itertools
import numpy as np
import scipy.sparse

from numpy.testing import assert_almost_equal
from numpy.testing import assert_array_almost_equal
Expand Down Expand Up @@ -32,45 +33,122 @@ def test_find_cut_no_cut():
assert_equal(-1, k)

def test_fit_transform_scale():
expected = [
[0, 0],
[0, 0],
[1, 0],
[2, 0],
]

X = np.array([
[0.1, 0.1],
[0.2, 0.4],
[0.3, 0.2],
[0.4, 0.3]
])
y = np.array([0, 0, 1, 2])
for i in range(10):
scaled_disc = MDLP(shuffle=False).fit_transform(X / 10**i, y)
assert_array_equal(expected, scaled_disc)
expected = [
[0, 0],
[0, 0],
[1, 0],
[2, 0],
]

X = np.array([
[0.1, 0.1],
[0.2, 0.4],
[0.3, 0.2],
[0.4, 0.3]
])
y = np.array([0, 0, 1, 2])
for i in range(10):
scaled_disc = MDLP(shuffle=False).fit_transform(X / 10**i, y)
assert_array_equal(expected, scaled_disc)

def test_fit_transform_translate():
expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1)
expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1)

X = np.arange(9, dtype=float).reshape(-1, 1)
y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1])
transformed = MDLP(shuffle=False).fit_transform(X, y)
assert_array_equal(expected, transformed)
X = np.arange(9, dtype=float).reshape(-1, 1)
y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1])
transformed = MDLP(shuffle=False).fit_transform(X, y)
assert_array_equal(expected, transformed)

# translating data does not affect discretization result
translated = MDLP(shuffle=False).fit_transform(X - 5, y)
assert_array_equal(expected, translated)
# translating data does not affect discretization result
translated = MDLP(shuffle=False).fit_transform(X - 5, y)
assert_array_equal(expected, translated)

def test_coerce_list():
expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1)
expected = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1]).reshape(-1, 1)

X = [[i] for i in range(9)]
y = [0, 0, 0, 0, 1, 0, 1, 1, 1]
transformed = MDLP(shuffle=False).fit_transform(X, y)
assert_array_equal(expected, transformed)

np_X = np.arange(9).reshape(-1, 1)
np_y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1])
np_transformed = MDLP(shuffle=False).fit_transform(np_X, np_y)
assert_array_equal(expected, np_transformed)

def test_drop_collapsed_features_dense():
expected = [
[0, 0],
[0, 0],
[1, 1],
[2, 2],
]

X = np.array([
[0.1, 0.1, 0.1, 0.1, 0.1],
[0.4, 0.2, 0.4, 0.2, 0.4],
[0.2, 0.3, 0.2, 0.3, 0.2],
[0.3, 0.4, 0.3, 0.4, 0.3]
])
y = np.array([0, 0, 1, 2])
disc = MDLP(drop_collapsed_features=True, shuffle=False).fit_transform(X, y)
assert_array_equal(expected, disc)

def test_sparse_input():
expected = [
[0, 0],
[0, 0],
[1, 0],
[2, 0],
]

dense_X = np.array([
[0.1, 0.1],
[0.2, 0.4],
[0.3, 0.2],
[0.4, 0.3]
])
X = scipy.sparse.csr_matrix(dense_X)
y = np.array([0, 0, 1, 2])
disc = MDLP(shuffle=False).fit_transform(X, y)
assert_array_equal(expected, disc.toarray())

def test_drop_collapsed_features_sparse():
expected = [
[0, 0],
[0, 0],
[1, 1],
[2, 2],
]

dense_X = np.array([
[0.1, 0.1, 0.1, 0.1, 0.1],
[0.4, 0.2, 0.4, 0.2, 0.4],
[0.2, 0.3, 0.2, 0.3, 0.2],
[0.3, 0.4, 0.3, 0.4, 0.3]
])
X = scipy.sparse.csr_matrix(dense_X)
y = np.array([0, 0, 1, 2])
disc = MDLP(drop_collapsed_features=True, shuffle=False).fit_transform(X, y)
assert_array_equal(expected, disc.toarray())

X = [[i] for i in range(9)]
y = [0, 0, 0, 0, 1, 0, 1, 1, 1]
transformed = MDLP(shuffle=False).fit_transform(X, y)
assert_array_equal(expected, transformed)
def test_multiprocessing():
"""Only tests that the functionality is not affected, not that parallel
processing actually takes place.
"""
expected = [
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 1, 0, 1, 0],
[0, 2, 0, 2, 0],
]

np_X = np.arange(9).reshape(-1, 1)
np_y = np.array([0, 0, 0, 0, 1, 0, 1, 1, 1])
np_transformed = MDLP(shuffle=False).fit_transform(np_X, np_y)
assert_array_equal(expected, np_transformed)
X = np.array([
[0.1, 0.1, 0.1, 0.1, 0.1],
[0.4, 0.2, 0.4, 0.2, 0.4],
[0.2, 0.3, 0.2, 0.3, 0.2],
[0.3, 0.4, 0.3, 0.4, 0.3]
])
y = np.array([0, 0, 1, 2])
disc = MDLP(n_jobs=3, shuffle=False).fit_transform(X, y)
assert_array_equal(expected, disc)