Skip to content

Allow features to be AbstractVector (in addition to AbstractMatrix) #150

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: dev
Choose a base branch
from
70 changes: 35 additions & 35 deletions src/classification/main.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ end

# Applies `row_fun(X_row)::AbstractVector` to each row in X
# and returns a matrix containing the resulting vectors, stacked vertically
function stack_function_results(row_fun::Function, X::AbstractMatrix)
function stack_function_results(row_fun::Function, X::AbstractVecOrMat)
N = size(X, 1)
N_cols = length(row_fun(X[1, :])) # gets the number of columns
out = Array{Float64}(undef, N, N_cols)
Expand Down Expand Up @@ -52,7 +52,7 @@ end

function build_stump(
labels :: AbstractVector{T},
features :: AbstractMatrix{S},
features :: AbstractVecOrMat{S},
weights = nothing;
rng = Random.GLOBAL_RNG) where {S, T}

Expand All @@ -73,7 +73,7 @@ end

function build_tree(
labels :: AbstractVector{T},
features :: AbstractMatrix{S},
features :: AbstractVecOrMat{S},
n_subfeatures = 0,
max_depth = -1,
min_samples_leaf = 1,
Expand Down Expand Up @@ -138,23 +138,23 @@ function prune_tree(tree::LeafOrNode{S, T}, purity_thresh=1.0) where {S, T}
end


apply_tree(leaf::Leaf{T}, feature::AbstractVector{S}) where {S, T} = leaf.majority
_apply_tree(leaf::Leaf{T}, feature::AbstractVector{S}) where {S, T} = leaf.majority

function apply_tree(tree::Node{S, T}, features::AbstractVector{S}) where {S, T}
function _apply_tree(tree::Node{S, T}, features::AbstractVector{S}) where {S, T}
if tree.featid == 0
return apply_tree(tree.left, features)
return _apply_tree(tree.left, features)
elseif features[tree.featid] < tree.featval
return apply_tree(tree.left, features)
return _apply_tree(tree.left, features)
else
return apply_tree(tree.right, features)
return _apply_tree(tree.right, features)
end
end

function apply_tree(tree::LeafOrNode{S, T}, features::AbstractMatrix{S}) where {S, T}
function apply_tree(tree::LeafOrNode{S, T}, features::AbstractVecOrMat{S}) where {S, T}
N = size(features,1)
predictions = Array{T}(undef, N)
for i in 1:N
predictions[i] = apply_tree(tree, features[i, :])
predictions[i] = _apply_tree(tree, features[i, :])
end
if T <: Float64
return Float64.(predictions)
Expand All @@ -171,25 +171,25 @@ n_labels` matrix of probabilities, each row summing up to 1.
`col_labels` is a vector containing the distinct labels
(eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering
of the output matrix. """
apply_tree_proba(leaf::Leaf{T}, features::AbstractVector{S}, labels) where {S, T} =
_apply_tree_proba(leaf::Leaf{T}, features::AbstractVector{S}, labels) where {S, T} =
compute_probabilities(labels, leaf.values)

function apply_tree_proba(tree::Node{S, T}, features::AbstractVector{S}, labels) where {S, T}
function _apply_tree_proba(tree::Node{S, T}, features::AbstractVector{S}, labels) where {S, T}
if tree.featval === nothing
return apply_tree_proba(tree.left, features, labels)
return _apply_tree_proba(tree.left, features, labels)
elseif features[tree.featid] < tree.featval
return apply_tree_proba(tree.left, features, labels)
return _apply_tree_proba(tree.left, features, labels)
else
return apply_tree_proba(tree.right, features, labels)
return _apply_tree_proba(tree.right, features, labels)
end
end

apply_tree_proba(tree::LeafOrNode{S, T}, features::AbstractMatrix{S}, labels) where {S, T} =
stack_function_results(row->apply_tree_proba(tree, row, labels), features)
apply_tree_proba(tree::LeafOrNode{S, T}, features::AbstractVecOrMat{S}, labels) where {S, T} =
stack_function_results(row->_apply_tree_proba(tree, row, labels), features)

function build_forest(
labels :: AbstractVector{T},
features :: AbstractMatrix{S},
features :: AbstractVecOrMat{S},
n_subfeatures = -1,
n_trees = 10,
partial_sampling = 0.7,
Expand Down Expand Up @@ -254,11 +254,11 @@ function build_forest(
return Ensemble{S, T}(forest)
end

function apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where {S, T}
function _apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where {S, T}
n_trees = length(forest)
votes = Array{T}(undef, n_trees)
for i in 1:n_trees
votes[i] = apply_tree(forest.trees[i], features)
votes[i] = _apply_tree(forest.trees[i], features)
end

if T <: Float64
Expand All @@ -268,11 +268,11 @@ function apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where
end
end

function apply_forest(forest::Ensemble{S, T}, features::AbstractMatrix{S}) where {S, T}
function apply_forest(forest::Ensemble{S, T}, features::AbstractVecOrMat{S}) where {S, T}
N = size(features,1)
predictions = Array{T}(undef, N)
for i in 1:N
predictions[i] = apply_forest(forest, features[i, :])
predictions[i] = _apply_forest(forest, features[i, :])
end
return predictions
end
Expand All @@ -285,18 +285,18 @@ n_labels` matrix of probabilities, each row summing up to 1.
`col_labels` is a vector containing the distinct labels
(eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering
of the output matrix. """
function apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVector{S}, labels) where {S, T}
votes = [apply_tree(tree, features) for tree in forest.trees]
function _apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVector{S}, labels) where {S, T}
votes = [_apply_tree(tree, features) for tree in forest.trees]
return compute_probabilities(labels, votes)
end

apply_forest_proba(forest::Ensemble{S, T}, features::AbstractMatrix{S}, labels) where {S, T} =
stack_function_results(row->apply_forest_proba(forest, row, labels),
apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVecOrMat{S}, labels) where {S, T} =
stack_function_results(row->_apply_forest_proba(forest, row, labels),
features)

function build_adaboost_stumps(
labels :: AbstractVector{T},
features :: AbstractMatrix{S},
features :: AbstractVecOrMat{S},
n_iterations :: Integer;
rng = Random.GLOBAL_RNG) where {S, T}
N = length(labels)
Expand All @@ -321,11 +321,11 @@ function build_adaboost_stumps(
return (Ensemble{S, T}(stumps), coeffs)
end

function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVector{S}) where {S, T}
function _apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVector{S}) where {S, T}
n_stumps = length(stumps)
counts = Dict()
for i in 1:n_stumps
prediction = apply_tree(stumps.trees[i], features)
prediction = _apply_tree(stumps.trees[i], features)
counts[prediction] = get(counts, prediction, 0.0) + coeffs[i]
end
top_prediction = stumps.trees[1].left.majority
Expand All @@ -339,11 +339,11 @@ function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Fl
return top_prediction
end

function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractMatrix{S}) where {S, T}
function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVecOrMat{S}) where {S, T}
n_samples = size(features, 1)
predictions = Array{T}(undef, n_samples)
for i in 1:n_samples
predictions[i] = apply_adaboost_stumps(stumps, coeffs, features[i,:])
predictions[i] = _apply_adaboost_stumps(stumps, coeffs, features[i,:])
end
return predictions
end
Expand All @@ -356,13 +356,13 @@ n_labels` matrix of probabilities, each row summing up to 1.
`col_labels` is a vector containing the distinct labels
(eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering
of the output matrix. """
function apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64},
function _apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64},
features::AbstractVector{S}, labels::AbstractVector{T}) where {S, T}
votes = [apply_tree(stump, features) for stump in stumps.trees]
votes = [_apply_tree(stump, features) for stump in stumps.trees]
compute_probabilities(labels, votes, coeffs)
end

function apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64},
features::AbstractMatrix{S}, labels::AbstractVector{T}) where {S, T}
stack_function_results(row->apply_adaboost_stumps_proba(stumps, coeffs, row, labels), features)
features::AbstractVecOrMat{S}, labels::AbstractVector{T}) where {S, T}
stack_function_results(row->_apply_adaboost_stumps_proba(stumps, coeffs, row, labels), features)
end
13 changes: 6 additions & 7 deletions src/classification/tree.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ module treeclassifier
# find an optimal split that satisfy the given constraints
# (max_depth, min_samples_split, min_purity_increase)
function _split!(
X :: AbstractMatrix{S}, # the feature array
X :: AbstractVecOrMat{S}, # the feature array
Y :: AbstractVector{Int}, # the label array
W :: AbstractVector{U}, # the weight vector
purity_function :: Function,
Expand Down Expand Up @@ -226,7 +226,7 @@ module treeclassifier
end

function _fit(
X :: AbstractMatrix{S},
X :: AbstractVecOrMat{S},
Y :: AbstractVector{Int},
W :: AbstractVector{U},
loss :: Function,
Expand All @@ -237,9 +237,8 @@ module treeclassifier
min_samples_split :: Int,
min_purity_increase :: Float64,
rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U}

n_samples, n_features = size(X)


n_samples, n_features = util.find_n_samples_and_n_features(X)
nc = Array{U}(undef, n_classes)
ncl = Array{U}(undef, n_classes)
ncr = Array{U}(undef, n_classes)
Expand Down Expand Up @@ -273,7 +272,7 @@ module treeclassifier
end

function fit(;
X :: AbstractMatrix{S},
X :: AbstractVecOrMat{S},
Y :: AbstractVector{T},
W :: Union{Nothing, AbstractVector{U}},
loss=util.entropy :: Function,
Expand All @@ -284,7 +283,7 @@ module treeclassifier
min_purity_increase :: Float64,
rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, T, U}

n_samples, n_features = size(X)
n_samples, n_features = util.find_n_samples_and_n_features(X)
list, Y_ = util.assign(Y)
if W == nothing
W = fill(1, n_samples)
Expand Down
14 changes: 7 additions & 7 deletions src/measures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ function confusion_matrix(actual::AbstractVector, predicted::AbstractVector)
return ConfusionMatrix(classes, CM, accuracy, kappa)
end

function _nfoldCV(classifier::Symbol, labels::AbstractVector{T}, features::AbstractMatrix{S}, args...; verbose, rng) where {S, T}
function _nfoldCV(classifier::Symbol, labels::AbstractVector{T}, features::AbstractVecOrMat{S}, args...; verbose, rng) where {S, T}
_rng = mk_rng(rng)::Random.AbstractRNG
nfolds = args[1]
if nfolds < 2
Expand Down Expand Up @@ -151,7 +151,7 @@ end

function nfoldCV_tree(
labels :: AbstractVector{T},
features :: AbstractMatrix{S},
features :: AbstractVecOrMat{S},
n_folds :: Integer,
pruning_purity :: Float64 = 1.0,
max_depth :: Integer = -1,
Expand All @@ -165,7 +165,7 @@ function nfoldCV_tree(
end
function nfoldCV_forest(
labels :: AbstractVector{T},
features :: AbstractMatrix{S},
features :: AbstractVecOrMat{S},
n_folds :: Integer,
n_subfeatures :: Integer = -1,
n_trees :: Integer = 10,
Expand All @@ -181,7 +181,7 @@ function nfoldCV_forest(
end
function nfoldCV_stumps(
labels ::AbstractVector{T},
features ::AbstractMatrix{S},
features ::AbstractVecOrMat{S},
n_folds ::Integer,
n_iterations ::Integer = 10;
verbose :: Bool = true,
Expand All @@ -203,7 +203,7 @@ function R2(actual, predicted)
return 1.0 - ss_residual/ss_total
end

function _nfoldCV(regressor::Symbol, labels::AbstractVector{T}, features::AbstractMatrix, args...; verbose, rng) where T <: Float64
function _nfoldCV(regressor::Symbol, labels::AbstractVector{T}, features::AbstractVecOrMat, args...; verbose, rng) where T <: Float64
_rng = mk_rng(rng)::Random.AbstractRNG
nfolds = args[1]
if nfolds < 2
Expand Down Expand Up @@ -279,7 +279,7 @@ end

function nfoldCV_tree(
labels :: AbstractVector{T},
features :: AbstractMatrix{S},
features :: AbstractVecOrMat{S},
n_folds :: Integer,
pruning_purity :: Float64 = 1.0,
max_depth :: Integer = -1,
Expand All @@ -293,7 +293,7 @@ _nfoldCV(:tree, labels, features, n_folds, pruning_purity, max_depth,
end
function nfoldCV_forest(
labels :: AbstractVector{T},
features :: AbstractMatrix{S},
features :: AbstractVecOrMat{S},
n_folds :: Integer,
n_subfeatures :: Integer = -1,
n_trees :: Integer = 10,
Expand Down
6 changes: 3 additions & 3 deletions src/regression/main.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ function _convert(node::treeregressor.NodeMeta{S}, labels::Array{T}) where {S, T
end
end

function build_stump(labels::AbstractVector{T}, features::AbstractMatrix{S}; rng = Random.GLOBAL_RNG) where {S, T <: Float64}
function build_stump(labels::AbstractVector{T}, features::AbstractVecOrMat{S}; rng = Random.GLOBAL_RNG) where {S, T <: Float64}
return build_tree(labels, features, 0, 1)
end

function build_tree(
labels :: AbstractVector{T},
features :: AbstractMatrix{S},
features :: AbstractVecOrMat{S},
n_subfeatures = 0,
max_depth = -1,
min_samples_leaf = 5,
Expand Down Expand Up @@ -48,7 +48,7 @@ end

function build_forest(
labels :: AbstractVector{T},
features :: AbstractMatrix{S},
features :: AbstractVecOrMat{S},
n_subfeatures = -1,
n_trees = 10,
partial_sampling = 0.7,
Expand Down
11 changes: 5 additions & 6 deletions src/regression/tree.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ module treeregressor
# find an optimal split that satisfy the given constraints
# (max_depth, min_samples_split, min_purity_increase)
function _split!(
X :: AbstractMatrix{S}, # the feature array
X :: AbstractVecOrMat{S}, # the feature array
Y :: AbstractVector{Float64}, # the label array
W :: AbstractVector{U},
node :: NodeMeta{S}, # the node to split
Expand Down Expand Up @@ -229,7 +229,7 @@ module treeregressor
end

function _fit(
X :: AbstractMatrix{S},
X :: AbstractVecOrMat{S},
Y :: AbstractVector{Float64},
W :: AbstractVector{U},
max_features :: Int,
Expand All @@ -239,8 +239,7 @@ module treeregressor
min_purity_increase :: Float64,
rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U}

n_samples, n_features = size(X)

n_samples, n_features = util.find_n_samples_and_n_features(X)
Yf = Array{Float64}(undef, n_samples)
Xf = Array{S}(undef, n_samples)
Wf = Array{U}(undef, n_samples)
Expand Down Expand Up @@ -272,7 +271,7 @@ module treeregressor
end

function fit(;
X :: AbstractMatrix{S},
X :: AbstractVecOrMat{S},
Y :: AbstractVector{Float64},
W :: Union{Nothing, AbstractVector{U}},
max_features :: Int,
Expand All @@ -282,7 +281,7 @@ module treeregressor
min_purity_increase :: Float64,
rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U}

n_samples, n_features = size(X)
n_samples, n_features = util.find_n_samples_and_n_features(X)
if W == nothing
W = fill(1.0, n_samples)
end
Expand Down
Loading