diff --git a/src/classification/main.jl b/src/classification/main.jl index 52e7bdb7..3ae84a4e 100644 --- a/src/classification/main.jl +++ b/src/classification/main.jl @@ -23,7 +23,7 @@ end # Applies `row_fun(X_row)::AbstractVector` to each row in X # and returns a matrix containing the resulting vectors, stacked vertically -function stack_function_results(row_fun::Function, X::AbstractMatrix) +function stack_function_results(row_fun::Function, X::AbstractVecOrMat) N = size(X, 1) N_cols = length(row_fun(X[1, :])) # gets the number of columns out = Array{Float64}(undef, N, N_cols) @@ -52,7 +52,7 @@ end function build_stump( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, weights = nothing; rng = Random.GLOBAL_RNG) where {S, T} @@ -73,7 +73,7 @@ end function build_tree( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_subfeatures = 0, max_depth = -1, min_samples_leaf = 1, @@ -138,23 +138,23 @@ function prune_tree(tree::LeafOrNode{S, T}, purity_thresh=1.0) where {S, T} end -apply_tree(leaf::Leaf{T}, feature::AbstractVector{S}) where {S, T} = leaf.majority +_apply_tree(leaf::Leaf{T}, feature::AbstractVector{S}) where {S, T} = leaf.majority -function apply_tree(tree::Node{S, T}, features::AbstractVector{S}) where {S, T} +function _apply_tree(tree::Node{S, T}, features::AbstractVector{S}) where {S, T} if tree.featid == 0 - return apply_tree(tree.left, features) + return _apply_tree(tree.left, features) elseif features[tree.featid] < tree.featval - return apply_tree(tree.left, features) + return _apply_tree(tree.left, features) else - return apply_tree(tree.right, features) + return _apply_tree(tree.right, features) end end -function apply_tree(tree::LeafOrNode{S, T}, features::AbstractMatrix{S}) where {S, T} +function apply_tree(tree::LeafOrNode{S, T}, features::AbstractVecOrMat{S}) where {S, T} N = size(features,1) predictions = Array{T}(undef, N) for i in 1:N - predictions[i] = apply_tree(tree, features[i, :]) + predictions[i] = _apply_tree(tree, features[i, :]) end if T <: Float64 return Float64.(predictions) @@ -171,25 +171,25 @@ n_labels` matrix of probabilities, each row summing up to 1. `col_labels` is a vector containing the distinct labels (eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering of the output matrix. """ -apply_tree_proba(leaf::Leaf{T}, features::AbstractVector{S}, labels) where {S, T} = +_apply_tree_proba(leaf::Leaf{T}, features::AbstractVector{S}, labels) where {S, T} = compute_probabilities(labels, leaf.values) -function apply_tree_proba(tree::Node{S, T}, features::AbstractVector{S}, labels) where {S, T} +function _apply_tree_proba(tree::Node{S, T}, features::AbstractVector{S}, labels) where {S, T} if tree.featval === nothing - return apply_tree_proba(tree.left, features, labels) + return _apply_tree_proba(tree.left, features, labels) elseif features[tree.featid] < tree.featval - return apply_tree_proba(tree.left, features, labels) + return _apply_tree_proba(tree.left, features, labels) else - return apply_tree_proba(tree.right, features, labels) + return _apply_tree_proba(tree.right, features, labels) end end -apply_tree_proba(tree::LeafOrNode{S, T}, features::AbstractMatrix{S}, labels) where {S, T} = - stack_function_results(row->apply_tree_proba(tree, row, labels), features) +apply_tree_proba(tree::LeafOrNode{S, T}, features::AbstractVecOrMat{S}, labels) where {S, T} = + stack_function_results(row->_apply_tree_proba(tree, row, labels), features) function build_forest( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_subfeatures = -1, n_trees = 10, partial_sampling = 0.7, @@ -254,11 +254,11 @@ function build_forest( return Ensemble{S, T}(forest) end -function apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where {S, T} +function _apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where {S, T} n_trees = length(forest) votes = Array{T}(undef, n_trees) for i in 1:n_trees - votes[i] = apply_tree(forest.trees[i], features) + votes[i] = _apply_tree(forest.trees[i], features) end if T <: Float64 @@ -268,11 +268,11 @@ function apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where end end -function apply_forest(forest::Ensemble{S, T}, features::AbstractMatrix{S}) where {S, T} +function apply_forest(forest::Ensemble{S, T}, features::AbstractVecOrMat{S}) where {S, T} N = size(features,1) predictions = Array{T}(undef, N) for i in 1:N - predictions[i] = apply_forest(forest, features[i, :]) + predictions[i] = _apply_forest(forest, features[i, :]) end return predictions end @@ -285,18 +285,18 @@ n_labels` matrix of probabilities, each row summing up to 1. `col_labels` is a vector containing the distinct labels (eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering of the output matrix. """ -function apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVector{S}, labels) where {S, T} - votes = [apply_tree(tree, features) for tree in forest.trees] +function _apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVector{S}, labels) where {S, T} + votes = [_apply_tree(tree, features) for tree in forest.trees] return compute_probabilities(labels, votes) end -apply_forest_proba(forest::Ensemble{S, T}, features::AbstractMatrix{S}, labels) where {S, T} = - stack_function_results(row->apply_forest_proba(forest, row, labels), +apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVecOrMat{S}, labels) where {S, T} = + stack_function_results(row->_apply_forest_proba(forest, row, labels), features) function build_adaboost_stumps( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_iterations :: Integer; rng = Random.GLOBAL_RNG) where {S, T} N = length(labels) @@ -321,11 +321,11 @@ function build_adaboost_stumps( return (Ensemble{S, T}(stumps), coeffs) end -function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVector{S}) where {S, T} +function _apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVector{S}) where {S, T} n_stumps = length(stumps) counts = Dict() for i in 1:n_stumps - prediction = apply_tree(stumps.trees[i], features) + prediction = _apply_tree(stumps.trees[i], features) counts[prediction] = get(counts, prediction, 0.0) + coeffs[i] end top_prediction = stumps.trees[1].left.majority @@ -339,11 +339,11 @@ function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Fl return top_prediction end -function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractMatrix{S}) where {S, T} +function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVecOrMat{S}) where {S, T} n_samples = size(features, 1) predictions = Array{T}(undef, n_samples) for i in 1:n_samples - predictions[i] = apply_adaboost_stumps(stumps, coeffs, features[i,:]) + predictions[i] = _apply_adaboost_stumps(stumps, coeffs, features[i,:]) end return predictions end @@ -356,13 +356,13 @@ n_labels` matrix of probabilities, each row summing up to 1. `col_labels` is a vector containing the distinct labels (eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering of the output matrix. """ -function apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, +function _apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVector{S}, labels::AbstractVector{T}) where {S, T} - votes = [apply_tree(stump, features) for stump in stumps.trees] + votes = [_apply_tree(stump, features) for stump in stumps.trees] compute_probabilities(labels, votes, coeffs) end function apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, - features::AbstractMatrix{S}, labels::AbstractVector{T}) where {S, T} - stack_function_results(row->apply_adaboost_stumps_proba(stumps, coeffs, row, labels), features) + features::AbstractVecOrMat{S}, labels::AbstractVector{T}) where {S, T} + stack_function_results(row->_apply_adaboost_stumps_proba(stumps, coeffs, row, labels), features) end diff --git a/src/classification/tree.jl b/src/classification/tree.jl index ce22a9b0..00b567de 100644 --- a/src/classification/tree.jl +++ b/src/classification/tree.jl @@ -43,7 +43,7 @@ module treeclassifier # find an optimal split that satisfy the given constraints # (max_depth, min_samples_split, min_purity_increase) function _split!( - X :: AbstractMatrix{S}, # the feature array + X :: AbstractVecOrMat{S}, # the feature array Y :: AbstractVector{Int}, # the label array W :: AbstractVector{U}, # the weight vector purity_function :: Function, @@ -226,7 +226,7 @@ module treeclassifier end function _fit( - X :: AbstractMatrix{S}, + X :: AbstractVecOrMat{S}, Y :: AbstractVector{Int}, W :: AbstractVector{U}, loss :: Function, @@ -237,9 +237,8 @@ module treeclassifier min_samples_split :: Int, min_purity_increase :: Float64, rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U} - - n_samples, n_features = size(X) - + + n_samples, n_features = util.find_n_samples_and_n_features(X) nc = Array{U}(undef, n_classes) ncl = Array{U}(undef, n_classes) ncr = Array{U}(undef, n_classes) @@ -273,7 +272,7 @@ module treeclassifier end function fit(; - X :: AbstractMatrix{S}, + X :: AbstractVecOrMat{S}, Y :: AbstractVector{T}, W :: Union{Nothing, AbstractVector{U}}, loss=util.entropy :: Function, @@ -284,7 +283,7 @@ module treeclassifier min_purity_increase :: Float64, rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, T, U} - n_samples, n_features = size(X) + n_samples, n_features = util.find_n_samples_and_n_features(X) list, Y_ = util.assign(Y) if W == nothing W = fill(1, n_samples) diff --git a/src/measures.jl b/src/measures.jl index 06de1e18..35a62c3f 100644 --- a/src/measures.jl +++ b/src/measures.jl @@ -72,7 +72,7 @@ function confusion_matrix(actual::AbstractVector, predicted::AbstractVector) return ConfusionMatrix(classes, CM, accuracy, kappa) end -function _nfoldCV(classifier::Symbol, labels::AbstractVector{T}, features::AbstractMatrix{S}, args...; verbose, rng) where {S, T} +function _nfoldCV(classifier::Symbol, labels::AbstractVector{T}, features::AbstractVecOrMat{S}, args...; verbose, rng) where {S, T} _rng = mk_rng(rng)::Random.AbstractRNG nfolds = args[1] if nfolds < 2 @@ -151,7 +151,7 @@ end function nfoldCV_tree( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_folds :: Integer, pruning_purity :: Float64 = 1.0, max_depth :: Integer = -1, @@ -165,7 +165,7 @@ function nfoldCV_tree( end function nfoldCV_forest( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_folds :: Integer, n_subfeatures :: Integer = -1, n_trees :: Integer = 10, @@ -181,7 +181,7 @@ function nfoldCV_forest( end function nfoldCV_stumps( labels ::AbstractVector{T}, - features ::AbstractMatrix{S}, + features ::AbstractVecOrMat{S}, n_folds ::Integer, n_iterations ::Integer = 10; verbose :: Bool = true, @@ -203,7 +203,7 @@ function R2(actual, predicted) return 1.0 - ss_residual/ss_total end -function _nfoldCV(regressor::Symbol, labels::AbstractVector{T}, features::AbstractMatrix, args...; verbose, rng) where T <: Float64 +function _nfoldCV(regressor::Symbol, labels::AbstractVector{T}, features::AbstractVecOrMat, args...; verbose, rng) where T <: Float64 _rng = mk_rng(rng)::Random.AbstractRNG nfolds = args[1] if nfolds < 2 @@ -279,7 +279,7 @@ end function nfoldCV_tree( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_folds :: Integer, pruning_purity :: Float64 = 1.0, max_depth :: Integer = -1, @@ -293,7 +293,7 @@ _nfoldCV(:tree, labels, features, n_folds, pruning_purity, max_depth, end function nfoldCV_forest( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_folds :: Integer, n_subfeatures :: Integer = -1, n_trees :: Integer = 10, diff --git a/src/regression/main.jl b/src/regression/main.jl index 2d012aa0..af234e4c 100644 --- a/src/regression/main.jl +++ b/src/regression/main.jl @@ -10,13 +10,13 @@ function _convert(node::treeregressor.NodeMeta{S}, labels::Array{T}) where {S, T end end -function build_stump(labels::AbstractVector{T}, features::AbstractMatrix{S}; rng = Random.GLOBAL_RNG) where {S, T <: Float64} +function build_stump(labels::AbstractVector{T}, features::AbstractVecOrMat{S}; rng = Random.GLOBAL_RNG) where {S, T <: Float64} return build_tree(labels, features, 0, 1) end function build_tree( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_subfeatures = 0, max_depth = -1, min_samples_leaf = 5, @@ -48,7 +48,7 @@ end function build_forest( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_subfeatures = -1, n_trees = 10, partial_sampling = 0.7, diff --git a/src/regression/tree.jl b/src/regression/tree.jl index 06ade6e2..dc42c063 100644 --- a/src/regression/tree.jl +++ b/src/regression/tree.jl @@ -42,7 +42,7 @@ module treeregressor # find an optimal split that satisfy the given constraints # (max_depth, min_samples_split, min_purity_increase) function _split!( - X :: AbstractMatrix{S}, # the feature array + X :: AbstractVecOrMat{S}, # the feature array Y :: AbstractVector{Float64}, # the label array W :: AbstractVector{U}, node :: NodeMeta{S}, # the node to split @@ -229,7 +229,7 @@ module treeregressor end function _fit( - X :: AbstractMatrix{S}, + X :: AbstractVecOrMat{S}, Y :: AbstractVector{Float64}, W :: AbstractVector{U}, max_features :: Int, @@ -239,8 +239,7 @@ module treeregressor min_purity_increase :: Float64, rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U} - n_samples, n_features = size(X) - + n_samples, n_features = util.find_n_samples_and_n_features(X) Yf = Array{Float64}(undef, n_samples) Xf = Array{S}(undef, n_samples) Wf = Array{U}(undef, n_samples) @@ -272,7 +271,7 @@ module treeregressor end function fit(; - X :: AbstractMatrix{S}, + X :: AbstractVecOrMat{S}, Y :: AbstractVector{Float64}, W :: Union{Nothing, AbstractVector{U}}, max_features :: Int, @@ -282,7 +281,7 @@ module treeregressor min_purity_increase :: Float64, rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U} - n_samples, n_features = size(X) + n_samples, n_features = util.find_n_samples_and_n_features(X) if W == nothing W = fill(1.0, n_samples) end diff --git a/src/scikitlearnAPI.jl b/src/scikitlearnAPI.jl index 249e531a..03fb7df9 100644 --- a/src/scikitlearnAPI.jl +++ b/src/scikitlearnAPI.jl @@ -49,8 +49,8 @@ get_classes(dt::DecisionTreeClassifier) = dt.classes [:pruning_purity_threshold, :max_depth, :min_samples_leaf, :min_samples_split, :min_purity_increase, :rng]) -function fit!(dt::DecisionTreeClassifier, X, y) - n_samples, n_features = size(X) +function fit!(dt::DecisionTreeClassifier, X::AbstractVecOrMat, y) + n_samples, n_features = find_n_samples_and_n_features(X) dt.root = build_tree( y, X, dt.n_subfeatures, @@ -136,8 +136,8 @@ end [:pruning_purity_threshold, :min_samples_leaf, :n_subfeatures, :max_depth, :min_samples_split, :min_purity_increase, :rng]) -function fit!(dt::DecisionTreeRegressor, X::AbstractMatrix, y::AbstractVector) - n_samples, n_features = size(X) +function fit!(dt::DecisionTreeRegressor, X::AbstractVecOrMat, y::AbstractVector) + n_samples, n_features = find_n_samples_and_n_features(X) dt.root = build_tree( float.(y), X, dt.n_subfeatures, @@ -213,8 +213,8 @@ get_classes(rf::RandomForestClassifier) = rf.classes :min_samples_leaf, :min_samples_split, :min_purity_increase, :rng]) -function fit!(rf::RandomForestClassifier, X::AbstractMatrix, y::AbstractVector) - n_samples, n_features = size(X) +function fit!(rf::RandomForestClassifier, X::AbstractVecOrMat, y::AbstractVector) + n_samples, n_features = find_n_samples_and_n_features(X) rf.ensemble = build_forest( y, X, rf.n_subfeatures, @@ -297,8 +297,8 @@ end # since it'll change throughout fitting, but it works :max_depth, :rng]) -function fit!(rf::RandomForestRegressor, X::AbstractMatrix, y::AbstractVector) - n_samples, n_features = size(X) +function fit!(rf::RandomForestRegressor, X::AbstractVecOrMat, y::AbstractVector) + n_samples, n_features = find_n_samples_and_n_features(X) rf.ensemble = build_forest( float.(y), X, rf.n_subfeatures, @@ -388,3 +388,15 @@ length(dt::DecisionTreeRegressor) = length(dt.root) print_tree(dt::DecisionTreeClassifier, depth=-1; kwargs...) = print_tree(dt.root, depth; kwargs...) print_tree(dt::DecisionTreeRegressor, depth=-1; kwargs...) = print_tree(dt.root, depth; kwargs...) print_tree(n::Nothing, depth=-1; kwargs...) = show(n) + +# Due to the current project structure, this can't currently be loaded from util +function find_n_samples_and_n_features(X::AbstractVecOrMat) + n_samples, n_features = (0, 0) + if isa(X, AbstractVector) + n_samples = length(X) + n_features = 1 + elseif isa(X, AbstractMatrix) + n_samples, n_features = size(X) + end + return (n_samples, n_features) +end \ No newline at end of file diff --git a/src/util.jl b/src/util.jl index fe1fe63d..1b687c7f 100644 --- a/src/util.jl +++ b/src/util.jl @@ -3,7 +3,7 @@ module util - export gini, entropy, zero_one, q_bi_sort!, hypergeometric, check_input + export gini, entropy, zero_one, q_bi_sort!, hypergeometric, check_input, find_n_samples_and_n_features function assign(Y :: AbstractVector{T}, list :: AbstractVector{T}) where T dict = Dict{T, Int}() @@ -297,8 +297,21 @@ module util end end + # Find the appropriate values for n_samples and n_features. + # This is a shared need across multiple functions and files. + function find_n_samples_and_n_features(X::AbstractVecOrMat) + n_samples, n_features = (0, 0) + if isa(X, AbstractVector) + n_samples = length(X) + n_features = 1 + elseif isa(X, AbstractMatrix) + n_samples, n_features = size(X) + end + return (n_samples, n_features) + end + function check_input( - X :: AbstractMatrix{S}, + X :: AbstractVecOrMat{S}, Y :: AbstractVector{T}, W :: AbstractVector{U}, max_features :: Int, @@ -306,7 +319,12 @@ module util min_samples_leaf :: Int, min_samples_split :: Int, min_purity_increase :: Float64) where {S, T, U} - n_samples, n_features = size(X) + if isa(X, AbstractVector) + n_samples = length(X) + n_features = 1 + elseif isa(X, AbstractMatrix) + n_samples, n_features = size(X) + end if length(Y) != n_samples throw("dimension mismatch between X and Y ($(size(X)) vs $(size(Y))") elseif length(W) != n_samples diff --git a/test/classification/scikitlearn.jl b/test/classification/scikitlearn.jl index c4329439..efc287ab 100644 --- a/test/classification/scikitlearn.jl +++ b/test/classification/scikitlearn.jl @@ -18,6 +18,21 @@ model = fit!(AdaBoostStumpClassifier(), features, labels) # Adaboost isn't so hot on this task, disabled for now mean(predict(model, features) .== labels) +# Repeat above but for input as vector +features = rand(n); +labels = round.(Int, features); + +model = fit!(DecisionTreeClassifier(pruning_purity_threshold=0.9), features, labels) +@test mean(predict(model, features) .== labels) > 0.8 + +model = fit!(RandomForestClassifier(), features, labels) +@test mean(predict(model, features) .== labels) > 0.8 + +model = fit!(AdaBoostStumpClassifier(), features, labels) +# Adaboost isn't so hot on this task, disabled for now +mean(predict(model, features) .== labels) +# End Vector input tests + Random.seed!(2) N = 3000 X = randn(N, 10) diff --git a/test/miscellaneous/convert.jl b/test/miscellaneous/convert.jl index c131dbd4..a2660cc8 100644 --- a/test/miscellaneous/convert.jl +++ b/test/miscellaneous/convert.jl @@ -1,20 +1,19 @@ # Test conversion of Leaf to Node - @testset "convert.jl" begin lf = Leaf(1, [1]) nv = Node{Int, Int}[] push!(nv, lf) -@test apply_tree(nv[1], [0]) == 1 +@test DecisionTree._apply_tree(nv[1], [0]) == 1 lf = Leaf(1.0, [0.0, 1.0]) nv = Node{Int, Float64}[] push!(nv, lf) -@test apply_tree(nv[1], [0]) == 1.0 +@test DecisionTree._apply_tree(nv[1], [0]) == 1.0 lf = Leaf("A", ["B", "A"]) nv = Node{Int, String}[] push!(nv, lf) -@test apply_tree(nv[1], [0]) == "A" +@test DecisionTree._apply_tree(nv[1], [0]) == "A" end # @testset diff --git a/test/regression/scikitlearn.jl b/test/regression/scikitlearn.jl index 13e78742..2b1cbccb 100644 --- a/test/regression/scikitlearn.jl +++ b/test/regression/scikitlearn.jl @@ -15,6 +15,18 @@ model = fit!(DecisionTreeRegressor(min_samples_split=5), features, labels) model = fit!(RandomForestRegressor(n_trees=10, min_samples_leaf=5, n_subfeatures=2), features, labels) @test R2(labels, predict(model, features)) > 0.8 +# Repeat the above but for single feature vector +features = rand(n) +labels = features .* 2 +model = fit!(DecisionTreeRegressor(min_samples_leaf=5, pruning_purity_threshold=0.1), features, labels) +@test R2(labels, predict(model, features)) > 0.8 + +model = fit!(DecisionTreeRegressor(min_samples_split=5), features, labels) +@test R2(labels, predict(model, features)) > 0.8 + +model = fit!(RandomForestRegressor(n_trees=10, min_samples_leaf=5, n_subfeatures=1), features, labels) +@test R2(labels, predict(model, features)) > 0.8 + Random.seed!(2) N = 3000 X = randn(N, 10)