From d9933fd93826b740bf2430729bee42b7cb2b33e7 Mon Sep 17 00:00:00 2001 From: Sander Valentin Date: Wed, 23 Feb 2022 10:45:52 +0100 Subject: [PATCH 1/5] Change AbstractMatrix to AbstractVecOrMat --- src/classification/main.jl | 22 +++++++++++----------- src/classification/tree.jl | 6 +++--- src/measures.jl | 14 +++++++------- src/regression/main.jl | 6 +++--- src/regression/tree.jl | 6 +++--- src/scikitlearnAPI.jl | 6 +++--- src/util.jl | 2 +- 7 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/classification/main.jl b/src/classification/main.jl index 52e7bdb7..2f909a78 100644 --- a/src/classification/main.jl +++ b/src/classification/main.jl @@ -23,7 +23,7 @@ end # Applies `row_fun(X_row)::AbstractVector` to each row in X # and returns a matrix containing the resulting vectors, stacked vertically -function stack_function_results(row_fun::Function, X::AbstractMatrix) +function stack_function_results(row_fun::Function, X::AbstractVecOrMat) N = size(X, 1) N_cols = length(row_fun(X[1, :])) # gets the number of columns out = Array{Float64}(undef, N, N_cols) @@ -52,7 +52,7 @@ end function build_stump( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, weights = nothing; rng = Random.GLOBAL_RNG) where {S, T} @@ -73,7 +73,7 @@ end function build_tree( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_subfeatures = 0, max_depth = -1, min_samples_leaf = 1, @@ -150,7 +150,7 @@ function apply_tree(tree::Node{S, T}, features::AbstractVector{S}) where {S, T} end end -function apply_tree(tree::LeafOrNode{S, T}, features::AbstractMatrix{S}) where {S, T} +function apply_tree(tree::LeafOrNode{S, T}, features::AbstractVecOrMat{S}) where {S, T} N = size(features,1) predictions = Array{T}(undef, N) for i in 1:N @@ -184,12 +184,12 @@ function apply_tree_proba(tree::Node{S, T}, features::AbstractVector{S}, labels) end end -apply_tree_proba(tree::LeafOrNode{S, T}, features::AbstractMatrix{S}, labels) where {S, T} = +apply_tree_proba(tree::LeafOrNode{S, T}, features::AbstractVecOrMat{S}, labels) where {S, T} = stack_function_results(row->apply_tree_proba(tree, row, labels), features) function build_forest( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_subfeatures = -1, n_trees = 10, partial_sampling = 0.7, @@ -268,7 +268,7 @@ function apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where end end -function apply_forest(forest::Ensemble{S, T}, features::AbstractMatrix{S}) where {S, T} +function apply_forest(forest::Ensemble{S, T}, features::AbstractVecOrMat{S}) where {S, T} N = size(features,1) predictions = Array{T}(undef, N) for i in 1:N @@ -290,13 +290,13 @@ function apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVector{S}, return compute_probabilities(labels, votes) end -apply_forest_proba(forest::Ensemble{S, T}, features::AbstractMatrix{S}, labels) where {S, T} = +apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVecOrMat{S}, labels) where {S, T} = stack_function_results(row->apply_forest_proba(forest, row, labels), features) function build_adaboost_stumps( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_iterations :: Integer; rng = Random.GLOBAL_RNG) where {S, T} N = length(labels) @@ -339,7 +339,7 @@ function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Fl return top_prediction end -function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractMatrix{S}) where {S, T} +function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVecOrMat{S}) where {S, T} n_samples = size(features, 1) predictions = Array{T}(undef, n_samples) for i in 1:n_samples @@ -363,6 +363,6 @@ function apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVec end function apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, - features::AbstractMatrix{S}, labels::AbstractVector{T}) where {S, T} + features::AbstractVecOrMat{S}, labels::AbstractVector{T}) where {S, T} stack_function_results(row->apply_adaboost_stumps_proba(stumps, coeffs, row, labels), features) end diff --git a/src/classification/tree.jl b/src/classification/tree.jl index ce22a9b0..3dccfca7 100644 --- a/src/classification/tree.jl +++ b/src/classification/tree.jl @@ -43,7 +43,7 @@ module treeclassifier # find an optimal split that satisfy the given constraints # (max_depth, min_samples_split, min_purity_increase) function _split!( - X :: AbstractMatrix{S}, # the feature array + X :: AbstractVecOrMat{S}, # the feature array Y :: AbstractVector{Int}, # the label array W :: AbstractVector{U}, # the weight vector purity_function :: Function, @@ -226,7 +226,7 @@ module treeclassifier end function _fit( - X :: AbstractMatrix{S}, + X :: AbstractVecOrMat{S}, Y :: AbstractVector{Int}, W :: AbstractVector{U}, loss :: Function, @@ -273,7 +273,7 @@ module treeclassifier end function fit(; - X :: AbstractMatrix{S}, + X :: AbstractVecOrMat{S}, Y :: AbstractVector{T}, W :: Union{Nothing, AbstractVector{U}}, loss=util.entropy :: Function, diff --git a/src/measures.jl b/src/measures.jl index 06de1e18..35a62c3f 100644 --- a/src/measures.jl +++ b/src/measures.jl @@ -72,7 +72,7 @@ function confusion_matrix(actual::AbstractVector, predicted::AbstractVector) return ConfusionMatrix(classes, CM, accuracy, kappa) end -function _nfoldCV(classifier::Symbol, labels::AbstractVector{T}, features::AbstractMatrix{S}, args...; verbose, rng) where {S, T} +function _nfoldCV(classifier::Symbol, labels::AbstractVector{T}, features::AbstractVecOrMat{S}, args...; verbose, rng) where {S, T} _rng = mk_rng(rng)::Random.AbstractRNG nfolds = args[1] if nfolds < 2 @@ -151,7 +151,7 @@ end function nfoldCV_tree( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_folds :: Integer, pruning_purity :: Float64 = 1.0, max_depth :: Integer = -1, @@ -165,7 +165,7 @@ function nfoldCV_tree( end function nfoldCV_forest( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_folds :: Integer, n_subfeatures :: Integer = -1, n_trees :: Integer = 10, @@ -181,7 +181,7 @@ function nfoldCV_forest( end function nfoldCV_stumps( labels ::AbstractVector{T}, - features ::AbstractMatrix{S}, + features ::AbstractVecOrMat{S}, n_folds ::Integer, n_iterations ::Integer = 10; verbose :: Bool = true, @@ -203,7 +203,7 @@ function R2(actual, predicted) return 1.0 - ss_residual/ss_total end -function _nfoldCV(regressor::Symbol, labels::AbstractVector{T}, features::AbstractMatrix, args...; verbose, rng) where T <: Float64 +function _nfoldCV(regressor::Symbol, labels::AbstractVector{T}, features::AbstractVecOrMat, args...; verbose, rng) where T <: Float64 _rng = mk_rng(rng)::Random.AbstractRNG nfolds = args[1] if nfolds < 2 @@ -279,7 +279,7 @@ end function nfoldCV_tree( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_folds :: Integer, pruning_purity :: Float64 = 1.0, max_depth :: Integer = -1, @@ -293,7 +293,7 @@ _nfoldCV(:tree, labels, features, n_folds, pruning_purity, max_depth, end function nfoldCV_forest( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_folds :: Integer, n_subfeatures :: Integer = -1, n_trees :: Integer = 10, diff --git a/src/regression/main.jl b/src/regression/main.jl index 2d012aa0..af234e4c 100644 --- a/src/regression/main.jl +++ b/src/regression/main.jl @@ -10,13 +10,13 @@ function _convert(node::treeregressor.NodeMeta{S}, labels::Array{T}) where {S, T end end -function build_stump(labels::AbstractVector{T}, features::AbstractMatrix{S}; rng = Random.GLOBAL_RNG) where {S, T <: Float64} +function build_stump(labels::AbstractVector{T}, features::AbstractVecOrMat{S}; rng = Random.GLOBAL_RNG) where {S, T <: Float64} return build_tree(labels, features, 0, 1) end function build_tree( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_subfeatures = 0, max_depth = -1, min_samples_leaf = 5, @@ -48,7 +48,7 @@ end function build_forest( labels :: AbstractVector{T}, - features :: AbstractMatrix{S}, + features :: AbstractVecOrMat{S}, n_subfeatures = -1, n_trees = 10, partial_sampling = 0.7, diff --git a/src/regression/tree.jl b/src/regression/tree.jl index 06ade6e2..ad3eedd6 100644 --- a/src/regression/tree.jl +++ b/src/regression/tree.jl @@ -42,7 +42,7 @@ module treeregressor # find an optimal split that satisfy the given constraints # (max_depth, min_samples_split, min_purity_increase) function _split!( - X :: AbstractMatrix{S}, # the feature array + X :: AbstractVecOrMat{S}, # the feature array Y :: AbstractVector{Float64}, # the label array W :: AbstractVector{U}, node :: NodeMeta{S}, # the node to split @@ -229,7 +229,7 @@ module treeregressor end function _fit( - X :: AbstractMatrix{S}, + X :: AbstractVecOrMat{S}, Y :: AbstractVector{Float64}, W :: AbstractVector{U}, max_features :: Int, @@ -272,7 +272,7 @@ module treeregressor end function fit(; - X :: AbstractMatrix{S}, + X :: AbstractVecOrMat{S}, Y :: AbstractVector{Float64}, W :: Union{Nothing, AbstractVector{U}}, max_features :: Int, diff --git a/src/scikitlearnAPI.jl b/src/scikitlearnAPI.jl index 249e531a..a510086c 100644 --- a/src/scikitlearnAPI.jl +++ b/src/scikitlearnAPI.jl @@ -136,7 +136,7 @@ end [:pruning_purity_threshold, :min_samples_leaf, :n_subfeatures, :max_depth, :min_samples_split, :min_purity_increase, :rng]) -function fit!(dt::DecisionTreeRegressor, X::AbstractMatrix, y::AbstractVector) +function fit!(dt::DecisionTreeRegressor, X::AbstractVecOrMat, y::AbstractVector) n_samples, n_features = size(X) dt.root = build_tree( float.(y), X, @@ -213,7 +213,7 @@ get_classes(rf::RandomForestClassifier) = rf.classes :min_samples_leaf, :min_samples_split, :min_purity_increase, :rng]) -function fit!(rf::RandomForestClassifier, X::AbstractMatrix, y::AbstractVector) +function fit!(rf::RandomForestClassifier, X::AbstractVecOrMat, y::AbstractVector) n_samples, n_features = size(X) rf.ensemble = build_forest( y, X, @@ -297,7 +297,7 @@ end # since it'll change throughout fitting, but it works :max_depth, :rng]) -function fit!(rf::RandomForestRegressor, X::AbstractMatrix, y::AbstractVector) +function fit!(rf::RandomForestRegressor, X::AbstractVecOrMat, y::AbstractVector) n_samples, n_features = size(X) rf.ensemble = build_forest( float.(y), X, diff --git a/src/util.jl b/src/util.jl index fe1fe63d..e81a244b 100644 --- a/src/util.jl +++ b/src/util.jl @@ -298,7 +298,7 @@ module util end function check_input( - X :: AbstractMatrix{S}, + X :: AbstractVecOrMat{S}, Y :: AbstractVector{T}, W :: AbstractVector{U}, max_features :: Int, From 01aa8146ca3dc2f99f46fae248cea18269320e22 Mon Sep 17 00:00:00 2001 From: Sander Valentin Date: Wed, 23 Feb 2022 11:16:43 +0100 Subject: [PATCH 2/5] Fix n_samples and n_features --- src/regression/tree.jl | 16 ++++++++++++---- src/util.jl | 7 ++++++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/regression/tree.jl b/src/regression/tree.jl index ad3eedd6..da000dee 100644 --- a/src/regression/tree.jl +++ b/src/regression/tree.jl @@ -239,8 +239,12 @@ module treeregressor min_purity_increase :: Float64, rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U} - n_samples, n_features = size(X) - + if isa(X, AbstractVector) + n_samples = length(X) + n_features = 1 + elseif isa(X, AbstractMatrix) + n_samples, n_features = size(X) + end Yf = Array{Float64}(undef, n_samples) Xf = Array{S}(undef, n_samples) Wf = Array{U}(undef, n_samples) @@ -281,8 +285,12 @@ module treeregressor min_samples_split :: Int, min_purity_increase :: Float64, rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U} - - n_samples, n_features = size(X) + if isa(X, AbstractVector) + n_samples = length(X) + n_features = 1 + elseif isa(X, AbstractMatrix) + n_samples, n_features = size(X) + end if W == nothing W = fill(1.0, n_samples) end diff --git a/src/util.jl b/src/util.jl index e81a244b..0ef64139 100644 --- a/src/util.jl +++ b/src/util.jl @@ -306,7 +306,12 @@ module util min_samples_leaf :: Int, min_samples_split :: Int, min_purity_increase :: Float64) where {S, T, U} - n_samples, n_features = size(X) + if isa(X, AbstractVector) + n_samples = length(X) + n_features = 1 + elseif isa(X, AbstractMatrix) + n_samples, n_features = size(X) + end if length(Y) != n_samples throw("dimension mismatch between X and Y ($(size(X)) vs $(size(Y))") elseif length(W) != n_samples From 57684723827723ad2c7309e26f285d8db976d0c0 Mon Sep 17 00:00:00 2001 From: sanderbboisen <63041604+sanderbboisen@users.noreply.github.com> Date: Thu, 3 Mar 2022 13:29:22 +0100 Subject: [PATCH 3/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7fd2fb74..2c6ff992 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Codecov](https://codecov.io/gh/bensadeghi/DecisionTree.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/bensadeghi/DecisionTree.jl) [![Docs Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://juliahub.com/docs/DecisionTree/pEDeB/0.10.11/) -### Disclaimer: this package is no longer maintained. +Most of this work is not mine, but since the original repo is no longer maintained, I will try to adapt this fork to my own needs. Julia implementation of Decision Tree (CART) and Random Forest algorithms From 63f012eeb42176654fd594fbed5ce26fb121e9d2 Mon Sep 17 00:00:00 2001 From: Sander Valentin Date: Wed, 18 May 2022 12:00:22 +0100 Subject: [PATCH 4/5] Make classification and regression trees work as intended with vector input --- src/classification/main.jl | 44 +++++++++++++++++++------------------- src/classification/tree.jl | 7 +++--- src/regression/tree.jl | 15 +++---------- src/scikitlearnAPI.jl | 22 ++++++++++++++----- src/util.jl | 15 ++++++++++++- 5 files changed, 59 insertions(+), 44 deletions(-) diff --git a/src/classification/main.jl b/src/classification/main.jl index 2f909a78..d3b6f71d 100644 --- a/src/classification/main.jl +++ b/src/classification/main.jl @@ -138,15 +138,15 @@ function prune_tree(tree::LeafOrNode{S, T}, purity_thresh=1.0) where {S, T} end -apply_tree(leaf::Leaf{T}, feature::AbstractVector{S}) where {S, T} = leaf.majority +_apply_tree(leaf::Leaf{T}, feature::AbstractVector{S}) where {S, T} = leaf.majority -function apply_tree(tree::Node{S, T}, features::AbstractVector{S}) where {S, T} +function _apply_tree(tree::Node{S, T}, features::AbstractVector{S}) where {S, T} if tree.featid == 0 - return apply_tree(tree.left, features) + return _apply_tree(tree.left, features) elseif features[tree.featid] < tree.featval - return apply_tree(tree.left, features) + return _apply_tree(tree.left, features) else - return apply_tree(tree.right, features) + return _apply_tree(tree.right, features) end end @@ -154,7 +154,7 @@ function apply_tree(tree::LeafOrNode{S, T}, features::AbstractVecOrMat{S}) where N = size(features,1) predictions = Array{T}(undef, N) for i in 1:N - predictions[i] = apply_tree(tree, features[i, :]) + predictions[i] = _apply_tree(tree, features[i, :]) end if T <: Float64 return Float64.(predictions) @@ -171,16 +171,16 @@ n_labels` matrix of probabilities, each row summing up to 1. `col_labels` is a vector containing the distinct labels (eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering of the output matrix. """ -apply_tree_proba(leaf::Leaf{T}, features::AbstractVector{S}, labels) where {S, T} = +_apply_tree_proba(leaf::Leaf{T}, features::AbstractVector{S}, labels) where {S, T} = compute_probabilities(labels, leaf.values) -function apply_tree_proba(tree::Node{S, T}, features::AbstractVector{S}, labels) where {S, T} +function _apply_tree_proba(tree::Node{S, T}, features::AbstractVector{S}, labels) where {S, T} if tree.featval === nothing - return apply_tree_proba(tree.left, features, labels) + return _apply_tree_proba(tree.left, features, labels) elseif features[tree.featid] < tree.featval - return apply_tree_proba(tree.left, features, labels) + return _apply_tree_proba(tree.left, features, labels) else - return apply_tree_proba(tree.right, features, labels) + return _apply_tree_proba(tree.right, features, labels) end end @@ -254,11 +254,11 @@ function build_forest( return Ensemble{S, T}(forest) end -function apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where {S, T} +function _apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where {S, T} n_trees = length(forest) votes = Array{T}(undef, n_trees) for i in 1:n_trees - votes[i] = apply_tree(forest.trees[i], features) + votes[i] = _apply_tree(forest.trees[i], features) end if T <: Float64 @@ -272,7 +272,7 @@ function apply_forest(forest::Ensemble{S, T}, features::AbstractVecOrMat{S}) whe N = size(features,1) predictions = Array{T}(undef, N) for i in 1:N - predictions[i] = apply_forest(forest, features[i, :]) + predictions[i] = _apply_forest(forest, features[i, :]) end return predictions end @@ -285,13 +285,13 @@ n_labels` matrix of probabilities, each row summing up to 1. `col_labels` is a vector containing the distinct labels (eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering of the output matrix. """ -function apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVector{S}, labels) where {S, T} - votes = [apply_tree(tree, features) for tree in forest.trees] +function _apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVector{S}, labels) where {S, T} + votes = [_apply_tree(tree, features) for tree in forest.trees] return compute_probabilities(labels, votes) end apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVecOrMat{S}, labels) where {S, T} = - stack_function_results(row->apply_forest_proba(forest, row, labels), + stack_function_results(row->_apply_forest_proba(forest, row, labels), features) function build_adaboost_stumps( @@ -321,11 +321,11 @@ function build_adaboost_stumps( return (Ensemble{S, T}(stumps), coeffs) end -function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVector{S}) where {S, T} +function _apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVector{S}) where {S, T} n_stumps = length(stumps) counts = Dict() for i in 1:n_stumps - prediction = apply_tree(stumps.trees[i], features) + prediction = _apply_tree(stumps.trees[i], features) counts[prediction] = get(counts, prediction, 0.0) + coeffs[i] end top_prediction = stumps.trees[1].left.majority @@ -343,7 +343,7 @@ function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Fl n_samples = size(features, 1) predictions = Array{T}(undef, n_samples) for i in 1:n_samples - predictions[i] = apply_adaboost_stumps(stumps, coeffs, features[i,:]) + predictions[i] = _apply_adaboost_stumps(stumps, coeffs, features[i,:]) end return predictions end @@ -356,7 +356,7 @@ n_labels` matrix of probabilities, each row summing up to 1. `col_labels` is a vector containing the distinct labels (eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering of the output matrix. """ -function apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, +function _apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVector{S}, labels::AbstractVector{T}) where {S, T} votes = [apply_tree(stump, features) for stump in stumps.trees] compute_probabilities(labels, votes, coeffs) @@ -364,5 +364,5 @@ end function apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVecOrMat{S}, labels::AbstractVector{T}) where {S, T} - stack_function_results(row->apply_adaboost_stumps_proba(stumps, coeffs, row, labels), features) + stack_function_results(row->_apply_adaboost_stumps_proba(stumps, coeffs, row, labels), features) end diff --git a/src/classification/tree.jl b/src/classification/tree.jl index 3dccfca7..00b567de 100644 --- a/src/classification/tree.jl +++ b/src/classification/tree.jl @@ -237,9 +237,8 @@ module treeclassifier min_samples_split :: Int, min_purity_increase :: Float64, rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U} - - n_samples, n_features = size(X) - + + n_samples, n_features = util.find_n_samples_and_n_features(X) nc = Array{U}(undef, n_classes) ncl = Array{U}(undef, n_classes) ncr = Array{U}(undef, n_classes) @@ -284,7 +283,7 @@ module treeclassifier min_purity_increase :: Float64, rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, T, U} - n_samples, n_features = size(X) + n_samples, n_features = util.find_n_samples_and_n_features(X) list, Y_ = util.assign(Y) if W == nothing W = fill(1, n_samples) diff --git a/src/regression/tree.jl b/src/regression/tree.jl index da000dee..dc42c063 100644 --- a/src/regression/tree.jl +++ b/src/regression/tree.jl @@ -239,12 +239,7 @@ module treeregressor min_purity_increase :: Float64, rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U} - if isa(X, AbstractVector) - n_samples = length(X) - n_features = 1 - elseif isa(X, AbstractMatrix) - n_samples, n_features = size(X) - end + n_samples, n_features = util.find_n_samples_and_n_features(X) Yf = Array{Float64}(undef, n_samples) Xf = Array{S}(undef, n_samples) Wf = Array{U}(undef, n_samples) @@ -285,12 +280,8 @@ module treeregressor min_samples_split :: Int, min_purity_increase :: Float64, rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U} - if isa(X, AbstractVector) - n_samples = length(X) - n_features = 1 - elseif isa(X, AbstractMatrix) - n_samples, n_features = size(X) - end + + n_samples, n_features = util.find_n_samples_and_n_features(X) if W == nothing W = fill(1.0, n_samples) end diff --git a/src/scikitlearnAPI.jl b/src/scikitlearnAPI.jl index a510086c..03fb7df9 100644 --- a/src/scikitlearnAPI.jl +++ b/src/scikitlearnAPI.jl @@ -49,8 +49,8 @@ get_classes(dt::DecisionTreeClassifier) = dt.classes [:pruning_purity_threshold, :max_depth, :min_samples_leaf, :min_samples_split, :min_purity_increase, :rng]) -function fit!(dt::DecisionTreeClassifier, X, y) - n_samples, n_features = size(X) +function fit!(dt::DecisionTreeClassifier, X::AbstractVecOrMat, y) + n_samples, n_features = find_n_samples_and_n_features(X) dt.root = build_tree( y, X, dt.n_subfeatures, @@ -137,7 +137,7 @@ end :max_depth, :min_samples_split, :min_purity_increase, :rng]) function fit!(dt::DecisionTreeRegressor, X::AbstractVecOrMat, y::AbstractVector) - n_samples, n_features = size(X) + n_samples, n_features = find_n_samples_and_n_features(X) dt.root = build_tree( float.(y), X, dt.n_subfeatures, @@ -214,7 +214,7 @@ get_classes(rf::RandomForestClassifier) = rf.classes :rng]) function fit!(rf::RandomForestClassifier, X::AbstractVecOrMat, y::AbstractVector) - n_samples, n_features = size(X) + n_samples, n_features = find_n_samples_and_n_features(X) rf.ensemble = build_forest( y, X, rf.n_subfeatures, @@ -298,7 +298,7 @@ end :max_depth, :rng]) function fit!(rf::RandomForestRegressor, X::AbstractVecOrMat, y::AbstractVector) - n_samples, n_features = size(X) + n_samples, n_features = find_n_samples_and_n_features(X) rf.ensemble = build_forest( float.(y), X, rf.n_subfeatures, @@ -388,3 +388,15 @@ length(dt::DecisionTreeRegressor) = length(dt.root) print_tree(dt::DecisionTreeClassifier, depth=-1; kwargs...) = print_tree(dt.root, depth; kwargs...) print_tree(dt::DecisionTreeRegressor, depth=-1; kwargs...) = print_tree(dt.root, depth; kwargs...) print_tree(n::Nothing, depth=-1; kwargs...) = show(n) + +# Due to the current project structure, this can't currently be loaded from util +function find_n_samples_and_n_features(X::AbstractVecOrMat) + n_samples, n_features = (0, 0) + if isa(X, AbstractVector) + n_samples = length(X) + n_features = 1 + elseif isa(X, AbstractMatrix) + n_samples, n_features = size(X) + end + return (n_samples, n_features) +end \ No newline at end of file diff --git a/src/util.jl b/src/util.jl index 0ef64139..1b687c7f 100644 --- a/src/util.jl +++ b/src/util.jl @@ -3,7 +3,7 @@ module util - export gini, entropy, zero_one, q_bi_sort!, hypergeometric, check_input + export gini, entropy, zero_one, q_bi_sort!, hypergeometric, check_input, find_n_samples_and_n_features function assign(Y :: AbstractVector{T}, list :: AbstractVector{T}) where T dict = Dict{T, Int}() @@ -297,6 +297,19 @@ module util end end + # Find the appropriate values for n_samples and n_features. + # This is a shared need across multiple functions and files. + function find_n_samples_and_n_features(X::AbstractVecOrMat) + n_samples, n_features = (0, 0) + if isa(X, AbstractVector) + n_samples = length(X) + n_features = 1 + elseif isa(X, AbstractMatrix) + n_samples, n_features = size(X) + end + return (n_samples, n_features) + end + function check_input( X :: AbstractVecOrMat{S}, Y :: AbstractVector{T}, From f2684ecc989299a8837aa65fd6772c548b55c038 Mon Sep 17 00:00:00 2001 From: Sander Valentin Date: Wed, 18 May 2022 12:35:47 +0100 Subject: [PATCH 5/5] Add new tests to scikitlearApi and fix errors that occured during testing --- src/classification/main.jl | 4 ++-- test/classification/scikitlearn.jl | 15 +++++++++++++++ test/miscellaneous/convert.jl | 7 +++---- test/regression/scikitlearn.jl | 12 ++++++++++++ 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/classification/main.jl b/src/classification/main.jl index d3b6f71d..3ae84a4e 100644 --- a/src/classification/main.jl +++ b/src/classification/main.jl @@ -185,7 +185,7 @@ function _apply_tree_proba(tree::Node{S, T}, features::AbstractVector{S}, labels end apply_tree_proba(tree::LeafOrNode{S, T}, features::AbstractVecOrMat{S}, labels) where {S, T} = - stack_function_results(row->apply_tree_proba(tree, row, labels), features) + stack_function_results(row->_apply_tree_proba(tree, row, labels), features) function build_forest( labels :: AbstractVector{T}, @@ -358,7 +358,7 @@ n_labels` matrix of probabilities, each row summing up to 1. of the output matrix. """ function _apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVector{S}, labels::AbstractVector{T}) where {S, T} - votes = [apply_tree(stump, features) for stump in stumps.trees] + votes = [_apply_tree(stump, features) for stump in stumps.trees] compute_probabilities(labels, votes, coeffs) end diff --git a/test/classification/scikitlearn.jl b/test/classification/scikitlearn.jl index c4329439..efc287ab 100644 --- a/test/classification/scikitlearn.jl +++ b/test/classification/scikitlearn.jl @@ -18,6 +18,21 @@ model = fit!(AdaBoostStumpClassifier(), features, labels) # Adaboost isn't so hot on this task, disabled for now mean(predict(model, features) .== labels) +# Repeat above but for input as vector +features = rand(n); +labels = round.(Int, features); + +model = fit!(DecisionTreeClassifier(pruning_purity_threshold=0.9), features, labels) +@test mean(predict(model, features) .== labels) > 0.8 + +model = fit!(RandomForestClassifier(), features, labels) +@test mean(predict(model, features) .== labels) > 0.8 + +model = fit!(AdaBoostStumpClassifier(), features, labels) +# Adaboost isn't so hot on this task, disabled for now +mean(predict(model, features) .== labels) +# End Vector input tests + Random.seed!(2) N = 3000 X = randn(N, 10) diff --git a/test/miscellaneous/convert.jl b/test/miscellaneous/convert.jl index c131dbd4..a2660cc8 100644 --- a/test/miscellaneous/convert.jl +++ b/test/miscellaneous/convert.jl @@ -1,20 +1,19 @@ # Test conversion of Leaf to Node - @testset "convert.jl" begin lf = Leaf(1, [1]) nv = Node{Int, Int}[] push!(nv, lf) -@test apply_tree(nv[1], [0]) == 1 +@test DecisionTree._apply_tree(nv[1], [0]) == 1 lf = Leaf(1.0, [0.0, 1.0]) nv = Node{Int, Float64}[] push!(nv, lf) -@test apply_tree(nv[1], [0]) == 1.0 +@test DecisionTree._apply_tree(nv[1], [0]) == 1.0 lf = Leaf("A", ["B", "A"]) nv = Node{Int, String}[] push!(nv, lf) -@test apply_tree(nv[1], [0]) == "A" +@test DecisionTree._apply_tree(nv[1], [0]) == "A" end # @testset diff --git a/test/regression/scikitlearn.jl b/test/regression/scikitlearn.jl index 13e78742..2b1cbccb 100644 --- a/test/regression/scikitlearn.jl +++ b/test/regression/scikitlearn.jl @@ -15,6 +15,18 @@ model = fit!(DecisionTreeRegressor(min_samples_split=5), features, labels) model = fit!(RandomForestRegressor(n_trees=10, min_samples_leaf=5, n_subfeatures=2), features, labels) @test R2(labels, predict(model, features)) > 0.8 +# Repeat the above but for single feature vector +features = rand(n) +labels = features .* 2 +model = fit!(DecisionTreeRegressor(min_samples_leaf=5, pruning_purity_threshold=0.1), features, labels) +@test R2(labels, predict(model, features)) > 0.8 + +model = fit!(DecisionTreeRegressor(min_samples_split=5), features, labels) +@test R2(labels, predict(model, features)) > 0.8 + +model = fit!(RandomForestRegressor(n_trees=10, min_samples_leaf=5, n_subfeatures=1), features, labels) +@test R2(labels, predict(model, features)) > 0.8 + Random.seed!(2) N = 3000 X = randn(N, 10)