JuliaAI · sanderbboisen · Feb 23, 2022 · Feb 23, 2022 · Mar 3, 2022 · May 13, 2022
diff --git a/src/classification/main.jl b/src/classification/main.jl
@@ -23,7 +23,7 @@ end
 
 # Applies `row_fun(X_row)::AbstractVector` to each row in X
 # and returns a matrix containing the resulting vectors, stacked vertically
-function stack_function_results(row_fun::Function, X::AbstractMatrix)
+function stack_function_results(row_fun::Function, X::AbstractVecOrMat)
     N = size(X, 1)
     N_cols = length(row_fun(X[1, :])) # gets the number of columns
     out = Array{Float64}(undef, N, N_cols)
@@ -52,7 +52,7 @@ end
 
 function build_stump(
         labels      :: AbstractVector{T},
-        features    :: AbstractMatrix{S},
+        features    :: AbstractVecOrMat{S},
         weights      = nothing;
         rng          = Random.GLOBAL_RNG) where {S, T}
 
@@ -73,7 +73,7 @@ end
 
 function build_tree(
         labels              :: AbstractVector{T},
-        features            :: AbstractMatrix{S},
+        features            :: AbstractVecOrMat{S},
         n_subfeatures        = 0,
         max_depth            = -1,
         min_samples_leaf     = 1,
@@ -138,23 +138,23 @@ function prune_tree(tree::LeafOrNode{S, T}, purity_thresh=1.0) where {S, T}
 end
 
 
-apply_tree(leaf::Leaf{T}, feature::AbstractVector{S}) where {S, T} = leaf.majority
+_apply_tree(leaf::Leaf{T}, feature::AbstractVector{S}) where {S, T} = leaf.majority
 
-function apply_tree(tree::Node{S, T}, features::AbstractVector{S}) where {S, T}
+function _apply_tree(tree::Node{S, T}, features::AbstractVector{S}) where {S, T}
     if tree.featid == 0
-        return apply_tree(tree.left, features)
+        return _apply_tree(tree.left, features)
     elseif features[tree.featid] < tree.featval
-        return apply_tree(tree.left, features)
+        return _apply_tree(tree.left, features)
     else
-        return apply_tree(tree.right, features)
+        return _apply_tree(tree.right, features)
     end
 end
 
-function apply_tree(tree::LeafOrNode{S, T}, features::AbstractMatrix{S}) where {S, T}
+function apply_tree(tree::LeafOrNode{S, T}, features::AbstractVecOrMat{S}) where {S, T}
     N = size(features,1)
     predictions = Array{T}(undef, N)
     for i in 1:N
-        predictions[i] = apply_tree(tree, features[i, :])
+        predictions[i] = _apply_tree(tree, features[i, :])
     end
     if T <: Float64
         return Float64.(predictions)
@@ -171,25 +171,25 @@ n_labels` matrix of probabilities, each row summing up to 1.
 `col_labels` is a vector containing the distinct labels
 (eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering
 of the output matrix. """
-apply_tree_proba(leaf::Leaf{T}, features::AbstractVector{S}, labels) where {S, T} =
+_apply_tree_proba(leaf::Leaf{T}, features::AbstractVector{S}, labels) where {S, T} =
     compute_probabilities(labels, leaf.values)
 
-function apply_tree_proba(tree::Node{S, T}, features::AbstractVector{S}, labels) where {S, T}
+function _apply_tree_proba(tree::Node{S, T}, features::AbstractVector{S}, labels) where {S, T}
     if tree.featval === nothing
-        return apply_tree_proba(tree.left, features, labels)
+        return _apply_tree_proba(tree.left, features, labels)
     elseif features[tree.featid] < tree.featval
-        return apply_tree_proba(tree.left, features, labels)
+        return _apply_tree_proba(tree.left, features, labels)
     else
-        return apply_tree_proba(tree.right, features, labels)
+        return _apply_tree_proba(tree.right, features, labels)
     end
 end
 
-apply_tree_proba(tree::LeafOrNode{S, T}, features::AbstractMatrix{S}, labels) where {S, T} =
-    stack_function_results(row->apply_tree_proba(tree, row, labels), features)
+apply_tree_proba(tree::LeafOrNode{S, T}, features::AbstractVecOrMat{S}, labels) where {S, T} =
+    stack_function_results(row->_apply_tree_proba(tree, row, labels), features)
 
 function build_forest(
         labels              :: AbstractVector{T},
-        features            :: AbstractMatrix{S},
+        features            :: AbstractVecOrMat{S},
         n_subfeatures       = -1,
         n_trees             = 10,
         partial_sampling    = 0.7,
@@ -254,11 +254,11 @@ function build_forest(
     return Ensemble{S, T}(forest)
 end
 
-function apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where {S, T}
+function _apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where {S, T}
     n_trees = length(forest)
     votes = Array{T}(undef, n_trees)
     for i in 1:n_trees
-        votes[i] = apply_tree(forest.trees[i], features)
+        votes[i] = _apply_tree(forest.trees[i], features)
     end
 
     if T <: Float64
@@ -268,11 +268,11 @@ function apply_forest(forest::Ensemble{S, T}, features::AbstractVector{S}) where
     end
 end
 
-function apply_forest(forest::Ensemble{S, T}, features::AbstractMatrix{S}) where {S, T}
+function apply_forest(forest::Ensemble{S, T}, features::AbstractVecOrMat{S}) where {S, T}
     N = size(features,1)
     predictions = Array{T}(undef, N)
     for i in 1:N
-        predictions[i] = apply_forest(forest, features[i, :])
+        predictions[i] = _apply_forest(forest, features[i, :])
     end
     return predictions
 end
@@ -285,18 +285,18 @@ n_labels` matrix of probabilities, each row summing up to 1.
 `col_labels` is a vector containing the distinct labels
 (eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering
 of the output matrix. """
-function apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVector{S}, labels) where {S, T}
-    votes = [apply_tree(tree, features) for tree in forest.trees]
+function _apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVector{S}, labels) where {S, T}
+    votes = [_apply_tree(tree, features) for tree in forest.trees]
     return compute_probabilities(labels, votes)
 end
 
-apply_forest_proba(forest::Ensemble{S, T}, features::AbstractMatrix{S}, labels) where {S, T} =
-    stack_function_results(row->apply_forest_proba(forest, row, labels),
+apply_forest_proba(forest::Ensemble{S, T}, features::AbstractVecOrMat{S}, labels) where {S, T} =
+    stack_function_results(row->_apply_forest_proba(forest, row, labels),
                            features)
 
 function build_adaboost_stumps(
         labels       :: AbstractVector{T},
-        features     :: AbstractMatrix{S},
+        features     :: AbstractVecOrMat{S},
         n_iterations :: Integer;
         rng           = Random.GLOBAL_RNG) where {S, T}
     N = length(labels)
@@ -321,11 +321,11 @@ function build_adaboost_stumps(
     return (Ensemble{S, T}(stumps), coeffs)
 end
 
-function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVector{S}) where {S, T}
+function _apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVector{S}) where {S, T}
     n_stumps = length(stumps)
     counts = Dict()
     for i in 1:n_stumps
-        prediction = apply_tree(stumps.trees[i], features)
+        prediction = _apply_tree(stumps.trees[i], features)
         counts[prediction] = get(counts, prediction, 0.0) + coeffs[i]
     end
     top_prediction = stumps.trees[1].left.majority
@@ -339,11 +339,11 @@ function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Fl
     return top_prediction
 end
 
-function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractMatrix{S}) where {S, T}
+function apply_adaboost_stumps(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64}, features::AbstractVecOrMat{S}) where {S, T}
     n_samples = size(features, 1)
     predictions = Array{T}(undef, n_samples)
     for i in 1:n_samples
-        predictions[i] = apply_adaboost_stumps(stumps, coeffs, features[i,:])
+        predictions[i] = _apply_adaboost_stumps(stumps, coeffs, features[i,:])
     end
     return predictions
 end
@@ -356,13 +356,13 @@ n_labels` matrix of probabilities, each row summing up to 1.
 `col_labels` is a vector containing the distinct labels
 (eg. ["versicolor", "virginica", "setosa"]). It specifies the column ordering
 of the output matrix. """
-function apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64},
+function _apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64},
                                      features::AbstractVector{S}, labels::AbstractVector{T}) where {S, T}
-    votes = [apply_tree(stump, features) for stump in stumps.trees]
+    votes = [_apply_tree(stump, features) for stump in stumps.trees]
     compute_probabilities(labels, votes, coeffs)
 end
 
 function apply_adaboost_stumps_proba(stumps::Ensemble{S, T}, coeffs::AbstractVector{Float64},
-                                    features::AbstractMatrix{S}, labels::AbstractVector{T}) where {S, T}
-    stack_function_results(row->apply_adaboost_stumps_proba(stumps, coeffs, row, labels), features)
+                                    features::AbstractVecOrMat{S}, labels::AbstractVector{T}) where {S, T}
+    stack_function_results(row->_apply_adaboost_stumps_proba(stumps, coeffs, row, labels), features)
 end
diff --git a/src/classification/tree.jl b/src/classification/tree.jl
@@ -43,7 +43,7 @@ module treeclassifier
     # find an optimal split that satisfy the given constraints
     # (max_depth, min_samples_split, min_purity_increase)
     function _split!(
-            X                   :: AbstractMatrix{S},   # the feature array
+            X                   :: AbstractVecOrMat{S},   # the feature array
             Y                   :: AbstractVector{Int}, # the label array
             W                   :: AbstractVector{U},   # the weight vector
             purity_function     :: Function,
@@ -226,7 +226,7 @@ module treeclassifier
     end
 
     function _fit(
-            X                     :: AbstractMatrix{S},
+            X                     :: AbstractVecOrMat{S},
             Y                     :: AbstractVector{Int},
             W                     :: AbstractVector{U},
             loss                  :: Function,
@@ -237,9 +237,8 @@ module treeclassifier
             min_samples_split     :: Int,
             min_purity_increase   :: Float64,
             rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U}
-
-        n_samples, n_features = size(X)
-
+
+        n_samples, n_features = util.find_n_samples_and_n_features(X)
         nc  = Array{U}(undef, n_classes)
         ncl = Array{U}(undef, n_classes)
         ncr = Array{U}(undef, n_classes)
@@ -273,7 +272,7 @@ module treeclassifier
     end
 
     function fit(;
-            X                     :: AbstractMatrix{S},
+            X                     :: AbstractVecOrMat{S},
             Y                     :: AbstractVector{T},
             W                     :: Union{Nothing, AbstractVector{U}},
             loss=util.entropy     :: Function,
@@ -284,7 +283,7 @@ module treeclassifier
             min_purity_increase   :: Float64,
             rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, T, U}
 
-        n_samples, n_features = size(X)
+        n_samples, n_features = util.find_n_samples_and_n_features(X)
         list, Y_ = util.assign(Y)
         if W == nothing
             W = fill(1, n_samples)

diff --git a/src/measures.jl b/src/measures.jl
@@ -72,7 +72,7 @@ function confusion_matrix(actual::AbstractVector, predicted::AbstractVector)
     return ConfusionMatrix(classes, CM, accuracy, kappa)
 end
 
-function _nfoldCV(classifier::Symbol, labels::AbstractVector{T}, features::AbstractMatrix{S}, args...; verbose, rng) where {S, T}
+function _nfoldCV(classifier::Symbol, labels::AbstractVector{T}, features::AbstractVecOrMat{S}, args...; verbose, rng) where {S, T}
     _rng = mk_rng(rng)::Random.AbstractRNG
     nfolds = args[1]
     if nfolds < 2
@@ -151,7 +151,7 @@ end
 
 function nfoldCV_tree(
         labels              :: AbstractVector{T},
-        features            :: AbstractMatrix{S},
+        features            :: AbstractVecOrMat{S},
         n_folds             :: Integer,
         pruning_purity      :: Float64 = 1.0,
         max_depth           :: Integer = -1,
@@ -165,7 +165,7 @@ function nfoldCV_tree(
 end
 function nfoldCV_forest(
         labels              :: AbstractVector{T},
-        features            :: AbstractMatrix{S},
+        features            :: AbstractVecOrMat{S},
         n_folds             :: Integer,
         n_subfeatures       :: Integer = -1,
         n_trees             :: Integer = 10,
@@ -181,7 +181,7 @@ function nfoldCV_forest(
 end
 function nfoldCV_stumps(
         labels       ::AbstractVector{T},
-        features     ::AbstractMatrix{S},
+        features     ::AbstractVecOrMat{S},
         n_folds      ::Integer,
         n_iterations ::Integer = 10;
         verbose             :: Bool = true,
@@ -203,7 +203,7 @@ function R2(actual, predicted)
     return 1.0 - ss_residual/ss_total
 end
 
-function _nfoldCV(regressor::Symbol, labels::AbstractVector{T}, features::AbstractMatrix, args...; verbose, rng) where T <: Float64
+function _nfoldCV(regressor::Symbol, labels::AbstractVector{T}, features::AbstractVecOrMat, args...; verbose, rng) where T <: Float64
     _rng = mk_rng(rng)::Random.AbstractRNG
     nfolds = args[1]
     if nfolds < 2
@@ -279,7 +279,7 @@ end
 
 function nfoldCV_tree(
     labels              :: AbstractVector{T},
-    features            :: AbstractMatrix{S},
+    features            :: AbstractVecOrMat{S},
     n_folds             :: Integer,
     pruning_purity      :: Float64 = 1.0,
     max_depth           :: Integer = -1,
@@ -293,7 +293,7 @@ _nfoldCV(:tree, labels, features, n_folds, pruning_purity, max_depth,
 end
 function nfoldCV_forest(
     labels              :: AbstractVector{T},
-    features            :: AbstractMatrix{S},
+    features            :: AbstractVecOrMat{S},
     n_folds             :: Integer,
     n_subfeatures       :: Integer = -1,
     n_trees             :: Integer = 10,

diff --git a/src/regression/main.jl b/src/regression/main.jl
@@ -10,13 +10,13 @@ function _convert(node::treeregressor.NodeMeta{S}, labels::Array{T}) where {S, T
     end
 end
 
-function build_stump(labels::AbstractVector{T}, features::AbstractMatrix{S}; rng = Random.GLOBAL_RNG) where {S, T <: Float64}
+function build_stump(labels::AbstractVector{T}, features::AbstractVecOrMat{S}; rng = Random.GLOBAL_RNG) where {S, T <: Float64}
     return build_tree(labels, features, 0, 1)
 end
 
 function build_tree(
         labels             :: AbstractVector{T},
-        features           :: AbstractMatrix{S},
+        features           :: AbstractVecOrMat{S},
         n_subfeatures       = 0,
         max_depth           = -1,
         min_samples_leaf    = 5,
@@ -48,7 +48,7 @@ end
 
 function build_forest(
         labels              :: AbstractVector{T},
-        features            :: AbstractMatrix{S},
+        features            :: AbstractVecOrMat{S},
         n_subfeatures       = -1,
         n_trees             = 10,
         partial_sampling    = 0.7,

diff --git a/src/regression/tree.jl b/src/regression/tree.jl
@@ -42,7 +42,7 @@ module treeregressor
     # find an optimal split that satisfy the given constraints
     # (max_depth, min_samples_split, min_purity_increase)
     function _split!(
-            X                   :: AbstractMatrix{S}, # the feature array
+            X                   :: AbstractVecOrMat{S}, # the feature array
             Y                   :: AbstractVector{Float64}, # the label array
             W                   :: AbstractVector{U},
             node                :: NodeMeta{S}, # the node to split
@@ -229,7 +229,7 @@ module treeregressor
     end
 
     function _fit(
-            X                     :: AbstractMatrix{S},
+            X                     :: AbstractVecOrMat{S},
             Y                     :: AbstractVector{Float64},
             W                     :: AbstractVector{U},
             max_features          :: Int,
@@ -239,8 +239,7 @@ module treeregressor
             min_purity_increase   :: Float64,
             rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U}
 
-        n_samples, n_features = size(X)
-
+        n_samples, n_features = util.find_n_samples_and_n_features(X)
         Yf  = Array{Float64}(undef, n_samples)
         Xf  = Array{S}(undef, n_samples)
         Wf  = Array{U}(undef, n_samples)
@@ -272,7 +271,7 @@ module treeregressor
     end
 
     function fit(;
-            X                     :: AbstractMatrix{S},
+            X                     :: AbstractVecOrMat{S},
             Y                     :: AbstractVector{Float64},
             W                     :: Union{Nothing, AbstractVector{U}},
             max_features          :: Int,
@@ -282,7 +281,7 @@ module treeregressor
             min_purity_increase   :: Float64,
             rng=Random.GLOBAL_RNG :: Random.AbstractRNG) where {S, U}
 
-        n_samples, n_features = size(X)
+        n_samples, n_features = util.find_n_samples_and_n_features(X)
         if W == nothing
             W = fill(1.0, n_samples)
         end