Skip to content

Commit

Permalink
KMeans OOP (#1770)
Browse files Browse the repository at this point in the history
* kmeans oop init commit

* reformat

* reformat

* experimental

* address ci failures

* deselected tests

* will be reverted

* enable deslected tests

* include elkan

* address CI failure

* address ci failures

* enable all deselected tests

* deselected tests

* compiler update

* init signature

* deselected tests

* format

* add sparsity support

* lint

* minor fix

* callable init

* lint

* table fix

* minor

* minor

* rename attribute

* test, revert later

* minor

* add sparsity

* lint

* replace basic stat with numpy

* remove skip

* CI fixes

* CI fixes

* lint

* minor

* fix sample_weight

* pandas dtype

* lint

* remove deselected tests

* use numpy variance

* test sparse offset

* revert b51e6bd

* remove basic_statistics changes

* remove comments

* minor

* update

* update

* add result option

* refactor for csr

* lint

* refactor and ci

* add version check for oneDAL

* update

* fix for CI

* ci fix

* minor

* some fixes

* ci fixes

* lint

* add version checks

* csr condition for policy

* version check for stability check

* update test

* floating methods

* minor

* ci fixes

* minor

* address review

* address review

* minor

* update comments

* refactor

* ci

* address ci

* update test

* version check

* lint

* minor fix

* lint

* basic stat fix

* score

* minor

* ci fix + refactor

* more fixes

* not a table

* minor

* sample weight

* import

* preview remove

* SPMD fix

* SPMD fix

* SPMD fix

* refactor

* deselect

* deselect refactor

* deselect update

* deselect update

* deselect update

* deselect

* reverting to previous

* update daal version

* refactor deselected tests

* update daal check

* address comments

* address comments

* test fix

* address comments

* minor

* refactor

* refactor

* refactor

* ci fix

* ci fix

* minor

* update checks

* import

* fix import

* refactor

* update test

* update test

* ci fixes

* lint

* minor

* minor

* ci fix

* fix ci

* fix ci

* fix ci

* fix ci

---------

Co-authored-by: md.shafiul.alam <[email protected]>
  • Loading branch information
md-shafiul-alam and md.shafiul.alam authored Sep 6, 2024
1 parent 48714b0 commit fede266
Show file tree
Hide file tree
Showing 16 changed files with 714 additions and 615 deletions.
49 changes: 4 additions & 45 deletions deselected_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ deselected_tests:

# test_non_uniform_strategies fails due to differences in handling of vacuous clusters after update
# See https://github.com/IntelPython/daal4py/issues/69
- cluster/tests/test_k_means.py::test_relocated_clusters >=0.23,<0.24
- cluster/tests/test_k_means.py::test_kmeans_relocated_clusters >=0.24

# In scikit-learn, these algorithms are not included in this test. However, scikit-learn-intelex
Expand Down Expand Up @@ -258,9 +257,6 @@ deselected_tests:
# Different results scikit-learn-intelex and scikit-learn linear regression with weights. Need to investigate.
- inspection/tests/test_permutation_importance.py::test_permutation_importance_sample_weight >=0.24

# Patched and unpatched kmeans set same values to different clusters. Need to investigate.
- preprocessing/tests/test_discretization.py::test_nonuniform_strategies[kmeans-expected_2bins1-expected_3bins1-expected_5bins1] >=0.24

# OOB scores in scikit-learn and oneDAL are different because of different random number generators
- ensemble/tests/test_forest.py::test_forest_classifier_oob[X1-y1-0.65-array-ExtraTreesClassifier]
- ensemble/tests/test_forest.py::test_forest_classifier_oob[True-X1-y1-0.65-array-ExtraTreesClassifier] >=1.3
Expand Down Expand Up @@ -362,14 +358,6 @@ deselected_tests:
- tests/test_common.py::test_estimators[LogisticRegression()-check_sample_weights_invariance(kind=zeros)] >=1.4
- tests/test_multioutput.py::test_classifier_chain_fit_and_predict_with_sparse_data >=1.4

# New failing sklearn1.4.1 tests for kmeans associated with incorrect n_iter_ values in daal4py
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-dense] >=1.4
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_matrix] >=1.4
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_array] >=1.4
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-dense] >=1.4
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_matrix] >=1.4
- cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_array] >=1.4

# Deselected tests for incremental algorithms
# Need to rework getting policy to correctly obtain it for method without data (finalize_fit)
# and avoid keeping it in class attribute, also need to investigate how to implement
Expand Down Expand Up @@ -466,16 +454,15 @@ public:
- neighbors/tests/test_neighbors.py::test_KNeighborsClassifier_raise_on_all_zero_weights

# --------------------------------------------------------
# The following tests currently fail with GPU offload
# The following tests currently fail with GPU offloading
gpu:

# Segfaults
- ensemble/tests/test_weight_boosting.py

# Fails
- cluster/tests/test_dbscan.py::test_weighted_dbscan
- cluster/tests/test_k_means.py::test_k_means_fit_predict
- cluster/tests/test_k_means.py::test_predict
- cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-normal]
- cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs]
- model_selection/tests/test_search.py::test_unsupervised_grid_search

- ensemble/tests/test_bagging.py::test_gridsearch
- ensemble/tests/test_bagging.py::test_estimators_samples
Expand Down Expand Up @@ -609,8 +596,6 @@ gpu:
- tests/test_common.py::test_estimators[GaussianMixture()-check_fit_idempotent]
- tests/test_common.py::test_estimators[GaussianMixture()-check_n_features_in]
- tests/test_common.py::test_estimators[GaussianMixture()-check_fit2d_predict1d]
- tests/test_common.py::test_estimators[KMeans()-check_clustering]
- tests/test_common.py::test_estimators[KMeans()-check_clustering(readonly_memmap=True)]
- tests/test_common.py::test_estimators[RandomForestClassifier()-check_class_weight_classifiers]
- tests/test_common.py::test_estimators[SVC()-check_sample_weights_pandas_series]
- tests/test_common.py::test_estimators[SVC()-check_sample_weights_not_an_array]
Expand Down Expand Up @@ -645,7 +630,6 @@ gpu:
- tests/test_multiclass.py::test_ovr_coef_
- tests/test_multiclass.py::test_ovr_deprecated_coef_intercept
- tests/test_multiclass.py::test_pairwise_cross_val_score

- tests/test_multioutput.py::test_multiclass_multioutput_estimator_predict_proba
- tests/test_multioutput.py::test_classifier_chain_fit_and_predict_with_sparse_data

Expand All @@ -658,25 +642,6 @@ gpu:
- tests/test_common.py::test_search_cv
- manifold/tests/test_t_sne.py::test_n_iter_without_progress

# KMeans based (unsupported for GPU)
- cluster/tests/test_k_means.py
- tests/test_common.py::test_pandas_column_name_consistency[KMeans()]
- tests/test_common.py::test_pandas_column_name_consistency[GaussianMixture()]
- tests/test_common.py::test_pandas_column_name_consistency[BayesianGaussianMixture()]
- tests/test_common.py::test_estimators[KMeans()
- tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_fit_check_is_fitted]
- tests/test_common.py::test_estimators[GaussianMixture()-check_fit_check_is_fitted]
- tests/test_common.py::test_check_n_features_in_after_fitting[BayesianGaussianMixture()]
- tests/test_common.py::test_check_n_features_in_after_fitting[GaussianMixture()]
- tests/test_common.py::test_check_n_features_in_after_fitting[KMeans()]
- tests/test_common.py::test_set_output_transform[KMeans()]
- tests/test_common.py::test_set_output_transform_pandas[KMeans()]
- tests/test_common.py::test_global_output_transform_pandas[KMeans()]
- mixture/tests/test_gaussian_mixture.py
- model_selection/tests/test_validation.py::test_cross_val_predict
- metrics/tests/test_score_objects.py::test_supervised_cluster_scorers
- tests/test_pipeline.py::test_fit_predict_on_pipeline
- tests/test_discriminant_analysis.py::test_lda_predict
# Other device issues
- tests/test_metaestimators.py::test_meta_estimators_delegate_data_validation[StackingClassifier]
- tests/test_multiclass.py::test_ovr_always_present
Expand Down Expand Up @@ -759,9 +724,3 @@ gpu:
# RuntimeError: Device support is not implemented, failing as result of fallback to cpu false
- svm/tests/test_svm.py::test_unfitted
- tests/test_common.py::test_estimators[SVC()-check_estimators_unfitted]

preview:
- cluster/tests/test_k_means.py::test_kmeans_elkan_results
- cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2
- cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2
- cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3
37 changes: 21 additions & 16 deletions onedal/cluster/kmeans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ struct method2t {
const auto method = params["method"].cast<std::string>();
ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default);
ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_dense", ops, Float, method::lloyd_dense);
#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700
ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_csr", ops, Float, method::lloyd_csr);
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700
ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method);
}

Expand All @@ -47,14 +50,10 @@ struct method2t {
template <typename Float, typename Method, typename Task>
struct descriptor_creator {};

template <typename Float>
struct descriptor_creator<Float,
dal::kmeans::method::by_default,
dal::kmeans::task::clustering > {
template <typename Float, typename Method>
struct descriptor_creator<Float, Method, dal::kmeans::task::clustering> {
static auto get() {
return dal::kmeans::descriptor<Float,
dal::kmeans::method::by_default,
dal::kmeans::task::clustering>{};
return dal::kmeans::descriptor<Float, Method, dal::kmeans::task::clustering>{};
}
};

Expand All @@ -65,10 +64,15 @@ struct params2desc {

auto desc = descriptor_creator<Float, Method, Task>::get();

desc.set_cluster_count( params["cluster_count"].cast<std::int64_t>() );
desc.set_accuracy_threshold( params["accuracy_threshold"].cast<Float>() );
desc.set_max_iteration_count( params["max_iteration_count"].cast<std::int64_t>() );

desc.set_cluster_count(params["cluster_count"].cast<std::int64_t>());
desc.set_accuracy_threshold(params["accuracy_threshold"].cast<Float>());
desc.set_max_iteration_count(params["max_iteration_count"].cast<std::int64_t>());
#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200
auto result_options = params["result_options"].cast<std::string>();
if (result_options == "compute_exact_objective_function") {
desc.set_result_options(result_options::compute_exact_objective_function);
}
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200
return desc;
}
};
Expand Down Expand Up @@ -153,7 +157,8 @@ void init_infer_result(py::module_& m) {

auto cls = py::class_<result_t>(m, "infer_result")
.def(py::init())
.DEF_ONEDAL_PY_PROPERTY(responses, result_t);
.DEF_ONEDAL_PY_PROPERTY(responses, result_t)
.DEF_ONEDAL_PY_PROPERTY(objective_function_value, result_t);
}

ONEDAL_PY_DECLARE_INSTANTIATOR(init_model);
Expand All @@ -173,10 +178,10 @@ ONEDAL_PY_INIT_MODULE(kmeans) {
auto sub = m.def_submodule("kmeans");

#ifdef ONEDAL_DATA_PARALLEL_SPMD
#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_spmd, task_list);
ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_spmd, task_list);
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_spmd, task_list);
ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_spmd, task_list);
#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
#else // ONEDAL_DATA_PARALLEL_SPMD
ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_list, task_list);
ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_list, task_list);
Expand Down
Loading

0 comments on commit fede266

Please sign in to comment.