KMeans OOP (#1770)

* kmeans oop init commit * reformat * reformat * experimental * address ci failures * deselected tests * will be reverted * enable deslected tests * include elkan * address CI failure * address ci failures * enable all deselected tests * deselected tests * compiler update * init signature * deselected tests * format * add sparsity support * lint * minor fix * callable init * lint * table fix * minor * minor * rename attribute * test, revert later * minor * add sparsity * lint * replace basic stat with numpy * remove skip * CI fixes * CI fixes * lint * minor * fix sample_weight * pandas dtype * lint * remove deselected tests * use numpy variance * test sparse offset * revert b51e6bd * remove basic_statistics changes * remove comments * minor * update * update * add result option * refactor for csr * lint * refactor and ci * add version check for oneDAL * update * fix for CI * ci fix * minor * some fixes * ci fixes * lint * add version checks * csr condition for policy * version check for stability check * update test * floating methods * minor * ci fixes * minor * address review * address review * minor * update comments * refactor * ci * address ci * update test * version check * lint * minor fix * lint * basic stat fix * score * minor * ci fix + refactor * more fixes * not a table * minor * sample weight * import * preview remove * SPMD fix * SPMD fix * SPMD fix * refactor * deselect * deselect refactor * deselect update * deselect update * deselect update * deselect * reverting to previous * update daal version * refactor deselected tests * update daal check * address comments * address comments * test fix * address comments * minor * refactor * refactor * refactor * ci fix * ci fix * minor * update checks * import * fix import * refactor * update test * update test * ci fixes * lint * minor * minor * ci fix * fix ci * fix ci * fix ci * fix ci --------- Co-authored-by: md.shafiul.alam <[email protected]>
uxlfoundation · Sep 6, 2024 · fede266 · fede266
1 parent 48714b0
commit fede266
Show file tree

Hide file tree

Showing 16 changed files with 714 additions and 615 deletions.
diff --git a/deselected_tests.yaml b/deselected_tests.yaml
@@ -177,7 +177,6 @@ deselected_tests:
 
   # test_non_uniform_strategies fails due to differences in handling of vacuous clusters after update
   # See https://github.com/IntelPython/daal4py/issues/69
-  - cluster/tests/test_k_means.py::test_relocated_clusters >=0.23,<0.24
   - cluster/tests/test_k_means.py::test_kmeans_relocated_clusters >=0.24
 
   # In scikit-learn, these algorithms are not included in this test. However, scikit-learn-intelex
@@ -258,9 +257,6 @@ deselected_tests:
   # Different results scikit-learn-intelex and scikit-learn linear regression with weights. Need to investigate.
   - inspection/tests/test_permutation_importance.py::test_permutation_importance_sample_weight >=0.24
 
-  # Patched and unpatched kmeans set same values to different clusters. Need to investigate.
-  - preprocessing/tests/test_discretization.py::test_nonuniform_strategies[kmeans-expected_2bins1-expected_3bins1-expected_5bins1] >=0.24
-
   # OOB scores in scikit-learn and oneDAL are different because of different random number generators
   - ensemble/tests/test_forest.py::test_forest_classifier_oob[X1-y1-0.65-array-ExtraTreesClassifier]
   - ensemble/tests/test_forest.py::test_forest_classifier_oob[True-X1-y1-0.65-array-ExtraTreesClassifier] >=1.3
@@ -362,14 +358,6 @@ deselected_tests:
   - tests/test_common.py::test_estimators[LogisticRegression()-check_sample_weights_invariance(kind=zeros)] >=1.4
   - tests/test_multioutput.py::test_classifier_chain_fit_and_predict_with_sparse_data >=1.4
 
-  # New failing sklearn1.4.1 tests for kmeans associated with incorrect n_iter_ values in daal4py
-  - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-dense] >=1.4
-  - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_matrix] >=1.4
-  - cluster/tests/test_k_means.py::test_relocating_with_duplicates[lloyd-sparse_array] >=1.4
-  - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-dense] >=1.4
-  - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_matrix] >=1.4
-  - cluster/tests/test_k_means.py::test_relocating_with_duplicates[elkan-sparse_array] >=1.4
-
   # Deselected tests for incremental algorithms
   # Need to rework getting policy to correctly obtain it for method without data (finalize_fit)
   # and avoid keeping it in class attribute, also need to investigate how to implement
@@ -466,16 +454,15 @@ public:
   - neighbors/tests/test_neighbors.py::test_KNeighborsClassifier_raise_on_all_zero_weights
 
   # --------------------------------------------------------
-  # The following tests currently fail with GPU offload
+  # The following tests currently fail with GPU offloading
 gpu:
-
   # Segfaults
   - ensemble/tests/test_weight_boosting.py
-
   # Fails
   - cluster/tests/test_dbscan.py::test_weighted_dbscan
-  - cluster/tests/test_k_means.py::test_k_means_fit_predict
-  - cluster/tests/test_k_means.py::test_predict
+  - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-normal]
+  - cluster/tests/test_k_means.py::test_kmeans_elkan_results[42-1e-100-sparse-blobs]
+  - model_selection/tests/test_search.py::test_unsupervised_grid_search
 
   - ensemble/tests/test_bagging.py::test_gridsearch
   - ensemble/tests/test_bagging.py::test_estimators_samples
@@ -609,8 +596,6 @@ gpu:
   - tests/test_common.py::test_estimators[GaussianMixture()-check_fit_idempotent]
   - tests/test_common.py::test_estimators[GaussianMixture()-check_n_features_in]
   - tests/test_common.py::test_estimators[GaussianMixture()-check_fit2d_predict1d]
-  - tests/test_common.py::test_estimators[KMeans()-check_clustering]
-  - tests/test_common.py::test_estimators[KMeans()-check_clustering(readonly_memmap=True)]
   - tests/test_common.py::test_estimators[RandomForestClassifier()-check_class_weight_classifiers]
   - tests/test_common.py::test_estimators[SVC()-check_sample_weights_pandas_series]
   - tests/test_common.py::test_estimators[SVC()-check_sample_weights_not_an_array]
@@ -645,7 +630,6 @@ gpu:
   - tests/test_multiclass.py::test_ovr_coef_
   - tests/test_multiclass.py::test_ovr_deprecated_coef_intercept
   - tests/test_multiclass.py::test_pairwise_cross_val_score
-
   - tests/test_multioutput.py::test_multiclass_multioutput_estimator_predict_proba
   - tests/test_multioutput.py::test_classifier_chain_fit_and_predict_with_sparse_data
 
@@ -658,25 +642,6 @@ gpu:
   - tests/test_common.py::test_search_cv
   - manifold/tests/test_t_sne.py::test_n_iter_without_progress
 
-  # KMeans based (unsupported for GPU)
-  - cluster/tests/test_k_means.py
-  - tests/test_common.py::test_pandas_column_name_consistency[KMeans()]
-  - tests/test_common.py::test_pandas_column_name_consistency[GaussianMixture()]
-  - tests/test_common.py::test_pandas_column_name_consistency[BayesianGaussianMixture()]
-  - tests/test_common.py::test_estimators[KMeans()
-  - tests/test_common.py::test_estimators[BayesianGaussianMixture()-check_fit_check_is_fitted]
-  - tests/test_common.py::test_estimators[GaussianMixture()-check_fit_check_is_fitted]
-  - tests/test_common.py::test_check_n_features_in_after_fitting[BayesianGaussianMixture()]
-  - tests/test_common.py::test_check_n_features_in_after_fitting[GaussianMixture()]
-  - tests/test_common.py::test_check_n_features_in_after_fitting[KMeans()]
-  - tests/test_common.py::test_set_output_transform[KMeans()]
-  - tests/test_common.py::test_set_output_transform_pandas[KMeans()]
-  - tests/test_common.py::test_global_output_transform_pandas[KMeans()]
-  - mixture/tests/test_gaussian_mixture.py
-  - model_selection/tests/test_validation.py::test_cross_val_predict
-  - metrics/tests/test_score_objects.py::test_supervised_cluster_scorers
-  - tests/test_pipeline.py::test_fit_predict_on_pipeline
-  - tests/test_discriminant_analysis.py::test_lda_predict
   # Other device issues
   - tests/test_metaestimators.py::test_meta_estimators_delegate_data_validation[StackingClassifier]
   - tests/test_multiclass.py::test_ovr_always_present
@@ -759,9 +724,3 @@ gpu:
   # RuntimeError: Device support is not implemented, failing as result of fallback to cpu false
   - svm/tests/test_svm.py::test_unfitted
   - tests/test_common.py::test_estimators[SVC()-check_estimators_unfitted]
-
-preview:
-  - cluster/tests/test_k_means.py::test_kmeans_elkan_results
-  - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[KMeans-dense] <1.2
-  - cluster/tests/test_k_means.py::test_unit_weights_vs_no_weights[42-KMeans-dense] >=1.2
-  - cluster/tests/test_k_means.py::test_predict_sample_weight_deprecation_warning[KMeans] >=1.3
diff --git a/onedal/cluster/kmeans.cpp b/onedal/cluster/kmeans.cpp
@@ -38,6 +38,9 @@ struct method2t {
         const auto method = params["method"].cast<std::string>();
         ONEDAL_PARAM_DISPATCH_VALUE(method, "by_default", ops, Float, method::by_default);
         ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_dense", ops, Float, method::lloyd_dense);
+#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700
+        ONEDAL_PARAM_DISPATCH_VALUE(method, "lloyd_csr", ops, Float, method::lloyd_csr);
+#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240700
         ONEDAL_PARAM_DISPATCH_THROW_INVALID_VALUE(method);
     }
 
@@ -47,14 +50,10 @@ struct method2t {
 template <typename Float, typename Method, typename Task>
 struct descriptor_creator {};
 
-template <typename Float>
-struct descriptor_creator<Float,
-                          dal::kmeans::method::by_default,
-                          dal::kmeans::task::clustering > {
+template <typename Float, typename Method>
+struct descriptor_creator<Float, Method, dal::kmeans::task::clustering> {
     static auto get() {
-        return dal::kmeans::descriptor<Float,
-                                  dal::kmeans::method::by_default,
-                                  dal::kmeans::task::clustering>{};
+        return dal::kmeans::descriptor<Float, Method, dal::kmeans::task::clustering>{};
     }
 };
 
@@ -65,10 +64,15 @@ struct params2desc {
 
         auto desc = descriptor_creator<Float, Method, Task>::get();
 
-        desc.set_cluster_count( params["cluster_count"].cast<std::int64_t>() );
-        desc.set_accuracy_threshold( params["accuracy_threshold"].cast<Float>() );
-        desc.set_max_iteration_count( params["max_iteration_count"].cast<std::int64_t>() );
-
+        desc.set_cluster_count(params["cluster_count"].cast<std::int64_t>());
+        desc.set_accuracy_threshold(params["accuracy_threshold"].cast<Float>());
+        desc.set_max_iteration_count(params["max_iteration_count"].cast<std::int64_t>());
+#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200
+        auto result_options = params["result_options"].cast<std::string>();
+        if (result_options == "compute_exact_objective_function") {
+            desc.set_result_options(result_options::compute_exact_objective_function);
+        }
+#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20240200
         return desc;
     }
 };
@@ -153,7 +157,8 @@ void init_infer_result(py::module_& m) {
 
     auto cls = py::class_<result_t>(m, "infer_result")
                    .def(py::init())
-                   .DEF_ONEDAL_PY_PROPERTY(responses, result_t);
+                   .DEF_ONEDAL_PY_PROPERTY(responses, result_t)
+                   .DEF_ONEDAL_PY_PROPERTY(objective_function_value, result_t);
 }
 
 ONEDAL_PY_DECLARE_INSTANTIATOR(init_model);
@@ -173,10 +178,10 @@ ONEDAL_PY_INIT_MODULE(kmeans) {
     auto sub = m.def_submodule("kmeans");
 
 #ifdef ONEDAL_DATA_PARALLEL_SPMD
-    #if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
-        ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_spmd, task_list);
-        ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_spmd, task_list);
-    #endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
+#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
+    ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_spmd, task_list);
+    ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_spmd, task_list);
+#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230200
 #else // ONEDAL_DATA_PARALLEL_SPMD
     ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_list, task_list);
     ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_list, task_list);