Merge branch 'main' into dev/bs_zero

uxlfoundation · Dec 3, 2024 · 8e4cde0 · 8e4cde0
2 parents 2ebf71b + 675a2da
commit 8e4cde0
Show file tree

Hide file tree

Showing 43 changed files with 1,367 additions and 754 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -60,7 +60,7 @@ jobs:
       - name: Get run ID of "Nightly-build" workflow
         id: get-run-id
         run: |
-          OTHER_REPO="oneapi-src/oneDAL"
+          OTHER_REPO="uxlfoundation/oneDAL"
           WF_NAME="Nightly-build"
           JQ_QUERY='map(select(.event == "workflow_dispatch" or .event == "schedule")) | .[0].databaseId'
           RUN_ID=`gh run --repo ${OTHER_REPO} list --workflow "${WF_NAME}" --json databaseId,event --status success --jq "${JQ_QUERY}"`
@@ -73,15 +73,15 @@ jobs:
         with:
           name: __release_lnx
           github-token: ${{ github.token }}
-          repository: oneapi-src/oneDAL
+          repository: uxlfoundation/oneDAL
           run-id: ${{ steps.get-run-id.outputs.run-id }}
           path: ./__release_lnx
       - name: Download oneDAL environment artifact
         uses: actions/download-artifact@v4
         with:
           name: oneDAL_env
           github-token: ${{ github.token }}
-          repository: oneapi-src/oneDAL
+          repository: uxlfoundation/oneDAL
           run-id: ${{ steps.get-run-id.outputs.run-id }}
           path: .ci/env
       - name: Set Environment Variables
@@ -181,7 +181,7 @@ jobs:
         id: get-run-id
         shell: bash
         run: |
-          OTHER_REPO="oneapi-src/oneDAL"
+          OTHER_REPO="uxlfoundation/oneDAL"
           WF_NAME="Nightly-build"
           JQ_QUERY='map(select(.event == "workflow_dispatch" or .event == "schedule")) | .[0].databaseId'
           RUN_ID=`gh run --repo ${OTHER_REPO} list --workflow "${WF_NAME}" --json databaseId,event --status success --jq "${JQ_QUERY}"`
@@ -194,15 +194,15 @@ jobs:
         with:
           name: __release_win
           github-token: ${{ github.token }}
-          repository: oneapi-src/oneDAL
+          repository: uxlfoundation/oneDAL
           run-id: ${{ steps.get-run-id.outputs.run-id }}
           path: ./__release_win
       - name: Download Intel BaseKit artifact
         uses: actions/download-artifact@v4
         with:
           name: intel_oneapi_basekit
           github-token: ${{ github.token }}
-          repository: oneapi-src/oneDAL
+          repository: uxlfoundation/oneDAL
           run-id: ${{ steps.get-run-id.outputs.run-id }}
       - name: Decompress Intel BaseKit
         shell: cmd
@@ -224,7 +224,7 @@ jobs:
         with:
           name: opencl_rt_installer
           github-token: ${{ github.token }}
-          repository: oneapi-src/oneDAL
+          repository: uxlfoundation/oneDAL
           run-id: ${{ steps.get-run-id.outputs.run-id }}
           path: .
       - name: Install Intel OpenCL CPU Runtime

diff --git a/deselected_tests.yaml b/deselected_tests.yaml
@@ -280,12 +280,8 @@ deselected_tests:
   # Need to rework getting policy to correctly obtain it for method without data (finalize_fit)
   # and avoid keeping it in class attribute, also need to investigate how to implement
   # partial result serialization
-  - tests/test_common.py::test_estimators[IncrementalEmpiricalCovariance()-check_estimators_pickle]
-  - tests/test_common.py::test_estimators[IncrementalEmpiricalCovariance()-check_estimators_pickle(readonly_memmap=True)]
   - tests/test_common.py::test_estimators[IncrementalLinearRegression()-check_estimators_pickle]
   - tests/test_common.py::test_estimators[IncrementalLinearRegression()-check_estimators_pickle(readonly_memmap=True)]
-  - tests/test_common.py::test_estimators[IncrementalPCA()-check_estimators_pickle]
-  - tests/test_common.py::test_estimators[IncrementalPCA()-check_estimators_pickle(readonly_memmap=True)]
   - tests/test_common.py::test_estimators[IncrementalRidge()-check_estimators_pickle]
   - tests/test_common.py::test_estimators[IncrementalRidge()-check_estimators_pickle(readonly_memmap=True)]
   # There are not enough data to run onedal backend
@@ -305,6 +301,9 @@ deselected_tests:
   # Fails in stock scikit-learn: checks that data is modified in-place when not strictly required
   - linear_model/tests/test_base.py::test_inplace_data_preprocessing
 
+  # Failure occurs in python3.9 on windows CPU only - not easy to reproduce
+  - ensemble/tests/test_weight_boosting.py::test_estimator >= 1.4 win32
+
   # --------------------------------------------------------
   # No need to test daal4py patching
 reduced_tests:
@@ -462,8 +461,3 @@ gpu:
   # RuntimeError: Device support is not implemented, failing as result of fallback to cpu false
   - svm/tests/test_svm.py::test_unfitted
   - tests/test_common.py::test_estimators[SVC()-check_estimators_unfitted]
-
-  # Failed on the onedal's LinearRegression call.
-  # RuntimeError: oneapi::mkl::lapack::potrf: computation error: info = 2: Leading principal minor of order
-  # 2 is not positive, and the factorization could not be completed.
-  - ensemble/tests/test_stacking.py::test_stacking_prefit[StackingRegressor-DummyRegressor-predict-final_estimator1-X1-y1]
diff --git a/doc/sources/algorithms.rst b/doc/sources/algorithms.rst
@@ -304,7 +304,7 @@ Regression
 
        - ``normalize`` != `False`
        - ``sample_weight`` != `None`
-     - Only dense data is supported, `#observations` should be >= `#features` and there should be no linearly dependent features in the data.
+     - Only dense data is supported.
 
 Clustering
 **********
@@ -456,7 +456,7 @@ Regression
 
        - ``normalize`` != `False`
        - ``sample_weight`` != `None`
-     - Only dense data is supported, `#observations` should be >= `#features` and there should be no linearly dependent features in the data.
+     - Only dense data is supported.
 
 Clustering
 **********

diff --git a/doc/sources/preview.rst b/doc/sources/preview.rst
@@ -50,10 +50,10 @@ Then, you can import Scikit-learn estimator patched with a preview one from `skl
 
      from sklearnex import patch_sklearn
      patch_sklearn()
-     from sklearn.linear_model import Ridge
-     print(Ridge.__module__)
+     from sklearn.decomposition import IncrementalPCA
+     print(IncrementalPCA.__module__)
      # output:
-     # sklearnex.preview.linear_model.ridge
+     # sklearnex.preview.decomposition.incremental_pca
 
 Current list of preview estimators:
 
@@ -71,6 +71,3 @@ Current list of preview estimators:
    * - IncrementalPCA
      - sklearnex.preview.decomposition
      - Yes
-   * - Ridge
-     - sklearnex.preview.linear_model
-     - Yes
diff --git a/onedal/basic_statistics/basic_statistics.cpp b/onedal/basic_statistics/basic_statistics.cpp
@@ -19,6 +19,9 @@
 #include "onedal/common.hpp"
 #include "onedal/version.hpp"
 
+#define NO_IMPORT_ARRAY // import_array called in table.cpp
+#include "onedal/datatypes/data_conversion.hpp"
+
 #include <string>
 #include <regex>
 #include <map>
@@ -204,7 +207,32 @@ void init_partial_compute_result(py::module_& m) {
         .DEF_ONEDAL_PY_PROPERTY(partial_max, result_t)
         .DEF_ONEDAL_PY_PROPERTY(partial_sum, result_t)
         .DEF_ONEDAL_PY_PROPERTY(partial_sum_squares, result_t)
-        .DEF_ONEDAL_PY_PROPERTY(partial_sum_squares_centered, result_t);
+        .DEF_ONEDAL_PY_PROPERTY(partial_sum_squares_centered, result_t)
+        .def(py::pickle(
+            [](const result_t& res) {
+                return py::make_tuple(
+                    py::cast<py::object>(convert_to_pyobject(res.get_partial_n_rows())),
+                    py::cast<py::object>(convert_to_pyobject(res.get_partial_min())),
+                    py::cast<py::object>(convert_to_pyobject(res.get_partial_max())),
+                    py::cast<py::object>(convert_to_pyobject(res.get_partial_sum())),
+                    py::cast<py::object>(convert_to_pyobject(res.get_partial_sum_squares())),
+                    py::cast<py::object>(convert_to_pyobject(res.get_partial_sum_squares_centered()))                    
+                );
+            },
+            [](py::tuple t) {
+                if (t.size() != 6)
+                    throw std::runtime_error("Invalid state!");
+                result_t res;
+                if (py::cast<int>(t[0].attr("size")) != 0) res.set_partial_n_rows(convert_to_table(t[0].ptr()));
+                if (py::cast<int>(t[1].attr("size")) != 0) res.set_partial_min(convert_to_table(t[1].ptr()));
+                if (py::cast<int>(t[2].attr("size")) != 0) res.set_partial_max(convert_to_table(t[2].ptr()));
+                if (py::cast<int>(t[2].attr("size")) != 0) res.set_partial_sum(convert_to_table(t[3].ptr()));
+                if (py::cast<int>(t[2].attr("size")) != 0) res.set_partial_sum_squares(convert_to_table(t[4].ptr()));
+                if (py::cast<int>(t[2].attr("size")) != 0) res.set_partial_sum_squares_centered(convert_to_table(t[5].ptr()));
+
+                return res;
+            }
+        ));
 }
 
 ONEDAL_PY_DECLARE_INSTANTIATOR(init_compute_result);

diff --git a/onedal/basic_statistics/incremental_basic_statistics.py b/onedal/basic_statistics/incremental_basic_statistics.py
@@ -65,10 +65,21 @@ def __init__(self, result_options="all", algorithm="by_default"):
         self._reset()
 
     def _reset(self):
+        self._need_to_finalize = False
         self._partial_result = self._get_backend(
             "basic_statistics", None, "partial_compute_result"
         )
 
+    def __getstate__(self):
+        # Since finalize_fit can't be dispatched without directly provided queue
+        # and the dispatching policy can't be serialized, the computation is finalized
+        # here and the policy is not saved in serialized data.
+        self.finalize_fit()
+        data = self.__dict__.copy()
+        data.pop("_queue", None)
+
+        return data
+
     def partial_fit(self, X, sample_weight=None, queue=None):
         """
         Computes partial data for basic statistics
@@ -106,6 +117,9 @@ def partial_fit(self, X, sample_weight=None, queue=None):
             sample_weight,
         )
 
+        self._need_to_finalize = True
+        return self
+
     def finalize_fit(self, queue=None):
         """
         Finalizes basic statistics computation and obtains result
@@ -121,22 +135,23 @@ def finalize_fit(self, queue=None):
         self : object
             Returns the instance itself.
         """
-
-        if queue is not None:
-            policy = self._get_policy(queue)
-        else:
-            policy = self._get_policy(self._queue)
-
-        result = self._get_backend(
-            "basic_statistics",
-            None,
-            "finalize_compute",
-            policy,
-            self._onedal_params,
-            self._partial_result,
-        )
-
-        for opt in self.options:
-            setattr(self, opt, from_table(getattr(result, opt))[0])
+        if self._need_to_finalize:
+            if queue is not None:
+                policy = self._get_policy(queue)
+            else:
+                policy = self._get_policy(self._queue)
+
+            result = self._get_backend(
+                "basic_statistics",
+                None,
+                "finalize_compute",
+                policy,
+                self._onedal_params,
+                self._partial_result,
+            )
+            for opt in self.options:
+                setattr(self, opt, from_table(getattr(result, opt))[0])
+
+            self._need_to_finalize = False
 
         return self
diff --git a/onedal/basic_statistics/tests/test_incremental_basic_statistics.py b/onedal/basic_statistics/tests/test_incremental_basic_statistics.py
@@ -20,6 +20,7 @@
 
 from onedal.basic_statistics import IncrementalBasicStatistics
 from onedal.basic_statistics.tests.utils import options_and_tests
+from onedal.datatypes import from_table
 from onedal.tests.utils._device_selection import get_queues
 
 
@@ -189,3 +190,90 @@ def test_all_option_on_random_data(
             gtr = function(data)
         tol = fp32tol if res.dtype == np.float32 else fp64tol
         assert_allclose(gtr, res, atol=tol)
+
+
+@pytest.mark.parametrize("queue", get_queues())
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_incremental_estimator_pickle(queue, dtype):
+    import pickle
+
+    from onedal.basic_statistics import IncrementalBasicStatistics
+
+    incbs = IncrementalBasicStatistics()
+
+    # Check that estimator can be serialized without any data.
+    dump = pickle.dumps(incbs)
+    incbs_loaded = pickle.loads(dump)
+    seed = 77
+    gen = np.random.default_rng(seed)
+    X = gen.uniform(low=-0.3, high=+0.7, size=(10, 10))
+    X = X.astype(dtype)
+    X_split = np.array_split(X, 2)
+    incbs.partial_fit(X_split[0], queue=queue)
+    incbs_loaded.partial_fit(X_split[0], queue=queue)
+
+    assert incbs._need_to_finalize == True
+    assert incbs_loaded._need_to_finalize == True
+
+    # Check that estimator can be serialized after partial_fit call.
+    dump = pickle.dumps(incbs)
+    incbs_loaded = pickle.loads(dump)
+    assert incbs._need_to_finalize == False
+    # Finalize is called during serialization to make sure partial results are finalized correctly.
+    assert incbs_loaded._need_to_finalize == False
+
+    partial_n_rows = from_table(incbs._partial_result.partial_n_rows)
+    partial_n_rows_loaded = from_table(incbs_loaded._partial_result.partial_n_rows)
+    assert_allclose(partial_n_rows, partial_n_rows_loaded)
+
+    partial_min = from_table(incbs._partial_result.partial_min)
+    partial_min_loaded = from_table(incbs_loaded._partial_result.partial_min)
+    assert_allclose(partial_min, partial_min_loaded)
+
+    partial_max = from_table(incbs._partial_result.partial_max)
+    partial_max_loaded = from_table(incbs_loaded._partial_result.partial_max)
+    assert_allclose(partial_max, partial_max_loaded)
+
+    partial_sum = from_table(incbs._partial_result.partial_sum)
+    partial_sum_loaded = from_table(incbs_loaded._partial_result.partial_sum)
+    assert_allclose(partial_sum, partial_sum_loaded)
+
+    partial_sum_squares = from_table(incbs._partial_result.partial_sum_squares)
+    partial_sum_squares_loaded = from_table(
+        incbs_loaded._partial_result.partial_sum_squares
+    )
+    assert_allclose(partial_sum_squares, partial_sum_squares_loaded)
+
+    partial_sum_squares_centered = from_table(
+        incbs._partial_result.partial_sum_squares_centered
+    )
+    partial_sum_squares_centered_loaded = from_table(
+        incbs_loaded._partial_result.partial_sum_squares_centered
+    )
+    assert_allclose(partial_sum_squares_centered, partial_sum_squares_centered_loaded)
+
+    incbs.partial_fit(X_split[1], queue=queue)
+    incbs_loaded.partial_fit(X_split[1], queue=queue)
+    assert incbs._need_to_finalize == True
+    assert incbs_loaded._need_to_finalize == True
+
+    dump = pickle.dumps(incbs_loaded)
+    incbs_loaded = pickle.loads(dump)
+
+    assert incbs._need_to_finalize == True
+    assert incbs_loaded._need_to_finalize == False
+
+    incbs.finalize_fit()
+    incbs_loaded.finalize_fit()
+
+    # Check that finalized estimator can be serialized.
+    dump = pickle.dumps(incbs_loaded)
+    incbs_loaded = pickle.loads(dump)
+
+    for result_option in options_and_tests:
+        _, tols = options_and_tests[result_option]
+        fp32tol, fp64tol = tols
+        res = getattr(incbs, result_option)
+        res_loaded = getattr(incbs_loaded, result_option)
+        tol = fp32tol if res.dtype == np.float32 else fp64tol
+        assert_allclose(res, res_loaded, atol=tol)
diff --git a/onedal/common/tests/test_sycl.py b/onedal/common/tests/test_sycl.py
@@ -109,7 +109,11 @@ def test_sycl_device_attributes(queue):
 
 @pytest.mark.skipif(not _is_dpc_backend, reason="requires dpc backend")
 def test_backend_queue():
-    q = _backend.SyclQueue("cpu")
+    try:
+        q = _backend.SyclQueue("cpu")
+    except RuntimeError:
+        pytest.skip("OpenCL CPU runtime not installed")
+
     # verify copying via a py capsule object is functional
     q2 = _backend.SyclQueue(q._get_capsule())
     # verify copying via the _get_capsule attribute