From b84094126bbdc58a883d8dd930d4f9b50456ba72 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 21 Aug 2023 20:45:17 +0800 Subject: [PATCH 1/3] Move feature weight to skl parameters. Revert "Move feature weight to skl parameters." This reverts commit e45e3b3f567b9c4328781d8737f8747b51c87fab. Revert "Revert "Move feature weight to skl parameters."" This reverts commit 1749338652b097db25c41574707bf68bc5a98199. --- python-package/xgboost/sklearn.py | 51 +++++++++++++++++++----- python-package/xgboost/testing/shared.py | 8 +++- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index cf15f5d5d717..237305c58d4d 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -389,7 +389,13 @@ def task(i: int) -> float: Used for specifying feature types without constructing a dataframe. See :py:class:`DMatrix` for details. - max_cat_to_onehot : {Optional[int]} + feature_weights : Optional[ArrayLike] + + Weight for each feature, defines the probability of each feature being selected + when colsample is being used. All values must be greater than 0, otherwise a + `ValueError` is thrown. + + max_cat_to_onehot : Optional[int] .. versionadded:: 1.6.0 @@ -607,7 +613,7 @@ def _wrap_evaluation_matrices( qid: Optional[Any], sample_weight: Optional[Any], base_margin: Optional[Any], - feature_weights: Optional[Any], + feature_weights: Optional[ArrayLike], eval_set: Optional[Sequence[Tuple[Any, Any]]], sample_weight_eval_set: Optional[Sequence[Any]], base_margin_eval_set: Optional[Sequence[Any]], @@ -753,6 +759,7 @@ def __init__( validate_parameters: Optional[bool] = None, enable_categorical: bool = False, feature_types: Optional[FeatureTypes] = None, + feature_weights: Optional[ArrayLike] = None, max_cat_to_onehot: Optional[int] = None, max_cat_threshold: Optional[int] = None, multi_strategy: Optional[str] = None, @@ -799,6 +806,7 @@ def __init__( self.validate_parameters = validate_parameters self.enable_categorical = enable_categorical self.feature_types = feature_types + self.feature_weights = feature_weights self.max_cat_to_onehot = max_cat_to_onehot self.max_cat_threshold = max_cat_threshold self.multi_strategy = multi_strategy @@ -1065,10 +1073,12 @@ def _configure_fit( self, booster: Optional[Union[Booster, "XGBModel", str]], params: Dict[str, Any], + feature_weights: Optional[ArrayLike], ) -> Tuple[ Optional[Union[Booster, str, "XGBModel"]], Optional[Metric], Dict[str, Any], + Optional[ArrayLike], ]: """Configure parameters for :py:meth:`fit`.""" if isinstance(booster, XGBModel): @@ -1101,13 +1111,23 @@ def _duplicated(parameter: str) -> None: else: params.update({"eval_metric": self.eval_metric}) + if feature_weights is not None: + _deprecated("feature_weights") + if feature_weights is not None and self.feature_types is not None: + _duplicated("feature_weights") + feature_weights = ( + self.feature_weights + if self.feature_weights is not None + else feature_weights + ) + tree_method = params.get("tree_method", None) if self.enable_categorical and tree_method == "exact": raise ValueError( "Experimental support for categorical data is not implemented for" " current tree method yet." ) - return model, metric, params + return model, metric, params, feature_weights def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix: # Use `QuantileDMatrix` to save memory. @@ -1184,12 +1204,20 @@ def fit( A list of the form [M_1, M_2, ..., M_n], where each M_i is an array like object storing base margin for the i-th validation set. feature_weights : - Weight for each feature, defines the probability of each feature being - selected when colsample is being used. All values must be greater than 0, - otherwise a `ValueError` is thrown. + + .. deprecated:: 3.0.0 + + Use `feature_weights` in :py:meth:`__init__` or :py:meth:`set_params` + instead. """ with config_context(verbosity=self.verbosity): + params = self.get_xgb_params() + + model, metric, params, feature_weights = self._configure_fit( + xgb_model, params, feature_weights + ) + evals_result: TrainingCallback.EvalsLog = {} train_dmatrix, evals = _wrap_evaluation_matrices( missing=self.missing, @@ -1209,7 +1237,6 @@ def fit( enable_categorical=self.enable_categorical, feature_types=self.feature_types, ) - params = self.get_xgb_params() if callable(self.objective): obj: Optional[Objective] = _objective_decorator(self.objective) @@ -1217,7 +1244,6 @@ def fit( else: obj = None - model, metric, params = self._configure_fit(xgb_model, params) self._Booster = train( params, train_dmatrix, @@ -1631,7 +1657,9 @@ def fit( params["objective"] = "multi:softprob" params["num_class"] = self.n_classes_ - model, metric, params = self._configure_fit(xgb_model, params) + model, metric, params, feature_weights = self._configure_fit( + xgb_model, params, feature_weights + ) train_dmatrix, evals = _wrap_evaluation_matrices( missing=self.missing, X=X, @@ -2148,8 +2176,9 @@ def fit( evals_result: TrainingCallback.EvalsLog = {} params = self.get_xgb_params() - model, metric, params = self._configure_fit(xgb_model, params) - + model, metric, params, feature_weights = self._configure_fit( + xgb_model, params, feature_weights + ) self._Booster = train( params, train_dmatrix, diff --git a/python-package/xgboost/testing/shared.py b/python-package/xgboost/testing/shared.py index 46e4feacc93d..32d5962e7c30 100644 --- a/python-package/xgboost/testing/shared.py +++ b/python-package/xgboost/testing/shared.py @@ -63,9 +63,13 @@ def get_feature_weights( """Get feature weights using the demo parser.""" with tempfile.TemporaryDirectory() as tmpdir: colsample_bynode = 0.5 - reg = model(tree_method=tree_method, colsample_bynode=colsample_bynode) + reg = model( + tree_method=tree_method, + colsample_bynode=colsample_bynode, + feature_weights=fw, + ) - reg.fit(X, y, feature_weights=fw) + reg.fit(X, y) model_path = os.path.join(tmpdir, "model.json") reg.save_model(model_path) with open(model_path, "r", encoding="utf-8") as fd: From f8ca296a5280cca8820ab018d3ae324fe719ebf8 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 20 Feb 2025 23:33:09 +0800 Subject: [PATCH 2/3] dask, spark. --- python-package/xgboost/dask/__init__.py | 14 +++++++++++--- python-package/xgboost/sklearn.py | 2 +- python-package/xgboost/spark/core.py | 1 + python-package/xgboost/spark/data.py | 1 + 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py index 8ae7fe1fe0a2..e187d73f3f64 100644 --- a/python-package/xgboost/dask/__init__.py +++ b/python-package/xgboost/dask/__init__.py @@ -1639,6 +1639,10 @@ async def _fit_async( feature_weights: Optional[_DaskCollection], ) -> _DaskCollection: params = self.get_xgb_params() + model, metric, params, feature_weights = self._configure_fit( + xgb_model, params, feature_weights + ) + dtrain, evals = await _async_wrap_evaluation_matrices( client=self.client, device=self.device, @@ -1665,7 +1669,6 @@ async def _fit_async( obj: Optional[Callable] = _objective_decorator(self.objective) else: obj = None - model, metric, params = self._configure_fit(xgb_model, params) results = await self.client.sync( _train_async, asynchronous=True, @@ -1729,6 +1732,10 @@ async def _fit_async( feature_weights: Optional[_DaskCollection], ) -> "DaskXGBClassifier": params = self.get_xgb_params() + model, metric, params, feature_weights = self._configure_fit( + xgb_model, params, feature_weights + ) + dtrain, evals = await _async_wrap_evaluation_matrices( self.client, device=self.device, @@ -1773,7 +1780,6 @@ async def _fit_async( obj: Optional[Callable] = _objective_decorator(self.objective) else: obj = None - model, metric, params = self._configure_fit(xgb_model, params) results = await self.client.sync( _train_async, asynchronous=True, @@ -1953,6 +1959,9 @@ async def _fit_async( feature_weights: Optional[_DaskCollection], ) -> "DaskXGBRanker": params = self.get_xgb_params() + model, metric, params, feature_weights = self._configure_fit( + xgb_model, params, feature_weights + ) dtrain, evals = await _async_wrap_evaluation_matrices( self.client, device=self.device, @@ -1974,7 +1983,6 @@ async def _fit_async( enable_categorical=self.enable_categorical, feature_types=self.feature_types, ) - model, metric, params = self._configure_fit(xgb_model, params) results = await self.client.sync( _train_async, asynchronous=True, diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 237305c58d4d..44a2eb9411b7 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -903,6 +903,7 @@ def _wrapper_params(self) -> Set[str]: "early_stopping_rounds", "callbacks", "feature_types", + "feature_weights", } return wrapper_specific @@ -1213,7 +1214,6 @@ def fit( """ with config_context(verbosity=self.verbosity): params = self.get_xgb_params() - model, metric, params, feature_weights = self._configure_fit( xgb_model, params, feature_weights ) diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index df9a57ba8428..efc965e74e0f 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -641,6 +641,7 @@ def __init__(self) -> None: repartition_random_shuffle=False, feature_names=None, feature_types=None, + feature_weights=None, arbitrary_params_dict={}, launch_tracker_on_driver=True, ) diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py index 75f0e6e8ab7f..254db221cdf9 100644 --- a/python-package/xgboost/spark/data.py +++ b/python-package/xgboost/spark/data.py @@ -352,6 +352,7 @@ def pred_contribs( missing=model.missing, nthread=model.n_jobs, feature_types=model.feature_types, + feature_weights=model.feature_weights, enable_categorical=model.enable_categorical, ) return model.get_booster().predict( From e0ad91bf32c485a3af1d1ee22bbee57ab911b96f Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 21 Feb 2025 00:44:03 +0800 Subject: [PATCH 3/3] Check duplicated. --- python-package/xgboost/sklearn.py | 2 +- tests/python/test_with_sklearn.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 44a2eb9411b7..86f550298a35 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -1114,7 +1114,7 @@ def _duplicated(parameter: str) -> None: if feature_weights is not None: _deprecated("feature_weights") - if feature_weights is not None and self.feature_types is not None: + if feature_weights is not None and self.feature_weights is not None: _duplicated("feature_weights") feature_weights = ( self.feature_weights diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index efca92a8464a..8842112cf2c0 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1212,6 +1212,10 @@ def test_feature_weights(tree_method): assert poly_increasing[0] > 0.08 assert poly_decreasing[0] < -0.08 + reg = xgb.XGBRegressor(feature_weights=np.ones((kCols, ))) + with pytest.raises(ValueError, match="Use the one in"): + reg.fit(X, y, feature_weights=np.ones((kCols, ))) + def run_boost_from_prediction_binary(tree_method, X, y, as_frame: Optional[Callable]): """