Remove Kaggle dataset source

Alexsandruss · Alexsandruss · commit 7e6f97a5eb94 · 2025-10-15T15:54:13.000+02:00
diff --git a/configs/regular/xgboost_binary.json b/configs/regular/xgboost_binary.json
@@ -42,25 +42,6 @@
                     }
                 }
             },
-            {
-                "data": {
-                    "dataset": "bosch",
-                    "split_kwargs": {
-                        "train_size": 200000,
-                        "test_size": null
-                    }
-                },
-                "algorithm": {
-                    "estimator_params": {
-                        "learning_rate": 0.01,
-                        "reg_alpha": 1.0,
-                        "max_leaves": 256,
-                        "colsample_bytree": 0.5,
-                        "colsample_bynode": 0.5,
-                        "n_estimators": 200
-                    }
-                }
-            },
             {
                 "data": {
                     "dataset": "epsilon",
diff --git a/configs/weekly/pca.json b/configs/weekly/pca.json
@@ -15,7 +15,7 @@
             },
             {
                 "data": {
-                    "dataset": ["airline_depdelay", "bosch", "epsilon"],
+                    "dataset": ["airline_depdelay", "epsilon"],
                     "split_kwargs": { "ignore": true }
                 }
             }
diff --git a/configs/weekly/train_test_split.json b/configs/weekly/train_test_split.json
@@ -6,8 +6,7 @@
                 "data": {
                     "dataset": [
                         "airline_depdelay",
-                        "higgs",
-                        "bosch"
+                        "higgs"
                     ]
                 }
             },
diff --git a/envs/conda-env-rapids.yml b/envs/conda-env-rapids.yml
@@ -13,7 +13,6 @@ dependencies:
   - tabulate
   - fastparquet
   - h5py
-  - kaggle
   - openpyxl
   - tqdm
   - psutil
diff --git a/envs/conda-env-sklearn.yml b/envs/conda-env-sklearn.yml
@@ -15,7 +15,6 @@ dependencies:
   - tabulate
   - fastparquet
   - h5py
-  - kaggle
   - openpyxl
   - tqdm
   - psutil
diff --git a/envs/requirements-sklearn.txt b/envs/requirements-sklearn.txt
@@ -13,7 +13,6 @@ pandas
 tabulate
 fastparquet
 h5py
-kaggle
 openpyxl
 tqdm
 psutil
diff --git a/sklbench/datasets/README.md b/sklbench/datasets/README.md
@@ -10,13 +10,9 @@ Data handling steps:
 Existing data sources:
  - Synthetic data from sklearn
  - OpenML datasets
- - Kaggle competition datasets
  - Custom loaders for named datasets
  - User-provided datasets in compatible format
 
-Kaggle API keys and competition rules acceptance are required for next dataset:
-- [Bosch Production Line Performance (`bosch`)](https://www.kaggle.com/c/bosch-production-line-performance/overview)
-
 ## Data Caching
 
 There are two levels of caching with corresponding directories: `raw cache` for files downloaded from external sources, and just `cache` for files applicable for fast-loading in benchmarks.
diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py
@@ -23,13 +23,6 @@
 from scipy.sparse import csr_matrix
 from sklearn.datasets import fetch_openml
 
-try:
-    import kaggle
-
-    kaggle_is_imported = True
-except (ImportError, OSError, ValueError):
-    kaggle_is_imported = False
-
 
 def retrieve(url: str, filename: str) -> None:
     if os.path.isfile(filename):
@@ -95,29 +88,3 @@ def download_and_read_csv(url: str, raw_data_cache_dir: str, **reading_kwargs):
     retrieve(url, local_path)
     data = pd.read_csv(local_path, **reading_kwargs)
     return data
-
-
-def download_kaggle_files(
-    kaggle_type: str, kaggle_name: str, filenames: List[str], raw_data_cache_dir: str
-):
-    if not kaggle_is_imported:
-        raise ValueError(
-            "Kaggle API is not available. Please, check if 'kaggle' package and Kaggle API key are installed."
-        )
-    api = kaggle.KaggleApi()
-    api.authenticate()
-
-    if kaggle_type == "competition":
-        download_method = api.competition_download_file
-    elif kaggle_type == "dataset":
-        download_method = api.dataset_download_file
-    else:
-        raise ValueError(
-            f"Unknown {kaggle_type} type for " '"download_kaggle_files" function.'
-        )
-
-    output_file_paths = {}
-    for filename in filenames:
-        download_method(kaggle_name, filename, raw_data_cache_dir)
-        output_file_paths[filename] = os.path.join(raw_data_cache_dir, filename)
-    return output_file_paths
diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py
@@ -32,12 +32,7 @@
 )
 
 from .common import cache, load_data_description, load_data_from_cache, preprocess
-from .downloaders import (
-    download_and_read_csv,
-    download_kaggle_files,
-    load_openml,
-    retrieve,
-)
+from .downloaders import download_and_read_csv, load_openml, retrieve
 
 
 @preprocess
@@ -175,27 +170,6 @@ def load_airline_depdelay(
     return {"x": x, "y": y}, data_description
 
 
-@cache
-def load_bosch(
-    data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict
-) -> Tuple[Dict, Dict]:
-    data_filename = "train_numeric.csv.zip"
-
-    data_path = download_kaggle_files(
-        "competition",
-        "bosch-production-line-performance",
-        [data_filename],
-        raw_data_cache,
-    )[data_filename]
-
-    data = pd.read_csv(data_path, index_col=0, compression="zip", dtype=np.float32)
-    y = data.iloc[:, -1].to_numpy(dtype=np.float32)
-    x = data.drop(labels=[data.columns[-1]], axis=1)
-
-    data_desc = {"default_split": {"test_size": 0.2, "random_state": 77}}
-    return {"x": x, "y": y}, data_desc
-
-
 @cache
 def load_hepmass(
     data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict
@@ -833,7 +807,6 @@ def load_gist(
     # classification
     "airline_depdelay": load_airline_depdelay,
     "a9a": load_a9a,
-    "bosch": load_bosch,
     "codrnanorm": load_codrnanorm,
     "covtype": load_covtype,
     "creditcard": load_creditcard,

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`},`
`16`	`16`	`{`
`17`	`17`	`"data": {`
`18`		`- "dataset": ["airline_depdelay", "bosch", "epsilon"],`
	`18`	`+ "dataset": ["airline_depdelay", "epsilon"],`
`19`	`19`	`"split_kwargs": { "ignore": true }`
`20`	`20`	`}`
`21`	`21`	`}`
Original file line number	Diff line number	Diff line change
`@@ -6,8 +6,7 @@`
`6`	`6`	`"data": {`
`7`	`7`	`"dataset": [`
`8`	`8`	`"airline_depdelay",`
`9`		`- "higgs",`
`10`		`- "bosch"`
	`9`	`+ "higgs"`
`11`	`10`	`]`
`12`	`11`	`}`
`13`	`12`	`},`