Skip to content

Commit 7e6f97a

Browse files
committed
Remove Kaggle dataset source
1 parent 4337565 commit 7e6f97a

File tree

9 files changed

+3
-90
lines changed

9 files changed

+3
-90
lines changed

configs/regular/xgboost_binary.json

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -42,25 +42,6 @@
4242
}
4343
}
4444
},
45-
{
46-
"data": {
47-
"dataset": "bosch",
48-
"split_kwargs": {
49-
"train_size": 200000,
50-
"test_size": null
51-
}
52-
},
53-
"algorithm": {
54-
"estimator_params": {
55-
"learning_rate": 0.01,
56-
"reg_alpha": 1.0,
57-
"max_leaves": 256,
58-
"colsample_bytree": 0.5,
59-
"colsample_bynode": 0.5,
60-
"n_estimators": 200
61-
}
62-
}
63-
},
6445
{
6546
"data": {
6647
"dataset": "epsilon",

configs/weekly/pca.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
},
1616
{
1717
"data": {
18-
"dataset": ["airline_depdelay", "bosch", "epsilon"],
18+
"dataset": ["airline_depdelay", "epsilon"],
1919
"split_kwargs": { "ignore": true }
2020
}
2121
}

configs/weekly/train_test_split.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
"data": {
77
"dataset": [
88
"airline_depdelay",
9-
"higgs",
10-
"bosch"
9+
"higgs"
1110
]
1211
}
1312
},

envs/conda-env-rapids.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ dependencies:
1313
- tabulate
1414
- fastparquet
1515
- h5py
16-
- kaggle
1716
- openpyxl
1817
- tqdm
1918
- psutil

envs/conda-env-sklearn.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ dependencies:
1515
- tabulate
1616
- fastparquet
1717
- h5py
18-
- kaggle
1918
- openpyxl
2019
- tqdm
2120
- psutil

envs/requirements-sklearn.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ pandas
1313
tabulate
1414
fastparquet
1515
h5py
16-
kaggle
1716
openpyxl
1817
tqdm
1918
psutil

sklbench/datasets/README.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,9 @@ Data handling steps:
1010
Existing data sources:
1111
- Synthetic data from sklearn
1212
- OpenML datasets
13-
- Kaggle competition datasets
1413
- Custom loaders for named datasets
1514
- User-provided datasets in compatible format
1615

17-
Kaggle API keys and competition rules acceptance are required for next dataset:
18-
- [Bosch Production Line Performance (`bosch`)](https://www.kaggle.com/c/bosch-production-line-performance/overview)
19-
2016
## Data Caching
2117

2218
There are two levels of caching with corresponding directories: `raw cache` for files downloaded from external sources, and just `cache` for files applicable for fast-loading in benchmarks.

sklbench/datasets/downloaders.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,6 @@
2323
from scipy.sparse import csr_matrix
2424
from sklearn.datasets import fetch_openml
2525

26-
try:
27-
import kaggle
28-
29-
kaggle_is_imported = True
30-
except (ImportError, OSError, ValueError):
31-
kaggle_is_imported = False
32-
3326

3427
def retrieve(url: str, filename: str) -> None:
3528
if os.path.isfile(filename):
@@ -95,29 +88,3 @@ def download_and_read_csv(url: str, raw_data_cache_dir: str, **reading_kwargs):
9588
retrieve(url, local_path)
9689
data = pd.read_csv(local_path, **reading_kwargs)
9790
return data
98-
99-
100-
def download_kaggle_files(
101-
kaggle_type: str, kaggle_name: str, filenames: List[str], raw_data_cache_dir: str
102-
):
103-
if not kaggle_is_imported:
104-
raise ValueError(
105-
"Kaggle API is not available. Please, check if 'kaggle' package and Kaggle API key are installed."
106-
)
107-
api = kaggle.KaggleApi()
108-
api.authenticate()
109-
110-
if kaggle_type == "competition":
111-
download_method = api.competition_download_file
112-
elif kaggle_type == "dataset":
113-
download_method = api.dataset_download_file
114-
else:
115-
raise ValueError(
116-
f"Unknown {kaggle_type} type for " '"download_kaggle_files" function.'
117-
)
118-
119-
output_file_paths = {}
120-
for filename in filenames:
121-
download_method(kaggle_name, filename, raw_data_cache_dir)
122-
output_file_paths[filename] = os.path.join(raw_data_cache_dir, filename)
123-
return output_file_paths

sklbench/datasets/loaders.py

Lines changed: 1 addition & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,7 @@
3232
)
3333

3434
from .common import cache, load_data_description, load_data_from_cache, preprocess
35-
from .downloaders import (
36-
download_and_read_csv,
37-
download_kaggle_files,
38-
load_openml,
39-
retrieve,
40-
)
35+
from .downloaders import download_and_read_csv, load_openml, retrieve
4136

4237

4338
@preprocess
@@ -175,27 +170,6 @@ def load_airline_depdelay(
175170
return {"x": x, "y": y}, data_description
176171

177172

178-
@cache
179-
def load_bosch(
180-
data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict
181-
) -> Tuple[Dict, Dict]:
182-
data_filename = "train_numeric.csv.zip"
183-
184-
data_path = download_kaggle_files(
185-
"competition",
186-
"bosch-production-line-performance",
187-
[data_filename],
188-
raw_data_cache,
189-
)[data_filename]
190-
191-
data = pd.read_csv(data_path, index_col=0, compression="zip", dtype=np.float32)
192-
y = data.iloc[:, -1].to_numpy(dtype=np.float32)
193-
x = data.drop(labels=[data.columns[-1]], axis=1)
194-
195-
data_desc = {"default_split": {"test_size": 0.2, "random_state": 77}}
196-
return {"x": x, "y": y}, data_desc
197-
198-
199173
@cache
200174
def load_hepmass(
201175
data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict
@@ -833,7 +807,6 @@ def load_gist(
833807
# classification
834808
"airline_depdelay": load_airline_depdelay,
835809
"a9a": load_a9a,
836-
"bosch": load_bosch,
837810
"codrnanorm": load_codrnanorm,
838811
"covtype": load_covtype,
839812
"creditcard": load_creditcard,

0 commit comments

Comments
 (0)