Skip to content

Commit 2c1fec5

Browse files
Configs for xpu: decision forest classifier (#100)
* df_clsf * newline * disable float64 * pep8 * sqrt and log2 parse * fix codefactor * fix * return ternary * enable all devices * enable float64 * delete extra files * replace None with none * column major * data-order F back * float scientific notation handling * pep8
1 parent 6891388 commit 2c1fec5

File tree

6 files changed

+219
-5
lines changed

6 files changed

+219
-5
lines changed

bench.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import logging
2020
import sys
2121
import timeit
22+
import re
2223

2324
import numpy as np
2425
import sklearn
@@ -64,8 +65,17 @@ def _parse_size(string, dim=2):
6465
return tup
6566

6667

68+
def is_float(string):
69+
return bool(re.match(r"^[-+]?(?:\b[0-9]+(?:\.[0-9]*)?|\.[0-9]+\b)(?:[eE][-+]?[0-9]+\b)?$",
70+
string))
71+
72+
6773
def float_or_int(string):
68-
return float(string) if '.' in string else int(string)
74+
return int(string) if string.isdigit() else float(string)
75+
76+
77+
def float_or_int_or_str(string):
78+
return int(string) if string.isdigit() else float(string) if is_float(string) else string
6979

7080

7181
def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64):

configs/xpu/df_clsf.json

+113
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
{
2+
"common": {
3+
"lib": "sklearn",
4+
"algorithm": "df_clsf",
5+
"data-format": "pandas",
6+
"data-order": "F",
7+
"dtype": ["float32", "float64"],
8+
"max-features": "sqrt",
9+
"device": ["host", "cpu", "gpu", "none"]
10+
},
11+
"cases": [
12+
{
13+
"dataset": [
14+
{
15+
"source": "npy",
16+
"name": "susy",
17+
"training":
18+
{
19+
"x": "data/susy_x_train.npy",
20+
"y": "data/susy_y_train.npy"
21+
},
22+
"testing":
23+
{
24+
"x": "data/susy_x_test.npy",
25+
"y": "data/susy_y_test.npy"
26+
}
27+
}
28+
],
29+
"num-trees": 10,
30+
"max-depth": 5
31+
},
32+
{
33+
"dataset": [
34+
{
35+
"source": "npy",
36+
"name": "susy",
37+
"training":
38+
{
39+
"x": "data/susy_x_train.npy",
40+
"y": "data/susy_y_train.npy"
41+
},
42+
"testing":
43+
{
44+
"x": "data/susy_x_test.npy",
45+
"y": "data/susy_y_test.npy"
46+
}
47+
}
48+
],
49+
"num-trees": 100,
50+
"max-depth": 8
51+
},
52+
{
53+
"dataset": [
54+
{
55+
"source": "npy",
56+
"name": "susy",
57+
"training":
58+
{
59+
"x": "data/susy_x_train.npy",
60+
"y": "data/susy_y_train.npy"
61+
},
62+
"testing":
63+
{
64+
"x": "data/susy_x_test.npy",
65+
"y": "data/susy_y_test.npy"
66+
}
67+
}
68+
],
69+
"num-trees": 20,
70+
"max-depth": 16
71+
},
72+
{
73+
"dataset": [
74+
{
75+
"source": "npy",
76+
"name": "mnist",
77+
"training":
78+
{
79+
"x": "data/mnist_x_train.npy",
80+
"y": "data/mnist_y_train.npy"
81+
},
82+
"testing":
83+
{
84+
"x": "data/mnist_x_test.npy",
85+
"y": "data/mnist_y_test.npy"
86+
}
87+
}
88+
],
89+
"num-trees": 100,
90+
"max-depth": 10
91+
},
92+
{
93+
"dataset": [
94+
{
95+
"source": "npy",
96+
"name": "hepmass_150K",
97+
"training":
98+
{
99+
"x": "data/hepmass_150K_x_train.npy",
100+
"y": "data/hepmass_150K_y_train.npy"
101+
},
102+
"testing":
103+
{
104+
"x": "data/hepmass_150K_x_test.npy",
105+
"y": "data/hepmass_150K_y_test.npy"
106+
}
107+
}
108+
],
109+
"num-trees": 50,
110+
"max-depth": 15
111+
}
112+
]
113+
}

datasets/load_datasets.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323

2424
from .loader_classification import (a_nine_a, airline, airline_ohe, bosch,
2525
census, codrnanorm, creditcard, epsilon, fraud,
26-
gisette, higgs, higgs_one_m, ijcnn,
27-
klaverjas, santander, skin_segmentation)
26+
gisette, hepmass_150K, higgs, higgs_one_m, ijcnn,
27+
klaverjas, santander, skin_segmentation, susy)
2828
from .loader_multiclass import (connect, covertype, covtype, letters, mlsr,
2929
mnist, msrank, plasticc, sensit)
3030
from .loader_regression import (abalone, california_housing, fried,
@@ -49,6 +49,7 @@
4949
"fraud": fraud,
5050
"fried": fried,
5151
"gisette": gisette,
52+
"hepmass_150K": hepmass_150K,
5253
"higgs": higgs,
5354
"higgs1m": higgs_one_m,
5455
"ijcnn": ijcnn,
@@ -63,6 +64,7 @@
6364
"santander": santander,
6465
"sensit": sensit,
6566
"skin_segmentation": skin_segmentation,
67+
"susy": susy,
6668
"twodplanes": twodplanes,
6769
"year_prediction_msd": year_prediction_msd,
6870
"yolanda": yolanda,

datasets/loader_classification.py

+89
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,55 @@ def gisette(dataset_dir: Path) -> bool:
446446
return True
447447

448448

449+
def hepmass_150K(dataset_dir: Path) -> bool:
450+
"""
451+
HEPMASS dataset from UCI machine learning repository (
452+
https://archive.ics.uci.edu/ml/datasets/HEPMASS).
453+
454+
Classification task. n_classes = 2.
455+
hepmass_150K X train dataset (100000, 28)
456+
hepmass_150K y train dataset (100000, 1)
457+
hepmass_150K X test dataset (50000, 28)
458+
hepmass_150K y test dataset (50000, 1)
459+
"""
460+
dataset_name = 'hepmass_150K'
461+
os.makedirs(dataset_dir, exist_ok=True)
462+
463+
url_test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_test.csv.gz'
464+
url_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_train.csv.gz'
465+
466+
local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
467+
local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
468+
469+
if not os.path.isfile(local_url_test):
470+
logging.info(f'Started loading {dataset_name}, test')
471+
retrieve(url_test, local_url_test)
472+
if not os.path.isfile(local_url_train):
473+
logging.info(f'Started loading {dataset_name}, train')
474+
retrieve(url_train, local_url_train)
475+
logging.info(f'{dataset_name} is loaded, started parsing...')
476+
477+
nrows_train, nrows_test, dtype = 100000, 50000, np.float32
478+
data_test: Any = pd.read_csv(local_url_test, delimiter=",",
479+
compression="gzip", dtype=dtype,
480+
nrows=nrows_test)
481+
data_train: Any = pd.read_csv(local_url_train, delimiter=",",
482+
compression="gzip", dtype=dtype,
483+
nrows=nrows_train)
484+
485+
x_test = np.ascontiguousarray(data_test.values[:nrows_test, 1:], dtype=dtype)
486+
y_test = np.ascontiguousarray(data_test.values[:nrows_test, 0], dtype=dtype)
487+
x_train = np.ascontiguousarray(data_train.values[:nrows_train, 1:], dtype=dtype)
488+
y_train = np.ascontiguousarray(data_train.values[:nrows_train, 0], dtype=dtype)
489+
490+
for data, name in zip((x_train, x_test, y_train, y_test),
491+
('x_train', 'x_test', 'y_train', 'y_test')):
492+
filename = f'{dataset_name}_{name}.npy'
493+
np.save(os.path.join(dataset_dir, filename), data)
494+
logging.info(f'dataset {dataset_name} is ready.')
495+
return True
496+
497+
449498
def higgs(dataset_dir: Path) -> bool:
450499
"""
451500
Higgs dataset from UCI machine learning repository
@@ -637,3 +686,43 @@ def skin_segmentation(dataset_dir: Path) -> bool:
637686
np.save(os.path.join(dataset_dir, filename), data)
638687
logging.info(f'dataset {dataset_name} is ready.')
639688
return True
689+
690+
691+
def susy(dataset_dir: Path) -> bool:
692+
"""
693+
SUSY dataset from UCI machine learning repository (
694+
https://archive.ics.uci.edu/ml/datasets/SUSY).
695+
696+
Classification task. n_classes = 2.
697+
susy X train dataset (4500000, 28)
698+
susy y train dataset (4500000, 1)
699+
susy X test dataset (500000, 28)
700+
susy y test dataset (500000, 1)
701+
"""
702+
dataset_name = 'susy'
703+
os.makedirs(dataset_dir, exist_ok=True)
704+
705+
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz'
706+
local_url = os.path.join(dataset_dir, os.path.basename(url))
707+
if not os.path.isfile(local_url):
708+
logging.info(f'Started loading {dataset_name}')
709+
retrieve(url, local_url)
710+
logging.info(f'{dataset_name} is loaded, started parsing...')
711+
712+
nrows_train, nrows_test, dtype = 4500000, 500000, np.float32
713+
data: Any = pd.read_csv(local_url, delimiter=",", header=None,
714+
compression="gzip", dtype=dtype,
715+
nrows=nrows_train + nrows_test)
716+
717+
X = data[data.columns[1:]]
718+
y = data[data.columns[0:1]]
719+
720+
x_train, x_test, y_train, y_test = train_test_split(
721+
X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False)
722+
723+
for data, name in zip((x_train, x_test, y_train, y_test),
724+
('x_train', 'x_test', 'y_train', 'y_test')):
725+
filename = f'{dataset_name}_{name}.npy'
726+
np.save(os.path.join(dataset_dir, filename), data)
727+
logging.info(f'dataset {dataset_name} is ready.')
728+
return True

sklearn_bench/df_clsf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def main():
8181
help='The function to measure the quality of a split')
8282
parser.add_argument('--num-trees', type=int, default=100,
8383
help='Number of trees in the forest')
84-
parser.add_argument('--max-features', type=bench.float_or_int, default=None,
84+
parser.add_argument('--max-features', type=bench.float_or_int_or_str, default=None,
8585
help='Upper bound on features used at each split')
8686
parser.add_argument('--max-depth', type=int, default=None,
8787
help='Upper bound on depth of constructed trees')

sklearn_bench/df_regr.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def main():
7272
help='The function to measure the quality of a split')
7373
parser.add_argument('--num-trees', type=int, default=100,
7474
help='Number of trees in the forest')
75-
parser.add_argument('--max-features', type=bench.float_or_int, default=None,
75+
parser.add_argument('--max-features', type=bench.float_or_int_or_str, default=None,
7676
help='Upper bound on features used at each split')
7777
parser.add_argument('--max-depth', type=int, default=None,
7878
help='Upper bound on depth of constructed trees')

0 commit comments

Comments
 (0)