@@ -446,6 +446,55 @@ def gisette(dataset_dir: Path) -> bool:
446
446
return True
447
447
448
448
449
+ def hepmass_150K (dataset_dir : Path ) -> bool :
450
+ """
451
+ HEPMASS dataset from UCI machine learning repository (
452
+ https://archive.ics.uci.edu/ml/datasets/HEPMASS).
453
+
454
+ Classification task. n_classes = 2.
455
+ hepmass_150K X train dataset (100000, 28)
456
+ hepmass_150K y train dataset (100000, 1)
457
+ hepmass_150K X test dataset (50000, 28)
458
+ hepmass_150K y test dataset (50000, 1)
459
+ """
460
+ dataset_name = 'hepmass_150K'
461
+ os .makedirs (dataset_dir , exist_ok = True )
462
+
463
+ url_test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_test.csv.gz'
464
+ url_train = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_train.csv.gz'
465
+
466
+ local_url_test = os .path .join (dataset_dir , os .path .basename (url_test ))
467
+ local_url_train = os .path .join (dataset_dir , os .path .basename (url_train ))
468
+
469
+ if not os .path .isfile (local_url_test ):
470
+ logging .info (f'Started loading { dataset_name } , test' )
471
+ retrieve (url_test , local_url_test )
472
+ if not os .path .isfile (local_url_train ):
473
+ logging .info (f'Started loading { dataset_name } , train' )
474
+ retrieve (url_train , local_url_train )
475
+ logging .info (f'{ dataset_name } is loaded, started parsing...' )
476
+
477
+ nrows_train , nrows_test , dtype = 100000 , 50000 , np .float32
478
+ data_test : Any = pd .read_csv (local_url_test , delimiter = "," ,
479
+ compression = "gzip" , dtype = dtype ,
480
+ nrows = nrows_test )
481
+ data_train : Any = pd .read_csv (local_url_train , delimiter = "," ,
482
+ compression = "gzip" , dtype = dtype ,
483
+ nrows = nrows_train )
484
+
485
+ x_test = np .ascontiguousarray (data_test .values [:nrows_test , 1 :], dtype = dtype )
486
+ y_test = np .ascontiguousarray (data_test .values [:nrows_test , 0 ], dtype = dtype )
487
+ x_train = np .ascontiguousarray (data_train .values [:nrows_train , 1 :], dtype = dtype )
488
+ y_train = np .ascontiguousarray (data_train .values [:nrows_train , 0 ], dtype = dtype )
489
+
490
+ for data , name in zip ((x_train , x_test , y_train , y_test ),
491
+ ('x_train' , 'x_test' , 'y_train' , 'y_test' )):
492
+ filename = f'{ dataset_name } _{ name } .npy'
493
+ np .save (os .path .join (dataset_dir , filename ), data )
494
+ logging .info (f'dataset { dataset_name } is ready.' )
495
+ return True
496
+
497
+
449
498
def higgs (dataset_dir : Path ) -> bool :
450
499
"""
451
500
Higgs dataset from UCI machine learning repository
@@ -637,3 +686,43 @@ def skin_segmentation(dataset_dir: Path) -> bool:
637
686
np .save (os .path .join (dataset_dir , filename ), data )
638
687
logging .info (f'dataset { dataset_name } is ready.' )
639
688
return True
689
+
690
+
691
+ def susy (dataset_dir : Path ) -> bool :
692
+ """
693
+ SUSY dataset from UCI machine learning repository (
694
+ https://archive.ics.uci.edu/ml/datasets/SUSY).
695
+
696
+ Classification task. n_classes = 2.
697
+ susy X train dataset (4500000, 28)
698
+ susy y train dataset (4500000, 1)
699
+ susy X test dataset (500000, 28)
700
+ susy y test dataset (500000, 1)
701
+ """
702
+ dataset_name = 'susy'
703
+ os .makedirs (dataset_dir , exist_ok = True )
704
+
705
+ url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz'
706
+ local_url = os .path .join (dataset_dir , os .path .basename (url ))
707
+ if not os .path .isfile (local_url ):
708
+ logging .info (f'Started loading { dataset_name } ' )
709
+ retrieve (url , local_url )
710
+ logging .info (f'{ dataset_name } is loaded, started parsing...' )
711
+
712
+ nrows_train , nrows_test , dtype = 4500000 , 500000 , np .float32
713
+ data : Any = pd .read_csv (local_url , delimiter = "," , header = None ,
714
+ compression = "gzip" , dtype = dtype ,
715
+ nrows = nrows_train + nrows_test )
716
+
717
+ X = data [data .columns [1 :]]
718
+ y = data [data .columns [0 :1 ]]
719
+
720
+ x_train , x_test , y_train , y_test = train_test_split (
721
+ X , y , train_size = nrows_train , test_size = nrows_test , shuffle = False )
722
+
723
+ for data , name in zip ((x_train , x_test , y_train , y_test ),
724
+ ('x_train' , 'x_test' , 'y_train' , 'y_test' )):
725
+ filename = f'{ dataset_name } _{ name } .npy'
726
+ np .save (os .path .join (dataset_dir , filename ), data )
727
+ logging .info (f'dataset { dataset_name } is ready.' )
728
+ return True
0 commit comments