|
| 1 | +# Extract features from MPU dataset: |
| 2 | +# https://sites.google.com/view/mobile-phone-use-dataset |
| 3 | + |
| 4 | +import sys |
| 5 | +import os |
| 6 | +import time |
| 7 | +import argparse |
| 8 | +import shutil |
| 9 | + |
| 10 | +import pandas as pd |
| 11 | +# import numpy as np |
| 12 | + |
| 13 | +from tqdm import tqdm |
| 14 | +from datetime import datetime, timedelta |
| 15 | + |
| 16 | +from multiprocessing import Pool, cpu_count |
| 17 | +import traceback |
| 18 | + |
| 19 | + |
| 20 | +############################################################################################## |
| 21 | +# FILE PATHS |
| 22 | +############################################################################################## |
| 23 | + |
| 24 | +GLOBAL_PATH = "." # The global path that holds the INPUT_DATA and OUTPUT_DATA folders. e.g. '~/Users/<user-name>/Project'. |
| 25 | +INPUT_DATA = "mobile_phone_use" # The MPU dataset folder |
| 26 | +OUTPUT_DATA = "features" # The folder that will store the extracted features per user |
| 27 | + |
| 28 | +PARTICIPANTS_INFO_FILENAME = "pinfo.csv" # Should be 'pinfo.csv' |
| 29 | +GT_COLUMN = "Esm_TiredAwake" # Your Ground Truth column of the Esm sensor event. |
| 30 | + |
| 31 | + |
| 32 | +############################################################################################## |
| 33 | +# METHODS |
| 34 | +############################################################################################## |
| 35 | + |
| 36 | +def extract_features(pinfo, df, ff): |
| 37 | + |
| 38 | + # This is the place where the feature extraction should happen (per user). |
| 39 | + # 'pinfo' is a dict with the participant's information. |
| 40 | + # 'df' is the dataframe with the participant's mobile phone use data. |
| 41 | + # 'ff' is an empty dataframe that will store the features. GT_COLUMN is also copied. |
| 42 | + |
| 43 | + # This example uses the 'Acc' sensor (Accelerometer) and extracts the |
| 44 | + # average acceleration of the last measurement before each Esm event. |
| 45 | + df['Acc_Avg'].fillna(method='backfill', inplace=True) |
| 46 | + ff['ft_last_acc'] = df['Acc_Avg'] |
| 47 | + |
| 48 | + |
| 49 | +def extract_features_per_core(params): |
| 50 | + |
| 51 | + # unpack parameters |
| 52 | + pinfo, input_data_path, output_data_path = params |
| 53 | + |
| 54 | + try: |
| 55 | + # prepare paths |
| 56 | + input_file_path = os.path.join(input_data_path, "%s.csv" % pinfo.uuid) |
| 57 | + output_file_path = os.path.join(output_data_path, "%s.csv" % pinfo.uuid) |
| 58 | + |
| 59 | + # read data file |
| 60 | + df = pd.read_csv(input_file_path, low_memory=False) |
| 61 | + |
| 62 | + # init ff (features dataframe) and set GT |
| 63 | + ff = df[df.sensor_id == "Esm"][[GT_COLUMN]].copy().dropna() |
| 64 | + |
| 65 | + # extract features using pinfo, from df to ff. |
| 66 | + extract_features(pinfo, df, ff) |
| 67 | + |
| 68 | + # sort columns |
| 69 | + sorted_columns = sort_columns(ff.columns) |
| 70 | + ff = ff[sorted_columns] |
| 71 | + |
| 72 | + # save into csv |
| 73 | + ff.to_csv(output_file_path, index=False) |
| 74 | + |
| 75 | + # Status ok |
| 76 | + return True |
| 77 | + |
| 78 | + except KeyboardInterrupt: |
| 79 | + return False |
| 80 | + |
| 81 | + except Exception: |
| 82 | + e = sys.exc_info()[0] |
| 83 | + msg = sys.exc_info()[1] |
| 84 | + tb = sys.exc_info()[2] |
| 85 | + message = "exception: %s '%s'" % (e, msg) |
| 86 | + tqdm.write(message) |
| 87 | + traceback.print_tb(tb) |
| 88 | + return False |
| 89 | + |
| 90 | + |
| 91 | +def extract_all_features(pdf, input_data_path, output_data_path, nproc): |
| 92 | + |
| 93 | + # choose between single core vs multi-core |
| 94 | + if nproc <= 1: |
| 95 | + |
| 96 | + for _, pinfo in tqdm(pdf.iterrows(), total=len(pdf), desc='User', ncols=80): |
| 97 | + |
| 98 | + # pack params and extract features |
| 99 | + params = (pinfo, input_data_path, output_data_path) |
| 100 | + status = extract_features_per_core(params) |
| 101 | + |
| 102 | + # check for KeyboardInterrupt |
| 103 | + if status is False: |
| 104 | + raise KeyboardInterrupt |
| 105 | + |
| 106 | + else: |
| 107 | + # init pool with nproc |
| 108 | + pool = Pool(processes=nproc) |
| 109 | + |
| 110 | + # prepare parameters |
| 111 | + params = [(pinfo, input_data_path, output_data_path) for _, pinfo in pdf.iterrows()] |
| 112 | + |
| 113 | + try: |
| 114 | + for status in tqdm(pool.imap_unordered(extract_features_per_core, params), total=len(pdf), desc='User', ncols=80): |
| 115 | + |
| 116 | + # check for KeyboardInterrupt |
| 117 | + if status is False: |
| 118 | + raise KeyboardInterrupt |
| 119 | + |
| 120 | + except KeyboardInterrupt: |
| 121 | + pool.terminate() |
| 122 | + |
| 123 | + |
| 124 | +def sort_columns(columns): |
| 125 | + |
| 126 | + # sort columns by name, GT_COLUMN should be the last column |
| 127 | + columns = sorted(list(columns)) |
| 128 | + # columns.insert(0, columns.pop(columns.index("u2"))) # first |
| 129 | + columns.append(columns.pop(columns.index(GT_COLUMN))) # last |
| 130 | + |
| 131 | + return columns |
| 132 | + |
| 133 | + |
| 134 | +def ensure_path(path, clean=False): |
| 135 | + |
| 136 | + if clean and os.path.exists(path): |
| 137 | + shutil.rmtree(path) |
| 138 | + |
| 139 | + if not os.path.exists(path): |
| 140 | + os.makedirs(path) |
| 141 | + |
| 142 | + |
| 143 | +def parse_arguments(args): |
| 144 | + |
| 145 | + parser = argparse.ArgumentParser(description="Extract Features (using 'ft_' column prefix)") |
| 146 | + parser.add_argument('-p', '--parallel', dest='parallel', type=int, nargs=1, metavar='nproc', default=[0], |
| 147 | + help='execute in parallel, nproc=number of processors to use') |
| 148 | + parser.add_argument('-sd', '--sudden-death', dest='sudden_death', action='store', nargs='*', metavar='uuid', |
| 149 | + help='sudden death: use particular uuid to test the features extraction; either specify the uuid or omit it and it reads out a default one from code (u000)') |
| 150 | + parsed = vars(parser.parse_args(args)) |
| 151 | + |
| 152 | + return parsed |
| 153 | + |
| 154 | + |
| 155 | +############################################################################################## |
| 156 | +# MAIN |
| 157 | +############################################################################################## |
| 158 | + |
| 159 | +def main(args): |
| 160 | + '''main function''' |
| 161 | + |
| 162 | + # detemine number of CPUs |
| 163 | + nproc = args['parallel'][0] |
| 164 | + if nproc <= 0: |
| 165 | + # automatically selects about 80% of the available CPUs |
| 166 | + cpus = cpu_count() |
| 167 | + nproc = int(cpus * 0.8 + 0.5) |
| 168 | + else: |
| 169 | + nproc = min([nproc, cpu_count()]) |
| 170 | + print("using %d CPUs" % nproc) |
| 171 | + |
| 172 | + # get paths |
| 173 | + global_path = os.path.expanduser(GLOBAL_PATH) |
| 174 | + input_data_path = os.path.join(global_path, INPUT_DATA, "data") |
| 175 | + output_data_path = os.path.join(global_path, OUTPUT_DATA) |
| 176 | + |
| 177 | + # clean and ensure dir |
| 178 | + ensure_path(output_data_path, clean=True) |
| 179 | + |
| 180 | + # load pinfo.csv |
| 181 | + pinfo_path = os.path.join(global_path, INPUT_DATA, PARTICIPANTS_INFO_FILENAME) |
| 182 | + print(pinfo_path) |
| 183 | + if not os.path.isfile(pinfo_path): |
| 184 | + sys.exit("Participant's info file with name '%s' does not exist." % PARTICIPANTS_INFO_FILENAME) |
| 185 | + |
| 186 | + # load json file |
| 187 | + with open(pinfo_path) as data_file: |
| 188 | + pdf = pd.read_csv(data_file) |
| 189 | + |
| 190 | + # determine sudden_death |
| 191 | + sudden_death = args['sudden_death'] |
| 192 | + if sudden_death is not None: |
| 193 | + if len(sudden_death) == 0: |
| 194 | + sudden_death = ['u000'] # default user |
| 195 | + |
| 196 | + # apply sudden_death |
| 197 | + pdf = pdf[pdf.uuid.isin(sudden_death)] |
| 198 | + |
| 199 | + # begin feature extraction |
| 200 | + extract_all_features(pdf, input_data_path, output_data_path, nproc) |
| 201 | + |
| 202 | + |
| 203 | +if __name__ == '__main__': |
| 204 | + |
| 205 | + try: |
| 206 | + # track time |
| 207 | + print("Started at: %s" % (datetime.now())) |
| 208 | + start_time = time.time() |
| 209 | + |
| 210 | + # parse args |
| 211 | + args = parse_arguments(sys.argv[1:]) |
| 212 | + |
| 213 | + # call main |
| 214 | + main(args) |
| 215 | + |
| 216 | + # save and report elapsed time |
| 217 | + elapsed_time = time.time() - start_time |
| 218 | + print("\nSuccess! Duration: %s" % str(timedelta(seconds=int(elapsed_time)))) |
| 219 | + |
| 220 | + except(KeyboardInterrupt): |
| 221 | + sys.exit("Interrupted: Exiting on request.") |
| 222 | + |
| 223 | + except(SystemExit): |
| 224 | + e = sys.exc_info()[0] |
| 225 | + msg = sys.exc_info()[1] |
| 226 | + tb = sys.exc_info()[2] |
| 227 | + message = "exception: %s '%s'" % (e, msg) |
| 228 | + tqdm.write(message) |
| 229 | + traceback.print_tb(tb) |
0 commit comments