Skip to content

Commit e82f01d

Browse files
committed
Add script
1 parent 5f47fbc commit e82f01d

File tree

2 files changed

+235
-0
lines changed

2 files changed

+235
-0
lines changed

.gitignore

+6
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,9 @@ venv.bak/
102102

103103
# mypy
104104
.mypy_cache/
105+
106+
.DS_Store
107+
108+
# data folders
109+
mobile_phone_use/*
110+
features/*

extract_features.py

+229
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
# Extract features from MPU dataset:
2+
# https://sites.google.com/view/mobile-phone-use-dataset
3+
4+
import sys
5+
import os
6+
import time
7+
import argparse
8+
import shutil
9+
10+
import pandas as pd
11+
# import numpy as np
12+
13+
from tqdm import tqdm
14+
from datetime import datetime, timedelta
15+
16+
from multiprocessing import Pool, cpu_count
17+
import traceback
18+
19+
20+
##############################################################################################
21+
# FILE PATHS
22+
##############################################################################################
23+
24+
GLOBAL_PATH = "." # The global path that holds the INPUT_DATA and OUTPUT_DATA folders. e.g. '~/Users/<user-name>/Project'.
25+
INPUT_DATA = "mobile_phone_use" # The MPU dataset folder
26+
OUTPUT_DATA = "features" # The folder that will store the extracted features per user
27+
28+
PARTICIPANTS_INFO_FILENAME = "pinfo.csv" # Should be 'pinfo.csv'
29+
GT_COLUMN = "Esm_TiredAwake" # Your Ground Truth column of the Esm sensor event.
30+
31+
32+
##############################################################################################
33+
# METHODS
34+
##############################################################################################
35+
36+
def extract_features(pinfo, df, ff):
37+
38+
# This is the place where the feature extraction should happen (per user).
39+
# 'pinfo' is a dict with the participant's information.
40+
# 'df' is the dataframe with the participant's mobile phone use data.
41+
# 'ff' is an empty dataframe that will store the features. GT_COLUMN is also copied.
42+
43+
# This example uses the 'Acc' sensor (Accelerometer) and extracts the
44+
# average acceleration of the last measurement before each Esm event.
45+
df['Acc_Avg'].fillna(method='backfill', inplace=True)
46+
ff['ft_last_acc'] = df['Acc_Avg']
47+
48+
49+
def extract_features_per_core(params):
50+
51+
# unpack parameters
52+
pinfo, input_data_path, output_data_path = params
53+
54+
try:
55+
# prepare paths
56+
input_file_path = os.path.join(input_data_path, "%s.csv" % pinfo.uuid)
57+
output_file_path = os.path.join(output_data_path, "%s.csv" % pinfo.uuid)
58+
59+
# read data file
60+
df = pd.read_csv(input_file_path, low_memory=False)
61+
62+
# init ff (features dataframe) and set GT
63+
ff = df[df.sensor_id == "Esm"][[GT_COLUMN]].copy().dropna()
64+
65+
# extract features using pinfo, from df to ff.
66+
extract_features(pinfo, df, ff)
67+
68+
# sort columns
69+
sorted_columns = sort_columns(ff.columns)
70+
ff = ff[sorted_columns]
71+
72+
# save into csv
73+
ff.to_csv(output_file_path, index=False)
74+
75+
# Status ok
76+
return True
77+
78+
except KeyboardInterrupt:
79+
return False
80+
81+
except Exception:
82+
e = sys.exc_info()[0]
83+
msg = sys.exc_info()[1]
84+
tb = sys.exc_info()[2]
85+
message = "exception: %s '%s'" % (e, msg)
86+
tqdm.write(message)
87+
traceback.print_tb(tb)
88+
return False
89+
90+
91+
def extract_all_features(pdf, input_data_path, output_data_path, nproc):
92+
93+
# choose between single core vs multi-core
94+
if nproc <= 1:
95+
96+
for _, pinfo in tqdm(pdf.iterrows(), total=len(pdf), desc='User', ncols=80):
97+
98+
# pack params and extract features
99+
params = (pinfo, input_data_path, output_data_path)
100+
status = extract_features_per_core(params)
101+
102+
# check for KeyboardInterrupt
103+
if status is False:
104+
raise KeyboardInterrupt
105+
106+
else:
107+
# init pool with nproc
108+
pool = Pool(processes=nproc)
109+
110+
# prepare parameters
111+
params = [(pinfo, input_data_path, output_data_path) for _, pinfo in pdf.iterrows()]
112+
113+
try:
114+
for status in tqdm(pool.imap_unordered(extract_features_per_core, params), total=len(pdf), desc='User', ncols=80):
115+
116+
# check for KeyboardInterrupt
117+
if status is False:
118+
raise KeyboardInterrupt
119+
120+
except KeyboardInterrupt:
121+
pool.terminate()
122+
123+
124+
def sort_columns(columns):
125+
126+
# sort columns by name, GT_COLUMN should be the last column
127+
columns = sorted(list(columns))
128+
# columns.insert(0, columns.pop(columns.index("u2"))) # first
129+
columns.append(columns.pop(columns.index(GT_COLUMN))) # last
130+
131+
return columns
132+
133+
134+
def ensure_path(path, clean=False):
135+
136+
if clean and os.path.exists(path):
137+
shutil.rmtree(path)
138+
139+
if not os.path.exists(path):
140+
os.makedirs(path)
141+
142+
143+
def parse_arguments(args):
144+
145+
parser = argparse.ArgumentParser(description="Extract Features (using 'ft_' column prefix)")
146+
parser.add_argument('-p', '--parallel', dest='parallel', type=int, nargs=1, metavar='nproc', default=[0],
147+
help='execute in parallel, nproc=number of processors to use')
148+
parser.add_argument('-sd', '--sudden-death', dest='sudden_death', action='store', nargs='*', metavar='uuid',
149+
help='sudden death: use particular uuid to test the features extraction; either specify the uuid or omit it and it reads out a default one from code (u000)')
150+
parsed = vars(parser.parse_args(args))
151+
152+
return parsed
153+
154+
155+
##############################################################################################
156+
# MAIN
157+
##############################################################################################
158+
159+
def main(args):
160+
'''main function'''
161+
162+
# detemine number of CPUs
163+
nproc = args['parallel'][0]
164+
if nproc <= 0:
165+
# automatically selects about 80% of the available CPUs
166+
cpus = cpu_count()
167+
nproc = int(cpus * 0.8 + 0.5)
168+
else:
169+
nproc = min([nproc, cpu_count()])
170+
print("using %d CPUs" % nproc)
171+
172+
# get paths
173+
global_path = os.path.expanduser(GLOBAL_PATH)
174+
input_data_path = os.path.join(global_path, INPUT_DATA, "data")
175+
output_data_path = os.path.join(global_path, OUTPUT_DATA)
176+
177+
# clean and ensure dir
178+
ensure_path(output_data_path, clean=True)
179+
180+
# load pinfo.csv
181+
pinfo_path = os.path.join(global_path, INPUT_DATA, PARTICIPANTS_INFO_FILENAME)
182+
print(pinfo_path)
183+
if not os.path.isfile(pinfo_path):
184+
sys.exit("Participant's info file with name '%s' does not exist." % PARTICIPANTS_INFO_FILENAME)
185+
186+
# load json file
187+
with open(pinfo_path) as data_file:
188+
pdf = pd.read_csv(data_file)
189+
190+
# determine sudden_death
191+
sudden_death = args['sudden_death']
192+
if sudden_death is not None:
193+
if len(sudden_death) == 0:
194+
sudden_death = ['u000'] # default user
195+
196+
# apply sudden_death
197+
pdf = pdf[pdf.uuid.isin(sudden_death)]
198+
199+
# begin feature extraction
200+
extract_all_features(pdf, input_data_path, output_data_path, nproc)
201+
202+
203+
if __name__ == '__main__':
204+
205+
try:
206+
# track time
207+
print("Started at: %s" % (datetime.now()))
208+
start_time = time.time()
209+
210+
# parse args
211+
args = parse_arguments(sys.argv[1:])
212+
213+
# call main
214+
main(args)
215+
216+
# save and report elapsed time
217+
elapsed_time = time.time() - start_time
218+
print("\nSuccess! Duration: %s" % str(timedelta(seconds=int(elapsed_time))))
219+
220+
except(KeyboardInterrupt):
221+
sys.exit("Interrupted: Exiting on request.")
222+
223+
except(SystemExit):
224+
e = sys.exc_info()[0]
225+
msg = sys.exc_info()[1]
226+
tb = sys.exc_info()[2]
227+
message = "exception: %s '%s'" % (e, msg)
228+
tqdm.write(message)
229+
traceback.print_tb(tb)

0 commit comments

Comments
 (0)