-
Notifications
You must be signed in to change notification settings - Fork 86
/
Copy pathmain.py
177 lines (150 loc) · 7.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import os
import re
import math
import random
import pickle
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
import soundfile as sf
# Part 1: Extract Audio Labels
def extract_audio_labels():
info_line = re.compile(r'\[.+\]\n', re.IGNORECASE)
start_times, end_times, wav_file_names, emotions, vals, acts, doms = \
[], [], [], [], [], [], []
for sess in range(1, 6):
emo_evaluation_dir = \
'data/IEMOCAP_full_release/Session{}/dialog/EmoEvaluation/'.format(sess)
evaluation_files = [l for l in os.listdir(emo_evaluation_dir) if 'Ses' in l]
for file in evaluation_files:
with open(emo_evaluation_dir + file) as f:
content = f.read()
info_lines = re.findall(info_line, content)
for line in info_lines[1:]: # the first line is a header
start_end_time, wav_file_name, emotion, val_act_dom = \
line.strip().split('\t')
start_time, end_time = start_end_time[1:-1].split('-')
val, act, dom = val_act_dom[1:-1].split(',')
val, act, dom = float(val), float(act), float(dom)
start_time, end_time = float(start_time), float(end_time)
start_times.append(start_time)
end_times.append(end_time)
wav_file_names.append(wav_file_name)
emotions.append(emotion)
vals.append(val)
acts.append(act)
doms.append(dom)
df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file',
'emotion', 'val', 'act', 'dom'])
df_iemocap['start_time'] = start_times
df_iemocap['end_time'] = end_times
df_iemocap['wav_file'] = wav_file_names
df_iemocap['emotion'] = emotions
df_iemocap['val'] = vals
df_iemocap['act'] = acts
df_iemocap['dom'] = doms
df_iemocap.to_csv('data/pre-processed/df_iemocap.csv', index=False)
# Part 2: Build Audio Vectors
def build_audio_vectors():
labels_df = pd.read_csv('data/pre-processed/df_iemocap.csv')
iemocap_dir = 'data/IEMOCAP_full_release/'
sr = 44100
audio_vectors = {}
for sess in range(1, 6): # using one session due to memory constraint, can replace [5] with range(1, 6)
wav_file_path = '{}Session{}/dialog/wav/'.format(iemocap_dir, sess)
orig_wav_files = os.listdir(wav_file_path)
for orig_wav_file in tqdm(orig_wav_files):
try:
orig_wav_vector, _sr = librosa.load(
wav_file_path + orig_wav_file, sr=sr)
orig_wav_file, file_format = orig_wav_file.split('.')
for index, row in labels_df[labels_df['wav_file'].str.contains(
orig_wav_file)].iterrows():
start_time, end_time, truncated_wav_file_name, emotion,\
val, act, dom = row['start_time'], row['end_time'],\
row['wav_file'], row['emotion'], row['val'],\
row['act'], row['dom']
start_frame = math.floor(start_time * sr)
end_frame = math.floor(end_time * sr)
truncated_wav_vector = orig_wav_vector[start_frame:end_frame + 1]
audio_vectors[truncated_wav_file_name] = truncated_wav_vector
except:
print('An exception occured for {}'.format(orig_wav_file))
with open('data/pre-processed/audio_vectors_{}.pkl'.format(sess), 'wb') as f:
pickle.dump(audio_vectors, f)
# Part 3: Extract Audio Features
def extract_audio_features():
data_dir = 'data/pre-processed/'
labels_df_path = '{}df_iemocap.csv'.format(data_dir)
audio_vectors_path = '{}audio_vectors_1.pkl'.format(data_dir)
labels_df = pd.read_csv(labels_df_path)
audio_vectors = pickle.load(open(audio_vectors_path, 'rb'))
columns = ['wav_file', 'label', 'sig_mean', 'sig_std', 'rmse_mean',
'rmse_std', 'silence', 'harmonic', 'auto_corr_max', 'auto_corr_std']
df_features = pd.DataFrame(columns=columns)
emotion_dict = {'ang': 0,
'hap': 1,
'exc': 2,
'sad': 3,
'fru': 4,
'fea': 5,
'sur': 6,
'neu': 7,
'xxx': 8,
'oth': 8}
data_dir = 'data/pre-processed/'
labels_path = '{}df_iemocap.csv'.format(data_dir)
audio_vectors_path = '{}audio_vectors_'.format(data_dir)
labels_df = pd.read_csv(labels_path)
for sess in (range(1, 6)):
audio_vectors = pickle.load(open('{}{}.pkl'.format(audio_vectors_path, sess), 'rb'))
for index, row in tqdm(labels_df[labels_df['wav_file'].str.contains('Ses0{}'.format(sess))].iterrows()):
try:
wav_file_name = row['wav_file']
label = emotion_dict[row['emotion']]
y = audio_vectors[wav_file_name]
feature_list = [wav_file_name, label] # wav_file, label
sig_mean = np.mean(abs(y))
feature_list.append(sig_mean) # sig_mean
feature_list.append(np.std(y)) # sig_std
rmse = librosa.feature.rmse(y + 0.0001)[0]
feature_list.append(np.mean(rmse)) # rmse_mean
feature_list.append(np.std(rmse)) # rmse_std
silence = 0
for e in rmse:
if e <= 0.4 * np.mean(rmse):
silence += 1
silence /= float(len(rmse))
feature_list.append(silence) # silence
y_harmonic = librosa.effects.hpss(y)[0]
feature_list.append(np.mean(y_harmonic) * 1000) # harmonic (scaled by 1000)
# based on the pitch detection algorithm mentioned here:
# http://access.feld.cvut.cz/view.php?cisloclanku=2009060001
cl = 0.45 * sig_mean
center_clipped = []
for s in y:
if s >= cl:
center_clipped.append(s - cl)
elif s <= -cl:
center_clipped.append(s + cl)
elif np.abs(s) < cl:
center_clipped.append(0)
auto_corrs = librosa.core.autocorrelate(np.array(center_clipped))
feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs)) # auto_corr_max (scaled by 1000)
feature_list.append(np.std(auto_corrs)) # auto_corr_std
df_features = df_features.append(pd.DataFrame(
feature_list, index=columns).transpose(),
ignore_index=True)
except Exception as e:
print('Some exception occurred: {}'.format(e))
df_features.to_csv('data/pre-processed/audio_features.csv', index=False)
def main():
print('Part 1: Extract Audio Labels')
extract_audio_labels()
print('Part 2: Build Audio Vectors')
build_audio_vectors()
print('Part 3: Extract Audio Features')
extract_audio_features()
if __name__ == '__main__':
main()