courseload-analytics-LAK/utils.py at main · CAHLR/courseload-analytics-LAK · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Collection of objects and functions used across multiple scripts.
"""
import ast
import inspect
import itertools
import json
import math
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import random
import re
import sys
import tensorflow as tf
import textwrap
import time
import warnings
import xgboost as xgb

from datetime import timedelta
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasRegressor
from scipy.stats import pearsonr, mode
from sklearn.base import clone
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, roc_auc_score, f1_score, accuracy_score, mean_absolute_error, log_loss
from sklearn.model_selection import cross_val_score, RepeatedKFold, KFold
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.svm import SVR
from tqdm import tqdm

# C2V
COURSE2VEC = np.load('../LAK_paper_data/course2vec.npy')
COURSE2VEC_idx = json.load(open('../LAK_paper_data/course2idx.json'))
AVG_C2V = COURSE2VEC.mean(axis=0)

# get course vector for a given course ID (cid)
def get_c2v(cid: str) -> list:
    try:
        idx = COURSE2VEC_idx[cid]-1 # minus 1 because the index starts from 1 rather than 0
    except KeyError:
        try:
            idx = COURSE2VEC_idx[cid[::-1].replace(' ', '_', 1)[::-1]]-1
        except KeyError:
            try:
                idx = COURSE2VEC_idx[ABBR_CID2CID[cid]]-1
            except:
                return np.nan
    vec = COURSE2VEC[idx]
    return vec

# Survey answers to numeric
def answers_to_numeric(row, col: str) -> float:
    """
    This function converts verbatim survey scale responses to numeric values (e.g., 'sometimes' to 3).
    Each course load type (time load, mental effort, psychological stress) has a control variable
    asking the student about how manageable the load was. These control variables will be subtracted
    from the main survey ratings separately (e.g., high stress and high managability is low actual stress, high
    stress and low managability is high actual stress) or included in the predictive model as control
    variable.
    """
    if row[col] in ['0-5 hours per week', 'Nearly never', 'Nearly always unmanageable',
                    'A very low amount', 'Not important at all']:
        return 1.0
    elif row[col] in ['6-10 hours per week', 'Seldom', 'Mostly unmanageable',
                      'A low amount', 'Slightly important']:
        return 2.0
    elif row[col] in ['11-15 hours per week', 'Sometimes', 'Sometimes manageable',
                      'A moderate amount', 'Moderately important']:
        return 3.0
    elif row[col] in ['16-20 hours per week', 'Frequently', 'Mostly manageable',
                      'A high amount', 'Important']:
        return 4.0
    elif row[col] in ['21-25 hours per week', 'Nearly always', 'Nearly always manageable',
                      'A very high amount', 'Very important']:
        return 5.0
    elif row[col] == '26+ hours per week':
        return 6.0
    else:
        raise ColumnError('No relevant column values detected, please check which column you passed as argument.')
        return

# STEM Mapping of Majors and Courses on department level

d_dept_stem = {
'-': np.nan,
'African American Studies': False,
'Ag & Env Chem Grad Grp': True,
'Ag & Resource Econ & Pol': True,
'Anc Hist Med Arc Grad Grp': False,
'Ancient Greek & Roman Studies': False,
'Anthropology': False,
'Applied Sci & Tech Grad Grp': True,
'Architecture': True,
'Art Practice': False,
'Asian Studies Grad Grp': False,
'Astronomy': True,
'Bioengineering': True,
'Bioengineering-UCSF Grad Grp': True,
'Biophysics Grad Grp': True,
'Biostatistics Grad Grp': True,
'Buddhist Studies Grad Grp': False,
'Business': False,
'Chem & Biomolecular Eng': True,
'Chemistry': True,
'City & Regional Planning': True,
'Civil & Environmental Eng': True,
'Classics': False,
'College Writing Programs': False,
'Comparative Biochem Grad Grp': True,
'Comparative Literature': False,
'Computational Biology Grad Grp': True,
'Critical Theory Grad Grp': False,
'Data Science': True,
'Demography': False,
'Design Innovation': False,
'Development Eng Grad Grp': True,
'Development Practice Grad Grp': False,
'Earth & Planetary Science': True,
'East Asian Lang & Culture': False,
'Economics': True,
'Education': False,
'Electrical Eng & Computer Sci': True,
'Endocrinology Grad Grp': True,
'Energy & Resources Grad Grp': True,
'Engineering Joint Programs': True,
'Engineering Science': True,
'English': False,
'Env Sci, Policy, & Mgmt': True,
'Environmental Health Sci GG': True,
'Epidemiology Grad Grp': True,
'Ethnic Studies': False,
'European Studies Grad Grp': False,
'FPF-African American Studies': False,
'FPF-Anc Greek & Roman Studies': False,
'FPF-Anthropology': False,
'FPF-Art Practice': False,
'FPF-Astronomy': True,
'FPF-Chemistry': True,
'FPF-Classics': False,
'FPF-College Writing Program': False,
'FPF-Comparative Literature': False,
'FPF-Earth & Planetary Science': True,
'FPF-English': False,
'FPF-Env Sci, Policy, & Mgmt': True,
'FPF-Ethnic Studies': False,
'FPF-Film & Media': False,
'FPF-Gender & Womens Studies': False,
'FPF-Geography': False,
'FPF-History': False,
'FPF-History of Art': False,
'FPF-IAS Teaching Program': False,
'FPF-Integrative Biology': True,
'FPF-Interdisc Social Sci Pgms': False,
'FPF-Legal Studies': False,
'FPF-Letters & Science': np.nan, # comprises of multiple departments
'FPF-Linguistics': False,
'FPF-Mathematics': True,
'FPF-Molecular & Cell Biology': True,
'FPF-Music': False,
'FPF-Philosophy': False,
'FPF-Political Science': False,
'FPF-Psychology': True,
'FPF-Rhetoric': False,
'FPF-Sociology': False,
'FPF-South & SE Asian Studies': False,
'FPF-Statistics': True,
'FPF-UG Interdisciplinary Stds': False,
'Film and Media': False,
'Folklore Grad Grp': False,
'French': False,
'Gender & Womens Studies': False,
'Geography': False,
'German': False,
'Global Metro Std Grad Grp': False,
'Global Studies Grad Grp': False,
'Grad Division Other Programs': np.nan,
'Health & Medical Sci Grad Grp': True,
'Health Policy GG': False,
'History': False,
'History of Art': False,
'Industrial Eng & Ops Research': True,
'Infectious Diseases & Immun GG': True,
'Information': True,
'Integrative Biology': True,
'Interdisc Social Science Pgms': False,
'Interdisciplinary Doctoral Pgm': False,
'Italian Studies': False,
'JSP Graduate Program': False,
'Jewish Studies Program': False,
'Journalism': False,
'L&S Chemistry': True,
'L&S Computer Science': True,
'L&S Data Science': True,
'L&S Envir Econ & Policy': True,
'L&S Legal Studies': False,
'L&S Ops Rsch & Mgmt Sci': True,
'L&S Public Health': True,
'L&S Social Welfare': False,
'L&S Undeclared': np.nan,
'Landscape Arch & Env Plan': True,
'Latin American Studies GG': False,
'Law': False,
'Linguistics': False,
'Logic and Method of Science GG': False,
'Materials Science & Eng': True,
'Mathematics': True,
'Mechanical Engineering': True,
'Medieval Studies Program': False,
'Metabolic Biology Grad Grp': True,
'Microbiology Grad Grp': True,
'Middle Eastern Lang & Cultures': False,
'Military Affairs Program': False,
'Molecular & Cell Biology': True,
'Molecular Toxicology Grad Grp': True,
'Music': False,
'Nano Sci & Eng Grad Grp': True,
'Near Eastern Religions GG': False,
'Near Eastern Studies': False,
'Neuroscience Graduate Program': True,
'New Media Grad Grp': False,
'Nuclear Engineering': True,
'Nutritional Sciences & Tox': True,
'Optometry': True,
'Other Arts & Humanities Pgms': False,
'Other Bio Sciences Pgms': True,
'Other Clg of Natural Res Pgms': True,
'Other EVCP Programs': False,
'Other Env Design Programs': True,
'Other Math & Physical Sci Pgms': True,
'Other Social Sciences Programs': False,
'Performance Studies Grad Grp': False,
'Philosophy': False,
'Physical Education': False,
'Physics': True,
'Plant & Microbial Biology': True,
'Political Science': False,
'Psychology': True,
'Public Health': True,
'Public Policy': False,
'Rangeland & Wildlife Mgmt GG': False,
'Rhetoric': False,
'Romance Lang & Lit Grad Pgm': False,
'Scandinavian': False,
'Sci & Tech Stds Grad Grp': True,
'Science & Math Educ Grad Grp': True,
'Slavic Languages & Literatures': False,
'Social Welfare': False,
'Sociology': False,
'Sociology and Demography GG': False,
'South & SE Asian Studies': False,
'Spanish & Portuguese': False,
'Statistics': True,
'Study of Religion Grad Grp': False,
'Theater Dance & Perf Stds': False,
'UC Education Abroad Program': False,
'UCBX-Concurrent Enrollment Dpt': False,
'UG Interdisciplinary Studies': False,
'Urban Design Grad Grp': False,
'Vision Science Grad Grp': True
}

# Hand-coded mapping of courses and majors from survey data
# Based on https://www.ice.gov/sites/default/files/documents/stem-list.pdf
d_stem_courses = {
"American Studies 101": False,
"American Studies C172": False,
"Anthropology 115": False,
"Anthropology 121AC": False,
"Anthropology 141": False,
"Anthropology 160AC": False,
"Anthropology 3AC": False,
"Architecture 11B": False,  # only lists Naval Architecture and Marine Engineering.
"Architecture 170B": False,  # only lists Naval Architecture and Marine Engineering.
"Architecture 198BC": False,  # only lists Naval Architecture and Marine Engineering.
"Asian Am & Asn Diaspora Stds 121": False,
"Asian Am & Asn Diaspora Stds 132AC": False,
"Asian Am & Asn Diaspora Stds 171": False,
"Asian Am & Asn Diaspora Stds 20A": False,
"Astronomy 84": True,
"Astronomy C12": True,
"Bioengineering 100": True,
"Bioengineering 104": True,
"Bioengineering 11": True,
"Bioengineering 110": True,
"Bioengineering 153": True,
"Bioengineering 25": True,
"Bioengineering 98": True,
"Biology 1A": True,
"Biology 1AL": True,
"Biology 1B": True,
"Business Admin-Undergrad 10": False,   # list includes business, but only business statistics and not administration
"Business Admin-Undergrad 102B": False,   # list includes business, but only business statistics and not administration,
"Business Admin-Undergrad 103": False,   # list includes business, but only business statistics and not administration,
"Business Admin-Undergrad 105": False,   # list includes business, but only business statistics and not administration,
"Business Admin-Undergrad 106": False,   # list includes business, but only business statistics and not administration,
"Business Admin-Undergrad 131": False,   # list includes business, but only business statistics and not administration,
"Business Admin-Undergrad 135": False,   # list includes business, but only business statistics and not administration,
"Business Admin-Undergrad 141": False,   # list includes business, but only business statistics and not administration,
"Business Admin-Undergrad 147": False,   # list includes business, but only business statistics and not administration,
"Business Admin-Undergrad 169": False,   # list includes business, but only business statistics and not administration,
"Business Admin-Undergrad 192T": False,   # list includes business, but only business statistics and not administration,
"Business Admin-Undergrad 194": False,   # list includes business, but only business statistics and not administration,
"Business Admin-Undergrad 198": False,   # list includes business, but only business statistics and not administration,
"Celtic Studies R1B": False,
"Chemical Engineering 141": True,
"Chemical Engineering 150A": True,
"Chemical Engineering 98": True,
"Chemistry 12B": True,
"Chemistry 1A": True,
"Chemistry 1AL": True,
"Chemistry 1B": True,
"Chemistry 3A": True,
"Chemistry 3AL": True,
"Chemistry 3B": True,
"Chemistry 3BL": True,
"Chemistry 98": True,
"Chinese 10Y": False,
"Chinese 1A": False,
"Civil & Environmental Eng 105": True,
"Civil & Environmental Eng 107": True,
"Civil & Environmental Eng 11": True,
"Civil & Environmental Eng 113": True,
"Civil & Environmental Eng 123": True,
"Civil & Environmental Eng 155": True,
"Civil & Environmental Eng 166": True,
"Civil & Environmental Eng 175": True,
"Civil & Environmental Eng 198": True,
"Civil & Environmental Eng 199": True,
"Civil & Environmental Eng C88": True,
"Classics 10B": False,
"Classics 130E": False,
"Classics 28": False,
"Cognitive Science 1": True,
"Cognitive Science 131": True,
"Cognitive Science 190": True,
"College Writing Programs R1A": False,
"College Writing Programs R4B": False,
"Computer Science 10": True,
"Computer Science 161": True,
"Computer Science 162": True,
"Computer Science 170": True,
"Computer Science 188": True,
"Computer Science 189": True,
"Computer Science 194": True,
"Computer Science 195": True,
"Computer Science 197": True,
"Computer Science 198": True,
"Computer Science 370": True,
"Computer Science 47B": True,
"Computer Science 61A": True,
"Computer Science 61B": True,
"Computer Science 61C": True,
"Computer Science 70": True,
"Computer Science 88": True,
"Computer Science W182": True,
"Computer Science W186": True,
"Data Science, Undergraduate 198": True,   # data science is usually listed as 11.0401 Information Science/Studies.
"Data Science, Undergraduate C100": True,
"Data Science, Undergraduate C104": True,
"Data Science, Undergraduate C8": True,
"Demography C175": False,
"Design Innovation 10": False,
"Design Innovation 15": False,
"Design Innovation 198": False,
"Design Innovation 98": False,
"Dutch 171AC": False,
"Earth & Planetary Science C12": True,
"Economics 1": False,     # list only lists econometrics/quantitative economics
"Economics 100A": False,
"Economics 100B": False,
"Economics 101A": False,
"Economics 115": False,
"Economics 157": False,
"Economics 172": False,
"Education 130": False,
"Education 197": False,
"Electrical Eng & Computer Sci 126": True,
"Electrical Eng & Computer Sci 127": True,
"Electrical Eng & Computer Sci 16A": True,
"Electrical Eng & Computer Sci 16B": True,
"Energy and Resources 98": True,
"Engineering 125": True,
"Engineering 26": True,
"Engineering 29": True,
"English 110": False,
"English 170": False,
"English 24": False,
"English 43B": False,
"English 45C": False,
"English R1B": False,
"Env Sci, Policy, & Mgmt 114": True,  # Environmental Science is Listed
"Env Sci, Policy, & Mgmt 131": True,
"Env Sci, Policy, & Mgmt 152": True,
"Env Sci, Policy, & Mgmt 40": True,
"Env Sci, Policy, & Mgmt 50AC": True,
"Env Sci, Policy, & Mgmt 98": True,
"Env Sci, Policy, & Mgmt 98BC": True,
"Env Sci, Policy, & Mgmt C167": True,
"Environ Econ & Policy C101": False,
"Environmental Design 100": False,
"Ethnic Studies 101A": False,
"Ethnic Studies 190": False,
"Ethnic Studies 197": False,
"Film 171": False,
"Film R1B": False,
"French 1": False,
"French 2": False,
"Gender & Womens Studies 100AC": False,
"Gender & Womens Studies 139": False,
"Geography 130": False, # only lists Geographic Information Science and Cartography.
"Geography 70AC": False,
"Global Poverty & Practice 105": False,
"Global Studies 110Q": False,
"Global Studies 173": False,
"Global Studies C10A": False,
"History 100M": False,
"History 109C": False,
"History 160": False,
"History 190": False,
"History 6B": False,
"History C139C": False,
"History R1B": False,
"History of Art 190F": False,
"Industrial Eng & Ops Rsch 135": True,
"Industrial Eng & Ops Rsch 162": True,
"Industrial Eng & Ops Rsch 165": True,
"Industrial Eng & Ops Rsch 166": True,
"Industrial Eng & Ops Rsch 170": True,
"Industrial Eng & Ops Rsch 173": True,
"Industrial Eng & Ops Rsch 185": True,
"Industrial Eng & Ops Rsch 186": True,
"Industrial Eng & Ops Rsch 190E": True,
"Industrial Eng & Ops Rsch 195": True,
"Industrial Eng & Ops Rsch 221": True,
"Industrial Eng & Ops Rsch 95": True,
"Information C265": True, # iSchool course on interface design
"Integrative Biology 169": True,
"Integrative Biology 192": True,
"Integrative Biology 198": True,
"Integrative Biology 77B": True,
"Integrative Biology 84": True,
"Integrative Biology 98": True,
"Integrative Biology 98BC": True,
"Integrative Biology C32": True,
"Interdisciplinary Studies 100J": False, # "The Social Life of Computing", historical and ethnographic methods
"Italian Studies R5B": False,
"Korean 10B": False,
"Korean 112": False,
"LGBT Studies 145": False,
"Landscape Arch & Env Plan 1": False, # only lists Naval Architecture and Marine Engineering.
"Latin 100": False,
"Letters & Science 22": False,  # interdisciplinary studies
"Letters & Science 25": False,  # interdisciplinary studies
"Linguistics 100": False, # only lists Cognitive Psychology and Psycholinguistics.
"Linguistics 115": False,
"Linguistics 47": False,
"Linguistics C105": False,
"Materials Science & Eng 45": True,
"Mathematics 104": True,
"Mathematics 10B": True,
"Mathematics 110": True,
"Mathematics 124": True,
"Mathematics 128A": True,
"Mathematics 152": True,
"Mathematics 160": True,
"Mathematics 16B": True,
"Mathematics 1A": True,
"Mathematics 1B": True,
"Mathematics 53": True,
"Mathematics 1B": True,
"Mathematics 53": True,
"Mathematics 54": True,
"Mathematics 55": True,
"Mathematics 98": True,
"Mathematics 98BC": True,
"Mechanical Engineering 104": True,
"Mechanical Engineering 40": True,
"Mechanical Engineering C85": True,
"Media Studies 111": False,
"Media Studies 113": False,
"Military Affairs 180": False,  # must be applied military technology to be STE
"Molecular & Cell Biology 100B": True,
"Molecular & Cell Biology 102": True,
"Molecular & Cell Biology 140": True,
"Molecular & Cell Biology 140L": True,
"Molecular & Cell Biology 198": True,
"Molecular & Cell Biology 199": True,
"Molecular & Cell Biology 38": True,
"Molecular & Cell Biology 50": True,
"Molecular & Cell Biology 90E": True,
"Molecular & Cell Biology C61": True,
"Molecular & Cell Biology C95B": True,
"Music 128": False,
"Music 159": False,
"Music 168B": False,
"Music 168C": False,
"Music 168CS": False,
"Music 170": False,
"Music 20A": False,
"Music 25": False,
"Music 27": False,
"Music 45M": False,
"Music 52A": False,
"Music 52B": False,
"Music 53A": False,
"Music 53B": False,
"Music 80": False,
"Music R1B": False,
"Near Eastern Studies 10": False,
"Near Eastern Studies 18": False,
"Nuclear Engineering 155": True,
"Nuclear Engineering 162": True,
"Nutritional Science & Tox 10S": True,
"Nutritional Science & Tox 11": True,
"Nutritional Science & Tox 160": True,
"Nutritional Science & Tox 170": True,
"Nutritional Science & Tox 190": True,
"Nutritional Science & Tox 198": True,
"Nutritional Science & Tox 20": True,
"Philosophy 104": False,
"Philosophy 121": False,
"Philosophy 12A": False,
"Philosophy 135": False,
"Philosophy 161": False,
"Philosophy 25B": False,
"Philosophy 3": False,
"Physical Education 1": False,
"Physics 112": True,
"Physics 137A": True,
"Physics 137B": True,
"Physics 7A": True,
"Physics 7B": True,
"Physics 8A": True,
"Physics 8B": True,
"Physics C21": True,
"Plant & Microbial Biology 122": True,
"Plant & Microbial Biology 40": True,
"Plant & Microbial Biology C112L": True,
"Political Science 103": False,
"Political Science 111AC": False,
"Political Science 112C": False,
"Political Science 146A": False,
"Political Science 148A": False,
"Political Science 149E": False,
"Political Science 149P": False,
"Political Science 179": False,
"Political Science 197": False,
"Political Science 2": False,
"Psychology 1": True,
"Psychology 110": True,
"Psychology 114": True,
"Psychology 130": True,
"Psychology 135": True,
"Psychology 160": True,
"Psychology 167AC": True,
"Psychology 198": True,
"Psychology 290B": True,
"Psychology C116": True,
"Psychology W1": True,
"Public Health 126": False, # only Veterinary Preventive Medicine, Epidemiology, and Public Health and Health Engineering
"Public Health 142": False,
"Public Health 150E": False,
"Public Health 198": False,
"Public Health W250B": False,
"Public Policy 101": False,
"Public Policy 157": False,
"Public Policy 192AC": False,
"Public Policy 198": False,
"Public Policy C103": False,
"Rhetoric R1B": False,
"Scandinavian 106": False,
"Slavic Languages & Lit R5B": False,
"Social Welfare 112": False,
"Social Welfare 114": False,
"Sociology 1": False,
"Sociology 127": False,
"Sociology 140": False,
"Sociology 167": False,
"Sociology 198": False,
"Sociology 3AC": False,
"Southeast Asian 148": False,
"Southeast Asian R5B": False,
"Spanish 131": False,
"Spanish 135": False,
"Statistics 133": True,
"Statistics 134": True,
"Statistics 135": True,
"Statistics 150": True,
"Statistics 20": True,
"Statistics 33B": True,
"Statistics 88": True,
"Statistics 89A": True,
"Statistics C131A": True,
"Statistics C140": True,
"Theater Dance & Perf Stds 111": False,
"Theater Dance & Perf Stds 172": False,
"Theater Dance & Perf Stds 52AC": False,
"Theater Dance & Perf Stds R1B": False,
"UGIS-UG Interdisc Studies 192A": False,
"UGIS-UG Interdisc Studies 192B": False,
"UGIS-UG Interdisc Studies 192D": False,
"UGIS-UG Interdisc Studies 192E": False,
"UGIS-UG Interdisc Studies C122": False
}

d_stem_majors = {
"Anthropology": False,
"Applied Mathematics": True,
"Architecture": False,
"Bioengineering": True,
"Business Administration": False,
"Chemical Engineering": True,
"Chemistry": True,
"Civil & Environmental Eng": True,
"Civil Engineering": True,
"Cognitive Science": True,
"Computer Science": True,
"Economics": False,
"Electrical Eng & Comp Sci": True,
"Engineering Physics": True,
"English": False,
"Environmental Sciences": True,
"Gender & Womens Studies": False,
"Global Studies": False,
"Industrial Eng & Ops Rsch": True,
"Integrative Biology": True,
"L&S Computer Science": True,
"L&S Data Science": True,
"L&S Public Health": False,
"L&S Social Welfare": False,
"Letters & Sci Undeclared": np.NaN,
"Linguistics": False,
"MCB-Biochem & Mol Biol": True,
"MCB-Cell & Dev Biology": True,
"MCB-Genetics": True,
"MCB-Neurobiology": True,
"Mathematics": True,
"Mechanical Engineering": True,
"Media Studies": False,
"Microbial Biology": True,
"Molecular & Cell Biology": True,
"Molecular Environ Biology": True,
"Music": False,
"Nut Sci-Physio & Metabol": True,
"Nutritional Sci-Dietetics": True,
"Nutritional Sci-Toxicology": True,
"Nutritional Science": True,
"Physics": True,
"Political Economy": False,
"Political Science": False,
"Psychology": True,
"Public Health": False,
"Sociology": False,
"Statistics": True
}

# LMS Functions

# Semester start and end dates
# Reference:
# https://registrar.ANON-UNIVERSITY.DOMAIN/calendar/
semester_frames = {
    '2017 Spring': ('2017-01-10 00:00:00.000', '2017-05-12 23:59:59.999'),
    '2017 Fall': ('2017-08-15 00:00:00.000', '2017-12-15 23:59:59.999'),
    '2018 Spring': ('2018-01-09 00:00:00.000', '2018-05-11 23:59:59.999'),
    '2018 Fall': ('2018-08-15 00:00:00.000', '2018-12-14 23:59:59.999'),
    '2019 Spring': ('2019-01-15 00:00:00.000', '2019-05-17 23:59:59.999'),
    '2019 Fall': ('2019-08-21 00:00:00.000', '2019-12-20 23:59:59.999'),
    '2020 Spring': ('2020-01-14 00:00:00.000', '2020-05-15 23:59:59.999'),
    '2020 Fall': ('2020-08-19 00:00:00.000', '2020-12-18 23:59:59.999'),
    '2021 Spring': ('2021-01-12 00:00:00.000', '2021-05-14 23:59:59.999')
}

# Preprocessing

def lms_preproc(d: dict, semester_start='2021-01-18 00:00:00.000', semester_end='2021-05-13 23:59:59.999'):
    temp = d['enrollments'][['course_id', 'user_id', 'enrollment_role_type']]

    # Filter Students, Teachers, TAs
    temp = temp[temp['enrollment_role_type'].isin(['StudentEnrollment', 'TeacherEnrollment', 'TaEnrollment'])]
    temp['enrollment_role_type'] = temp['enrollment_role_type'].str.replace('Enrollment', '') # make string cleaner

    # Get count of roles of users across all courses
    check = temp.groupby(['user_id','course_id', 'enrollment_role_type']).size().reset_index().rename(columns={0:'count'})

    # Check if any users have more than one role in any course
    check = check.groupby(['user_id','course_id']).size().reset_index().rename(columns={0:'count'})

    # Drop duplicates of these 924 instances as follows: If user has been a teacher or TA at some point,
    # assign teacher or TA. Reason: Teachers might have been enrolling in their course as students for testing
    # purposes

    # Sort data frame such that Teacher and TA enrollment appear first
    temp = temp.sort_values(by='enrollment_role_type', ascending=False)

    # Drop duplicates such that first unique combination with Teacher or TA is kept
    temp = temp.drop_duplicates(subset=['course_id', 'user_id'], keep='first', inplace=False)

    user_role_reference_table = temp
    # Simplify variable name
    user_role_reference_table = user_role_reference_table.rename({'enrollment_role_type': 'user_role'}, axis=1)

    d['discussion_entry'] = d['discussion_entry'].merge(user_role_reference_table, on=['course_id', 'user_id'], how='left')
    d['submissions'] = d['submissions'].merge(user_role_reference_table, on=['course_id', 'user_id'], how='left')
    d['submission_comments'] = d['submission_comments'].rename({'author_id': 'user_id'}, axis=1) # fix user id var name
    d['submission_comments']['user_id'] = d['submission_comments'].user_id.fillna(0).astype(int) # fix encoding for joining
    d['submission_comments'] = d['submission_comments'].merge(user_role_reference_table, on=['course_id', 'user_id'], how='left')

    # Take most recent enrollment state of each user in each course
    temp = d['enrollments'][['course_id', 'user_id', 'enrollment_updated_at', 'enrollment_state']]
    temp = temp.sort_values(by=['enrollment_updated_at', 'course_id', 'user_id'], ascending=False)
    temp = temp.drop_duplicates(subset=['course_id', 'user_id'], keep='first', inplace=False)
    enrollment_status = temp

    # Join last updated status including timestamp to tables
    d['discussion_entry'] = d['discussion_entry'].merge(enrollment_status, on=['course_id', 'user_id'], how='left')
    d['submissions'] = d['submissions'].merge(enrollment_status, on=['course_id', 'user_id'], how='left')
    d['submission_comments'] = d['submission_comments'].merge(enrollment_status, on=['course_id', 'user_id'], how='left')

    # Create relevant variables
    # A: If user is student and last updated enrollment status is deleted, then assign dropout status
    #    If user is not student, assign -1, if user is student and not a dropout, assign 0 (active or completed)
    # B: If user is a dropout student, return last updated enrollment status as time of dropout, else assign NA

    def student_dropout_conditions(row):
        if row['user_role'] != 'Student':
            return -1
        else:
            if row['enrollment_state'] in ['active', 'completed']:
                return 0
            elif row['enrollment_state'] == 'deleted':
                return 1
            else:
                return np.nan

    def dropout_at_conditions(row):
        if row['is_student_dropout'] != 1:
            return np.nan
        else:
            return row['enrollment_updated_at']

    d['discussion_entry']['is_student_dropout'] = d['discussion_entry'].apply(student_dropout_conditions, axis=1)
    # Submission data misses for some semesters
    try:
        d['submissions']['is_student_dropout'] = d['submissions'].apply(student_dropout_conditions, axis=1)
    except:
        pass
    d['submission_comments']['is_student_dropout'] = d['submission_comments'].apply(student_dropout_conditions, axis=1)

    d['discussion_entry']['dropout_at'] = d['discussion_entry'].apply(dropout_at_conditions, axis=1)
    # Submission data misses for some semesters
    try:
        d['submissions']['dropout_at'] = d['submissions'].apply(dropout_at_conditions, axis=1)
    except:
        pass
    d['submission_comments']['dropout_at'] = d['submission_comments'].apply(dropout_at_conditions, axis=1)

    # Assignments, due and unlock dates
    updated_due_dates = d['assignments_overrides'].sort_values(by=['updated_at'], ascending=False)  # sort by most recent
    updated_due_dates = updated_due_dates.loc[updated_due_dates['due_at'].notnull(), ['assignment_id', 'due_at']]
    updated_due_dates.assignment_id = updated_due_dates.assignment_id.fillna(0).astype(int)
    updated_due_dates = updated_due_dates.drop_duplicates(subset = 'assignment_id', keep = 'first') # keep most recent

    # Join most recent entries to main table
    d['assignments'] = pd.merge(d['assignments'], updated_due_dates, on='assignment_id', how='left')

    # Take most recent due_at if available, else take asn_due_at from original table
    d['assignments']['due_at_correct'] = d['assignments'][['asn_due_at', 'due_at']].apply(lambda x: x['asn_due_at'] if pd.isnull(x['due_at']) else x['due_at'], axis=1)

    d['assignments']['due_at_correct'] = pd.to_datetime(d['assignments']['due_at_correct'], errors = 'coerce')

    updated_unlock_dates = d['assignments_overrides'].sort_values(by=['updated_at'], ascending=False)
    updated_unlock_dates = updated_unlock_dates.loc[updated_unlock_dates['unlock_at'].notnull(), ['assignment_id', 'unlock_at']]
    updated_unlock_dates.assignment_id = updated_unlock_dates.assignment_id.fillna(0).astype(int)
    updated_unlock_dates = updated_unlock_dates.drop_duplicates(subset = 'assignment_id', keep = 'first')

    d['assignments'] = pd.merge(d['assignments'], updated_unlock_dates, on='assignment_id', how='left')
    d['assignments']['unlock_at_updated'] = d['assignments'][['asn_unlock_at', 'unlock_at']].apply(lambda x: x['asn_unlock_at'] if pd.isnull(x['unlock_at']) else x['unlock_at'], axis=1)

    d['assignments']['unlock_at_updated'] = pd.to_datetime(d['assignments']['unlock_at_updated'], errors = 'coerce')

    d['assignments'] = d['assignments'][d['assignments'].workflow_state == 'published']

    # Submission data misses for some semesters
    if d['submissions'].shape[0] > 0:
        assignment_ids_with_submissions = set(d['submissions'][
                                    (d['submissions'].user_role == 'Student') &
                                    (d['submissions'].assignment_id.isin(d['assignments'].assignment_id))]\
                                          .assignment_id)

        n_assignments_with_submissions = len(assignment_ids_with_submissions)
        n_assignments = len(pd.unique(d['assignments'].assignment_id))

        d['assignments'] = d['assignments'][d['assignments'].assignment_id.isin(assignment_ids_with_submissions)]

    semester_start = pd.to_datetime(semester_start, errors = 'coerce')
    d['assignments'] = d['assignments'].loc[~(
        (d['assignments'].due_at_correct.notna()) &
        (d['assignments'].due_at_correct < semester_start)
    ),]

    d['assignments_with_due'] = d['assignments'][d['assignments'].due_at_correct.notna()]
    d['assignments_with_due-unlock'] = d['assignments_with_due'][d['assignments_with_due'].unlock_at_updated.notna()]

    # Join course_name_number and section_num to submissions table
    canvas_courses = d['course_section'][
        ['canvas_course_global_id', 'course_subject_name_number', 'section_num']
    ]
    canvas_courses.columns = ['course_id', 'course_name_number', 'section_num']

    d['submissions'] = d['submissions'].merge(canvas_courses, on='course_id', how='left')
    d['submission_comments'] = d['submission_comments'].merge(canvas_courses, on='course_id', how='left')
    d['assignments'] = d['assignments'].merge(canvas_courses, on='course_id', how='left')
    d['assignments_with_due'] = d['assignments_with_due'].merge(canvas_courses, on='course_id', how='left')
    d['assignments_with_due-unlock'] = d['assignments_with_due-unlock'].merge(canvas_courses, on='course_id', how='left')
    d['discussion_entry'] = d['discussion_entry'].merge(canvas_courses, on='course_id', how='left')
    d['enrollments'] = d['enrollments'].merge(canvas_courses, on='course_id', how='left')


    # The dates for the Spring 2021 semester are January 18, 2021 to May 13, 2021.
    semester_start = pd.to_datetime(semester_start, errors = 'coerce')
    semester_end = pd.to_datetime(semester_end, errors = 'coerce')
    semester_quarter_limits = pd.date_range(semester_start, semester_end, periods=5)

    # Take most recent enrollment state of each user in each course
    temp = d['enrollments'][['course_name_number', 'section_num', 'user_id', 'enrollment_updated_at', 'enrollment_state', 'enrollment_role_type']]
    temp = temp.sort_values(by=['enrollment_updated_at', 'course_name_number', 'section_num', 'user_id'], ascending=False)
    temp = temp.drop_duplicates(subset=['course_name_number', 'section_num', 'user_id'], keep='first', inplace=False)

    # Filter students
    temp = temp[temp['enrollment_role_type'] == 'StudentEnrollment']

    # If students dropped out before start of Spring Semester, do not impute Spring Semester start date
    # If students dropped out after end of Spring Semester, do not impute Spring Semester end date

    # Rather, remove students from the calculation which dropped out outside of the semester
    temp['enrollment_updated_at'] = pd.to_datetime(temp['enrollment_updated_at'], errors = 'coerce')
    temp = temp[(temp.enrollment_updated_at >= semester_start) & (temp.enrollment_updated_at <= semester_end)]

    # Create "dropped out in quarter n" binary variables for each quarter
    temp['dropped_out_q1'] = (temp['enrollment_state'] == 'deleted') & (
        semester_quarter_limits[0] <= temp['enrollment_updated_at']) & (
        temp['enrollment_updated_at'] <= semester_quarter_limits[1])
    temp['dropped_out_q2'] = (temp['enrollment_state'] == 'deleted') & (
        semester_quarter_limits[1] <= temp['enrollment_updated_at']) & (
        temp['enrollment_updated_at'] <= semester_quarter_limits[2])
    temp['dropped_out_q3'] = (temp['enrollment_state'] == 'deleted') & (
        semester_quarter_limits[2] <= temp['enrollment_updated_at']) & (
        temp['enrollment_updated_at'] <= semester_quarter_limits[3])
    temp['dropped_out_q4'] = (temp['enrollment_state'] == 'deleted') & (
        semester_quarter_limits[3] <= temp['enrollment_updated_at']) & (
        temp['enrollment_updated_at'] <= semester_quarter_limits[4])

    d['temp_dropout_reference'] = temp

    # Get teaching staff reference

    # Take most recent enrollment state of each user in each course
    temp = d['enrollments'][['course_name_number', 'section_num', 'user_id', 'enrollment_updated_at', 'enrollment_state', 'enrollment_role_type']]
    temp = temp.sort_values(by=['enrollment_updated_at', 'course_name_number', 'section_num', 'user_id'], ascending=False)
    temp = temp.drop_duplicates(subset=['course_name_number', 'section_num', 'user_id'], keep='first', inplace=False)

    # Filter teaching staff
    temp = temp[temp['enrollment_role_type'].isin(['TeacherEnrollment', 'TaEnrollment'])]

    d['temp_teaching_staff_reference'] = temp

    return d


# Features

def get_assignment_spread(df, name1: str, section1: str, section2: list) -> float:
    temp = df[(df.course_name_number==name1) & (df.section_num.isin([section1] + section2))]
    if temp.shape[0] in [0, 1]: # standard deviation requires 2+ data points
        return 0
    else:
        return temp.due_at_correct.astype(int).std()

# parallel assignments
## Current approach which adds a time period in front of assignment deadlines and counts pair-wise overlap ##

# For all courses
# 1. Create a timeframe for each assignment from deadline-1day to deadline
# 2. Count number of timeframes that overlap in each course

def check_overlap(tuple1, tuple2, i, j):
    if i==j: # a timeframe will always overlap itself
        return False
    # this condition for overlap holds independent of which timeframe starts earlier
    elif tuple1[0] < tuple2[1] and tuple2[0] < tuple1[1]:
        return tuple(sorted([i, j])) # sort in order to filter out inverse later
    else:
        return False

def get_parallel_assingments(df, name1: str, section1: str, section2: list, grace_period_days: int) -> int:
    temp = df[(df.course_name_number==name1) & (df.section_num.isin([section1] + section2))]
    if temp.shape[0] == 0:
        return np.nan
    else:
        temp['asn_frame_lag'] = temp['due_at_correct'] - timedelta(days=grace_period_days)
        temp['asn_frame'] = list(zip(temp.asn_frame_lag, temp.due_at_correct))
        out = []
        for i, timeframe1 in enumerate(temp.asn_frame):
            for j, timeframe2 in enumerate(temp.asn_frame):
                out.append(check_overlap(timeframe1, timeframe2, i, j))
        out = set([element for element in out if element != False and element is not None]) # casting to set drops inverse
        return len(out)

# Flexible approach based on graded (3 days) or not graded (1 day)

def timeframe_conditions(row):
    res = 1 # 1 day, extend by factors and return value
    if row['grading_type'] in ['points', 'percent', 'letter_grade', 'gpa_scale']:
        res *= 3
    return row['due_at_correct'] - timedelta(days=res)

def get_parallel_assingments_flexible(df, name1: str, section1: str, section2: list) -> int:
    temp = df[(df.course_name_number==name1) & (df.section_num.isin([section1] + section2))]
    if temp.shape[0] == 0:
        return np.nan
    else:
        temp['asn_frame_start'] = temp.apply(timeframe_conditions, axis=1)
        temp['asn_frame'] = list(zip(temp.asn_frame_start, temp.due_at_correct))
        out = []
        for i, timeframe1 in enumerate(temp.asn_frame):
            for j, timeframe2 in enumerate(temp.asn_frame):
                out.append(check_overlap(timeframe1, timeframe2, i, j))
        out = set([element for element in out if element != False and element is not None]) # casting to set drops inverse
        return len(out)

def get_n_course_assignments(df, name1: str, section1: str, section2: list, graded_only=False) -> int:
    temp = df[(df.course_name_number==name1) & (df.section_num.isin([section1] + section2))]
    if graded_only:
        temp = temp[temp['grading_type'].isin(['points', 'percent', 'letter_grade', 'gpa_scale'])]
    return 0 if temp.shape[0] == 0 else temp.shape[0]

def get_graded_assignments_week(df, week_start_dates, week_end_dates, name1: str, section1: str, section2: list,
                                metric='average'):
    """
    Parse 'average' or 'max' as metric to get either the average number of graded assignments per weke
    or the maximum number of assignments during the whole semester which was due in a single calendar week.
    """
    temp = df[(df.course_name_number==name1) & (df.section_num.isin([section1] + section2))]
    temp = temp[temp['grading_type'].isin(['points', 'percent', 'letter_grade', 'gpa_scale'])]

    assignments_due_per_week_list = []

    for week_start, week_end in zip(week_start_dates, week_end_dates):
        subset = temp[(temp['due_at_correct'] < week_end) & (week_start < temp['due_at_correct'])] # due this week only
        assignments_due_per_week_list.append(subset.shape[0])

    if metric == 'average':
        return sum(assignments_due_per_week_list)/len(assignments_due_per_week_list)
    elif metric == 'max':
        return max(assignments_due_per_week_list)
    else: raise ArgumentError('Please parse either "average" or "max" to argument "metric"')


def get_avg_submission_time_to_deadline_minutes(df, name1: str, section1: str, section2: list, dropout_status=0) -> float:
    temp_assignments = df['assignments_with_due'][(df['assignments_with_due'].course_name_number==name1) & (df['assignments_with_due']['section_num'].isin([section1] + section2))]
    temp_submission = df['submissions'][(df['submissions'].course_name_number==name1) & (df['submissions']['section_num'].isin([section1] + section2))]
    if temp_assignments.shape[0] == 0 or temp_submission.shape[0] == 0:
        return 0
    else:
        join_this = temp_submission[
            (temp_submission.user_role == 'Student') &
            (temp_submission.is_student_dropout == dropout_status)
        ][['assignment_id', 'submitted_at']]
        temp = pd.merge(temp_assignments[['course_id', 'assignment_id', 'due_at_correct']],
                        join_this, on='assignment_id', how='left')

        # Where possible, create average timeframe difference in minutes
        temp['submitted_at'] = pd.to_datetime(temp['submitted_at'])
        #temp = temp.dropna()

        temp['submission_diff'] = temp['due_at_correct'] - temp['submitted_at']

        return temp.submission_diff.dt.total_seconds().mean()/60


def get_early_assignment_availability_ratio(df, semester_start_plus_two_weeks,
                                            name1: str, section1: str, section2: list) -> float:
    temp = df[(df.course_name_number==name1) & (df.section_num.isin([section1] + section2))]
    if temp.shape[0] == 0:
        return np.nan
    else:
        return sum(temp['unlock_at_updated'] <= semester_start_plus_two_weeks) / temp.shape[0]