-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
81 lines (62 loc) · 2.34 KB
/
utils.py
File metadata and controls
81 lines (62 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
from numpy import sqrt, abs, round
from scipy.stats import norm
from scipy.stats import t as t_dist
import matplotlib.pyplot as plt
import seaborn as sns
def TwoSampleZTest(X1, X2, sigma1, sigma2, N1, N2):
"""
takes mean, standard deviation, and number of observations
and returns p-value calculated for 2-sampled Z-Test
"""
ovr_sigma = sqrt(sigma1 ** 2 / N1 + sigma2 ** 2 / N2)
z = (X1 - X2) / ovr_sigma
p_val = 2 * (1 - norm.cdf(abs(z)))
return p_val
def TwoSampleTTest(X1, X2, sd1, sd2, n1, n2):
"""
takes mean, standard deviation, and number of observations
and returns p-value calculated for 2-sample T-Test
"""
ovr_sd = sqrt(sd1 ** 2 / n1 + sd2 ** 2 / n2)
t = (X1 - X2) / ovr_sd
df = n1 + n2 - 2
p_val = 2 * (1 - t_dist.cdf(abs(t), df))
return p_val
def Bivariate_plot(data, cont, cat, category):
"""
Plotting bivariate relationship by creating two samples
This also performs the hypothesis testing as we go along plotting the graphs.
"""
# creating 2 samples
x1 = data[cont][data[cat] == category][:]
x2 = data[cont][~(data[cat] == category)][:]
# calculating descriptives
n1, n2 = x1.shape[0], x2.shape[0]
m1, m2 = x1.mean(), x2.mean()
std1, std2 = x1.std(), x2.mean()
# calculating p-values
t_p_val = TwoSampleTTest(m1, m2, std1, std2, n1, n2)
z_p_val = TwoSampleZTest(m1, m2, std1, std2, n1, n2)
# table
table = pd.pivot_table(data=data, values=cont, columns=cat, aggfunc=np.mean)
# plotting
plt.figure(figsize=(20, 4), dpi=140)
# barplot
plt.subplot(1, 3, 1)
sns.barplot([str(category), 'not {}'.format(category)], [m1, m2])
plt.ylabel('mean {}'.format(cont))
plt.xlabel(cat)
plt.title('t-test p-value = {} \n z-test p-value = {}\n {}'.format(t_p_val,
z_p_val,
table))
# category-wise distribution
plt.subplot(1, 3, 2)
sns.kdeplot(x1, shade=True, color='blue', label='Subscribed')
sns.kdeplot(x2, shade=False, color='green', label='not Subscribed')
plt.title('Categorical distribution')
plt.legend()
# boxplot
plt.subplot(1, 3, 3)
sns.boxplot(x=cat, y=cont, data=data)
plt.title('Categorical boxplot')