Skip to content

Commit 16a7329

Browse files
committedApr 14, 2022
add handling imbalanced datasets tutorial
1 parent be6e68b commit 16a7329

File tree

5 files changed

+784
-0
lines changed

5 files changed

+784
-0
lines changed
 

‎README.md

+1
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
8686
- [Credit Card Fraud Detection in Python](https://www.thepythoncode.com/article/credit-card-fraud-detection-using-sklearn-in-python#near-miss). ([code](machine-learning/credit-card-fraud-detection))
8787
- [Customer Churn Prediction in Python](https://www.thepythoncode.com/article/customer-churn-detection-using-sklearn-in-python). ([code](machine-learning/customer-churn-detection))
8888
- [Recommender Systems using Association Rules Mining in Python](https://www.thepythoncode.com/article/build-a-recommender-system-with-association-rule-mining-in-python). ([code](machine-learning/recommender-system-using-association-rules))
89+
- [Handling Imbalanced Datasets: A Case Study with Customer Churn](https://www.thepythoncode.com/article/handling-imbalanced-datasets-sklearn-in-python). ([code](machine-learning/handling-inbalance-churn-data))
8990

9091
- ### [General Python Topics](https://www.thepythoncode.com/topic/general-python-topics)
9192
- [How to Make Facebook Messenger bot in Python](https://www.thepythoncode.com/article/make-bot-fbchat-python). ([code](general/messenger-bot))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,551 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"id": "pyYnq_d3jX9y"
7+
},
8+
"source": [
9+
"## Loading the dataset"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": null,
15+
"metadata": {
16+
"colab": {
17+
"base_uri": "https://localhost:8080/"
18+
},
19+
"id": "oAKgqSfg4Av_",
20+
"outputId": "2efaaeba-9191-4899-cc23-a30daead997d"
21+
},
22+
"outputs": [],
23+
"source": [
24+
"!pip install --upgrade gdown"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": null,
30+
"metadata": {
31+
"colab": {
32+
"base_uri": "https://localhost:8080/"
33+
},
34+
"id": "o_YiySaL5JXh",
35+
"outputId": "de9b84e1-f860-42a1-f5c2-33d3c5f8d7c0"
36+
},
37+
"outputs": [],
38+
"source": [
39+
"!gdown --id 12vfq3DYFId3bsXuNj_PhsACMzrLTfObs"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": null,
45+
"metadata": {
46+
"colab": {
47+
"base_uri": "https://localhost:8080/"
48+
},
49+
"id": "95RkXV8bgVAV",
50+
"outputId": "e1db356b-e9e0-4e61-fe7a-4ed19f740637"
51+
},
52+
"outputs": [],
53+
"source": [
54+
"import numpy as np\n",
55+
"from sklearn.model_selection import train_test_split\n",
56+
"from imblearn.over_sampling import SMOTE\n",
57+
"from sklearn.utils import resample\n",
58+
"import pandas as pd\n",
59+
"from sklearn.linear_model import LogisticRegression\n",
60+
"from sklearn.metrics import roc_auc_score, classification_report\n",
61+
"from sklearn.metrics import roc_auc_score\n",
62+
"from sklearn.metrics import confusion_matrix\n",
63+
"from sklearn.linear_model import LogisticRegression\n",
64+
"import statsmodels.api as sm\n",
65+
"import seaborn as sns\n",
66+
"from sklearn.preprocessing import OrdinalEncoder"
67+
]
68+
},
69+
{
70+
"cell_type": "code",
71+
"execution_count": null,
72+
"metadata": {
73+
"colab": {
74+
"base_uri": "https://localhost:8080/",
75+
"height": 427
76+
},
77+
"id": "LbJgEpuFsXo8",
78+
"outputId": "55c9df3b-26bc-4fda-90db-14a88914f4cf"
79+
},
80+
"outputs": [],
81+
"source": [
82+
"data=pd.read_csv(\"data_regression.csv\")\n",
83+
"# get the first 10 rows\n",
84+
"data.head(10)"
85+
]
86+
},
87+
{
88+
"cell_type": "code",
89+
"execution_count": null,
90+
"metadata": {
91+
"id": "wc6EPUEBWvIq"
92+
},
93+
"outputs": [],
94+
"source": [
95+
"# check for the missing values and dataframes\n",
96+
"def datainspection(dataframe):\n",
97+
" print(\"Types of the variables we are working with:\")\n",
98+
" print(dataframe.dtypes)\n",
99+
" \n",
100+
" print(\"Total Samples with missing values:\")\n",
101+
"\n",
102+
" print(data.isnull().any(axis=1).sum()) # null values\n",
103+
"\n",
104+
" print(\"Total Missing Values per Variable\")\n",
105+
" print(data.isnull().sum())\n",
106+
" print(\"Map of missing values\")\n",
107+
" sns.heatmap(dataframe.isnull())"
108+
]
109+
},
110+
{
111+
"cell_type": "code",
112+
"execution_count": null,
113+
"metadata": {
114+
"colab": {
115+
"base_uri": "https://localhost:8080/",
116+
"height": 1000
117+
},
118+
"id": "aB6NkvMPX9Fx",
119+
"outputId": "ad7a6b57-1cfb-4e30-9288-83191e943c59"
120+
},
121+
"outputs": [],
122+
"source": [
123+
"datainspection(data)"
124+
]
125+
},
126+
{
127+
"cell_type": "code",
128+
"execution_count": null,
129+
"metadata": {
130+
"id": "kQVFmR-pjiJj"
131+
},
132+
"outputs": [],
133+
"source": [
134+
"data = data.dropna() # cleaning up null values"
135+
]
136+
},
137+
{
138+
"cell_type": "code",
139+
"execution_count": null,
140+
"metadata": {
141+
"colab": {
142+
"base_uri": "https://localhost:8080/",
143+
"height": 270
144+
},
145+
"id": "bRdAmm8yvWJJ",
146+
"outputId": "071d8e5e-58be-4e3e-f6c8-136bd932dfd7"
147+
},
148+
"outputs": [],
149+
"source": [
150+
"# function for encoding categorical variables\n",
151+
"def encode_cat(data, vars):\n",
152+
" ord_en = OrdinalEncoder() \n",
153+
" for v in vars:\n",
154+
" name = v+'_code' # add _code for encoded variables\n",
155+
" data[name] = ord_en.fit_transform(data[[v]])\n",
156+
" print('The encoded values for '+ v + ' are:')\n",
157+
" print(data[name].unique())\n",
158+
" return data\n",
159+
"data.head()"
160+
]
161+
},
162+
{
163+
"cell_type": "code",
164+
"execution_count": null,
165+
"metadata": {
166+
"colab": {
167+
"base_uri": "https://localhost:8080/",
168+
"height": 379
169+
},
170+
"id": "xGnuojAZzgSG",
171+
"outputId": "9df3f4b2-d04d-427c-e1c3-cf0febfbcc09"
172+
},
173+
"outputs": [],
174+
"source": [
175+
"# check for the encoded variables\n",
176+
"data = encode_cat(data, ['gender', 'multi_screen', 'mail_subscribed'])\n",
177+
"data.head()"
178+
]
179+
},
180+
{
181+
"cell_type": "code",
182+
"execution_count": null,
183+
"metadata": {
184+
"id": "dVcl9m6-XhX2"
185+
},
186+
"outputs": [],
187+
"source": [
188+
"def full_plot(data, class_col, cols_to_exclude):\n",
189+
" cols = data.select_dtypes(include=np.number).columns.tolist() # finding all the numerical columns from the dataframe\n",
190+
" X = data[cols] # creating a dataframe only with the numerical columns\n",
191+
" X = X[X.columns.difference(cols_to_exclude)] # columns to exclude\n",
192+
" X = X[X.columns.difference([class_col])]\n",
193+
" sns.pairplot(data, hue=class_col)"
194+
]
195+
},
196+
{
197+
"cell_type": "code",
198+
"execution_count": null,
199+
"metadata": {
200+
"colab": {
201+
"base_uri": "https://localhost:8080/",
202+
"height": 1000
203+
},
204+
"id": "mhtOwq2-YADb",
205+
"outputId": "185665dd-2c1d-4936-a34d-076ea1f2bd2d"
206+
},
207+
"outputs": [],
208+
"source": [
209+
"full_plot(data,class_col='churn', cols_to_exclude=['customer_id','phone_no', 'year'])"
210+
]
211+
},
212+
{
213+
"cell_type": "code",
214+
"execution_count": null,
215+
"metadata": {
216+
"id": "KkQ0mOtRUQOb"
217+
},
218+
"outputs": [],
219+
"source": [
220+
"# function for creating plots for selective columns only\n",
221+
"def selected_diagnotic(data,class_col, cols_to_eval):\n",
222+
" cols_to_eval.append(class_col) \n",
223+
" X = data[cols_to_eval] # only selective columns\n",
224+
" sns.pairplot(X, hue=class_col) # plot"
225+
]
226+
},
227+
{
228+
"cell_type": "code",
229+
"execution_count": null,
230+
"metadata": {
231+
"colab": {
232+
"base_uri": "https://localhost:8080/",
233+
"height": 374
234+
},
235+
"id": "on5q6dJuWqG_",
236+
"outputId": "32663e8c-deb5-4ba4-8fdf-c1433adedf10"
237+
},
238+
"outputs": [],
239+
"source": [
240+
"selected_diagnotic(data, class_col='churn', cols_to_eval=['videos_watched', 'no_of_days_subscribed'])"
241+
]
242+
},
243+
{
244+
"cell_type": "code",
245+
"execution_count": null,
246+
"metadata": {
247+
"id": "9hOwb2lcZjOZ"
248+
},
249+
"outputs": [],
250+
"source": [
251+
"def logistic_regression(data, class_col, cols_to_exclude):\n",
252+
" cols = data.select_dtypes(include=np.number).columns.tolist() \n",
253+
" X = data[cols]\n",
254+
" X = X[X.columns.difference([class_col])] \n",
255+
" X = X[X.columns.difference(cols_to_exclude)] # unwanted columns \n",
256+
"\n",
257+
" y = data[class_col] # the target variable \n",
258+
" logit_model = sm.Logit(y,X) \n",
259+
" result = logit_model.fit() # fit the model \n",
260+
" print(result.summary2()) # check for summary "
261+
]
262+
},
263+
{
264+
"cell_type": "code",
265+
"execution_count": null,
266+
"metadata": {
267+
"colab": {
268+
"base_uri": "https://localhost:8080/"
269+
},
270+
"id": "CylJ_cx8lLgS",
271+
"outputId": "021114d2-813c-4579-aba0-b0112da318e7"
272+
},
273+
"outputs": [],
274+
"source": [
275+
"logistic_regression(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])"
276+
]
277+
},
278+
{
279+
"cell_type": "code",
280+
"execution_count": null,
281+
"metadata": {
282+
"id": "H9VkVEB6lTvZ"
283+
},
284+
"outputs": [],
285+
"source": [
286+
"def prepare_data(data, class_col, cols_to_exclude):\n",
287+
" ## Split in training and test set\n",
288+
" ## Selecting only the numerical columns and excluding the columns we specified in the function\n",
289+
" cols = data.select_dtypes(include=np.number).columns.tolist() \n",
290+
" X = data[cols]\n",
291+
" X = X[X.columns.difference([class_col])] \n",
292+
" X = X[X.columns.difference(cols_to_exclude)]\n",
293+
" ## Selecting y as a column\n",
294+
" y = data[class_col]\n",
295+
" return train_test_split(X, y, test_size=0.3, random_state=0) # perform train test split"
296+
]
297+
},
298+
{
299+
"cell_type": "code",
300+
"execution_count": null,
301+
"metadata": {
302+
"id": "HaJzCmF0l6n9"
303+
},
304+
"outputs": [],
305+
"source": [
306+
"def run_model(X_train, X_test, y_train, y_test):\n",
307+
" # Fitting the logistic regression\n",
308+
" logreg = LogisticRegression(random_state=13)\n",
309+
" logreg.fit(X_train, y_train) # fit the model\n",
310+
" # Predicting y values\n",
311+
" y_pred = logreg.predict(X_test) # make predictions on th test data\n",
312+
" logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))\n",
313+
" print(classification_report(y_test, y_pred)) # check for classification report \n",
314+
" print(\"The area under the curve is:\", logit_roc_auc) # check for AUC\n",
315+
" return y_pred"
316+
]
317+
},
318+
{
319+
"cell_type": "code",
320+
"execution_count": null,
321+
"metadata": {
322+
"colab": {
323+
"base_uri": "https://localhost:8080/"
324+
},
325+
"id": "GsjB3X51m5Fh",
326+
"outputId": "0dce3d78-e373-42d7-e5ea-f63b6b9991ab"
327+
},
328+
"outputs": [],
329+
"source": [
330+
"X_train, X_test, y_train, y_test = prepare_data(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])\n",
331+
"y_pred = run_model(X_train, X_test, y_train, y_test)"
332+
]
333+
},
334+
{
335+
"cell_type": "code",
336+
"execution_count": null,
337+
"metadata": {
338+
"id": "l1foHWuxfpr7"
339+
},
340+
"outputs": [],
341+
"source": [
342+
"from sklearn.metrics import confusion_matrix\n",
343+
"\n",
344+
"def confusion_m(y_test, y_pred):\n",
345+
" cm = confusion_matrix(y_test, y_pred)\n",
346+
" print(cm)\n",
347+
" tn, fp, fn, tp = cm.ravel()\n",
348+
" print(\"TN:\", tn)\n",
349+
" print(\"TP:\", tp)\n",
350+
" print(\"FN:\", fn)\n",
351+
" print(\"FP:\", fp)"
352+
]
353+
},
354+
{
355+
"cell_type": "code",
356+
"execution_count": null,
357+
"metadata": {
358+
"colab": {
359+
"base_uri": "https://localhost:8080/"
360+
},
361+
"id": "cipNEx9R9iRE",
362+
"outputId": "09c7dcb6-3923-46c7-e0dc-3a21721fb343"
363+
},
364+
"outputs": [],
365+
"source": [
366+
"## Call the function\n",
367+
"confusion_m(y_test, y_pred)"
368+
]
369+
},
370+
{
371+
"cell_type": "code",
372+
"execution_count": null,
373+
"metadata": {
374+
"id": "ksRCpBZCng5k"
375+
},
376+
"outputs": [],
377+
"source": [
378+
"# class imbalance method 1 \n",
379+
"def run_model_bweights(X_train, X_test, y_train, y_test):\n",
380+
" logreg = LogisticRegression(random_state=13, class_weight='balanced') # define class_weight parameter\n",
381+
" logreg.fit(X_train, y_train) # fit the model \n",
382+
" y_pred = logreg.predict(X_test) # predict on test data\n",
383+
" logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test)) # ROC AUC score\n",
384+
" print(classification_report(y_test, y_pred)) \n",
385+
" print(\"The area under the curve is:\", logit_roc_auc) # AUC curve"
386+
]
387+
},
388+
{
389+
"cell_type": "code",
390+
"execution_count": null,
391+
"metadata": {
392+
"colab": {
393+
"base_uri": "https://localhost:8080/"
394+
},
395+
"id": "wAWyH-cBoYow",
396+
"outputId": "24347c88-d87f-4bc8-fe08-f6314d83bad7"
397+
},
398+
"outputs": [],
399+
"source": [
400+
"run_model_bweights(X_train, X_test, y_train, y_test)"
401+
]
402+
},
403+
{
404+
"cell_type": "code",
405+
"execution_count": null,
406+
"metadata": {
407+
"id": "eHQ5X6-Dobc6"
408+
},
409+
"outputs": [],
410+
"source": [
411+
"# class imbalance method 2\n",
412+
"def run_model_aweights(X_train, X_test, y_train, y_test, w):\n",
413+
" logreg = LogisticRegression(random_state=13, class_weight=w) # define class_weight parameter\n",
414+
" logreg.fit(X_train, y_train) # fit the model \n",
415+
" y_pred = logreg.predict(X_test) # predict on test data\n",
416+
" logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test)) # ROC AUC score\n",
417+
" print(classification_report(y_test, y_pred))\n",
418+
" print(\"The area under the curve is: %0.2f\"%logit_roc_auc) # AUC curve"
419+
]
420+
},
421+
{
422+
"cell_type": "code",
423+
"execution_count": null,
424+
"metadata": {
425+
"colab": {
426+
"base_uri": "https://localhost:8080/"
427+
},
428+
"id": "XCU6OwiNxabC",
429+
"outputId": "5eb4ea9a-d72b-4611-8ab9-c6881bf394a6"
430+
},
431+
"outputs": [],
432+
"source": [
433+
"run_model_aweights(X_train,X_test,y_train,y_test,{0:90, 1:10})"
434+
]
435+
},
436+
{
437+
"cell_type": "code",
438+
"execution_count": null,
439+
"metadata": {
440+
"id": "PoF1mh1xopI9"
441+
},
442+
"outputs": [],
443+
"source": [
444+
"# class imbalance method 3\n",
445+
"def adjust_imbalance(X_train, y_train, class_col):\n",
446+
" X = pd.concat([X_train, y_train], axis=1)\n",
447+
" # separate the 2 classes. Here we divide majority and minority classes\n",
448+
" class0 = X[X[class_col] == 0]\n",
449+
" class1 = X[X[class_col] == 1]\n",
450+
" # Case 1 - bootstraps from the minority class\n",
451+
" if len(class1)<len(class0):\n",
452+
" resampled = resample(class1,\n",
453+
" replace=True, # Upsampling with replacement\n",
454+
" n_samples=len(class0), ## Number to match majority class\n",
455+
" random_state=10) \n",
456+
" resampled_data = pd.concat([resampled, class0]) ## # Combination of majority and upsampled minority class\n",
457+
" # Case 1 - resamples from the majority class\n",
458+
" else:\n",
459+
" resampled = resample(class1,\n",
460+
" replace=False, ## false instead of True like above\n",
461+
" n_samples=len(class0), \n",
462+
" random_state=10) \n",
463+
" resampled_data = pd.concat([resampled, class0])\n",
464+
" return resampled_data"
465+
]
466+
},
467+
{
468+
"cell_type": "code",
469+
"execution_count": null,
470+
"metadata": {
471+
"id": "ztC2PFvPsE70"
472+
},
473+
"outputs": [],
474+
"source": [
475+
"## Call the function\n",
476+
"resampled_data = adjust_imbalance(X_train, y_train, class_col='churn')"
477+
]
478+
},
479+
{
480+
"cell_type": "code",
481+
"execution_count": null,
482+
"metadata": {
483+
"colab": {
484+
"base_uri": "https://localhost:8080/"
485+
},
486+
"id": "fTWnG5RBqf7f",
487+
"outputId": "8adb9d43-543e-4695-ee50-b899941db5f3"
488+
},
489+
"outputs": [],
490+
"source": [
491+
"X_train, X_test, y_train, y_test = prepare_data(resampled_data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])\n",
492+
"run_model(X_train, X_test, y_train, y_test)"
493+
]
494+
},
495+
{
496+
"cell_type": "code",
497+
"execution_count": null,
498+
"metadata": {
499+
"id": "zy5_fe3xrt_k"
500+
},
501+
"outputs": [],
502+
"source": [
503+
"def prepare_data_smote(data,class_col,cols_to_exclude):\n",
504+
" # Synthetic Minority Oversampling Technique. \n",
505+
" # Generates new instances from existing minority cases that you supply as input. \n",
506+
" cols = data.select_dtypes(include=np.number).columns.tolist() \n",
507+
" X = data[cols]\n",
508+
" X = X[X.columns.difference([class_col])]\n",
509+
" X = X[X.columns.difference(cols_to_exclude)]\n",
510+
" y = data[class_col]\n",
511+
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
512+
" sm = SMOTE(random_state=0, sampling_strategy=1.0)\n",
513+
" # run SMOTE on training set only\n",
514+
" X_train, y_train = sm.fit_resample(X_train, y_train)\n",
515+
" return X_train, X_test, y_train, y_test"
516+
]
517+
},
518+
{
519+
"cell_type": "code",
520+
"execution_count": null,
521+
"metadata": {
522+
"colab": {
523+
"base_uri": "https://localhost:8080/"
524+
},
525+
"id": "b2N_k-aCs8ck",
526+
"outputId": "06770f5c-366e-4aad-fd84-475a2c2652b8"
527+
},
528+
"outputs": [],
529+
"source": [
530+
"X_train, X_test, y_train, y_test = prepare_data_smote(data,class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])\n",
531+
"run_model(X_train, X_test, y_train, y_test)"
532+
]
533+
}
534+
],
535+
"metadata": {
536+
"colab": {
537+
"collapsed_sections": [],
538+
"name": "Handling_Imbalance_Datasets_PythonCodeTutorial.ipynb",
539+
"provenance": []
540+
},
541+
"kernelspec": {
542+
"display_name": "Python 3",
543+
"name": "python3"
544+
},
545+
"language_info": {
546+
"name": "python"
547+
}
548+
},
549+
"nbformat": 4,
550+
"nbformat_minor": 0
551+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# [Handling Imbalanced Datasets: A Case Study with Customer Churn](https://www.thepythoncode.com/article/handling-imbalanced-datasets-sklearn-in-python)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
# %% [markdown]
2+
# ## Loading the dataset
3+
4+
# %%
5+
# !pip install --upgrade gdown
6+
7+
# %%
8+
# !gdown --id 12vfq3DYFId3bsXuNj_PhsACMzrLTfObs
9+
10+
# %%
11+
import numpy as np
12+
from sklearn.model_selection import train_test_split
13+
from imblearn.over_sampling import SMOTE
14+
from sklearn.utils import resample
15+
import pandas as pd
16+
from sklearn.linear_model import LogisticRegression
17+
from sklearn.metrics import roc_auc_score, classification_report
18+
from sklearn.metrics import roc_auc_score
19+
from sklearn.metrics import confusion_matrix
20+
from sklearn.linear_model import LogisticRegression
21+
import statsmodels.api as sm
22+
import seaborn as sns
23+
from sklearn.preprocessing import OrdinalEncoder
24+
25+
# %%
26+
data=pd.read_csv("data_regression.csv")
27+
# get the first 10 rows
28+
data.head(10)
29+
30+
# %%
31+
# check for the missing values and dataframes
32+
def datainspection(dataframe):
33+
print("Types of the variables we are working with:")
34+
print(dataframe.dtypes)
35+
36+
print("Total Samples with missing values:")
37+
38+
print(data.isnull().any(axis=1).sum()) # null values
39+
40+
print("Total Missing Values per Variable")
41+
print(data.isnull().sum())
42+
print("Map of missing values")
43+
sns.heatmap(dataframe.isnull())
44+
45+
# %%
46+
datainspection(data)
47+
48+
# %%
49+
data = data.dropna() # cleaning up null values
50+
51+
# %%
52+
# function for encoding categorical variables
53+
def encode_cat(data, vars):
54+
ord_en = OrdinalEncoder()
55+
for v in vars:
56+
name = v+'_code' # add _code for encoded variables
57+
data[name] = ord_en.fit_transform(data[[v]])
58+
print('The encoded values for '+ v + ' are:')
59+
print(data[name].unique())
60+
return data
61+
data.head()
62+
63+
# %%
64+
# check for the encoded variables
65+
data = encode_cat(data, ['gender', 'multi_screen', 'mail_subscribed'])
66+
data.head()
67+
68+
# %%
69+
def full_plot(data, class_col, cols_to_exclude):
70+
cols = data.select_dtypes(include=np.number).columns.tolist() # finding all the numerical columns from the dataframe
71+
X = data[cols] # creating a dataframe only with the numerical columns
72+
X = X[X.columns.difference(cols_to_exclude)] # columns to exclude
73+
X = X[X.columns.difference([class_col])]
74+
sns.pairplot(data, hue=class_col)
75+
76+
# %%
77+
full_plot(data,class_col='churn', cols_to_exclude=['customer_id','phone_no', 'year'])
78+
79+
# %%
80+
# function for creating plots for selective columns only
81+
def selected_diagnotic(data,class_col, cols_to_eval):
82+
cols_to_eval.append(class_col)
83+
X = data[cols_to_eval] # only selective columns
84+
sns.pairplot(X, hue=class_col) # plot
85+
86+
# %%
87+
selected_diagnotic(data, class_col='churn', cols_to_eval=['videos_watched', 'no_of_days_subscribed'])
88+
89+
# %%
90+
def logistic_regression(data, class_col, cols_to_exclude):
91+
cols = data.select_dtypes(include=np.number).columns.tolist()
92+
X = data[cols]
93+
X = X[X.columns.difference([class_col])]
94+
X = X[X.columns.difference(cols_to_exclude)] # unwanted columns
95+
96+
y = data[class_col] # the target variable
97+
logit_model = sm.Logit(y,X)
98+
result = logit_model.fit() # fit the model
99+
print(result.summary2()) # check for summary
100+
101+
# %%
102+
logistic_regression(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])
103+
104+
# %%
105+
def prepare_data(data, class_col, cols_to_exclude):
106+
## Split in training and test set
107+
## Selecting only the numerical columns and excluding the columns we specified in the function
108+
cols = data.select_dtypes(include=np.number).columns.tolist()
109+
X = data[cols]
110+
X = X[X.columns.difference([class_col])]
111+
X = X[X.columns.difference(cols_to_exclude)]
112+
## Selecting y as a column
113+
y = data[class_col]
114+
return train_test_split(X, y, test_size=0.3, random_state=0) # perform train test split
115+
116+
# %%
117+
def run_model(X_train, X_test, y_train, y_test):
118+
# Fitting the logistic regression
119+
logreg = LogisticRegression(random_state=13)
120+
logreg.fit(X_train, y_train) # fit the model
121+
# Predicting y values
122+
y_pred = logreg.predict(X_test) # make predictions on th test data
123+
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
124+
print(classification_report(y_test, y_pred)) # check for classification report
125+
print("The area under the curve is:", logit_roc_auc) # check for AUC
126+
return y_pred
127+
128+
# %%
129+
X_train, X_test, y_train, y_test = prepare_data(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])
130+
y_pred = run_model(X_train, X_test, y_train, y_test)
131+
132+
# %%
133+
from sklearn.metrics import confusion_matrix
134+
135+
def confusion_m(y_test, y_pred):
136+
cm = confusion_matrix(y_test, y_pred)
137+
print(cm)
138+
tn, fp, fn, tp = cm.ravel()
139+
print("TN:", tn)
140+
print("TP:", tp)
141+
print("FN:", fn)
142+
print("FP:", fp)
143+
144+
# %%
145+
## Call the function
146+
confusion_m(y_test, y_pred)
147+
148+
# %%
149+
# class imbalance method 1
150+
def run_model_bweights(X_train, X_test, y_train, y_test):
151+
logreg = LogisticRegression(random_state=13, class_weight='balanced') # define class_weight parameter
152+
logreg.fit(X_train, y_train) # fit the model
153+
y_pred = logreg.predict(X_test) # predict on test data
154+
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test)) # ROC AUC score
155+
print(classification_report(y_test, y_pred))
156+
print("The area under the curve is:", logit_roc_auc) # AUC curve
157+
158+
# %%
159+
run_model_bweights(X_train, X_test, y_train, y_test)
160+
161+
# %%
162+
# class imbalance method 2
163+
def run_model_aweights(X_train, X_test, y_train, y_test, w):
164+
logreg = LogisticRegression(random_state=13, class_weight=w) # define class_weight parameter
165+
logreg.fit(X_train, y_train) # fit the model
166+
y_pred = logreg.predict(X_test) # predict on test data
167+
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test)) # ROC AUC score
168+
print(classification_report(y_test, y_pred))
169+
print("The area under the curve is: %0.2f"%logit_roc_auc) # AUC curve
170+
171+
# %%
172+
run_model_aweights(X_train,X_test,y_train,y_test,{0:90, 1:10})
173+
174+
# %%
175+
# class imbalance method 3
176+
def adjust_imbalance(X_train, y_train, class_col):
177+
X = pd.concat([X_train, y_train], axis=1)
178+
# separate the 2 classes. Here we divide majority and minority classes
179+
class0 = X[X[class_col] == 0]
180+
class1 = X[X[class_col] == 1]
181+
# Case 1 - bootstraps from the minority class
182+
if len(class1)<len(class0):
183+
resampled = resample(class1,
184+
replace=True, # Upsampling with replacement
185+
n_samples=len(class0), ## Number to match majority class
186+
random_state=10)
187+
resampled_data = pd.concat([resampled, class0]) ## # Combination of majority and upsampled minority class
188+
# Case 1 - resamples from the majority class
189+
else:
190+
resampled = resample(class1,
191+
replace=False, ## false instead of True like above
192+
n_samples=len(class0),
193+
random_state=10)
194+
resampled_data = pd.concat([resampled, class0])
195+
return resampled_data
196+
197+
# %%
198+
## Call the function
199+
resampled_data = adjust_imbalance(X_train, y_train, class_col='churn')
200+
201+
# %%
202+
X_train, X_test, y_train, y_test = prepare_data(resampled_data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])
203+
run_model(X_train, X_test, y_train, y_test)
204+
205+
# %%
206+
def prepare_data_smote(data,class_col,cols_to_exclude):
207+
# Synthetic Minority Oversampling Technique.
208+
# Generates new instances from existing minority cases that you supply as input.
209+
cols = data.select_dtypes(include=np.number).columns.tolist()
210+
X = data[cols]
211+
X = X[X.columns.difference([class_col])]
212+
X = X[X.columns.difference(cols_to_exclude)]
213+
y = data[class_col]
214+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
215+
sm = SMOTE(random_state=0, sampling_strategy=1.0)
216+
# run SMOTE on training set only
217+
X_train, y_train = sm.fit_resample(X_train, y_train)
218+
return X_train, X_test, y_train, y_test
219+
220+
# %%
221+
X_train, X_test, y_train, y_test = prepare_data_smote(data,class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])
222+
run_model(X_train, X_test, y_train, y_test)
223+
224+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
numpy
2+
sklearn
3+
imblearn
4+
pandas
5+
statsmodels
6+
seaborn
7+
gdown

0 commit comments

Comments
 (0)
Please sign in to comment.