x4nth055 · Apr 14, 2022
diff --git a/‎README.md
+1 b/‎README.md
+1
diff --git a/‎machine-learning/handling-inbalance-churn-data/Handling_Imbalance_Datasets_PythonCodeTutorial.ipynb
+551 b/‎machine-learning/handling-inbalance-churn-data/Handling_Imbalance_Datasets_PythonCodeTutorial.ipynb
+551
diff --git a/‎machine-learning/handling-inbalance-churn-data/README.md
+1 b/‎machine-learning/handling-inbalance-churn-data/README.md
+1
diff --git a/‎machine-learning/handling-inbalance-churn-data/handling_imbalance_datasets_pythoncode_tutorial.py
+224 b/‎machine-learning/handling-inbalance-churn-data/handling_imbalance_datasets_pythoncode_tutorial.py
+224
diff --git a/‎machine-learning/handling-inbalance-churn-data/requirements.txt
+7 b/‎machine-learning/handling-inbalance-churn-data/requirements.txt
+7
@@ -86,6 +86,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
     - [Credit Card Fraud Detection in Python](https://www.thepythoncode.com/article/credit-card-fraud-detection-using-sklearn-in-python#near-miss). ([code](machine-learning/credit-card-fraud-detection))
     - [Customer Churn Prediction in Python](https://www.thepythoncode.com/article/customer-churn-detection-using-sklearn-in-python). ([code](machine-learning/customer-churn-detection))
     - [Recommender Systems using Association Rules Mining in Python](https://www.thepythoncode.com/article/build-a-recommender-system-with-association-rule-mining-in-python). ([code](machine-learning/recommender-system-using-association-rules))
+    - [Handling Imbalanced Datasets: A Case Study with Customer Churn](https://www.thepythoncode.com/article/handling-imbalanced-datasets-sklearn-in-python). ([code](machine-learning/handling-inbalance-churn-data))
 
 - ### [General Python Topics](https://www.thepythoncode.com/topic/general-python-topics)
     - [How to Make Facebook Messenger bot in Python](https://www.thepythoncode.com/article/make-bot-fbchat-python). ([code](general/messenger-bot))
 
@@ -0,0 +1,551 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "pyYnq_d3jX9y"
+      },
+      "source": [
+        "## Loading the dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "oAKgqSfg4Av_",
+        "outputId": "2efaaeba-9191-4899-cc23-a30daead997d"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install --upgrade gdown"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "o_YiySaL5JXh",
+        "outputId": "de9b84e1-f860-42a1-f5c2-33d3c5f8d7c0"
+      },
+      "outputs": [],
+      "source": [
+        "!gdown --id 12vfq3DYFId3bsXuNj_PhsACMzrLTfObs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "95RkXV8bgVAV",
+        "outputId": "e1db356b-e9e0-4e61-fe7a-4ed19f740637"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from imblearn.over_sampling import SMOTE\n",
+        "from sklearn.utils import resample\n",
+        "import pandas as pd\n",
+        "from sklearn.linear_model import LogisticRegression\n",
+        "from sklearn.metrics import roc_auc_score, classification_report\n",
+        "from sklearn.metrics import roc_auc_score\n",
+        "from sklearn.metrics import confusion_matrix\n",
+        "from sklearn.linear_model import LogisticRegression\n",
+        "import statsmodels.api as sm\n",
+        "import seaborn as sns\n",
+        "from sklearn.preprocessing import OrdinalEncoder"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 427
+        },
+        "id": "LbJgEpuFsXo8",
+        "outputId": "55c9df3b-26bc-4fda-90db-14a88914f4cf"
+      },
+      "outputs": [],
+      "source": [
+        "data=pd.read_csv(\"data_regression.csv\")\n",
+        "# get the first 10 rows\n",
+        "data.head(10)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wc6EPUEBWvIq"
+      },
+      "outputs": [],
+      "source": [
+        "# check for the missing values and dataframes\n",
+        "def datainspection(dataframe):\n",
+        "  print(\"Types of the variables we are working with:\")\n",
+        "  print(dataframe.dtypes)\n",
+        "  \n",
+        "  print(\"Total Samples with missing values:\")\n",
+        "\n",
+        "  print(data.isnull().any(axis=1).sum()) # null values\n",
+        "\n",
+        "  print(\"Total Missing Values per Variable\")\n",
+        "  print(data.isnull().sum())\n",
+        "  print(\"Map of missing values\")\n",
+        "  sns.heatmap(dataframe.isnull())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "aB6NkvMPX9Fx",
+        "outputId": "ad7a6b57-1cfb-4e30-9288-83191e943c59"
+      },
+      "outputs": [],
+      "source": [
+        "datainspection(data)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kQVFmR-pjiJj"
+      },
+      "outputs": [],
+      "source": [
+        "data = data.dropna() # cleaning up null values"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 270
+        },
+        "id": "bRdAmm8yvWJJ",
+        "outputId": "071d8e5e-58be-4e3e-f6c8-136bd932dfd7"
+      },
+      "outputs": [],
+      "source": [
+        "# function for encoding categorical variables\n",
+        "def encode_cat(data, vars):\n",
+        "  ord_en = OrdinalEncoder() \n",
+        "  for v in vars:\n",
+        "    name = v+'_code' # add _code for encoded variables\n",
+        "    data[name] = ord_en.fit_transform(data[[v]])\n",
+        "    print('The encoded values for '+ v + ' are:')\n",
+        "    print(data[name].unique())\n",
+        "  return data\n",
+        "data.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 379
+        },
+        "id": "xGnuojAZzgSG",
+        "outputId": "9df3f4b2-d04d-427c-e1c3-cf0febfbcc09"
+      },
+      "outputs": [],
+      "source": [
+        "# check for the encoded variables\n",
+        "data = encode_cat(data, ['gender', 'multi_screen', 'mail_subscribed'])\n",
+        "data.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "dVcl9m6-XhX2"
+      },
+      "outputs": [],
+      "source": [
+        "def full_plot(data, class_col, cols_to_exclude):\n",
+        "  cols = data.select_dtypes(include=np.number).columns.tolist() # finding all the numerical columns from the dataframe\n",
+        "  X = data[cols] # creating a dataframe only with the numerical columns\n",
+        "  X = X[X.columns.difference(cols_to_exclude)] # columns to exclude\n",
+        "  X = X[X.columns.difference([class_col])]\n",
+        "  sns.pairplot(data, hue=class_col)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "mhtOwq2-YADb",
+        "outputId": "185665dd-2c1d-4936-a34d-076ea1f2bd2d"
+      },
+      "outputs": [],
+      "source": [
+        "full_plot(data,class_col='churn', cols_to_exclude=['customer_id','phone_no', 'year'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KkQ0mOtRUQOb"
+      },
+      "outputs": [],
+      "source": [
+        "# function for creating plots for selective columns only\n",
+        "def selected_diagnotic(data,class_col, cols_to_eval):\n",
+        "  cols_to_eval.append(class_col) \n",
+        "  X = data[cols_to_eval] # only selective columns\n",
+        "  sns.pairplot(X, hue=class_col) # plot"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 374
+        },
+        "id": "on5q6dJuWqG_",
+        "outputId": "32663e8c-deb5-4ba4-8fdf-c1433adedf10"
+      },
+      "outputs": [],
+      "source": [
+        "selected_diagnotic(data, class_col='churn', cols_to_eval=['videos_watched', 'no_of_days_subscribed'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9hOwb2lcZjOZ"
+      },
+      "outputs": [],
+      "source": [
+        "def logistic_regression(data, class_col, cols_to_exclude):\n",
+        "  cols = data.select_dtypes(include=np.number).columns.tolist() \n",
+        "  X = data[cols]\n",
+        "  X = X[X.columns.difference([class_col])] \n",
+        "  X = X[X.columns.difference(cols_to_exclude)] # unwanted columns \n",
+        "\n",
+        "  y = data[class_col] # the target variable \n",
+        "  logit_model = sm.Logit(y,X) \n",
+        "  result = logit_model.fit() # fit the model \n",
+        "  print(result.summary2()) # check for summary "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "CylJ_cx8lLgS",
+        "outputId": "021114d2-813c-4579-aba0-b0112da318e7"
+      },
+      "outputs": [],
+      "source": [
+        "logistic_regression(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "H9VkVEB6lTvZ"
+      },
+      "outputs": [],
+      "source": [
+        "def prepare_data(data, class_col, cols_to_exclude):\n",
+        "  ## Split in training and test set\n",
+        "  ## Selecting only the numerical columns and excluding the columns we specified in the function\n",
+        "  cols = data.select_dtypes(include=np.number).columns.tolist() \n",
+        "  X = data[cols]\n",
+        "  X = X[X.columns.difference([class_col])] \n",
+        "  X = X[X.columns.difference(cols_to_exclude)]\n",
+        "  ## Selecting y as a column\n",
+        "  y = data[class_col]\n",
+        "  return train_test_split(X, y, test_size=0.3, random_state=0) # perform train test split"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HaJzCmF0l6n9"
+      },
+      "outputs": [],
+      "source": [
+        "def run_model(X_train, X_test, y_train, y_test):\n",
+        "  # Fitting the logistic regression\n",
+        "  logreg = LogisticRegression(random_state=13)\n",
+        "  logreg.fit(X_train, y_train) # fit the model\n",
+        "  # Predicting y values\n",
+        "  y_pred = logreg.predict(X_test) # make predictions on th test data\n",
+        "  logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))\n",
+        "  print(classification_report(y_test, y_pred)) # check for classification report \n",
+        "  print(\"The area under the curve is:\", logit_roc_auc)  # check for AUC\n",
+        "  return y_pred"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "GsjB3X51m5Fh",
+        "outputId": "0dce3d78-e373-42d7-e5ea-f63b6b9991ab"
+      },
+      "outputs": [],
+      "source": [
+        "X_train, X_test, y_train, y_test = prepare_data(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])\n",
+        "y_pred = run_model(X_train, X_test, y_train, y_test)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "l1foHWuxfpr7"
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.metrics import confusion_matrix\n",
+        "\n",
+        "def confusion_m(y_test, y_pred):\n",
+        "  cm = confusion_matrix(y_test, y_pred)\n",
+        "  print(cm)\n",
+        "  tn, fp, fn, tp = cm.ravel()\n",
+        "  print(\"TN:\", tn)\n",
+        "  print(\"TP:\", tp)\n",
+        "  print(\"FN:\", fn)\n",
+        "  print(\"FP:\", fp)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "cipNEx9R9iRE",
+        "outputId": "09c7dcb6-3923-46c7-e0dc-3a21721fb343"
+      },
+      "outputs": [],
+      "source": [
+        "## Call the function\n",
+        "confusion_m(y_test, y_pred)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ksRCpBZCng5k"
+      },
+      "outputs": [],
+      "source": [
+        "# class imbalance method 1 \n",
+        "def run_model_bweights(X_train, X_test, y_train, y_test):\n",
+        "    logreg = LogisticRegression(random_state=13, class_weight='balanced') # define class_weight parameter\n",
+        "    logreg.fit(X_train, y_train) # fit the model \n",
+        "    y_pred = logreg.predict(X_test) # predict on test data\n",
+        "    logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test)) # ROC AUC score\n",
+        "    print(classification_report(y_test, y_pred)) \n",
+        "    print(\"The area under the curve is:\", logit_roc_auc) # AUC curve"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "wAWyH-cBoYow",
+        "outputId": "24347c88-d87f-4bc8-fe08-f6314d83bad7"
+      },
+      "outputs": [],
+      "source": [
+        "run_model_bweights(X_train, X_test, y_train, y_test)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "eHQ5X6-Dobc6"
+      },
+      "outputs": [],
+      "source": [
+        "# class imbalance method 2\n",
+        "def run_model_aweights(X_train, X_test, y_train, y_test, w):\n",
+        "    logreg = LogisticRegression(random_state=13, class_weight=w) # define class_weight parameter\n",
+        "    logreg.fit(X_train, y_train) # fit the model \n",
+        "    y_pred = logreg.predict(X_test) # predict on test data\n",
+        "    logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))  # ROC AUC score\n",
+        "    print(classification_report(y_test, y_pred))\n",
+        "    print(\"The area under the curve is: %0.2f\"%logit_roc_auc)  # AUC curve"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "XCU6OwiNxabC",
+        "outputId": "5eb4ea9a-d72b-4611-8ab9-c6881bf394a6"
+      },
+      "outputs": [],
+      "source": [
+        "run_model_aweights(X_train,X_test,y_train,y_test,{0:90, 1:10})"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PoF1mh1xopI9"
+      },
+      "outputs": [],
+      "source": [
+        "# class imbalance method 3\n",
+        "def adjust_imbalance(X_train, y_train, class_col):\n",
+        "  X = pd.concat([X_train, y_train], axis=1)\n",
+        "  # separate the 2 classes. Here we divide majority and minority classes\n",
+        "  class0 = X[X[class_col] == 0]\n",
+        "  class1 = X[X[class_col] == 1]\n",
+        "  # Case 1 - bootstraps from the minority class\n",
+        "  if len(class1)<len(class0):\n",
+        "    resampled = resample(class1,\n",
+        "                              replace=True, # Upsampling with replacement\n",
+        "                              n_samples=len(class0), ## Number to match majority class\n",
+        "                              random_state=10) \n",
+        "    resampled_data = pd.concat([resampled, class0]) ## # Combination of majority and upsampled minority class\n",
+        "  # Case 1 - resamples from the majority class\n",
+        "  else:\n",
+        "    resampled = resample(class1,\n",
+        "                              replace=False, ## false instead of True like above\n",
+        "                              n_samples=len(class0), \n",
+        "                              random_state=10) \n",
+        "    resampled_data = pd.concat([resampled, class0])\n",
+        "  return resampled_data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ztC2PFvPsE70"
+      },
+      "outputs": [],
+      "source": [
+        "## Call the function\n",
+        "resampled_data = adjust_imbalance(X_train, y_train, class_col='churn')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "fTWnG5RBqf7f",
+        "outputId": "8adb9d43-543e-4695-ee50-b899941db5f3"
+      },
+      "outputs": [],
+      "source": [
+        "X_train, X_test, y_train, y_test = prepare_data(resampled_data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])\n",
+        "run_model(X_train, X_test, y_train, y_test)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zy5_fe3xrt_k"
+      },
+      "outputs": [],
+      "source": [
+        "def prepare_data_smote(data,class_col,cols_to_exclude):\n",
+        "  # Synthetic Minority Oversampling Technique. \n",
+        "  # Generates new instances from existing minority cases that you supply as input. \n",
+        "  cols = data.select_dtypes(include=np.number).columns.tolist() \n",
+        "  X = data[cols]\n",
+        "  X = X[X.columns.difference([class_col])]\n",
+        "  X = X[X.columns.difference(cols_to_exclude)]\n",
+        "  y = data[class_col]\n",
+        "  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
+        "  sm = SMOTE(random_state=0, sampling_strategy=1.0)\n",
+        "  # run SMOTE on training set only\n",
+        "  X_train, y_train = sm.fit_resample(X_train, y_train)\n",
+        "  return X_train, X_test, y_train, y_test"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "b2N_k-aCs8ck",
+        "outputId": "06770f5c-366e-4aad-fd84-475a2c2652b8"
+      },
+      "outputs": [],
+      "source": [
+        "X_train, X_test, y_train, y_test = prepare_data_smote(data,class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])\n",
+        "run_model(X_train, X_test, y_train, y_test)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "Handling_Imbalance_Datasets_PythonCodeTutorial.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1 @@
+# [Handling Imbalanced Datasets: A Case Study with Customer Churn](https://www.thepythoncode.com/article/handling-imbalanced-datasets-sklearn-in-python)
@@ -0,0 +1,224 @@
+# %% [markdown]
+# ## Loading the dataset
+
+# %%
+# !pip install --upgrade gdown
+
+# %%
+# !gdown --id 12vfq3DYFId3bsXuNj_PhsACMzrLTfObs
+
+# %%
+import numpy as np
+from sklearn.model_selection import train_test_split
+from imblearn.over_sampling import SMOTE
+from sklearn.utils import resample
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import roc_auc_score, classification_report
+from sklearn.metrics import roc_auc_score
+from sklearn.metrics import confusion_matrix
+from sklearn.linear_model import LogisticRegression
+import statsmodels.api as sm
+import seaborn as sns
+from sklearn.preprocessing import OrdinalEncoder
+
+# %%
+data=pd.read_csv("data_regression.csv")
+# get the first 10 rows
+data.head(10)
+
+# %%
+# check for the missing values and dataframes
+def datainspection(dataframe):
+  print("Types of the variables we are working with:")
+  print(dataframe.dtypes)
+  
+  print("Total Samples with missing values:")
+
+  print(data.isnull().any(axis=1).sum()) # null values
+
+  print("Total Missing Values per Variable")
+  print(data.isnull().sum())
+  print("Map of missing values")
+  sns.heatmap(dataframe.isnull())
+
+# %%
+datainspection(data)
+
+# %%
+data = data.dropna() # cleaning up null values
+
+# %%
+# function for encoding categorical variables
+def encode_cat(data, vars):
+  ord_en = OrdinalEncoder() 
+  for v in vars:
+    name = v+'_code' # add _code for encoded variables
+    data[name] = ord_en.fit_transform(data[[v]])
+    print('The encoded values for '+ v + ' are:')
+    print(data[name].unique())
+  return data
+data.head()
+
+# %%
+# check for the encoded variables
+data = encode_cat(data, ['gender', 'multi_screen', 'mail_subscribed'])
+data.head()
+
+# %%
+def full_plot(data, class_col, cols_to_exclude):
+  cols = data.select_dtypes(include=np.number).columns.tolist() # finding all the numerical columns from the dataframe
+  X = data[cols] # creating a dataframe only with the numerical columns
+  X = X[X.columns.difference(cols_to_exclude)] # columns to exclude
+  X = X[X.columns.difference([class_col])]
+  sns.pairplot(data, hue=class_col)
+
+# %%
+full_plot(data,class_col='churn', cols_to_exclude=['customer_id','phone_no', 'year'])
+
+# %%
+# function for creating plots for selective columns only
+def selected_diagnotic(data,class_col, cols_to_eval):
+  cols_to_eval.append(class_col) 
+  X = data[cols_to_eval] # only selective columns
+  sns.pairplot(X, hue=class_col) # plot
+
+# %%
+selected_diagnotic(data, class_col='churn', cols_to_eval=['videos_watched', 'no_of_days_subscribed'])
+
+# %%
+def logistic_regression(data, class_col, cols_to_exclude):
+  cols = data.select_dtypes(include=np.number).columns.tolist() 
+  X = data[cols]
+  X = X[X.columns.difference([class_col])] 
+  X = X[X.columns.difference(cols_to_exclude)] # unwanted columns 
+
+  y = data[class_col] # the target variable 
+  logit_model = sm.Logit(y,X) 
+  result = logit_model.fit() # fit the model 
+  print(result.summary2()) # check for summary 
+
+# %%
+logistic_regression(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])
+
+# %%
+def prepare_data(data, class_col, cols_to_exclude):
+  ## Split in training and test set
+  ## Selecting only the numerical columns and excluding the columns we specified in the function
+  cols = data.select_dtypes(include=np.number).columns.tolist() 
+  X = data[cols]
+  X = X[X.columns.difference([class_col])] 
+  X = X[X.columns.difference(cols_to_exclude)]
+  ## Selecting y as a column
+  y = data[class_col]
+  return train_test_split(X, y, test_size=0.3, random_state=0) # perform train test split
+
+# %%
+def run_model(X_train, X_test, y_train, y_test):
+  # Fitting the logistic regression
+  logreg = LogisticRegression(random_state=13)
+  logreg.fit(X_train, y_train) # fit the model
+  # Predicting y values
+  y_pred = logreg.predict(X_test) # make predictions on th test data
+  logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
+  print(classification_report(y_test, y_pred)) # check for classification report 
+  print("The area under the curve is:", logit_roc_auc)  # check for AUC
+  return y_pred
+
+# %%
+X_train, X_test, y_train, y_test = prepare_data(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])
+y_pred = run_model(X_train, X_test, y_train, y_test)
+
+# %%
+from sklearn.metrics import confusion_matrix
+
+def confusion_m(y_test, y_pred):
+  cm = confusion_matrix(y_test, y_pred)
+  print(cm)
+  tn, fp, fn, tp = cm.ravel()
+  print("TN:", tn)
+  print("TP:", tp)
+  print("FN:", fn)
+  print("FP:", fp)
+
+# %%
+## Call the function
+confusion_m(y_test, y_pred)
+
+# %%
+# class imbalance method 1 
+def run_model_bweights(X_train, X_test, y_train, y_test):
+    logreg = LogisticRegression(random_state=13, class_weight='balanced') # define class_weight parameter
+    logreg.fit(X_train, y_train) # fit the model 
+    y_pred = logreg.predict(X_test) # predict on test data
+    logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test)) # ROC AUC score
+    print(classification_report(y_test, y_pred)) 
+    print("The area under the curve is:", logit_roc_auc) # AUC curve
+
+# %%
+run_model_bweights(X_train, X_test, y_train, y_test)
+
+# %%
+# class imbalance method 2
+def run_model_aweights(X_train, X_test, y_train, y_test, w):
+    logreg = LogisticRegression(random_state=13, class_weight=w) # define class_weight parameter
+    logreg.fit(X_train, y_train) # fit the model 
+    y_pred = logreg.predict(X_test) # predict on test data
+    logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))  # ROC AUC score
+    print(classification_report(y_test, y_pred))
+    print("The area under the curve is: %0.2f"%logit_roc_auc)  # AUC curve
+
+# %%
+run_model_aweights(X_train,X_test,y_train,y_test,{0:90, 1:10})
+
+# %%
+# class imbalance method 3
+def adjust_imbalance(X_train, y_train, class_col):
+  X = pd.concat([X_train, y_train], axis=1)
+  # separate the 2 classes. Here we divide majority and minority classes
+  class0 = X[X[class_col] == 0]
+  class1 = X[X[class_col] == 1]
+  # Case 1 - bootstraps from the minority class
+  if len(class1)<len(class0):
+    resampled = resample(class1,
+                              replace=True, # Upsampling with replacement
+                              n_samples=len(class0), ## Number to match majority class
+                              random_state=10) 
+    resampled_data = pd.concat([resampled, class0]) ## # Combination of majority and upsampled minority class
+  # Case 1 - resamples from the majority class
+  else:
+    resampled = resample(class1,
+                              replace=False, ## false instead of True like above
+                              n_samples=len(class0), 
+                              random_state=10) 
+    resampled_data = pd.concat([resampled, class0])
+  return resampled_data
+
+# %%
+## Call the function
+resampled_data = adjust_imbalance(X_train, y_train, class_col='churn')
+
+# %%
+X_train, X_test, y_train, y_test = prepare_data(resampled_data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])
+run_model(X_train, X_test, y_train, y_test)
+
+# %%
+def prepare_data_smote(data,class_col,cols_to_exclude):
+  # Synthetic Minority Oversampling Technique. 
+  # Generates new instances from existing minority cases that you supply as input. 
+  cols = data.select_dtypes(include=np.number).columns.tolist() 
+  X = data[cols]
+  X = X[X.columns.difference([class_col])]
+  X = X[X.columns.difference(cols_to_exclude)]
+  y = data[class_col]
+  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
+  sm = SMOTE(random_state=0, sampling_strategy=1.0)
+  # run SMOTE on training set only
+  X_train, y_train = sm.fit_resample(X_train, y_train)
+  return X_train, X_test, y_train, y_test
+
+# %%
+X_train, X_test, y_train, y_test = prepare_data_smote(data,class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])
+run_model(X_train, X_test, y_train, y_test)
+
+
@@ -0,0 +1,7 @@
+numpy
+sklearn
+imblearn
+pandas
+statsmodels
+seaborn
+gdown
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# [Handling Imbalanced Datasets: A Case Study with Customer Churn](https://www.thepythoncode.com/article/handling-imbalanced-datasets-sklearn-in-python)`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,7 @@ @@
 +numpy
 +sklearn
 +imblearn
 +pandas
 +statsmodels
 +seaborn
 +gdown