Fall2024CS584 · badri-a4 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/GradientBoosting/__init__.py b/GradientBoosting/__init__.py
diff --git a/GradientBoosting/__pycache__/__init__.cpython-312.pyc b/GradientBoosting/__pycache__/__init__.cpython-312.pyc
diff --git a/GradientBoosting/models/Check.py b/GradientBoosting/models/Check.py
@@ -0,0 +1,53 @@
+import numpy as np
+
+
+def fill_if_null(data):
+    """
+    Fill null values in a DataFrame with the mean of each column.
+
+    Parameters:
+    - data: pandas DataFrame
+
+    Returns:
+    - data: pandas DataFrame with nulls filled
+    """
+    null_boy = np.array(data.columns[data.isnull().any()])
+    for i in null_boy:
+        data[i] = data[i].fillna(data[i].mean())
+    return data
+
+
+def check_null(data):
+    """
+    Check for null values in a DataFrame and fill them if found.
+
+    Parameters:
+    - data: pandas DataFrame
+
+    Returns:
+    - None: Prints the count of null values in each column.
+    """
+    if data.isnull().values.any():
+        fill_if_null(data)
+        print(data.isnull().sum())
+    else:
+        print(data.isnull().sum())
+
+
+def XandY(data, target_column):
+    """
+    Split the DataFrame into features (X) and target (Y).
+
+    Parameters:
+    - data: pandas DataFrame
+    - target_column: str, name of the target column
+
+    Returns:
+    - X: NumPy array of features
+    - Y: NumPy array of target
+    """
+    Y = data[target_column].to_numpy()
+    data.drop(target_column, axis=1, inplace=True)
+    X = data.to_numpy()
+
+    return [X, Y]
diff --git a/GradientBoosting/models/GradientBoosting.py b/GradientBoosting/models/GradientBoosting.py
@@ -0,0 +1,188 @@
+import numpy as np
+
+
+class DecisionTree:
+    def __init__(self, max_depth=3):
+        """
+        Initialize the DecisionTree with a specified maximum depth.
+
+        Parameters:
+        - max_depth: Maximum depth of the decision tree.
+        """
+        self.max_depth = max_depth
+        self.tree = None
+
+    def fit(self, X, y):
+        """
+        Fit a decision tree to the given data.
+
+        Parameters:
+        - X: Input features (NumPy array).
+        - y: Target variable (NumPy array).
+        """
+        self.tree = self._grow_tree(X, y)
+
+    def _grow_tree(self, X, y, depth=0):
+        """
+        Recursively grow the decision tree by splitting nodes.
+
+        Parameters:
+        - X: Input features for the current node.
+        - y: Target variable for the current node.
+        - depth: Current depth of the tree.
+
+        Returns:
+        - A dictionary representing the tree structure.
+        """
+        n_samples, n_features = X.shape
+
+        if depth >= self.max_depth or n_samples <= 1:
+            leaf_value = np.mean(y)
+            return {'leaf': leaf_value}
+
+        best_split = self._find_best_split(X, y, n_features)
+
+        if not best_split:
+            leaf_value = np.mean(y)
+            return {'leaf': leaf_value}
+
+        left_indices, right_indices = best_split['left_indices'], best_split['right_indices']
+        left_tree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
+        right_tree = self._grow_tree(X[right_indices], y[right_indices], depth + 1)
+
+        return {
+            'feature': best_split['feature'],
+            'threshold': best_split['threshold'],
+            'left': left_tree,
+            'right': right_tree,
+        }
+
+    def _find_best_split(self, X, y, n_features):
+        """
+        Find the best feature and threshold to split the data.
+
+        Parameters:
+        - X: Input features.
+        - y: Target variable.
+        - n_features: Number of features.
+
+        Returns:
+        - A dictionary containing the best split information, or None if no split is found.
+        """
+        best_split = {}
+        min_mse = float('inf')
+
+        for feature_index in range(n_features):
+            thresholds = np.unique(X[:, feature_index])
+            for threshold in thresholds:
+                left_indices = np.where(X[:, feature_index] <= threshold)[0]
+                right_indices = np.where(X[:, feature_index] > threshold)[0]
+
+                if len(left_indices) == 0 or len(right_indices) == 0:
+                    continue
+
+                mse = self._calculate_mse(y[left_indices], y[right_indices])
+                if mse < min_mse:
+                    min_mse = mse
+                    best_split = {
+                        'feature': feature_index,
+                        'threshold': threshold,
+                        'left_indices': left_indices,
+                        'right_indices': right_indices,
+                    }
+        return best_split if best_split else None
+
+    def _calculate_mse(self, left_y, right_y):
+        """
+        Calculate the mean squared error for a split.
+
+        Parameters:
+        - left_y: Target values for the left split.
+        - right_y: Target values for the right split.
+
+        Returns:
+        - Mean squared error for the split.
+        """
+        left_mse = np.var(left_y) * len(left_y)
+        right_mse = np.var(right_y) * len(right_y)
+        return (left_mse + right_mse) / (len(left_y) + len(right_y))
+
+    def predict(self, X):
+        """
+        Predict target values using the fitted decision tree.
+
+        Parameters:
+        - X: Input features.
+
+        Returns:
+        - Predicted target values.
+        """
+        return np.array([self._predict_sample(sample) for sample in X])
+
+    def _predict_sample(self, sample):
+        """
+        Predict a single sample by traversing the tree.
+
+        Parameters:
+        - sample: A single input sample.
+
+        Returns:
+        - Predicted value for the sample.
+        """
+        node = self.tree
+        while 'leaf' not in node:
+            if sample[node['feature']] <= node['threshold']:
+                node = node['left']
+            else:
+                node = node['right']
+        return node['leaf']
+
+
+class GradientBoosting:
+    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
+        """
+        Initialize the GradientBoosting model.
+
+        Parameters:
+        - n_estimators: Number of decision trees in the ensemble.
+        - learning_rate: Step size for updating residuals.
+        - max_depth: Maximum depth of each decision tree.
+        """
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.max_depth = max_depth
+        self.trees = []
+        self.initial_prediction = 0
+
+    def fit(self, X, y):
+        """
+        Fit the Gradient Boosting model to the data.
+
+        Parameters:
+        - X: Input features.
+        - y: Target variable.
+        """
+        self.initial_prediction = np.mean(y)
+        residuals = y - self.initial_prediction
+
+        for _ in range(self.n_estimators):
+            tree = DecisionTree(max_depth=self.max_depth)
+            tree.fit(X, residuals)
+            predictions = tree.predict(X)
+            residuals -= self.learning_rate * predictions
+            self.trees.append(tree)
+
+    def predict(self, X):
+        """
+        Predict using the fitted Gradient Boosting model.
+
+        Parameters:
+        - X: Input features.
+
+        Returns:
+        - Predicted target values as a NumPy array.
+        """
+        y_pred = np.full(X.shape[0], self.initial_prediction)
+        for tree in self.trees:
+            y_pred += self.learning_rate * tree.predict(X)
+        return y_pred
diff --git a/GradientBoosting/models/__init__.py b/GradientBoosting/models/__init__.py
diff --git a/GradientBoosting/models/__pycache__/GradientBoosting.cpython-312.pyc b/GradientBoosting/models/__pycache__/GradientBoosting.cpython-312.pyc
diff --git a/GradientBoosting/models/__pycache__/__init__.cpython-312.pyc b/GradientBoosting/models/__pycache__/__init__.cpython-312.pyc
diff --git a/GradientBoosting/models/grid_search.py b/GradientBoosting/models/grid_search.py
@@ -0,0 +1,54 @@
+from GradientBoosting.models.GradientBoosting import GradientBoosting
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+from itertools import product
+import numpy as np
+
+
+def grid_search(X, y, param_grid):
+    """
+    Perform grid search to find the best hyperparameters for the Gradient Boosting model.
+
+    Parameters:
+    - X: Input features (NumPy array or pandas DataFrame).
+    - y: Target variable (NumPy array or pandas Series).
+    - param_grid: Dictionary of hyperparameters to search, e.g.,
+                  {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]}.
+
+    Returns:
+    - A dictionary containing the best hyperparameters and the corresponding evaluation metric.
+    """
+    best_params = None
+    best_score = float('inf')  # Lower score is better (MSE)
+
+    # Generate all combinations of hyperparameters
+    keys, values = zip(*param_grid.items())
+    param_combinations = [dict(zip(keys, v)) for v in product(*values)]
+
+    for params in param_combinations:
+        # Create and fit the model with the current hyperparameters
+        model = GradientBoosting(
+            n_estimators=params['n_estimators'],
+            learning_rate=params['learning_rate'],
+            max_depth=params['max_depth']
+        )
+
+        # Split the data into training and testing sets
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
+
+        # Fit the model
+        model.fit(X_train, y_train)
+
+        # Evaluate the model on the test set
+        preds = model.predict(X_test)  # Use the trained model for predictions
+        mse = mean_squared_error(y_test, preds)
+
+        # Update the best parameters if the current score is better
+        if mse < best_score:
+            best_score = mse
+            best_params = params
+
+    return {
+        'best_params': best_params,
+        'best_score': best_score
+    }
diff --git a/GradientBoosting/tests/__init__.py b/GradientBoosting/tests/__init__.py
diff --git a/GradientBoosting/tests/__pycache__/__init__.cpython-312.pyc b/GradientBoosting/tests/__pycache__/__init__.cpython-312.pyc
diff --git a/GradientBoosting/tests/__pycache__/test_GradientBoosting.cpython-312-pytest-8.3.3.pyc b/GradientBoosting/tests/__pycache__/test_GradientBoosting.cpython-312-pytest-8.3.3.pyc
diff --git a/GradientBoosting/tests/small_test.csv b/GradientBoosting/tests/small_test.csv
@@ -0,0 +1,51 @@
+x_0,x_1,x_2,y
+-2.421348566501347,6.290215260063935,2.516304163087373,10.240119830146476
+8.13465811997068,-6.975968662410185,-3.2810945459842866,-6.8962940548446845
+-0.4531238994261493,0.05889462611191654,-3.592293253611172,14.10428803155231
+3.979832584128687,-8.129001764124755,9.202914789330517,-43.788867687445624
+-4.354231825431758,2.4724749171156333,8.45972163584499,-12.067617018047834
+8.726620980175113,-9.607722575405269,-5.092837184080405,-8.265643240683891
+-0.29136484802189955,8.224663789274086,-3.8193339707565555,32.98185595386334
+1.4118708853910462,6.003042800612462,3.9968255952773095,0.7267789346532836
+0.21525181834957507,-3.321041549359367,-5.352746248495515,11.93444109619503
+4.80226153299567,9.818246112545182,4.936296097738831,3.5995719453822046
+9.71733974143089,0.1440918710436101,8.74993701189404,-34.917122745540794
+4.098687611436789,-9.75205878861841,7.980744101999381,-43.32805584620358
+-2.398060521804659,2.8278192128541733,-1.626174948927721,16.91539285950553
+5.398272903061114,7.583046908728093,2.758295974535457,4.437457748228852
+3.371527871466675,-5.430064318728407,2.1915998058530857,-16.03565826569788
+2.0863644528269365,0.10824916542728857,8.144465640869694,-25.094326089867696
+2.8255940202840684,-2.286321234798363,4.771241059098381,-18.000440202657604
+-8.150227640024978,-4.259315052105519,1.8923353680502952,-1.3930242667026356
+-6.067265316809651,3.6776254617776942,8.4817269440159,-10.278522746897893
+8.64017362219969,9.717801217085075,4.980672567111553,-0.9266647796977245
+-4.636910653452324,0.9373715699813872,4.978170771263397,-3.8217233698137143
+-7.940395120999431,2.953441321061362,-0.9370552302607145,21.291726783530805
+7.692709298116139,-5.485844206553388,-6.019643260327971,2.1873435652525455
+-6.485086441297707,7.06589989184231,-8.842925435171665,50.35981404591074
+5.036321300769028,2.0420739888497152,-4.368234397412891,15.435100617505809
+-2.203566631709222,-6.141030616852454,-1.822186931753599,-0.5890454529472771
+3.2620868350599768,7.851306022896178,-4.479265977335616,27.896949611024628
+6.402611257683294,-4.018677430646336,0.48600102750762986,-12.289355696825485
+5.378501224056757,4.355667003325474,-7.565417868242747,31.017195148404717
+2.0486633392332614,8.253411759540757,-3.966950647644751,29.555547834722987
+2.626017326894857,3.314924154867276,9.810418858378235,-22.85112181951592
+-0.04750452520510429,5.935777040113393,-0.3470621837504506,16.516617979443822
+-6.775500897482147,-0.8747563332852692,-2.758815934335188,16.55155644731519
+-5.130765599150095,8.959898235120185,1.1701541118251235,22.753375944830324
+9.607901921761815,-9.108821424255002,5.524296399378377,-41.93781490943017
+-2.9201254899877434,5.134928295361929,-9.896226148902585,43.58829658171542
+6.956501039100711,0.8359369151964895,-6.1636372998431295,16.225403196517274
+7.725179239543149,-4.913104095867496,-1.110476120153832,-9.936035489824537
+-6.142683379729563,1.4244393989902058,1.8529074318076262,5.554396424524908
+-2.0474061706133977,-1.2170618863263076,8.899325908803291,-23.596187786238964
+9.359523403637155,3.4124788823300065,-1.4222946765509725,2.4507844709064064
+-8.642800876507275,-9.508822574677566,2.9901775243378577,-16.775543378589024
+-2.470992582133973,5.1672327675732195,-8.753045094764744,40.855147394263106
+-7.756097982925145,5.227601844332813,-3.179199348468109,30.739018818654756
+5.393783291304004,-1.5186710515725927,-7.469139234639499,17.503383657767756
+-7.644671911438172,1.8115363641056241,-6.167155079348694,33.57677356652164
+6.557442460132911,-4.44188855380612,-6.368621306151785,7.435670420087931
+0.21009363927752744,-2.719754693698011,1.0885820356480096,-6.289562485886653
+-8.571672299069252,8.890348599509473,5.468260371802332,15.412904086362603
+7.872454219630789,-3.9905860234116357,0.9068940749874717,-16.017543419998542