Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added GradientBoosting/__init__.py
Empty file.
Binary file not shown.
53 changes: 53 additions & 0 deletions GradientBoosting/models/Check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import numpy as np


def fill_if_null(data):
"""
Fill null values in a DataFrame with the mean of each column.

Parameters:
- data: pandas DataFrame

Returns:
- data: pandas DataFrame with nulls filled
"""
null_boy = np.array(data.columns[data.isnull().any()])
for i in null_boy:
data[i] = data[i].fillna(data[i].mean())
return data


def check_null(data):
"""
Check for null values in a DataFrame and fill them if found.

Parameters:
- data: pandas DataFrame

Returns:
- None: Prints the count of null values in each column.
"""
if data.isnull().values.any():
fill_if_null(data)
print(data.isnull().sum())
else:
print(data.isnull().sum())


def XandY(data, target_column):
"""
Split the DataFrame into features (X) and target (Y).

Parameters:
- data: pandas DataFrame
- target_column: str, name of the target column

Returns:
- X: NumPy array of features
- Y: NumPy array of target
"""
Y = data[target_column].to_numpy()
data.drop(target_column, axis=1, inplace=True)
X = data.to_numpy()

return [X, Y]
188 changes: 188 additions & 0 deletions GradientBoosting/models/GradientBoosting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
import numpy as np


class DecisionTree:
def __init__(self, max_depth=3):
"""
Initialize the DecisionTree with a specified maximum depth.

Parameters:
- max_depth: Maximum depth of the decision tree.
"""
self.max_depth = max_depth
self.tree = None

def fit(self, X, y):
"""
Fit a decision tree to the given data.

Parameters:
- X: Input features (NumPy array).
- y: Target variable (NumPy array).
"""
self.tree = self._grow_tree(X, y)

def _grow_tree(self, X, y, depth=0):
"""
Recursively grow the decision tree by splitting nodes.

Parameters:
- X: Input features for the current node.
- y: Target variable for the current node.
- depth: Current depth of the tree.

Returns:
- A dictionary representing the tree structure.
"""
n_samples, n_features = X.shape

if depth >= self.max_depth or n_samples <= 1:
leaf_value = np.mean(y)
return {'leaf': leaf_value}

best_split = self._find_best_split(X, y, n_features)

if not best_split:
leaf_value = np.mean(y)
return {'leaf': leaf_value}

left_indices, right_indices = best_split['left_indices'], best_split['right_indices']
left_tree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
right_tree = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

return {
'feature': best_split['feature'],
'threshold': best_split['threshold'],
'left': left_tree,
'right': right_tree,
}

def _find_best_split(self, X, y, n_features):
"""
Find the best feature and threshold to split the data.

Parameters:
- X: Input features.
- y: Target variable.
- n_features: Number of features.

Returns:
- A dictionary containing the best split information, or None if no split is found.
"""
best_split = {}
min_mse = float('inf')

for feature_index in range(n_features):
thresholds = np.unique(X[:, feature_index])
for threshold in thresholds:
left_indices = np.where(X[:, feature_index] <= threshold)[0]
right_indices = np.where(X[:, feature_index] > threshold)[0]

if len(left_indices) == 0 or len(right_indices) == 0:
continue

mse = self._calculate_mse(y[left_indices], y[right_indices])
if mse < min_mse:
min_mse = mse
best_split = {
'feature': feature_index,
'threshold': threshold,
'left_indices': left_indices,
'right_indices': right_indices,
}
return best_split if best_split else None

def _calculate_mse(self, left_y, right_y):
"""
Calculate the mean squared error for a split.

Parameters:
- left_y: Target values for the left split.
- right_y: Target values for the right split.

Returns:
- Mean squared error for the split.
"""
left_mse = np.var(left_y) * len(left_y)
right_mse = np.var(right_y) * len(right_y)
return (left_mse + right_mse) / (len(left_y) + len(right_y))

def predict(self, X):
"""
Predict target values using the fitted decision tree.

Parameters:
- X: Input features.

Returns:
- Predicted target values.
"""
return np.array([self._predict_sample(sample) for sample in X])

def _predict_sample(self, sample):
"""
Predict a single sample by traversing the tree.

Parameters:
- sample: A single input sample.

Returns:
- Predicted value for the sample.
"""
node = self.tree
while 'leaf' not in node:
if sample[node['feature']] <= node['threshold']:
node = node['left']
else:
node = node['right']
return node['leaf']


class GradientBoosting:
def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
"""
Initialize the GradientBoosting model.

Parameters:
- n_estimators: Number of decision trees in the ensemble.
- learning_rate: Step size for updating residuals.
- max_depth: Maximum depth of each decision tree.
"""
self.n_estimators = n_estimators
self.learning_rate = learning_rate
self.max_depth = max_depth
self.trees = []
self.initial_prediction = 0

def fit(self, X, y):
"""
Fit the Gradient Boosting model to the data.

Parameters:
- X: Input features.
- y: Target variable.
"""
self.initial_prediction = np.mean(y)
residuals = y - self.initial_prediction

for _ in range(self.n_estimators):
tree = DecisionTree(max_depth=self.max_depth)
tree.fit(X, residuals)
predictions = tree.predict(X)
residuals -= self.learning_rate * predictions
self.trees.append(tree)

def predict(self, X):
"""
Predict using the fitted Gradient Boosting model.

Parameters:
- X: Input features.

Returns:
- Predicted target values as a NumPy array.
"""
y_pred = np.full(X.shape[0], self.initial_prediction)
for tree in self.trees:
y_pred += self.learning_rate * tree.predict(X)
return y_pred
Empty file.
Binary file not shown.
Binary file not shown.
54 changes: 54 additions & 0 deletions GradientBoosting/models/grid_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from GradientBoosting.models.GradientBoosting import GradientBoosting
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from itertools import product
import numpy as np


def grid_search(X, y, param_grid):
"""
Perform grid search to find the best hyperparameters for the Gradient Boosting model.

Parameters:
- X: Input features (NumPy array or pandas DataFrame).
- y: Target variable (NumPy array or pandas Series).
- param_grid: Dictionary of hyperparameters to search, e.g.,
{'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]}.

Returns:
- A dictionary containing the best hyperparameters and the corresponding evaluation metric.
"""
best_params = None
best_score = float('inf') # Lower score is better (MSE)

# Generate all combinations of hyperparameters
keys, values = zip(*param_grid.items())
param_combinations = [dict(zip(keys, v)) for v in product(*values)]

for params in param_combinations:
# Create and fit the model with the current hyperparameters
model = GradientBoosting(
n_estimators=params['n_estimators'],
learning_rate=params['learning_rate'],
max_depth=params['max_depth']
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Evaluate the model on the test set
preds = model.predict(X_test) # Use the trained model for predictions
mse = mean_squared_error(y_test, preds)

# Update the best parameters if the current score is better
if mse < best_score:
best_score = mse
best_params = params

return {
'best_params': best_params,
'best_score': best_score
}
Empty file.
Binary file not shown.
Binary file not shown.
51 changes: 51 additions & 0 deletions GradientBoosting/tests/small_test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
x_0,x_1,x_2,y
-2.421348566501347,6.290215260063935,2.516304163087373,10.240119830146476
8.13465811997068,-6.975968662410185,-3.2810945459842866,-6.8962940548446845
-0.4531238994261493,0.05889462611191654,-3.592293253611172,14.10428803155231
3.979832584128687,-8.129001764124755,9.202914789330517,-43.788867687445624
-4.354231825431758,2.4724749171156333,8.45972163584499,-12.067617018047834
8.726620980175113,-9.607722575405269,-5.092837184080405,-8.265643240683891
-0.29136484802189955,8.224663789274086,-3.8193339707565555,32.98185595386334
1.4118708853910462,6.003042800612462,3.9968255952773095,0.7267789346532836
0.21525181834957507,-3.321041549359367,-5.352746248495515,11.93444109619503
4.80226153299567,9.818246112545182,4.936296097738831,3.5995719453822046
9.71733974143089,0.1440918710436101,8.74993701189404,-34.917122745540794
4.098687611436789,-9.75205878861841,7.980744101999381,-43.32805584620358
-2.398060521804659,2.8278192128541733,-1.626174948927721,16.91539285950553
5.398272903061114,7.583046908728093,2.758295974535457,4.437457748228852
3.371527871466675,-5.430064318728407,2.1915998058530857,-16.03565826569788
2.0863644528269365,0.10824916542728857,8.144465640869694,-25.094326089867696
2.8255940202840684,-2.286321234798363,4.771241059098381,-18.000440202657604
-8.150227640024978,-4.259315052105519,1.8923353680502952,-1.3930242667026356
-6.067265316809651,3.6776254617776942,8.4817269440159,-10.278522746897893
8.64017362219969,9.717801217085075,4.980672567111553,-0.9266647796977245
-4.636910653452324,0.9373715699813872,4.978170771263397,-3.8217233698137143
-7.940395120999431,2.953441321061362,-0.9370552302607145,21.291726783530805
7.692709298116139,-5.485844206553388,-6.019643260327971,2.1873435652525455
-6.485086441297707,7.06589989184231,-8.842925435171665,50.35981404591074
5.036321300769028,2.0420739888497152,-4.368234397412891,15.435100617505809
-2.203566631709222,-6.141030616852454,-1.822186931753599,-0.5890454529472771
3.2620868350599768,7.851306022896178,-4.479265977335616,27.896949611024628
6.402611257683294,-4.018677430646336,0.48600102750762986,-12.289355696825485
5.378501224056757,4.355667003325474,-7.565417868242747,31.017195148404717
2.0486633392332614,8.253411759540757,-3.966950647644751,29.555547834722987
2.626017326894857,3.314924154867276,9.810418858378235,-22.85112181951592
-0.04750452520510429,5.935777040113393,-0.3470621837504506,16.516617979443822
-6.775500897482147,-0.8747563332852692,-2.758815934335188,16.55155644731519
-5.130765599150095,8.959898235120185,1.1701541118251235,22.753375944830324
9.607901921761815,-9.108821424255002,5.524296399378377,-41.93781490943017
-2.9201254899877434,5.134928295361929,-9.896226148902585,43.58829658171542
6.956501039100711,0.8359369151964895,-6.1636372998431295,16.225403196517274
7.725179239543149,-4.913104095867496,-1.110476120153832,-9.936035489824537
-6.142683379729563,1.4244393989902058,1.8529074318076262,5.554396424524908
-2.0474061706133977,-1.2170618863263076,8.899325908803291,-23.596187786238964
9.359523403637155,3.4124788823300065,-1.4222946765509725,2.4507844709064064
-8.642800876507275,-9.508822574677566,2.9901775243378577,-16.775543378589024
-2.470992582133973,5.1672327675732195,-8.753045094764744,40.855147394263106
-7.756097982925145,5.227601844332813,-3.179199348468109,30.739018818654756
5.393783291304004,-1.5186710515725927,-7.469139234639499,17.503383657767756
-7.644671911438172,1.8115363641056241,-6.167155079348694,33.57677356652164
6.557442460132911,-4.44188855380612,-6.368621306151785,7.435670420087931
0.21009363927752744,-2.719754693698011,1.0885820356480096,-6.289562485886653
-8.571672299069252,8.890348599509473,5.468260371802332,15.412904086362603
7.872454219630789,-3.9905860234116357,0.9068940749874717,-16.017543419998542
Loading