Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
__pycache__/
*.py[cod]
*$py.class
.DS_Store

# C extensions
*.so
Expand Down Expand Up @@ -160,3 +161,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/
Binary file added BONUS CREDIT/plot1.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added BONUS CREDIT/plot2.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added BONUS CREDIT/plot3.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
113 changes: 113 additions & 0 deletions BONUS CREDIT/visualization_notebook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# advanced_visualization.py
import numpy as np
import matplotlib.pyplot as plt
from linear_regression import LinearRegression
from model_selection import k_fold_cross_validation, bootstrapping

# Set style for better visualizations
plt.style.use('tableau-colorblind10')

# Generate synthetic data with more features
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X[:, 0] + np.random.randn(100)

# Initialize model
model = LinearRegression()
model.fit(X, y)

# 1. Enhanced Regression Plot
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='#1f77b4', alpha=0.5, label='Data points')
X_test = np.linspace(0, 2, 100).reshape(-1, 1)
y_pred = model.predict(X_test)

# Add confidence intervals
y_std = np.std(y - model.predict(X))
plt.plot(X_test, y_pred, color='#d62728', label='Regression line', linewidth=2)
plt.fill_between(X_test.flatten(),
y_pred - 2*y_std,
y_pred + 2*y_std,
alpha=0.2,
color='#d62728',
label='95% Confidence interval')
plt.title('Linear Regression with Confidence Intervals')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.grid(True, alpha=0.3)

# 2. K-fold Cross-validation Analysis
k_values = [3, 5, 7, 10]
cv_errors = []

for k in k_values:
error = k_fold_cross_validation(model, X, y, k=k, random_seed=42)
cv_errors.append(error)

plt.figure(figsize=(8, 5))
plt.plot(k_values, cv_errors, marker='o', linestyle='-',
color='#2ca02c', linewidth=2, markersize=8)
plt.title('Cross-validation Error vs K-folds')
plt.xlabel('Number of folds')
plt.ylabel('Mean Squared Error')
plt.grid(True, alpha=0.3)

# 3. Bootstrap Distribution Plot
n_iterations = 100
bootstrap_errors = []

for _ in range(n_iterations):
error = bootstrapping(model, X, y, n_iterations=1, test_size=0.3, random_seed=None)
if error is not None:
bootstrap_errors.append(error)

plt.figure(figsize=(8, 5))
plt.hist(bootstrap_errors, bins=20, density=True, alpha=0.7,
color='#ff7f0e', edgecolor='black')
plt.axvline(np.mean(bootstrap_errors), color='#d62728', linestyle='--',
label=f'Mean MSE: {np.mean(bootstrap_errors):.3f}')
plt.title('Bootstrap Error Distribution')
plt.xlabel('Mean Squared Error')
plt.ylabel('Density')
plt.legend()

# 4. Residual Analysis
y_pred_all = model.predict(X)
residuals = y - y_pred_all

plt.figure(figsize=(8, 5))
plt.scatter(y_pred_all, residuals, alpha=0.5, color='#9467bd')
plt.axhline(y=0, color='#d62728', linestyle='--')
plt.title('Residual Plot')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.grid(True, alpha=0.3)

# 5. Learning Curve
train_sizes = np.linspace(0.1, 1.0, 10)
train_errors = []
val_errors = []

for size in train_sizes:
n_samples = int(len(X) * size)
X_subset = X[:n_samples]
y_subset = y[:n_samples]
train_error = k_fold_cross_validation(model, X_subset, y_subset, k=5)
val_error = k_fold_cross_validation(model, X_subset, y_subset, k=3)
train_errors.append(train_error)
val_errors.append(val_error)

plt.figure(figsize=(8, 5))
plt.plot(train_sizes * 100, train_errors, label='Training Error',
color='#1f77b4', linewidth=2)
plt.plot(train_sizes * 100, val_errors, label='Validation Error',
color='#d62728', linewidth=2)
plt.title('Learning Curve')
plt.xlabel('Training Set Size (%)')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
146 changes: 128 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,29 +1,139 @@
# Project 2
# Project 2: MODEL SELECTION

Select one of the following two options:
**Course:** CS584 - Machine Learning <br>
**Instructor:** Steve Avsec<br>
**Group Members:**
- [email protected] (FNU Saurav) - A20536122
- [email protected] (Pallavi Savant) - A20540976
- [email protected](Satwik Sinha)-A20547790
- [email protected] (Aditya Ramchandra Kutre)-A20544809
- [email protected](Tejaswi Yerra) -A20545536

## Boosting Trees

Implement the gradient-boosting tree algorithm (with the usual fit-predict interface) as described in Sections 10.9-10.10 of Elements of Statistical Learning (2nd Edition). Answer the questions below as you did for Project 1.
## Project Overview

Put your README below. Answer the following questions.
This project implements two model selection techniques:
1. **k-Fold Cross-Validation**:
- Evaluates a machine learning model by splitting the dataset into \( k \) folds and using each fold as a validation set while training on the remaining \( k-1 \) folds.
- The average loss (e.g., Mean Squared Error) across all folds is calculated to estimate the model’s predictive performance.

* What does the model you have implemented do and when should it be used?
* How did you test your model to determine if it is working reasonably correctly?
* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.)
* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?
2. **Bootstrapping**:
- Evaluates a model by generating multiple bootstrap samples (random sampling with replacement) from the dataset and using the out-of-bag (OOB) samples for validation.
- The average error across all bootstrap iterations is computed to measure model performance.

## Model Selection
Both methods are implemented for general-purpose models that provide `fit()` and `predict()` methods.

Implement generic k-fold cross-validation and bootstrapping model selection methods.
## Code Files
1. **`main.py`**:
- Demonstrates the usage of the implemented k-fold cross-validation and bootstrapping methods.
- Uses a simple linear regression model on synthetic data as an example.

In your README, answer the following questions:
2. **`linear_regression.py`**:
- Implements a basic Linear Regression model using the **Normal Equation**.
- Includes methods for fitting the model (`fit()`) and making predictions (`predict()`).

* Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?
* In what cases might the methods you've written fail or give incorrect or undesirable results?
* What could you implement given more time to mitigate these cases or help users of your methods?
* What parameters have you exposed to your users in order to use your model selectors.
3. **`model_selection.py`**:
- Contains implementations for:
- **k-Fold Cross-Validation**: Evaluates model performance using \( k \)-fold splitting.
- **Bootstrapping**: Evaluates model performance using random sampling with replacement.

See sections 7.10-7.11 of Elements of Statistical Learning and the lecture notes. Pay particular attention to Section 7.10.2.
## Functions Overview

As usual, above-and-beyond efforts will be considered for bonus points.
| **Function** | **Description** |
|-----------------------------|-------------------------------------------------------------------------------------------------------|
| **`k_fold_cross_validation`** | Performs k-fold cross-validation on the given model by splitting the data into `k` folds, training on `k-1` folds, and testing on the remaining fold. Returns the average error across all folds. |
| **`bootstrapping`** | Implements bootstrapping with out-of-bag (OOB) evaluation by resampling data with replacement. Calculates the average error across all bootstrap iterations. |
| **`LinearRegression.fit`** | Fits a linear regression model to the given training data using the Normal Equation. Calculates weights and intercept for the model. |
| **`LinearRegression.predict`** | Predicts target values for the given input data based on the weights and intercept obtained from the `fit` method. |
| **`main.py`** | Combines all components to generate synthetic data, perform k-fold cross-validation and bootstrapping, and print the results. |

## How to Run the Code

1. Clone the repository and navigate to the project directory.
2. Install the required Python libraries:
```bash
pip install numpy scikit-learn
```
3. Run the main script:
```bash
python main.py
```
4. Observe the cross-validation and bootstrapping errors printed in the console.
```yaml
5-Fold Cross-Validation Error (MSE): 0.9938904780907099
Bootstrap Error (MSE): 1.0599287186388127
```

## Key Questions

### 1. Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?

Yes, in simple cases like linear regression, cross-validation and bootstrapping often agree with a simpler model selector like the Akaike Information Criterion (AIC).
- **Cross-validation and bootstrapping** directly estimate the model's predictive performance on unseen data by simulating multiple train-test splits.
- **AIC (Akaike Information Criterion)** on the other hand, has been built in to prevent over-fitting in model and hence the more complex ones are discouraged although, there exists an underlying likelihood function assumed.

---

### 2. In what cases might the methods you've written fail or give incorrect or undesirable results?

The methods may fail or give incorrect results in the following cases:
1. **Imbalanced Datasets**:
- If the dataset is highly imbalanced, random splits in cross-validation or bootstrapping may fail to represent minority classes adequately, leading to biased error estimates.
2. **Small Datasets**:
- With limited data, random splitting in both methods might cause high variance in error estimates due to insufficient training data or small validation sets.
3. **Correlated features or data points**:
- Cross-validation and bootstrapping may underestimate errors in models trained on data with correlations between features if the validation set doesn't reflect the correlation structure.
4. **Violated model assumptions**:
- We have used linear regression and handled the edge cases and assumptions. However, if any other model is used, it might cause issues as it may rely on the model's assumptions being appropriate for the dataset.
- If the model's assumptions are violated, the methods may yield unreliable performance estimates.
5. **Computational Constraints**:
- For very large datasets, the computational overhead of these methods might be impractical without optimized implementations.
6. **Non-IID data**:
- If data isn't independent and identically distributed, random splits may ignore dependencies, resulting in unreliable error estimates.
7. **Overlap in bootstrapping samples**:
- Bootstrapping involves sampling with replacement, which means some samples can appear multiple times in a single bootstrap iteration.
- This can bias the model towards overfitting specific samples in the training set, particularly in small datasets.

---

### 3. What could you implement given more time to mitigate these cases or help users of your methods?

1. **Balanced k-Fold Cross-Validation**:
- For imbalanced datasets, we can implement balanced sampling to ensure proportional representation of each class in all folds.
- This will provide unbiased error estimates for minority classes.
2. **Parallelization**:
- Can use parallel computing libraries to speed up the computation for both cross-validation and bootstrapping.
- This can make the methods practical for large datasets or complex models.
3. **Dimensionality Reduction**:
- We can Address challenges with high-dimensional datasets and add automatic feature selection or dimensionality reduction techniques to handle datasets with correlated features or high dimensionality.
4. **Handling Overlap in Bootstrapping**:
- Add checks to identify excessive sample repetition in bootstrapping iterations and adjust sampling strategies dynamically to ensure diversity in bootstrap samples.
5. **Blocked Cross-Validation**:
- For non-IID data, can implement blocked or grouped cross-validation. This ensures that dependencies are maintained by splitting the data into meaningful groups.
6. **OOB Evaluation Improvements**:
- More robust handling of bootstrap edge cases.
7. **Documentation and Examples**:
- Provide more detailed documentation. Helps users understand and apply the methods effectively.
6. **Error handling**:
- Add more error handling for scenarios where methods might for example in cases like too few data points, highly imbalanced data or strong correlations in features.

---

### 4. What parameters have you exposed to your users in order to use your model selectors?

#### **k-Fold Cross-Validation Parameters**:
1. **`model`**: A machine learning model with `fit()` and `predict()` methods.
2. **`X` and `y`**: Input features and target values.
3. **`k`**: Number of folds (default = 5).
4. **`random_seed`**: For reproducibility of random splits. (optional).

#### **Bootstrapping Parameters**:
1. **`model`**: A machine learning model with `fit()` and `predict()` methods.
2. **`X` and `y`**: Input features and target values.
3. **`n_iterations`**: Number of bootstrap iterations (default = 100).
4. **`test_size`**: Proportion of OOB samples (default = 0.3).
5. **`random_seed`**: Seed for reproducibility (optional).

## Extra Credits

- The **BONUS CREDIT/visualization_notebook.py** script provides enhanced data visualizations and performance analysis for the implemented linear regression model using k-fold cross-validation and bootstrapping.
22 changes: 22 additions & 0 deletions linear_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# linear_regression.py

import numpy as np

class LinearRegression:
def __init__(self):
self.weights = None
self.bias = None

def fit(self, X, y):
# Adding bias term (intercept) to the features
X = np.hstack((np.ones((X.shape[0], 1)), X))

# Closed-form solution to linear regression (Normal Equation)
# w = (X^T X)^-1 X^T y
X_transpose = X.T
self.weights = np.linalg.inv(X_transpose @ X) @ X_transpose @ y

def predict(self, X):
# Adding bias term to test data
X = np.hstack((np.ones((X.shape[0], 1)), X))
return X @ self.weights
21 changes: 21 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# main.py

import numpy as np
from model_selection import k_fold_cross_validation, bootstrapping
from linear_regression import LinearRegression

# Generate synthetic data
np.random.seed(0)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X[:, 0] + np.random.randn(100)

#Choose any random model
model = LinearRegression()

# Perform 5-fold cross-validation
kfold_error = k_fold_cross_validation(model, X, y, k=5, random_seed=42)
print("5-Fold Cross-Validation Error (MSE):", kfold_error)

# Perform bootstrap with 100 iterations
bootstrap_error = bootstrapping(model, X, y, n_iterations=100, test_size=0.33, random_seed=42)
print("Bootstrap Error (MSE):", bootstrap_error)
Loading