Fall2024CS584 · pallavisavant · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
+.DS_Store
 
 # C extensions
 *.so
@@ -160,3 +161,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+.idea/
diff --git a/BONUS CREDIT/plot1.jpeg b/BONUS CREDIT/plot1.jpeg
diff --git a/BONUS CREDIT/plot2.jpeg b/BONUS CREDIT/plot2.jpeg
diff --git a/BONUS CREDIT/plot3.jpeg b/BONUS CREDIT/plot3.jpeg
diff --git a/BONUS CREDIT/visualization_notebook.py b/BONUS CREDIT/visualization_notebook.py
@@ -0,0 +1,113 @@
+# advanced_visualization.py
+import numpy as np
+import matplotlib.pyplot as plt
+from linear_regression import LinearRegression
+from model_selection import k_fold_cross_validation, bootstrapping
+
+# Set style for better visualizations
+plt.style.use('tableau-colorblind10')
+
+# Generate synthetic data with more features
+np.random.seed(42)
+X = 2 * np.random.rand(100, 1)
+y = 4 + 3 * X[:, 0] + np.random.randn(100)
+
+# Initialize model
+model = LinearRegression()
+model.fit(X, y)
+
+# 1. Enhanced Regression Plot
+plt.figure(figsize=(10, 6))
+plt.scatter(X, y, color='#1f77b4', alpha=0.5, label='Data points')
+X_test = np.linspace(0, 2, 100).reshape(-1, 1)
+y_pred = model.predict(X_test)
+
+# Add confidence intervals
+y_std = np.std(y - model.predict(X))
+plt.plot(X_test, y_pred, color='#d62728', label='Regression line', linewidth=2)
+plt.fill_between(X_test.flatten(), 
+                 y_pred - 2*y_std, 
+                 y_pred + 2*y_std, 
+                 alpha=0.2, 
+                 color='#d62728', 
+                 label='95% Confidence interval')
+plt.title('Linear Regression with Confidence Intervals')
+plt.xlabel('X')
+plt.ylabel('y')
+plt.legend()
+plt.grid(True, alpha=0.3)
+
+# 2. K-fold Cross-validation Analysis
+k_values = [3, 5, 7, 10]
+cv_errors = []
+
+for k in k_values:
+    error = k_fold_cross_validation(model, X, y, k=k, random_seed=42)
+    cv_errors.append(error)
+
+plt.figure(figsize=(8, 5))
+plt.plot(k_values, cv_errors, marker='o', linestyle='-', 
+         color='#2ca02c', linewidth=2, markersize=8)
+plt.title('Cross-validation Error vs K-folds')
+plt.xlabel('Number of folds')
+plt.ylabel('Mean Squared Error')
+plt.grid(True, alpha=0.3)
+
+# 3. Bootstrap Distribution Plot
+n_iterations = 100
+bootstrap_errors = []
+
+for _ in range(n_iterations):
+    error = bootstrapping(model, X, y, n_iterations=1, test_size=0.3, random_seed=None)
+    if error is not None:
+        bootstrap_errors.append(error)
+
+plt.figure(figsize=(8, 5))
+plt.hist(bootstrap_errors, bins=20, density=True, alpha=0.7, 
+         color='#ff7f0e', edgecolor='black')
+plt.axvline(np.mean(bootstrap_errors), color='#d62728', linestyle='--', 
+            label=f'Mean MSE: {np.mean(bootstrap_errors):.3f}')
+plt.title('Bootstrap Error Distribution')
+plt.xlabel('Mean Squared Error')
+plt.ylabel('Density')
+plt.legend()
+
+# 4. Residual Analysis
+y_pred_all = model.predict(X)
+residuals = y - y_pred_all
+
+plt.figure(figsize=(8, 5))
+plt.scatter(y_pred_all, residuals, alpha=0.5, color='#9467bd')
+plt.axhline(y=0, color='#d62728', linestyle='--')
+plt.title('Residual Plot')
+plt.xlabel('Predicted Values')
+plt.ylabel('Residuals')
+plt.grid(True, alpha=0.3)
+
+# 5. Learning Curve
+train_sizes = np.linspace(0.1, 1.0, 10)
+train_errors = []
+val_errors = []
+
+for size in train_sizes:
+    n_samples = int(len(X) * size)
+    X_subset = X[:n_samples]
+    y_subset = y[:n_samples]
+    train_error = k_fold_cross_validation(model, X_subset, y_subset, k=5)
+    val_error = k_fold_cross_validation(model, X_subset, y_subset, k=3)
+    train_errors.append(train_error)
+    val_errors.append(val_error)
+
+plt.figure(figsize=(8, 5))
+plt.plot(train_sizes * 100, train_errors, label='Training Error', 
+         color='#1f77b4', linewidth=2)
+plt.plot(train_sizes * 100, val_errors, label='Validation Error', 
+         color='#d62728', linewidth=2)
+plt.title('Learning Curve')
+plt.xlabel('Training Set Size (%)')
+plt.ylabel('Mean Squared Error')
+plt.legend()
+plt.grid(True, alpha=0.3)
+
+plt.tight_layout()
+plt.show()
diff --git a/README.md b/README.md
@@ -1,29 +1,139 @@
-# Project 2
+# Project 2: MODEL SELECTION
 
-Select one of the following two options:
+**Course:** CS584 - Machine Learning <br>
+**Instructor:** Steve Avsec<br>
+**Group Members:**
+- [email protected] (FNU Saurav) - A20536122
+- [email protected] (Pallavi Savant) - A20540976
+- [email protected](Satwik Sinha)-A20547790
+- [email protected] (Aditya Ramchandra Kutre)-A20544809
+- [email protected](Tejaswi Yerra) -A20545536
 
-## Boosting Trees
 
-Implement the gradient-boosting tree algorithm (with the usual fit-predict interface) as described in Sections 10.9-10.10 of Elements of Statistical Learning (2nd Edition). Answer the questions below as you did for Project 1.
+## Project Overview
 
-Put your README below. Answer the following questions.
+This project implements two model selection techniques:
+1. **k-Fold Cross-Validation**:
+   - Evaluates a machine learning model by splitting the dataset into \( k \) folds and using each fold as a validation set while training on the remaining \( k-1 \) folds.
+   - The average loss (e.g., Mean Squared Error) across all folds is calculated to estimate the model’s predictive performance.
 
-* What does the model you have implemented do and when should it be used?
-* How did you test your model to determine if it is working reasonably correctly?
-* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.)
-* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?
+2. **Bootstrapping**:
+   - Evaluates a model by generating multiple bootstrap samples (random sampling with replacement) from the dataset and using the out-of-bag (OOB) samples for validation.
+   - The average error across all bootstrap iterations is computed to measure model performance.
 
-## Model Selection
+Both methods are implemented for general-purpose models that provide `fit()` and `predict()` methods.
 
-Implement generic k-fold cross-validation and bootstrapping model selection methods.
+## Code Files
+1. **`main.py`**:
+   - Demonstrates the usage of the implemented k-fold cross-validation and bootstrapping methods.
+   - Uses a simple linear regression model on synthetic data as an example.
 
-In your README, answer the following questions:
+2. **`linear_regression.py`**:
+   - Implements a basic Linear Regression model using the **Normal Equation**.
+   - Includes methods for fitting the model (`fit()`) and making predictions (`predict()`).
 
-* Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?
-* In what cases might the methods you've written fail or give incorrect or undesirable results?
-* What could you implement given more time to mitigate these cases or help users of your methods?
-* What parameters have you exposed to your users in order to use your model selectors.
+3. **`model_selection.py`**:
+   - Contains implementations for:
+     - **k-Fold Cross-Validation**: Evaluates model performance using \( k \)-fold splitting.
+     - **Bootstrapping**: Evaluates model performance using random sampling with replacement.
 
-See sections 7.10-7.11 of Elements of Statistical Learning and the lecture notes. Pay particular attention to Section 7.10.2.
+## Functions Overview
 
-As usual, above-and-beyond efforts will be considered for bonus points.
+| **Function**                | **Description**                                                                                       |
+|-----------------------------|-------------------------------------------------------------------------------------------------------|
+| **`k_fold_cross_validation`** | Performs k-fold cross-validation on the given model by splitting the data into `k` folds, training on `k-1` folds, and testing on the remaining fold. Returns the average error across all folds. |
+| **`bootstrapping`**          | Implements bootstrapping with out-of-bag (OOB) evaluation by resampling data with replacement. Calculates the average error across all bootstrap iterations. |
+| **`LinearRegression.fit`**   | Fits a linear regression model to the given training data using the Normal Equation. Calculates weights and intercept for the model. |
+| **`LinearRegression.predict`** | Predicts target values for the given input data based on the weights and intercept obtained from the `fit` method. |
+| **`main.py`**                | Combines all components to generate synthetic data, perform k-fold cross-validation and bootstrapping, and print the results. |
+
+## How to Run the Code
+
+1. Clone the repository and navigate to the project directory.
+2. Install the required Python libraries:
+   ```bash
+   pip install numpy scikit-learn
+   ```
+3. Run the main script:
+   ```bash
+   python main.py
+   ```
+4. Observe the cross-validation and bootstrapping errors printed in the console.
+    ```yaml
+    5-Fold Cross-Validation Error (MSE): 0.9938904780907099
+    Bootstrap Error (MSE): 1.0599287186388127
+    ```
+
+## Key Questions
+
+### 1. Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?
+
+Yes, in simple cases like linear regression, cross-validation and bootstrapping often agree with a simpler model selector like the Akaike Information Criterion (AIC).
+- **Cross-validation and bootstrapping** directly estimate the model's predictive performance on unseen data by simulating multiple train-test splits.
+- **AIC (Akaike Information Criterion)** on the other hand, has been built in to prevent over-fitting in model and hence the more complex ones are discouraged although, there exists an underlying likelihood function assumed.
+
+---
+
+### 2. In what cases might the methods you've written fail or give incorrect or undesirable results?
+
+The methods may fail or give incorrect results in the following cases:
+1. **Imbalanced Datasets**:
+   - If the dataset is highly imbalanced, random splits in cross-validation or bootstrapping may fail to represent minority classes adequately, leading to biased error estimates.
+2. **Small Datasets**:
+   - With limited data, random splitting in both methods might cause high variance in error estimates due to insufficient training data or small validation sets.
+3. **Correlated features or data points**:
+   - Cross-validation and bootstrapping may underestimate errors in models trained on data with correlations between features if the validation set doesn't reflect the correlation structure.
+4. **Violated model assumptions**:
+   - We have used linear regression and handled the edge cases and assumptions. However, if any other model is used, it might cause issues as it may rely on the model's assumptions being appropriate for the dataset.
+   - If the model's assumptions are violated, the methods may yield unreliable performance estimates.
+5. **Computational Constraints**:
+   - For very large datasets, the computational overhead of these methods might be impractical without optimized implementations.
+6. **Non-IID data**:
+   - If data isn't independent and identically distributed, random splits may ignore dependencies, resulting in unreliable error estimates.
+7. **Overlap in bootstrapping samples**:
+   - Bootstrapping involves sampling with replacement, which means some samples can appear multiple times in a single bootstrap iteration.
+   - This can bias the model towards overfitting specific samples in the training set, particularly in small datasets.
+
+---
+
+### 3. What could you implement given more time to mitigate these cases or help users of your methods?
+
+1. **Balanced k-Fold Cross-Validation**:
+   - For imbalanced datasets, we can implement balanced sampling to ensure proportional representation of each class in all folds.
+   - This will provide unbiased error estimates for minority classes.
+2. **Parallelization**:
+   - Can use parallel computing libraries to speed up the computation for both cross-validation and bootstrapping.
+   - This can make the methods practical for large datasets or complex models.
+3. **Dimensionality Reduction**:
+   - We can Address challenges with high-dimensional datasets and add automatic feature selection or dimensionality reduction techniques to handle datasets with correlated features or high dimensionality.
+4. **Handling Overlap in Bootstrapping**:
+   - Add checks to identify excessive sample repetition in bootstrapping iterations and adjust sampling strategies dynamically to ensure diversity in bootstrap samples.
+5. **Blocked Cross-Validation**:
+   - For non-IID data, can implement blocked or grouped cross-validation. This ensures that dependencies are maintained by splitting the data into meaningful groups.
+6. **OOB Evaluation Improvements**:
+   - More robust handling of bootstrap edge cases.
+7. **Documentation and Examples**:
+   - Provide more detailed documentation. Helps users understand and apply the methods effectively.
+6. **Error handling**:
+   - Add more error handling for scenarios where methods might for example in cases like too few data points, highly imbalanced data or strong correlations in features.
+
+---
+
+### 4. What parameters have you exposed to your users in order to use your model selectors?
+
+#### **k-Fold Cross-Validation Parameters**:
+1. **`model`**: A machine learning model with `fit()` and `predict()` methods.
+2. **`X` and `y`**: Input features and target values.
+3. **`k`**: Number of folds (default = 5).
+4. **`random_seed`**: For reproducibility of random splits. (optional).
+
+#### **Bootstrapping Parameters**:
+1. **`model`**: A machine learning model with `fit()` and `predict()` methods.
+2. **`X` and `y`**: Input features and target values.
+3. **`n_iterations`**: Number of bootstrap iterations (default = 100).
+4. **`test_size`**: Proportion of OOB samples (default = 0.3).
+5. **`random_seed`**: Seed for reproducibility (optional).
+
+## Extra Credits
+
+- The **BONUS CREDIT/visualization_notebook.py** script provides enhanced data visualizations and performance analysis for the implemented linear regression model using k-fold cross-validation and bootstrapping.
diff --git a/linear_regression.py b/linear_regression.py
@@ -0,0 +1,22 @@
+# linear_regression.py
+
+import numpy as np
+
+class LinearRegression:
+    def __init__(self):
+        self.weights = None
+        self.bias = None
+
+    def fit(self, X, y):
+        # Adding bias term (intercept) to the features
+        X = np.hstack((np.ones((X.shape[0], 1)), X))
+
+        # Closed-form solution to linear regression (Normal Equation)
+        # w = (X^T X)^-1 X^T y
+        X_transpose = X.T
+        self.weights = np.linalg.inv(X_transpose @ X) @ X_transpose @ y
+
+    def predict(self, X):
+        # Adding bias term to test data
+        X = np.hstack((np.ones((X.shape[0], 1)), X))
+        return X @ self.weights
diff --git a/main.py b/main.py
@@ -0,0 +1,21 @@
+# main.py
+
+import numpy as np
+from model_selection import k_fold_cross_validation, bootstrapping
+from linear_regression import LinearRegression
+
+# Generate synthetic data
+np.random.seed(0)
+X = 2 * np.random.rand(100, 1)
+y = 4 + 3 * X[:, 0] + np.random.randn(100)
+
+#Choose any random model
+model = LinearRegression()
+
+# Perform 5-fold cross-validation
+kfold_error = k_fold_cross_validation(model, X, y, k=5, random_seed=42)
+print("5-Fold Cross-Validation Error (MSE):", kfold_error)
+
+# Perform bootstrap with 100 iterations
+bootstrap_error = bootstrapping(model, X, y, n_iterations=100, test_size=0.33, random_seed=42)
+print("Bootstrap Error (MSE):", bootstrap_error)