diff --git a/docs/neural-network/hidden-layers/activation.md b/docs/neural-network/hidden-layers/activation.md
index a4e4cde73..57d4dc46c 100644
--- a/docs/neural-network/hidden-layers/activation.md
+++ b/docs/neural-network/hidden-layers/activation.md
@@ -1,4 +1,4 @@
-<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Layers/Activation.php">[source]</a></span>
+<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Layers/Activation/Activation.php">[source]</a></span>
 
 # Activation
 Activation layers apply a user-defined non-linear activation function to their inputs. They often work in conjunction with [Dense](dense.md) layers as a way to transform their output.
@@ -10,8 +10,8 @@ Activation layers apply a user-defined non-linear activation function to their i
 
 ## Example
 ```php
-use Rubix\ML\NeuralNet\Layers\Activation;
-use Rubix\ML\NeuralNet\ActivationFunctions\ReLU;
+use Rubix\ML\NeuralNet\Layers\Activation\Activation;
+use Rubix\ML\NeuralNet\ActivationFunctions\ReLU\ReLU;
 
 $layer = new Activation(new ReLU());
-```
\ No newline at end of file
+```
diff --git a/docs/neural-network/hidden-layers/batch-norm.md b/docs/neural-network/hidden-layers/batch-norm.md
index 99fdefd22..373113e14 100644
--- a/docs/neural-network/hidden-layers/batch-norm.md
+++ b/docs/neural-network/hidden-layers/batch-norm.md
@@ -1,4 +1,4 @@
-<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Layers/BatchNorm.php">[source]</a></span>
+<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Layers/BatchNorm/BatchNorm.php">[source]</a></span>
 
 # Batch Norm
 Batch Norm layers normalize the activations of the previous layer such that the mean activation is *close* to 0 and the standard deviation is *close* to 1. Adding Batch Norm reduces the amount of covariate shift within the network which makes it possible to use higher learning rates and thus converge faster under some circumstances.
@@ -12,12 +12,12 @@ Batch Norm layers normalize the activations of the previous layer such that the
 
 ## Example
 ```php
-use Rubix\ML\NeuralNet\Layers\BatchNorm;
-use Rubix\ML\NeuralNet\Initializers\Constant;
-use Rubix\ML\NeuralNet\Initializers\Normal;
+use Rubix\ML\NeuralNet\Layers\BatchNorm\BatchNorm;
+use Rubix\ML\NeuralNet\Initializers\Constant\Constant;
+use Rubix\ML\NeuralNet\Initializers\Normal\Normal;
 
 $layer = new BatchNorm(0.7, new Constant(0.), new Normal(1.));
 ```
 
 ## References
-[^1]: S. Ioffe et al. (2015). Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift.
\ No newline at end of file
+[^1]: S. Ioffe et al. (2015). Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift.
diff --git a/docs/neural-network/optimizers/adagrad.md b/docs/neural-network/optimizers/adagrad.md
index 9cfddff25..2e55a9953 100644
--- a/docs/neural-network/optimizers/adagrad.md
+++ b/docs/neural-network/optimizers/adagrad.md
@@ -1,8 +1,24 @@
-<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/AdaGrad.php">[source]</a></span>
+<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/AdaGrad/AdaGrad.php">[source]</a></span>
 
 # AdaGrad
 Short for *Adaptive Gradient*, the AdaGrad Optimizer speeds up the learning of parameters that do not change often and slows down the learning of parameters that do enjoy heavy activity. Due to AdaGrad's infinitely decaying step size, training may be slow or fail to converge using a low learning rate.
 
+## Mathematical formulation
+Per step (element-wise), AdaGrad accumulates the sum of squared gradients and scales the update by the root of this sum:
+
+$$
+\begin{aligned}
+\mathbf{n}_t &= \mathbf{n}_{t-1} + \mathbf{g}_t^{2} \\
+\Delta{\theta}_t &= \alpha\, \frac{\mathbf{g}_t}{\sqrt{\mathbf{n}_t} + \varepsilon}
+\end{aligned}
+$$
+
+where:
+- $t$ is the current step,
+- $\alpha$ is the learning rate (`rate`),
+- $\mathbf{g}_t$ is the current gradient, and $\mathbf{g}_t^{2}$ denotes element-wise square,
+- $\varepsilon$ is a small constant for numerical stability (in the implementation, the denominator is clipped from below by `EPSILON`).
+
 ## Parameters
 | # | Name | Default | Type | Description |
 |---|---|---|---|---|
@@ -10,10 +26,10 @@ Short for *Adaptive Gradient*, the AdaGrad Optimizer speeds up the learning of p
 
 ## Example
 ```php
-use Rubix\ML\NeuralNet\Optimizers\AdaGrad;
+use Rubix\ML\NeuralNet\Optimizers\AdaGrad\AdaGrad;
 
 $optimizer = new AdaGrad(0.125);
 ```
 
 ## References
-[^1]: J. Duchi et al. (2011). Adaptive Subgradient Methods for Online Learning and Stochastic Optimization.
\ No newline at end of file
+[^1]: J. Duchi et al. (2011). Adaptive Subgradient Methods for Online Learning and Stochastic Optimization.
diff --git a/docs/neural-network/optimizers/adam.md b/docs/neural-network/optimizers/adam.md
index 3b9898649..d10a469f3 100644
--- a/docs/neural-network/optimizers/adam.md
+++ b/docs/neural-network/optimizers/adam.md
@@ -1,8 +1,27 @@
-<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/Adam.php">[source]</a></span>
+<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/Adam/Adam.php">[source]</a></span>
 
 # Adam
 Short for *Adaptive Moment Estimation*, the Adam Optimizer combines both Momentum and RMS properties. In addition to storing an exponentially decaying average of past squared gradients like [RMSprop](rms-prop.md), Adam also keeps an exponentially decaying average of past gradients, similar to [Momentum](momentum.md). Whereas Momentum can be seen as a ball running down a slope, Adam behaves like a heavy ball with friction.
 
+## Mathematical formulation
+Per step (element-wise), Adam maintains exponentially decaying moving averages of the gradient and its element-wise square and uses them to scale the update:
+
+$$
+\begin{aligned}
+\mathbf{v}_t &= (1 - \beta_1)\,\mathbf{v}_{t-1} + \beta_1\,\mathbf{g}_t \\
+\mathbf{n}_t &= (1 - \beta_2)\,\mathbf{n}_{t-1} + \beta_2\,\mathbf{g}_t^{2} \\
+\Delta{\theta}_t &= \alpha\, \frac{\mathbf{v}_t}{\sqrt{\mathbf{n}_t} + \varepsilon}
+\end{aligned}
+$$
+
+where:
+- $t$ is the current step,
+- $\alpha$ is the learning rate (`rate`),
+- $\beta_1$ is the momentum decay (`momentumDecay`),
+- $\beta_2$ is the norm decay (`normDecay`),
+- $\mathbf{g}_t$ is the current gradient, and $\mathbf{g}_t^{2}$ denotes element-wise square,
+- $\varepsilon$ is a small constant for numerical stability (in the implementation, the denominator is clipped from below by `EPSILON`).
+
 ## Parameters
 | # | Name | Default | Type | Description |
 |---|---|---|---|---|
@@ -12,10 +31,10 @@ Short for *Adaptive Moment Estimation*, the Adam Optimizer combines both Momentu
 
 ## Example
 ```php
-use Rubix\ML\NeuralNet\Optimizers\Adam;
+use Rubix\ML\NeuralNet\Optimizers\Adam\Adam;
 
 $optimizer = new Adam(0.0001, 0.1, 0.001);
 ```
 
 ## References
-[^1]: D. P. Kingma et al. (2014). Adam: A Method for Stochastic Optimization.
\ No newline at end of file
+[^1]: D. P. Kingma et al. (2014). Adam: A Method for Stochastic Optimization.
diff --git a/docs/neural-network/optimizers/adamax.md b/docs/neural-network/optimizers/adamax.md
index 6b1d9ea05..ff02f925a 100644
--- a/docs/neural-network/optimizers/adamax.md
+++ b/docs/neural-network/optimizers/adamax.md
@@ -1,8 +1,27 @@
-<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/AdaMax.php">[source]</a></span>
+<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/AdaMax/AdaMax.php">[source]</a></span>
 
 # AdaMax
 A version of the [Adam](adam.md) optimizer that replaces the RMS property with the infinity norm of the past gradients. As such, AdaMax is generally more suitable for sparse parameter updates and noisy gradients.
 
+## Mathematical formulation
+Per step (element-wise), AdaMax maintains an exponentially decaying moving average of the gradient (velocity) and an infinity-norm accumulator of past gradients, and uses them to scale the update:
+
+$$
+\begin{aligned}
+\mathbf{v}_t &= (1 - \beta_1)\,\mathbf{v}_{t-1} + \beta_1\,\mathbf{g}_t \\
+\mathbf{u}_t &= \max\big(\beta_2\,\mathbf{u}_{t-1},\ |\mathbf{g}_t|\big) \\
+\Delta{\theta}_t &= \alpha\, \frac{\mathbf{v}_t}{\max(\mathbf{u}_t, \varepsilon)}
+\end{aligned}
+$$
+
+where:
+- $t$ is the current step,
+- $\alpha$ is the learning rate (`rate`),
+- $\beta_1$ is the momentum decay (`momentumDecay`),
+- $\beta_2$ is the norm decay (`normDecay`),
+- $\mathbf{g}_t$ is the current gradient and $|\mathbf{g}_t|$ denotes element-wise absolute value,
+- $\varepsilon$ is a small constant for numerical stability (in the implementation, the denominator is clipped from below by `EPSILON`).
+
 ## Parameters
 | # | Name | Default | Type | Description |
 |---|---|---|---|---|
@@ -12,10 +31,10 @@ A version of the [Adam](adam.md) optimizer that replaces the RMS property with t
 
 ## Example
 ```php
-use Rubix\ML\NeuralNet\Optimizers\AdaMax;
+use Rubix\ML\NeuralNet\Optimizers\AdaMax\AdaMax;
 
 $optimizer = new AdaMax(0.0001, 0.1, 0.001);
 ```
 
 ## References
-[^1]: D. P. Kingma et al. (2014). Adam: A Method for Stochastic Optimization.
\ No newline at end of file
+[^1]: D. P. Kingma et al. (2014). Adam: A Method for Stochastic Optimization.
diff --git a/docs/neural-network/optimizers/cyclical.md b/docs/neural-network/optimizers/cyclical.md
index 9773004da..eed8b2779 100644
--- a/docs/neural-network/optimizers/cyclical.md
+++ b/docs/neural-network/optimizers/cyclical.md
@@ -1,8 +1,28 @@
-<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/Cyclical.php">[source]</a></span>
+<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/Cyclical/Cyclical.php">[source]</a></span>
 
 # Cyclical
 The Cyclical optimizer uses a global learning rate that cycles between the lower and upper bound over a designated period while also decaying the upper bound by a factor at each step. Cyclical learning rates have been shown to help escape bad local minima and saddle points of the gradient.
 
+## Mathematical formulation
+Per step (element-wise), the cyclical learning rate and update are computed as:
+
+$$
+\begin{aligned}
+\text{cycle} &= \left\lfloor 1 + \frac{t}{2\,\text{steps}} \right\rfloor \\
+x &= \left| \frac{t}{\text{steps}} - 2\,\text{cycle} + 1 \right| \\
+\text{scale} &= \text{decay}^{\,t} \\
+\eta_t &= \text{lower} + (\text{upper} - \text{lower})\,\max\bigl(0\,1 - x\bigr)\,\text{scale} \\
+\Delta\theta_t &= \eta_t\,g_t
+\end{aligned}
+$$
+
+where:
+- $t$ is the current step counter,
+- $steps$ is the number of steps in every half cycle,
+- $lower$ and $upper$ are the learning rate bounds,
+- $decay$ is the multiplicative decay applied each step,
+- $g_t$ is the current gradient.
+
 ## Parameters
 | # | Name | Default | Type | Description |
 |---|---|---|---|---|
@@ -13,10 +33,10 @@ The Cyclical optimizer uses a global learning rate that cycles between the lower
 
 ## Example
 ```php
-use Rubix\ML\NeuralNet\Optimizers\Cyclical;
+use Rubix\ML\NeuralNet\Optimizers\Cyclical\Cyclical;
 
 $optimizer = new Cyclical(0.001, 0.005, 1000);
 ```
 
 ## References
-[^1]: L. N. Smith. (2017). Cyclical Learning Rates for Training Neural Networks.
\ No newline at end of file
+[^1]: L. N. Smith. (2017). Cyclical Learning Rates for Training Neural Networks.
diff --git a/docs/neural-network/optimizers/momentum.md b/docs/neural-network/optimizers/momentum.md
index 7556ca008..e9c787a2f 100644
--- a/docs/neural-network/optimizers/momentum.md
+++ b/docs/neural-network/optimizers/momentum.md
@@ -1,8 +1,33 @@
-<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/Momentum.php">[source]</a></span>
+<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/Momentum/Momentum.php">[source]</a></span>
 
 # Momentum
 Momentum accelerates each update step by accumulating velocity from past updates and adding a factor of the previous velocity to the current step. Momentum can help speed up training and escape bad local minima when compared with [Stochastic](stochastic.md) Gradient Descent.
 
+## Mathematical formulation
+Per step (element-wise), Momentum updates the velocity and applies it as the parameter step:
+
+$$
+\begin{aligned}
+\beta &= 1 - \text{decay}, \quad \eta = \text{rate} \\
+\text{Velocity update:}\quad v_t &= \beta\,v_{t-1} + \eta\,g_t \\
+\text{Returned step:}\quad \Delta\theta_t &= v_t
+\end{aligned}
+$$
+
+Nesterov lookahead (when `lookahead = true`) is approximated by applying the velocity update a second time:
+
+$$
+\begin{aligned}
+v_t &\leftarrow \beta\,v_t + \eta\,g_t
+\end{aligned}
+$$
+
+where:
+- $g_t$ is the current gradient,
+- $v_t$ is the velocity (accumulated update),
+- $\beta$ is the momentum coefficient ($1 − decay$),
+- $\eta$ is the learning rate ($rate$).
+
 ## Parameters
 | # | Name | Default | Type | Description |
 |---|---|---|---|---|
@@ -12,7 +37,7 @@ Momentum accelerates each update step by accumulating velocity from past updates
 
 ## Example
 ```php
-use Rubix\ML\NeuralNet\Optimizers\Momentum;
+use Rubix\ML\NeuralNet\Optimizers\Momentum\Momentum;
 
 $optimizer = new Momentum(0.01, 0.1, true);
 ```
diff --git a/docs/neural-network/optimizers/rms-prop.md b/docs/neural-network/optimizers/rms-prop.md
index fdca6fd05..c531a863e 100644
--- a/docs/neural-network/optimizers/rms-prop.md
+++ b/docs/neural-network/optimizers/rms-prop.md
@@ -1,7 +1,25 @@
-<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/RMSProp.php">[source]</a></span>
+<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/RMSProp/RMSProp.php">[source]</a></span>
 
 # RMS Prop
-An adaptive gradient technique that divides the current gradient over a rolling window of the magnitudes of recent gradients. Unlike [AdaGrad](adagrad.md), RMS Prop does not suffer from an infinitely decaying step size.
+An adaptive gradient technique that divides the current gradient over a rolling window of magnitudes of recent gradients. Unlike [AdaGrad](adagrad.md), RMS Prop does not suffer from an infinitely decaying step size.
+
+## Mathematical formulation
+Per step (element-wise), RMSProp maintains a running average of squared gradients and scales the step by the root-mean-square:
+
+$$
+\begin{aligned}
+\rho &= 1 - \text{decay}, \quad \eta = \text{rate} \\
+\text{Running average:}\quad v_t &= \rho\,v_{t-1} + (1 - \rho)\,g_t^{\,2} \\
+\text{Returned step:}\quad \Delta\theta_t &= \frac{\eta\,g_t}{\max\bigl(\sqrt{v_t},\,\varepsilon\bigr)}
+\end{aligned}
+$$
+
+where:
+- $g_t$ - is the current gradient,
+- $v_t$ - is the running average of squared gradients,
+- $\rho$ - is the averaging coefficient ($1 − decay$),
+- $\eta$ - is the learning rate ($rate$),
+- $\varepsilon$ - is a small constant to avoid division by zero (implemented by clipping $\sqrt{v_t}$ to $[ε, +∞)$).
 
 ## Parameters
 | # | Name | Default | Type | Description |
@@ -11,10 +29,10 @@ An adaptive gradient technique that divides the current gradient over a rolling
 
 ## Example
 ```php
-use Rubix\ML\NeuralNet\Optimizers\RMSProp;
+use Rubix\ML\NeuralNet\Optimizers\RMSProp\RMSProp;
 
 $optimizer = new RMSProp(0.01, 0.1);
 ```
 
 ## References
-[^1]: T. Tieleman et al. (2012). Lecture 6e rmsprop: Divide the gradient by a running average of its recent magnitude.
\ No newline at end of file
+[^1]: T. Tieleman et al. (2012). Lecture 6e rmsprop: Divide the gradient by a running average of its recent magnitude.
diff --git a/docs/neural-network/optimizers/step-decay.md b/docs/neural-network/optimizers/step-decay.md
index 1a21f0804..f5da99c8b 100644
--- a/docs/neural-network/optimizers/step-decay.md
+++ b/docs/neural-network/optimizers/step-decay.md
@@ -1,8 +1,26 @@
-<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/StepDecay.php">[source]</a></span>
+<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Optimizers/StepDecay/StepDecay.php">[source]</a></span>
 
 # Step Decay
 A learning rate decay optimizer that reduces the global learning rate by a factor whenever it reaches a new *floor*. The number of steps needed to reach a new floor is defined by the *steps* hyper-parameter.
 
+## Mathematical formulation
+Per step (element-wise), the Step Decay learning rate and update are:
+
+$$
+\begin{aligned}
+\text{floor} &= \left\lfloor \frac{t}{k} \right\rfloor \\
+\eta_t &= \frac{\eta_0}{1 + \text{floor}\cdot \lambda} \\
+\Delta\theta_t &= \eta_t\,g_t
+\end{aligned}
+$$
+
+where:
+- $t$ is the current step number,
+- $k$ is the number of steps per floor,
+- $\eta_0$ is the initial learning rate ($rate$),
+- $\lambda$ is the decay factor ($decay$),
+- $g_t$ is the current gradient.
+
 ## Parameters
 | # | Name | Default | Type | Description |
 |---|---|---|---|---|
@@ -12,7 +30,7 @@ A learning rate decay optimizer that reduces the global learning rate by a facto
 
 ## Example
 ```php
-use Rubix\ML\NeuralNet\Optimizers\StepDecay;
+use Rubix\ML\NeuralNet\Optimizers\StepDecay\StepDecay;
 
 $optimizer = new StepDecay(0.1, 50, 1e-3);
-```
\ No newline at end of file
+```
diff --git a/docs/neural-network/optimizers/stochastic.md b/docs/neural-network/optimizers/stochastic.md
index 4422e0ddc..bb0096b87 100644
--- a/docs/neural-network/optimizers/stochastic.md
+++ b/docs/neural-network/optimizers/stochastic.md
@@ -3,6 +3,20 @@
 # Stochastic
 A constant learning rate optimizer based on vanilla Stochastic Gradient Descent (SGD).
 
+## Mathematical formulation
+Per step (element-wise), the SGD update scales the gradient by a constant learning rate:
+
+$$
+\begin{aligned}
+\eta &= \text{rate} \\
+\Delta\theta_t &= \eta\,g_t
+\end{aligned}
+$$
+
+where:
+- $g_t$ is the current gradient,
+- $\eta$ is the learning rate ($rate$).
+
 ## Parameters
 | # | Name | Default | Type | Description |
 |---|---|---|---|---|
diff --git a/src/NeuralNet/Layers/Activation/Activation.php b/src/NeuralNet/Layers/Activation/Activation.php
new file mode 100644
index 000000000..4394350b4
--- /dev/null
+++ b/src/NeuralNet/Layers/Activation/Activation.php
@@ -0,0 +1,184 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Layers\Activation;
+
+use NDArray;
+use NumPower;
+use Rubix\ML\Deferred;
+use Rubix\ML\Exceptions\RuntimeException;
+use Rubix\ML\NeuralNet\ActivationFunctions\Base\Contracts\ActivationFunction;
+use Rubix\ML\NeuralNet\Layers\Base\Contracts\Hidden;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+
+/**
+ * Activation
+ *
+ * Activation layers apply a user-defined non-linear activation function to their
+ * inputs.
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+class Activation implements Hidden
+{
+    /**
+     * The function that computes the output of the layer.
+     *
+     * @var ActivationFunction
+     */
+    protected ActivationFunction $activationFn;
+
+    /**
+     * The width of the layer.
+     *
+     * @var positive-int|null
+     */
+    protected ?int $width = null;
+
+    /**
+     * The memorized input matrix.
+     *
+     * @var NDArray|null
+     */
+    protected ?NDArray $input = null;
+
+    /**
+     * The memorized activation matrix.
+     *
+     * @var NDArray|null
+     */
+    protected ?NDArray $output = null;
+
+    /**
+     * @param ActivationFunction $activationFn
+     */
+    public function __construct(ActivationFunction $activationFn)
+    {
+        $this->activationFn = $activationFn;
+    }
+
+    /**
+     * Return the width of the layer.
+     *
+     * @internal
+     *
+     * @throws RuntimeException
+     * @return positive-int
+     */
+    public function width() : int
+    {
+        if ($this->width === null) {
+            throw new RuntimeException('Layer has not been initialized.');
+        }
+
+        return $this->width;
+    }
+
+    /**
+     * Initialize the layer with the fan in from the previous layer and return
+     * the fan out for this layer.
+     *
+     * @internal
+     *
+     * @param positive-int $fanIn
+     * @return positive-int
+     */
+    public function initialize(int $fanIn) : int
+    {
+        $fanOut = $fanIn;
+
+        $this->width = $fanOut;
+
+        return $fanOut;
+    }
+
+    /**
+     * Compute a forward pass through the layer.
+     *
+     * @internal
+     *
+     * @param NDArray $input
+     * @return NDArray
+     */
+    public function forward(NDArray $input) : NDArray
+    {
+        $output = $this->activationFn->activate($input);
+
+        $this->input = $input;
+        $this->output = $output;
+
+        return $output;
+    }
+
+    /**
+     * Compute an inferential pass through the layer.
+     *
+     * @internal
+     *
+     * @param NDArray $input
+     * @return NDArray
+     */
+    public function infer(NDArray $input) : NDArray
+    {
+        return $this->activationFn->activate($input);
+    }
+
+    /**
+     * Calculate the gradient and update the parameters of the layer.
+     *
+     * @internal
+     *
+     * @param Deferred $prevGradient
+     * @param Optimizer $optimizer
+     * @throws RuntimeException
+     * @return Deferred
+     */
+    public function back(Deferred $prevGradient, Optimizer $optimizer) : Deferred
+    {
+        if (!$this->input or !$this->output) {
+            throw new RuntimeException('Must perform forward pass before backpropagating.');
+        }
+
+        $input = $this->input;
+        $output = $this->output;
+
+        $this->input = $this->output = null;
+
+        return new Deferred(
+            [$this, 'gradient'],
+            [$input, $output, $prevGradient]
+        );
+    }
+
+    /**
+     * Calculate the gradient for the previous layer.
+     *
+     * @internal
+     *
+     * @param NDArray $input
+     * @param NDArray $output
+     * @param Deferred $prevGradient
+     * @return NDArray
+     */
+    public function gradient(NDArray $input, NDArray $output, Deferred $prevGradient) : NDArray
+    {
+        return NumPower::multiply(
+            $this->activationFn->differentiate($input),
+            $prevGradient()
+        );
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Activation (activation fn: {$this->activationFn})";
+    }
+}
diff --git a/src/NeuralNet/Layers/Base/Contracts/Hidden.php b/src/NeuralNet/Layers/Base/Contracts/Hidden.php
new file mode 100644
index 000000000..f903e3916
--- /dev/null
+++ b/src/NeuralNet/Layers/Base/Contracts/Hidden.php
@@ -0,0 +1,28 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Layers\Base\Contracts;
+
+use Rubix\ML\Deferred;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+
+/**
+ * Hidden
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+interface Hidden extends Layer
+{
+    /**
+     * Calculate the gradient and update the parameters of the layer.
+     *
+     * @internal
+     *
+     * @param Deferred $prevGradient
+     * @param Optimizer $optimizer
+     * @return Deferred
+     */
+    public function back(Deferred $prevGradient, Optimizer $optimizer) : Deferred;
+}
diff --git a/src/NeuralNet/Layers/Base/Contracts/Layer.php b/src/NeuralNet/Layers/Base/Contracts/Layer.php
new file mode 100644
index 000000000..10cf17b6e
--- /dev/null
+++ b/src/NeuralNet/Layers/Base/Contracts/Layer.php
@@ -0,0 +1,57 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Layers\Base\Contracts;
+
+use NDArray;
+use Stringable;
+
+/**
+ * Hidden
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+interface Layer extends Stringable
+{
+    /**
+     * The width of the layer. i.e. the number of neurons or computation nodes.
+     *
+     * @internal
+     *
+     * @return positive-int
+     */
+    public function width() : int;
+
+    /**
+     * Initialize the layer with the fan in from the previous layer and return
+     * the fan out for this layer.
+     *
+     * @internal
+     *
+     * @param positive-int $fanIn
+     * @return positive-int
+     */
+    public function initialize(int $fanIn) : int;
+
+    /**
+     * Feed the input forward to the next layer in the network.
+     *
+     * @internal
+     *
+     * @param NDArray $input
+     * @return NDArray
+     */
+    public function forward(NDArray $input) : NDArray;
+
+    /**
+     * Forward pass during inference.
+     *
+     * @internal
+     *
+     * @param NDArray $input
+     * @return NDArray
+     */
+    public function infer(NDArray $input) : NDArray;
+}
diff --git a/src/NeuralNet/Layers/Base/Contracts/Output.php b/src/NeuralNet/Layers/Base/Contracts/Output.php
new file mode 100644
index 000000000..49e11bb4b
--- /dev/null
+++ b/src/NeuralNet/Layers/Base/Contracts/Output.php
@@ -0,0 +1,29 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Layers\Base\Contracts;
+
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\Exceptions\RuntimeException;
+
+/**
+ * Output
+ *
+ * @internal
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+interface Output extends Layer
+{
+    /**
+     * Compute the gradient and loss at the output.
+     *
+     * @param (string|int|float)[] $labels
+     * @param Optimizer $optimizer
+     * @throws RuntimeException
+     * @return mixed[]
+     */
+    public function back(array $labels, Optimizer $optimizer) : array;
+}
diff --git a/src/NeuralNet/Layers/Base/Contracts/Parametric.php b/src/NeuralNet/Layers/Base/Contracts/Parametric.php
new file mode 100644
index 000000000..ed772c85d
--- /dev/null
+++ b/src/NeuralNet/Layers/Base/Contracts/Parametric.php
@@ -0,0 +1,33 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Layers\Base\Contracts;
+
+use Generator;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+
+/**
+ * Parametric
+ *
+ * @internal
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+interface Parametric
+{
+    /**
+     * Return the parameters of the layer.
+     *
+     * @return Generator<\Rubix\ML\NeuralNet\Parameter>
+     */
+    public function parameters() : Generator;
+
+    /**
+     * Restore the parameters on the layer from an associative array.
+     *
+     * @param Parameter[] $parameters
+     */
+    public function restore(array $parameters) : void;
+}
diff --git a/src/NeuralNet/Layers/BatchNorm/BatchNorm.php b/src/NeuralNet/Layers/BatchNorm/BatchNorm.php
new file mode 100644
index 000000000..a15b1fac5
--- /dev/null
+++ b/src/NeuralNet/Layers/BatchNorm/BatchNorm.php
@@ -0,0 +1,389 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Layers\BatchNorm;
+
+use NDArray;
+use NumPower;
+use Rubix\ML\Deferred;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\Exceptions\RuntimeException;
+use Rubix\ML\NeuralNet\Layers\Base\Contracts\Hidden;
+use Rubix\ML\NeuralNet\Layers\Base\Contracts\Parametric;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\NeuralNet\Initializers\Base\Initializer;
+use Rubix\ML\NeuralNet\Initializers\Constant\Constant;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+use Generator;
+
+use const Rubix\ML\EPSILON;
+
+/**
+ * Batch Norm
+ *
+ * Normalize the activations of the previous layer such that the mean activation
+ * is close to 0 and the standard deviation is close to 1. Batch Norm can reduce
+ * the amount of covariate shift within the network which makes it possible to use
+ * higher learning rates and converge faster under some circumstances.
+ *
+ * References:
+ * [1] S. Ioffe et al. (2015). Batch Normalization: Accelerating Deep Network
+ * Training by Reducing Internal Covariate Shift.
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+class BatchNorm implements Hidden, Parametric
+{
+    /**
+     * The decay rate of the previous running averages of the global mean and variance.
+     *
+     * @var float
+     */
+    protected float $decay;
+
+    /**
+     * The initializer for the beta parameter.
+     *
+     * @var Initializer
+     */
+    protected Initializer $betaInitializer;
+
+    /**
+     * The initializer for the gamma parameter.
+     *
+     * @var Initializer
+     */
+    protected Initializer $gammaInitializer;
+
+    /**
+     * The width of the layer. i.e. the number of neurons.
+     *
+     * @var positive-int|null
+     */
+    protected ?int $width = null;
+
+    /**
+     * The learnable centering parameter.
+     *
+     * @var Parameter|null
+     */
+    protected ?Parameter $beta = null;
+
+    /**
+     * The learnable scaling parameter.
+     *
+     * @var Parameter|null
+     */
+    protected ?Parameter $gamma = null;
+
+    /**
+     * The running mean of each input dimension.
+     *
+     * @var NDArray|null
+     */
+    protected ?NDArray $mean = null;
+
+    /**
+     * The running variance of each input dimension.
+     *
+     * @var NDArray|null
+     */
+    protected ?NDArray $variance = null;
+
+    /**
+     * A cache of inverse standard deviations calculated during the forward pass.
+     *
+     * @var NDArray|null
+     */
+    protected ?NDArray $stdInv = null;
+
+    /**
+     * A cache of normalized inputs to the layer.
+     *
+     * @var NDArray|null
+     */
+    protected ?NDArray $xHat = null;
+
+    /**
+     * Row-wise or column-wise normalization.
+     *
+     * @var int
+     */
+    protected const int AXIS_SAMPLES = 0;
+    protected const int AXIS_FEATURES = 1;
+
+    /**
+     * @param float $decay
+     * @param Initializer|null $betaInitializer
+     * @param Initializer|null $gammaInitializer
+     * @throws InvalidArgumentException
+     */
+    public function __construct(float $decay = 0.1, ?Initializer $betaInitializer = null, ?Initializer $gammaInitializer = null)
+    {
+        if ($decay < 0.0 or $decay > 1.0) {
+            throw new InvalidArgumentException("Decay must be between 0 and 1, $decay given.");
+        }
+
+        $this->decay = $decay;
+        $this->betaInitializer = $betaInitializer ?? new Constant(0.0);
+        $this->gammaInitializer = $gammaInitializer ?? new Constant(1.0);
+    }
+
+    /**
+     * Return the width of the layer.
+     *
+     * @internal
+     *
+     * @throws RuntimeException
+     * @return positive-int
+     */
+    public function width() : int
+    {
+        if ($this->width === null) {
+            throw new RuntimeException('Layer has not been initialized.');
+        }
+
+        return $this->width;
+    }
+
+    /**
+     * Initialize the layer with the fan in from the previous layer and return
+     * the fan out for this layer.
+     *
+     * @internal
+     *
+     * @param positive-int $fanIn
+     * @return positive-int
+     */
+    public function initialize(int $fanIn) : int
+    {
+        $fanOut = $fanIn;
+
+        // Initialize beta and gamma as vectors of length fanOut
+        // We request a [fanOut, 1] NDArray and then flatten to 1-D
+        $betaMat = $this->betaInitializer->initialize(1, $fanOut);
+        $gammaMat = $this->gammaInitializer->initialize(1, $fanOut);
+
+        $beta = NumPower::flatten($betaMat);
+        $gamma = NumPower::flatten($gammaMat);
+
+        $this->beta = new Parameter($beta);
+        $this->gamma = new Parameter($gamma);
+
+        $this->width = $fanOut;
+
+        return $fanOut;
+    }
+
+    /**
+     * Compute a forward pass through the layer.
+     *
+     * @internal
+     *
+     * @param NDArray $input
+     * @throws RuntimeException
+     * @return NDArray
+     */
+    public function forward(NDArray $input) : NDArray
+    {
+        if (!$this->beta or !$this->gamma) {
+            throw new RuntimeException('Layer has not been initialized.');
+        }
+
+        [$m, $n] = $input->shape();
+
+        // Row-wise mean across features (axis 1), length m
+        $sum = NumPower::sum($input, 1);
+        $mean = NumPower::divide($sum, $n);
+
+        // Center the input: broadcast mean to [m, n]
+        $centered = NumPower::subtract($input, NumPower::reshape($mean, [$m, 1]));
+
+        // Row-wise variance across features (axis 1)
+        $centeredSq = NumPower::multiply($centered, $centered);
+        $varSum = NumPower::sum($centeredSq, 1);
+        $variance = NumPower::divide($varSum, $n);
+        $variance = NumPower::clip($variance, EPSILON, PHP_FLOAT_MAX);
+
+        // Inverse std from clipped variance
+        $stdInv = NumPower::reciprocal(NumPower::sqrt($variance));
+
+        // Normalize: (x - mean) * stdInv
+        $xHat = NumPower::multiply($centered, NumPower::reshape($stdInv, [$m, 1]));
+
+        // Initialize running stats if needed
+        if (!$this->mean or !$this->variance) {
+            $this->mean = $mean;
+            $this->variance = $variance;
+        }
+
+        // Update running mean/variance using exponential moving average (EMA)
+        // Convention: running = running*(1 - decay) + current*decay
+        $this->mean = NumPower::add(
+            NumPower::multiply($this->mean, 1.0 - $this->decay),
+            NumPower::multiply($mean, $this->decay)
+        );
+
+        $this->variance = NumPower::add(
+            NumPower::multiply($this->variance, 1.0 - $this->decay),
+            NumPower::multiply($variance, $this->decay)
+        );
+
+        $this->stdInv = $stdInv;
+        $this->xHat = $xHat;
+
+        // gamma * xHat + beta (per-column scale/shift) using NDArray ops
+        return NumPower::add(NumPower::multiply($xHat, $this->gamma->param()), $this->beta->param());
+    }
+
+    /**
+     * Compute an inferential pass through the layer.
+     *
+     * @internal
+     *
+     * @param NDArray $input
+     * @throws RuntimeException
+     * @return NDArray
+     */
+    public function infer(NDArray $input) : NDArray
+    {
+        if (!$this->mean or !$this->variance or !$this->beta or !$this->gamma) {
+            throw new RuntimeException('Layer has not been initialized.');
+        }
+
+        $m = $input->shape()[0];
+
+        // Use clipped variance for numerical stability during inference
+        $varianceClipped = NumPower::clip($this->variance, EPSILON, PHP_FLOAT_MAX);
+        $xHat = NumPower::divide(
+            NumPower::subtract($input, NumPower::reshape($this->mean, [$m, 1])),
+            NumPower::reshape(NumPower::sqrt($varianceClipped), [$m, 1])
+        );
+
+        return NumPower::add(
+            NumPower::multiply(
+                $xHat,
+                $this->gamma->param()
+            ),
+            $this->beta->param()
+        );
+    }
+
+    /**
+     * Calculate the errors and gradients of the layer and update the parameters.
+     *
+     * @internal
+     *
+     * @param Deferred $prevGradient
+     * @param Optimizer $optimizer
+     * @throws RuntimeException
+     * @return Deferred
+     */
+    public function back(Deferred $prevGradient, Optimizer $optimizer) : Deferred
+    {
+        if (!$this->beta or !$this->gamma) {
+            throw new RuntimeException('Layer has not been initialized.');
+        }
+
+        if (!$this->stdInv or !$this->xHat) {
+            throw new RuntimeException('Must perform forward pass before backpropagating.');
+        }
+
+        $dOut = $prevGradient();
+        // Sum across samples (axis 0) for parameter gradients
+        $dBeta = NumPower::sum($dOut, self::AXIS_SAMPLES);
+        $dGamma = NumPower::sum(NumPower::multiply($dOut, $this->xHat), self::AXIS_SAMPLES);
+        $gamma = $this->gamma->param();
+
+        $this->beta->update($dBeta, $optimizer);
+        $this->gamma->update($dGamma, $optimizer);
+
+        $stdInv = $this->stdInv;
+        $xHat = $this->xHat;
+
+        $this->stdInv = $this->xHat = null;
+
+        return new Deferred(
+            [$this, 'gradient'],
+            [$dOut, $gamma, $stdInv, $xHat]
+        );
+    }
+
+    /**
+     * Calculate the gradient for the previous layer.
+     *
+     * @internal
+     *
+     * @param NDArray $dOut
+     * @param NDArray $gamma
+     * @param NDArray $stdInv
+     * @param NDArray $xHat
+     * @return NDArray
+     */
+    public function gradient(NDArray $dOut, NDArray $gamma, NDArray $stdInv, NDArray $xHat) : NDArray
+    {
+        $dXHat = NumPower::multiply($dOut, $gamma);
+        $xHatSigma = NumPower::sum(NumPower::multiply($dXHat, $xHat), self::AXIS_FEATURES);
+        $dXHatSigma = NumPower::sum($dXHat, self::AXIS_FEATURES);
+
+        $m = $dOut->shape()[0];
+
+        // Compute gradient per formula: dX = (dXHat * m - dXHatSigma - xHat * xHatSigma) * (stdInv / m)
+        return NumPower::multiply(
+            NumPower::subtract(
+                NumPower::subtract(
+                    NumPower::multiply($dXHat, $m),
+                    NumPower::reshape($dXHatSigma, [$m, 1])
+                ),
+                NumPower::multiply($xHat, NumPower::reshape($xHatSigma, [$m, 1]))
+            ),
+            NumPower::reshape(NumPower::divide($stdInv, $m), [$m, 1])
+        );
+    }
+
+    /**
+     * Return the parameters of the layer.
+     *
+     * @internal
+     *
+     * @throws RuntimeException
+     * @return Generator<Parameter>
+     */
+    public function parameters() : Generator
+    {
+        if (!$this->beta or !$this->gamma) {
+            throw new RuntimeException('Layer has not been initialized.');
+        }
+
+        yield 'beta' => $this->beta;
+        yield 'gamma' => $this->gamma;
+    }
+
+    /**
+     * Restore the parameters in the layer from an associative array.
+     *
+     * @internal
+     *
+     * @param Parameter[] $parameters
+     */
+    public function restore(array $parameters) : void
+    {
+        $this->beta = $parameters['beta'];
+        $this->gamma = $parameters['gamma'];
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Batch Norm (decay: {$this->decay}, beta initializer: {$this->betaInitializer},"
+            . " gamma initializer: {$this->gammaInitializer})";
+    }
+}
diff --git a/src/NeuralNet/Layers/Binary/Binary.php b/src/NeuralNet/Layers/Binary/Binary.php
new file mode 100644
index 000000000..37b6f145b
--- /dev/null
+++ b/src/NeuralNet/Layers/Binary/Binary.php
@@ -0,0 +1,222 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Layers\Binary;
+
+use NDArray;
+use NumPower;
+use Rubix\ML\NeuralNet\Layers\Base\Contracts\Output;
+use Rubix\ML\Deferred;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\NeuralNet\CostFunctions\CrossEntropy\CrossEntropy;
+use Rubix\ML\NeuralNet\ActivationFunctions\Sigmoid\Sigmoid;
+use Rubix\ML\NeuralNet\CostFunctions\Base\Contracts\ClassificationLoss;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\Exceptions\RuntimeException;
+
+use function count;
+
+/**
+ * Binary
+ *
+ * This Binary layer consists of a single sigmoid neuron capable of distinguishing between
+ * two discrete classes.
+ *
+ * @internal
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+class Binary implements Output
+{
+    /**
+     * The labels of either of the possible outcomes.
+     *
+     * @var float[]
+     */
+    protected array $classes = [
+        //
+    ];
+
+    /**
+     * The function that computes the loss of erroneous activations.
+     *
+     * @var ClassificationLoss
+     */
+    protected ClassificationLoss $costFn;
+
+    /**
+     * The sigmoid activation function.
+     *
+     * @var Sigmoid
+     */
+    protected Sigmoid $sigmoid;
+
+    /**
+     * The memorized input matrix.
+     *
+     * @var NDArray|null
+     */
+    protected ?NDArray $input = null;
+
+    /**
+     * The memorized activation matrix.
+     *
+     * @var NDArray|null
+     */
+    protected ?NDArray $output = null;
+
+    /**
+     * @param string[] $classes
+     * @param ClassificationLoss|null $costFn
+     * @throws InvalidArgumentException
+     */
+    public function __construct(array $classes, ?ClassificationLoss $costFn = null)
+    {
+        $classes = array_values(array_unique($classes));
+
+        if (count($classes) !== 2) {
+            throw new InvalidArgumentException('Number of classes must be 2, ' . count($classes) . ' given.');
+        }
+
+        $classes = [
+            $classes[0] => 0.0,
+            $classes[1] => 1.0,
+        ];
+
+        $this->classes = $classes;
+        $this->costFn = $costFn ?? new CrossEntropy();
+        $this->sigmoid = new Sigmoid();
+    }
+
+    /**
+     * Return the width of the layer.
+     *
+     * @return positive-int
+     */
+    public function width() : int
+    {
+        return 1;
+    }
+
+    /**
+     * Initialize the layer with the fan in from the previous layer and return
+     * the fan out for this layer.
+     *
+     * @param positive-int $fanIn
+     * @throws InvalidArgumentException
+     * @return positive-int
+     */
+    public function initialize(int $fanIn) : int
+    {
+        if ($fanIn !== 1) {
+            throw new InvalidArgumentException("Fan in must be equal to 1, $fanIn given.");
+        }
+
+        return 1;
+    }
+
+    /**
+     * Compute a forward pass through the layer.
+     *
+     * @param NDArray $input
+     * @return NDArray
+     */
+    public function forward(NDArray $input) : NDArray
+    {
+        $output = $this->sigmoid->activate($input);
+
+        $this->input = $input;
+        $this->output = $output;
+
+        return $output;
+    }
+
+    /**
+     * Compute an inferential pass through the layer.
+     *
+     * @param NDArray $input
+     * @return NDArray
+     */
+    public function infer(NDArray $input) : NDArray
+    {
+        return $this->sigmoid->activate($input);
+    }
+
+    /**
+     * Compute the gradient and loss at the output.
+     *
+     * @param string[] $labels
+     * @param Optimizer $optimizer
+     * @throws RuntimeException
+     * @return (Deferred|float)[]
+     */
+    public function back(array $labels, Optimizer $optimizer) : array
+    {
+        if (!$this->input or !$this->output) {
+            throw new RuntimeException('Must perform forward pass before backpropagating.');
+        }
+
+        $expected = [];
+
+        foreach ($labels as $label) {
+            $expected[] = $this->classes[$label];
+        }
+
+        $expected = NumPower::array([$expected]);
+
+        $input = $this->input;
+        $output = $this->output;
+
+        $gradient = new Deferred([$this, 'gradient'], [$input, $output, $expected]);
+
+        $loss = $this->costFn->compute($output, $expected);
+
+        $this->input = $this->output = null;
+
+        return [$gradient, $loss];
+    }
+
+    /**
+     * Calculate the gradient for the previous layer.
+     *
+     * @param NDArray $input
+     * @param NDArray $output
+     * @param NDArray $expected
+     * @return NDArray
+     */
+    public function gradient(NDArray $input, NDArray $output, NDArray $expected) : NDArray
+    {
+        $n = $output->shape()[1];
+
+        if ($this->costFn instanceof CrossEntropy) {
+            return NumPower::divide(
+                NumPower::subtract($output, $expected),
+                $n
+            );
+        }
+
+        $dLoss = NumPower::divide(
+            $this->costFn->differentiate($output, $expected),
+            $n
+        );
+
+        return NumPower::multiply(
+            $this->sigmoid->differentiate($output),
+            $dLoss
+        );
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Binary (cost function: {$this->costFn})";
+    }
+}
diff --git a/src/NeuralNet/Layers/Continuous/Continuous.php b/src/NeuralNet/Layers/Continuous/Continuous.php
new file mode 100644
index 000000000..7a07e9735
--- /dev/null
+++ b/src/NeuralNet/Layers/Continuous/Continuous.php
@@ -0,0 +1,157 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Layers\Continuous;
+
+use NDArray;
+use NumPower;
+use Rubix\ML\NeuralNet\Layers\Base\Contracts\Output;
+use Rubix\ML\Deferred;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\NeuralNet\CostFunctions\LeastSquares\LeastSquares;
+use Rubix\ML\NeuralNet\CostFunctions\Base\Contracts\RegressionLoss;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\Exceptions\RuntimeException;
+
+/**
+ * Continuous
+ *
+ * The Continuous output layer consists of a single linear neuron that outputs a scalar value.
+ *
+ * @internal
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+class Continuous implements Output
+{
+    /**
+     * The function that computes the loss of erroneous activations.
+     *
+     * @var RegressionLoss
+     */
+    protected RegressionLoss $costFn;
+
+    /**
+     * The memorized input matrix.
+     *
+     * @var NDArray|null
+     */
+    protected ?NDArray $input = null;
+
+    /**
+     * @param RegressionLoss|null $costFn
+     */
+    public function __construct(?RegressionLoss $costFn = null)
+    {
+        $this->costFn = $costFn ?? new LeastSquares();
+    }
+
+    /**
+     * Return the width of the layer.
+     *
+     * @return positive-int
+     */
+    public function width() : int
+    {
+        return 1;
+    }
+
+    /**
+     * Initialize the layer with the fan in from the previous layer and return
+     * the fan out for this layer.
+     *
+     * @param positive-int $fanIn
+     * @throws InvalidArgumentException
+     * @return positive-int
+     */
+    public function initialize(int $fanIn) : int
+    {
+        if ($fanIn !== 1) {
+            throw new InvalidArgumentException("Fan in must be equal to 1, $fanIn given.");
+        }
+
+        return 1;
+    }
+
+    /**
+     * Compute a forward pass through the layer.
+     *
+     * @param NDArray $input
+     * @return NDArray
+     */
+    public function forward(NDArray $input) : NDArray
+    {
+        $this->input = $input;
+
+        return $input;
+    }
+
+    /**
+     * Compute an inferential pass through the layer.
+     *
+     * @param NDArray $input
+     * @return NDArray
+     */
+    public function infer(NDArray $input) : NDArray
+    {
+        return $input;
+    }
+
+    /**
+     * Compute the gradient and loss at the output.
+     *
+     * @param (int|float)[] $labels
+     * @param Optimizer $optimizer
+     * @throws RuntimeException
+     * @return (Deferred|float)[]
+     */
+    public function back(array $labels, Optimizer $optimizer) : array
+    {
+        if (!$this->input) {
+            throw new RuntimeException('Must perform forward pass before backpropagating.');
+        }
+
+        $expected = NumPower::array([$labels]);
+
+        $input = $this->input;
+
+        $gradient = new Deferred([$this, 'gradient'], [$input, $expected]);
+
+        $loss = $this->costFn->compute($input, $expected);
+
+        $this->input = null;
+
+        return [$gradient, $loss];
+    }
+
+    /**
+     * Calculate the gradient for the previous layer.
+     *
+     * @param NDArray $input
+     * @param NDArray $expected
+     * @return NDArray
+     */
+    public function gradient(NDArray $input, NDArray $expected) : NDArray
+    {
+        $n = $input->shape()[1];
+
+        return NumPower::divide(
+            $this->costFn->differentiate($input, $expected),
+            $n
+        );
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Continuous (cost function: {$this->costFn})";
+    }
+}
diff --git a/src/NeuralNet/Optimizers/AdaGrad/AdaGrad.php b/src/NeuralNet/Optimizers/AdaGrad/AdaGrad.php
new file mode 100644
index 000000000..b6c92bd56
--- /dev/null
+++ b/src/NeuralNet/Optimizers/AdaGrad/AdaGrad.php
@@ -0,0 +1,134 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Optimizers\AdaGrad;
+
+use NDArray;
+use NumPower;
+use Rubix\ML\NeuralNet\Optimizers\Base\Adaptive;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\Exceptions\RuntimeException;
+
+use function get_class;
+
+use const Rubix\ML\EPSILON;
+use const PHP_FLOAT_MAX;
+
+/**
+ * AdaGrad
+ *
+ * Short for Adaptive Gradient, the AdaGrad Optimizer speeds up the learning of
+ * parameters that do not change often and slows down the learning of parameters
+ * that do enjoy heavy activity.
+ *
+ * References:
+ * [1] J. Duchi et al. (2011). Adaptive Subgradient Methods for Online Learning
+ * and Stochastic Optimization.
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+class AdaGrad implements Optimizer, Adaptive
+{
+    /**
+     * The learning rate that controls the global step size.
+     *
+     * @var float
+     */
+    protected float $rate;
+
+    /**
+     * The cache of sum of squared gradients.
+     *
+     * @var NDArray[]
+     */
+    protected array $cache = [
+        //
+    ];
+
+    /**
+     * @param float $rate
+     * @throws InvalidArgumentException
+     */
+    public function __construct(float $rate = 0.01)
+    {
+        if ($rate <= 0.0) {
+            throw new InvalidArgumentException("Learning rate must be greater than 0, $rate given.");
+        }
+
+        $this->rate = $rate;
+    }
+
+    /**
+     * Warm the parameter cache.
+     *
+     * @internal
+     *
+     * @param Parameter $param
+     * @throws RuntimeException
+     */
+    public function warm(Parameter $param) : void
+    {
+        $class = get_class($param->param());
+
+        if (!$class) {
+            throw new RuntimeException('Could not locate parameter class.');
+        }
+
+        $this->cache[$param->id()] = NumPower::zeros($param->param()->shape());
+    }
+
+    /**
+     * Take a step of gradient descent for a given parameter.
+     *
+     * AdaGrad update (element-wise):
+     *   n_t = n_{t-1} + g_t^2
+     *   Δθ_t = η · g_t / max(√n_t, ε)
+     *
+     * where:
+     *   - g_t is the current gradient,
+     *   - n_t is the accumulated (running) sum of squared gradients,
+     *   - η is the learning rate (rate),
+     *   - ε is a small constant to avoid division by zero (implemented via clipping √n_t to [ε, +∞)).
+     *
+     * @internal
+     *
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @return NDArray
+     */
+    public function step(Parameter $param, NDArray $gradient) : NDArray
+    {
+        $norm = $this->cache[$param->id()];
+
+        // Update accumulated squared gradients: norm = norm + gradient^2
+        $norm = NumPower::add($norm, NumPower::square($gradient));
+
+        $this->cache[$param->id()] = $norm;
+
+        // denominator = max(sqrt(norm), EPSILON)
+        $denominator = NumPower::sqrt($norm);
+        $denominator = NumPower::clip($denominator, EPSILON, PHP_FLOAT_MAX);
+
+        // return rate * gradient / denominator
+        return NumPower::divide(
+            NumPower::multiply($gradient, $this->rate),
+            $denominator
+        );
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "AdaGrad (rate: {$this->rate})";
+    }
+}
diff --git a/src/NeuralNet/Optimizers/AdaMax/AdaMax.php b/src/NeuralNet/Optimizers/AdaMax/AdaMax.php
new file mode 100644
index 000000000..ae13d2249
--- /dev/null
+++ b/src/NeuralNet/Optimizers/AdaMax/AdaMax.php
@@ -0,0 +1,90 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Optimizers\AdaMax;
+
+use NDArray;
+use NumPower;
+use Rubix\ML\NeuralNet\Optimizers\Adam\Adam;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+
+use const Rubix\ML\EPSILON;
+use const PHP_FLOAT_MAX;
+
+/**
+ * AdaMax
+ *
+ * A version of Adam that replaces the RMS property with the infinity norm of the gradients.
+ *
+ * References:
+ * [1] D. P. Kingma et al. (2014). Adam: A Method for Stochastic Optimization.
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+class AdaMax extends Adam
+{
+    /**
+     * @param float $rate
+     * @param float $momentumDecay
+     * @param float $normDecay
+     */
+    public function __construct(float $rate = 0.001, float $momentumDecay = 0.1, float $normDecay = 0.001)
+    {
+        parent::__construct($rate, $momentumDecay, $normDecay);
+    }
+
+    /**
+     * Take a step of gradient descent for a given parameter.
+     *
+     * AdaMax update (element-wise):
+     *   v_t = v_{t-1} + β1 · (g_t − v_{t-1})
+     *   u_t = max(β2 · u_{t-1}, |g_t|)
+     *   Δθ_t = η · v_t / max(u_t, ε)
+     *
+     * @internal
+     *
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @return NDArray
+     */
+    public function step(Parameter $param, NDArray $gradient) : NDArray
+    {
+        [$velocity, $norm] = $this->cache[$param->id()];
+
+        $vHat = NumPower::multiply(
+            NumPower::subtract($gradient, $velocity),
+            $this->momentumDecay
+        );
+
+        $velocity = NumPower::add($velocity, $vHat);
+
+        // Infinity norm accumulator
+        $norm = NumPower::multiply($norm, 1.0 - $this->normDecay);
+        $absGrad = NumPower::abs($gradient);
+        $norm = NumPower::maximum($norm, $absGrad);
+
+        $this->cache[$param->id()] = [$velocity, $norm];
+
+        $norm = NumPower::clip($norm, EPSILON, PHP_FLOAT_MAX);
+
+        return NumPower::multiply(
+            NumPower::divide($velocity, $norm),
+            $this->rate
+        );
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "AdaMax (rate: {$this->rate}, momentum decay: {$this->momentumDecay},"
+            . " norm decay: {$this->normDecay})";
+    }
+}
diff --git a/src/NeuralNet/Optimizers/Adam/Adam.php b/src/NeuralNet/Optimizers/Adam/Adam.php
new file mode 100644
index 000000000..fad8ac1bf
--- /dev/null
+++ b/src/NeuralNet/Optimizers/Adam/Adam.php
@@ -0,0 +1,181 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Optimizers\Adam;
+
+use NDArray;
+use NumPower;
+use Rubix\ML\NeuralNet\Optimizers\Base\Adaptive;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\Exceptions\RuntimeException;
+
+use function get_class;
+
+use const Rubix\ML\EPSILON;
+use const PHP_FLOAT_MAX;
+
+/**
+ * Adam
+ *
+ * Short for *Adaptive Moment Estimation*, the Adam Optimizer combines both
+ * Momentum and RMS prop to achieve a balance of velocity and stability. In
+ * addition to storing an exponentially decaying average of past squared
+ * gradients like RMSprop, Adam also keeps an exponentially decaying average
+ * of past gradients, similar to Momentum. Whereas Momentum can be seen as a
+ * ball running down a slope, Adam behaves like a heavy ball with friction.
+ *
+ * References:
+ * [1] D. P. Kingma et al. (2014). Adam: A Method for Stochastic Optimization.
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+class Adam implements Optimizer, Adaptive
+{
+    /**
+     * The learning rate that controls the global step size.
+     *
+     * @var float
+     */
+    protected float $rate;
+
+    /**
+     * The momentum decay rate.
+     *
+     * @var float
+     */
+    protected float $momentumDecay;
+
+    /**
+     * The decay rate of the previous norms.
+     *
+     * @var float
+     */
+    protected float $normDecay;
+
+    /**
+     * The parameter cache of running velocity and squared gradients.
+     *
+     * @var array{0: NDArray, 1: NDArray}[]
+     */
+    protected array $cache = [
+        // id => [velocity, norm]
+    ];
+
+    /**
+     * @param float $rate
+     * @param float $momentumDecay
+     * @param float $normDecay
+     * @throws InvalidArgumentException
+     */
+    public function __construct(float $rate = 0.001, float $momentumDecay = 0.1, float $normDecay = 0.001)
+    {
+        if ($rate <= 0.0) {
+            throw new InvalidArgumentException(
+                "Learning rate must be greater than 0, $rate given."
+            );
+        }
+
+        if ($momentumDecay <= 0.0 or $momentumDecay >= 1.0) {
+            throw new InvalidArgumentException(
+                "Momentum decay must be between 0 and 1, $momentumDecay given."
+            );
+        }
+
+        if ($normDecay <= 0.0 or $normDecay >= 1.0) {
+            throw new InvalidArgumentException(
+                "Norm decay must be between 0 and 1, $normDecay given."
+            );
+        }
+
+        $this->rate = $rate;
+        $this->momentumDecay = $momentumDecay;
+        $this->normDecay = $normDecay;
+    }
+
+    /**
+     * Warm the parameter cache.
+     *
+     * @internal
+     *
+     * @param Parameter $param
+     * @throws RuntimeException
+     */
+    public function warm(Parameter $param) : void
+    {
+        $class = get_class($param->param());
+
+        if (!$class) {
+            throw new RuntimeException('Could not locate parameter class.');
+        }
+
+        $zeros = NumPower::zeros($param->param()->shape());
+
+        $this->cache[$param->id()] = [clone $zeros, $zeros];
+    }
+
+    /**
+     * Take a step of gradient descent for a given parameter.
+     *
+     * Adam update (element-wise):
+     *   v_t = v_{t-1} + β1 · (g_t − v_{t-1})        // exponential moving average of gradients
+     *   n_t = n_{t-1} + β2 · (g_t^2 − n_{t-1})      // exponential moving average of squared gradients
+     *   Δθ_t = η · v_t / max(√n_t, ε)
+     *
+     * where:
+     *   - g_t is the current gradient,
+     *   - v_t is the running average of gradients ("velocity"), β1 = momentumDecay,
+     *   - n_t is the running average of squared gradients ("norm"), β2 = normDecay,
+     *   - η is the learning rate (rate), ε is a small constant to avoid division by zero (implemented by clipping √n_t to [ε, +∞)).
+     *
+     * @internal
+     *
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @return NDArray
+     */
+    public function step(Parameter $param, NDArray $gradient) : NDArray
+    {
+        [$velocity, $norm] = $this->cache[$param->id()];
+
+        $vHat = NumPower::multiply(
+            NumPower::subtract($gradient, $velocity),
+            $this->momentumDecay
+        );
+
+        $velocity = NumPower::add($velocity, $vHat);
+
+        $nHat = NumPower::multiply(
+            NumPower::subtract(NumPower::square($gradient), $norm),
+            $this->normDecay
+        );
+
+        $norm = NumPower::add($norm, $nHat);
+
+        $this->cache[$param->id()] = [$velocity, $norm];
+
+        $denominator = NumPower::sqrt($norm);
+        $denominator = NumPower::clip($denominator, EPSILON, PHP_FLOAT_MAX);
+
+        return NumPower::divide(
+            NumPower::multiply($velocity, $this->rate),
+            $denominator
+        );
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Adam (rate: {$this->rate}, momentum decay: {$this->momentumDecay},"
+            . " norm decay: {$this->normDecay})";
+    }
+}
diff --git a/src/NeuralNet/Optimizers/Base/Adaptive.php b/src/NeuralNet/Optimizers/Base/Adaptive.php
new file mode 100644
index 000000000..35ee5323b
--- /dev/null
+++ b/src/NeuralNet/Optimizers/Base/Adaptive.php
@@ -0,0 +1,25 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Optimizers\Base;
+
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+
+/**
+ * Adaptive
+ *
+ * @internal
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+interface Adaptive extends Optimizer
+{
+    /**
+     * Warm the parameter cache.
+     *
+     * @param Parameter $param
+     */
+    public function warm(Parameter $param) : void;
+}
diff --git a/src/NeuralNet/Optimizers/Cyclical/Cyclical.php b/src/NeuralNet/Optimizers/Cyclical/Cyclical.php
new file mode 100644
index 000000000..ac22d9d52
--- /dev/null
+++ b/src/NeuralNet/Optimizers/Cyclical/Cyclical.php
@@ -0,0 +1,166 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Optimizers\Cyclical;
+
+use NDArray;
+use NumPower;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+
+/**
+ * Cyclical
+ *
+ * The Cyclical optimizer uses a global learning rate that cycles between the
+ * lower and upper bound over a designated period while also decaying the
+ * upper bound by the decay coefficient at each step. Cyclical learning rates
+ * have been shown to help escape bad local minima and saddle points thus
+ * achieving lower training loss.
+ *
+ * References:
+ * [1] L. N. Smith. (2017). Cyclical Learning Rates for Training Neural Networks.
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+class Cyclical implements Optimizer
+{
+    /**
+     * The lower bound on the learning rate.
+     *
+     * @var float
+     */
+    protected float $lower;
+
+    /**
+     * The upper bound on the learning rate.
+     *
+     * @var float
+     */
+    protected float $upper;
+
+    /**
+     * The range of the learning rate.
+     *
+     * @var float
+     */
+    protected float $range;
+
+    /**
+     * The number of steps in every cycle.
+     *
+     * @var int
+     */
+    protected int $losses;
+
+    /**
+     * The exponential scaling factor applied to each step as decay.
+     *
+     * @var float
+     */
+    protected float $decay;
+
+    /**
+     * The number of steps taken so far.
+     *
+     * @var int
+     */
+    protected int $t = 0;
+
+    /**
+     * @param float $lower
+     * @param float $upper
+     * @param int $losses
+     * @param float $decay
+     * @throws InvalidArgumentException
+     */
+    public function __construct(
+        float $lower = 0.001,
+        float $upper = 0.006,
+        int $losses = 2000,
+        float $decay = 0.99994
+    ) {
+        if ($lower <= 0.0) {
+            throw new InvalidArgumentException(
+                "Lower bound must be greater than 0, $lower given."
+            );
+        }
+
+        if ($lower > $upper) {
+            throw new InvalidArgumentException(
+                'Lower bound cannot be reater than the upper bound.'
+            );
+        }
+
+        if ($losses < 1) {
+            throw new InvalidArgumentException(
+                "The number of steps per cycle must be greater than 0, $losses given."
+            );
+        }
+
+        if ($decay <= 0.0 or $decay >= 1.0) {
+            throw new InvalidArgumentException(
+                "Decay must be between 0 and 1, $decay given."
+            );
+        }
+
+        $this->lower = $lower;
+        $this->upper = $upper;
+        $this->range = $upper - $lower;
+        $this->losses = $losses;
+        $this->decay = $decay;
+    }
+
+    /**
+     * Take a step of gradient descent for a given parameter.
+     *
+     * Cyclical learning rate schedule (per-step, element-wise update):
+     *   - Cycle index:           cycle = floor(1 + t / (2 · losses))
+     *   - Triangular position:   x     = | t / losses − 2 · cycle + 1 |
+     *   - Exponential decay:     scale = decay^t
+     *   - Learning rate at t:    η_t   = lower + (upper − lower) · max(0, 1 − x) · scale
+     *   - Returned step:         Δθ_t  = η_t · g_t
+     *
+     * where:
+     *   - t is the current step counter (incremented after computing η_t),
+     *   - losses is the number of steps per cycle,
+     *   - lower and upper are the learning rate bounds,
+     *   - decay is the multiplicative decay applied each step,
+     *   - g_t is the current gradient.
+     *
+     * @internal
+     *
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @return NDArray
+     */
+    public function step(Parameter $param, NDArray $gradient) : NDArray
+    {
+        $cycle = floor(1 + $this->t / (2 * $this->losses));
+
+        $x = abs($this->t / $this->losses - 2 * $cycle + 1);
+
+        $scale = $this->decay ** $this->t;
+
+        $rate = $this->lower + $this->range * max(0, 1 - $x) * $scale;
+
+        ++$this->t;
+
+        return NumPower::multiply($gradient, $rate);
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Cyclical (lower: {$this->lower}, upper: {$this->upper},"
+            . " steps: {$this->losses}, decay: {$this->decay})";
+    }
+}
diff --git a/src/NeuralNet/Optimizers/Momentum/Momentum.php b/src/NeuralNet/Optimizers/Momentum/Momentum.php
new file mode 100644
index 000000000..05e62fa0b
--- /dev/null
+++ b/src/NeuralNet/Optimizers/Momentum/Momentum.php
@@ -0,0 +1,164 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Optimizers\Momentum;
+
+use NDArray;
+use NumPower;
+use Rubix\ML\NeuralNet\Optimizers\Base\Adaptive;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\Helpers\Params;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\Exceptions\RuntimeException;
+
+use function get_class;
+
+/**
+ * Momentum
+ *
+ * Momentum adds velocity to each step until exhausted. It does so by accumulating momentum from past updates and adding
+ * a factor of the previous velocity to the current step.
+ *
+ * References:
+ * [1] D. E. Rumelhart et al. (1988). Learning representations by back-propagating errors.
+ * [2] I. Sutskever et al. (2013). On the importance of initialization and momentum in deep learning.
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+class Momentum implements Optimizer, Adaptive
+{
+    /**
+     * The learning rate that controls the global step size.
+     *
+     * @var float
+     */
+    protected float $rate;
+
+    /**
+     * The rate at which the momentum force decays.
+     *
+     * @var float
+     */
+    protected float $decay;
+
+    /**
+     * Should we employ Nesterov's lookahead (NAG) when updating the parameters?
+     *
+     * @var bool
+     */
+    protected bool $lookahead;
+
+    /**
+     * The parameter cache of velocity NDArrays.
+     *
+     * @var NDArray[]
+     */
+    protected array $cache = [
+        //
+    ];
+
+    /**
+     * @param float $rate
+     * @param float $decay
+     * @param bool $lookahead
+     * @throws InvalidArgumentException
+     */
+    public function __construct(float $rate = 0.001, float $decay = 0.1, bool $lookahead = false)
+    {
+        if ($rate <= 0.0) {
+            throw new InvalidArgumentException(
+                "Learning rate must be greater than 0, $rate given."
+            );
+        }
+
+        if ($decay <= 0.0 or $decay >= 1.0) {
+            throw new InvalidArgumentException(
+                "Decay must be between 0 and 1, $decay given."
+            );
+        }
+
+        $this->rate = $rate;
+        $this->decay = $decay;
+        $this->lookahead = $lookahead;
+    }
+
+    /**
+     * Warm the cache.
+     *
+     * @internal
+     *
+     * @param Parameter $param
+     * @throws RuntimeException
+     */
+    public function warm(Parameter $param) : void
+    {
+        $class = get_class($param->param());
+
+        if (!$class) {
+            throw new RuntimeException('Could not locate parameter class.');
+        }
+
+        $this->cache[$param->id()] = NumPower::zeros($param->param()->shape());
+    }
+
+    /**
+     * Take a step of gradient descent for a given parameter.
+     *
+     * Mathematical formulation (per-parameter element):
+     * - Velocity update: v_t = β · v_{t-1} + η · g_t
+     *   where β = 1 − decay and η = rate, and g_t is the current gradient.
+     * - Returned step (the amount added to the parameter by the trainer): Δθ_t = v_t
+     *
+     * Nesterov lookahead (when lookahead = true):
+     * - We apply the same velocity update a second time to approximate NAG:
+     *   v_t ← β · v_t + η · g_t
+     *
+     * Notes:
+     * - This method updates and caches the velocity tensor per Parameter id.
+     * - The actual parameter update is performed by the training loop using the returned velocity.
+     *
+     * @internal
+     *
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @return NDArray
+     */
+    public function step(Parameter $param, NDArray $gradient) : NDArray
+    {
+        $velocity = $this->cache[$param->id()];
+
+        // velocity = gradient * rate + velocity * (1 - decay)
+        $velocity = NumPower::add(
+            NumPower::multiply($gradient, $this->rate),
+            NumPower::multiply($velocity, 1.0 - $this->decay)
+        );
+
+        $this->cache[$param->id()] = $velocity;
+
+        if ($this->lookahead) {
+            // Apply lookahead: velocity = gradient * rate + velocity * (1 - decay)
+            $velocity = NumPower::add(
+                NumPower::multiply($gradient, $this->rate),
+                NumPower::multiply($velocity, 1.0 - $this->decay)
+            );
+        }
+
+        return $velocity;
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Momentum (rate: {$this->rate}, decay: {$this->decay},"
+            . ' lookahead: ' . Params::toString($this->lookahead) . ')';
+    }
+}
diff --git a/src/NeuralNet/Optimizers/RMSProp/RMSProp.php b/src/NeuralNet/Optimizers/RMSProp/RMSProp.php
new file mode 100644
index 000000000..7c08aebb2
--- /dev/null
+++ b/src/NeuralNet/Optimizers/RMSProp/RMSProp.php
@@ -0,0 +1,158 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Optimizers\RMSProp;
+
+use NDArray;
+use NumPower;
+use Rubix\ML\NeuralNet\Optimizers\Base\Adaptive;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\Exceptions\RuntimeException;
+
+use function get_class;
+
+use const Rubix\ML\EPSILON;
+use const PHP_FLOAT_MAX;
+
+/**
+ * RMS Prop
+ *
+ * An adaptive gradient technique that divides the current gradient over a rolling window
+ * of magnitudes of recent gradients.
+ *
+ * References:
+ * [1] T. Tieleman et al. (2012). Lecture 6e rmsprop: Divide the
+ * gradient by a running average of its recent magnitude.
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+class RMSProp implements Optimizer, Adaptive
+{
+    /**
+     * The learning rate that controls the global step size.
+     *
+     * @var float
+     */
+    protected float $rate;
+
+    /**
+     * The rms decay rate.
+     *
+     * @var float
+     */
+    protected float $decay;
+
+    /**
+     * The opposite of the rms decay rate.
+     *
+     * @var float
+     */
+    protected float $rho;
+
+    /**
+     * The cache of running squared gradients.
+     *
+     * @var NDArray[]
+     */
+    protected array $cache = [
+        //
+    ];
+
+    /**
+     * @param float $rate
+     * @param float $decay
+     * @throws InvalidArgumentException
+     */
+    public function __construct(float $rate = 0.001, float $decay = 0.1)
+    {
+        if ($rate <= 0.0) {
+            throw new InvalidArgumentException(
+                "Learning rate must be greater than 0, $rate given."
+            );
+        }
+
+        if ($decay <= 0.0 or $decay >= 1.0) {
+            throw new InvalidArgumentException(
+                "Decay must be between 0 and 1, $decay given."
+            );
+        }
+
+        $this->rate = $rate;
+        $this->decay = $decay;
+        $this->rho = 1.0 - $decay;
+    }
+
+    /**
+     * Warm the parameter cache.
+     *
+     * @internal
+     *
+     * @param Parameter $param
+     * @throws RuntimeException
+     */
+    public function warm(Parameter $param) : void
+    {
+        $class = get_class($param->param());
+
+        if (!$class) {
+            throw new RuntimeException('Could not locate parameter class.');
+        }
+
+        $this->cache[$param->id()] = NumPower::zeros($param->param()->shape());
+    }
+
+    /**
+     * Take a step of gradient descent for a given parameter.
+     *
+     * RMSProp update (element-wise):
+     *   v_t = ρ · v_{t-1} + (1 − ρ) · g_t^2
+     *   Δθ_t = η · g_t / max(sqrt(v_t), ε)
+     *
+     * where:
+     *   - g_t is the current gradient,
+     *   - v_t is the running average of squared gradients,
+     *   - ρ = 1 − decay, η is the learning rate,
+     *   - ε is a small constant to avoid division by zero (implemented by clipping √v_t to [ε, +∞)).
+     *
+     * @internal
+     *
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @return NDArray
+     */
+    public function step(Parameter $param, NDArray $gradient) : NDArray
+    {
+        $norm = $this->cache[$param->id()];
+
+        $norm = NumPower::add(
+            NumPower::multiply($norm, $this->rho),
+            NumPower::multiply(NumPower::square($gradient), $this->decay)
+        );
+
+        $this->cache[$param->id()] = $norm;
+
+        $denominator = NumPower::sqrt($norm);
+        $denominator = NumPower::clip($denominator, EPSILON, PHP_FLOAT_MAX);
+
+        return NumPower::divide(
+            NumPower::multiply($gradient, $this->rate),
+            $denominator
+        );
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "RMS Prop (rate: {$this->rate}, decay: {$this->decay})";
+    }
+}
diff --git a/src/NeuralNet/Optimizers/StepDecay/StepDecay.php b/src/NeuralNet/Optimizers/StepDecay/StepDecay.php
new file mode 100644
index 000000000..abfeb6f7e
--- /dev/null
+++ b/src/NeuralNet/Optimizers/StepDecay/StepDecay.php
@@ -0,0 +1,127 @@
+<?php
+
+namespace Rubix\ML\NeuralNet\Optimizers\StepDecay;
+
+use NDArray;
+use NumPower;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+
+/**
+ * Step Decay
+ *
+ * A linear learning rate scheduler that reduces the learning rate by a factor
+ * of the decay parameter whenever it reaches a new *floor*. The number of
+ * steps needed to reach a new floor is defined by the *steps* parameter.
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ * @author      Samuel Akopyan <leumas.a@gmail.com>
+ */
+class StepDecay implements Optimizer
+{
+    /**
+     * The learning rate that controls the global step size.
+     *
+     * @var float
+     */
+    protected float $rate;
+
+    /**
+     * The size of every floor in steps. i.e. the number of steps to take before applying another factor of decay.
+     *
+     * @var int
+     */
+    protected int $losses;
+
+    /**
+     * The factor to decrease the learning rate by over a period of k steps.
+     *
+     * @var float
+     */
+    protected float $decay;
+
+    /**
+     * The number of steps taken so far.
+     *
+     * @var int
+     */
+    protected int $steps = 0;
+
+    /**
+     * @param float $rate
+     * @param int $losses
+     * @param float $decay
+     * @throws InvalidArgumentException
+     */
+    public function __construct(float $rate = 0.01, int $losses = 100, float $decay = 1e-3)
+    {
+        if ($rate <= 0.0) {
+            throw new InvalidArgumentException(
+                "Learning rate must be greater than 0, $rate given."
+            );
+        }
+
+        if ($losses < 1) {
+            throw new InvalidArgumentException(
+                "The number of steps per floor must be greater than 0, $losses given."
+            );
+        }
+
+        if ($decay < 0.0) {
+            throw new InvalidArgumentException(
+                "Decay rate must be positive, $decay given."
+            );
+        }
+
+        $this->rate = $rate;
+        $this->losses = $losses;
+        $this->decay = $decay;
+    }
+
+    /**
+     * Take a step of gradient descent for a given parameter.
+     *
+     * Step Decay update (element-wise):
+     *   floor = ⌊t / k⌋
+     *   η_t = η₀ / (1 + floor · λ)
+     *   Δθ_t = η_t · g_t
+     *
+     * where:
+     *   - t is the current step number,
+     *   - k is the number of steps per floor,
+     *   - η₀ is the initial learning rate,
+     *   - λ is the decay factor,
+     *   - g_t is the current gradient.
+     *
+     * @internal
+     *
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @return NDArray
+     */
+    public function step(Parameter $param, NDArray $gradient) : NDArray
+    {
+        $floor = floor($this->steps / $this->losses);
+
+        $rate = $this->rate * (1.0 / (1.0 + $floor * $this->decay));
+
+        ++$this->steps;
+
+        return NumPower::multiply($gradient, $rate);
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Step Decay (rate: {$this->rate}, steps: {$this->losses}, decay: {$this->decay})";
+    }
+}
diff --git a/src/NeuralNet/Optimizers/Stochastic/Stochastic.php b/src/NeuralNet/Optimizers/Stochastic/Stochastic.php
index ffd9daf30..b2cd6ebac 100644
--- a/src/NeuralNet/Optimizers/Stochastic/Stochastic.php
+++ b/src/NeuralNet/Optimizers/Stochastic/Stochastic.php
@@ -35,7 +35,9 @@ class Stochastic implements Optimizer
     public function __construct(float $rate = 0.01)
     {
         if ($rate <= 0.0) {
-            throw new InvalidArgumentException("Learning rate must be greater than 0, $rate given.");
+            throw new InvalidArgumentException(
+                "Learning rate must be greater than 0, $rate given."
+            );
         }
 
         $this->rate = $rate;
@@ -44,6 +46,13 @@ public function __construct(float $rate = 0.01)
     /**
      * Take a step of gradient descent for a given parameter.
      *
+     * SGD update (element-wise):
+     *   Δθ_t = η · g_t
+     *
+     * where:
+     *   - g_t is the current gradient,
+     *   - η is the learning rate.
+     *
      * @internal
      *
      * @param Parameter $param
diff --git a/src/NeuralNet/Parameters/Parameter.php b/src/NeuralNet/Parameters/Parameter.php
index efa7cf88a..0cef2e87a 100644
--- a/src/NeuralNet/Parameters/Parameter.php
+++ b/src/NeuralNet/Parameters/Parameter.php
@@ -22,7 +22,6 @@
 
 /**
  * Parameter
- *
  */
 class Parameter
 {
@@ -61,7 +60,7 @@ public function __construct(NDArray $param)
      *
      * @return int
      */
-    public function id(): int
+    public function id() : int
     {
         return $this->id;
     }
@@ -71,7 +70,7 @@ public function id(): int
      *
      * @return NDArray
      */
-    public function param(): NDArray
+    public function param() : NDArray
     {
         return $this->param;
     }
@@ -79,10 +78,10 @@ public function param(): NDArray
     /**
      * Update the parameter with the gradient and optimizer.
      *
-     * @param NDArray    $gradient
-     * @param Optimizer  $optimizer
+     * @param NDArray $gradient
+     * @param Optimizer $optimizer
      */
-    public function update(NDArray $gradient, Optimizer $optimizer): void
+    public function update(NDArray $gradient, Optimizer $optimizer) : void
     {
         $step = $optimizer->step($this, $gradient);
 
@@ -92,7 +91,7 @@ public function update(NDArray $gradient, Optimizer $optimizer): void
     /**
      * Perform a deep copy of the object upon cloning.
      */
-    public function __clone(): void
+    public function __clone() : void
     {
         $this->param = clone $this->param;
     }
diff --git a/tests/NeuralNet/Layers/Activation/ActivationTest.php b/tests/NeuralNet/Layers/Activation/ActivationTest.php
new file mode 100644
index 000000000..2c203ad18
--- /dev/null
+++ b/tests/NeuralNet/Layers/Activation/ActivationTest.php
@@ -0,0 +1,181 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Rubix\ML\Tests\NeuralNet\Layers\Activation;
+
+use PHPUnit\Framework\Attributes\CoversClass;
+use PHPUnit\Framework\Attributes\Group;
+use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestDox;
+use PHPUnit\Framework\Attributes\DataProvider;
+use NDArray;
+use NumPower;
+use Rubix\ML\Deferred;
+use Rubix\ML\NeuralNet\Layers\Activation\Activation;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\NeuralNet\Optimizers\Stochastic\Stochastic;
+use Rubix\ML\NeuralNet\ActivationFunctions\ReLU\ReLU;
+use PHPUnit\Framework\TestCase;
+
+#[Group('Layers')]
+#[CoversClass(Activation::class)]
+class ActivationTest extends TestCase
+{
+    /**
+     * @var positive-int
+     */
+    protected int $fanIn;
+
+    protected NDArray $input;
+
+    protected Deferred $prevGrad;
+
+    protected Optimizer $optimizer;
+
+    protected Activation $layer;
+
+    /**
+     * @return array<int, array{NDArray,array}>
+     */
+    public static function forwardProvider() : array
+    {
+        return [
+            [
+                NumPower::array([
+                    [1.0, 2.5, -0.1],
+                    [0.1, 0.0, 3.0],
+                    [0.002, -6.0, -0.5],
+                ]),
+                [
+                    [1.0, 2.5, 0.0],
+                    [0.1, 0.0, 3.0],
+                    [0.002, 0.0, 0.0],
+                ],
+            ],
+        ];
+    }
+
+    /**
+     * @return array<int, array{NDArray,NDArray,array}>
+     */
+    public static function backProvider() : array
+    {
+        return [
+            [
+                NumPower::array([
+                    [1.0, 2.5, -0.1],
+                    [0.1, 0.0, 3.0],
+                    [0.002, -6.0, -0.5],
+                ]),
+                NumPower::array([
+                    [0.25, 0.7, 0.1],
+                    [0.50, 0.2, 0.01],
+                    [0.25, 0.1, 0.89],
+                ]),
+                [
+                    [0.25, 0.7, 0.0],
+                    [0.5, 0.0, 0.01],
+                    [0.25, 0, 0.0],
+                ],
+            ],
+        ];
+    }
+
+    protected function setUp() : void
+    {
+        $this->fanIn = 3;
+
+        $this->input = NumPower::array([
+            [1.0, 2.5, -0.1],
+            [0.1, 0.0, 3.0],
+            [0.002, -6.0, -0.5],
+        ]);
+
+        $this->prevGrad = new Deferred(fn: function () : NDArray {
+            return NumPower::array([
+                [0.25, 0.7, 0.1],
+                [0.50, 0.2, 0.01],
+                [0.25, 0.1, 0.89],
+            ]);
+        });
+
+        $this->optimizer = new Stochastic(0.001);
+
+        $this->layer = new Activation(new ReLU());
+    }
+
+    #[Test]
+    #[TestDox('Can be cast to a string')]
+    public function testToString() : void
+    {
+        self::assertEquals('Activation (activation fn: ReLU)', (string) $this->layer);
+    }
+
+    #[Test]
+    #[TestDox('Initializes width equal to fan-in')]
+    public function testInitializeSetsWidth() : void
+    {
+        $this->layer->initialize($this->fanIn);
+
+        self::assertEquals($this->fanIn, $this->layer->width());
+    }
+
+    #[Test]
+    #[TestDox('Computes forward activations')]
+    #[DataProvider('forwardProvider')]
+    public function testForward(NDArray $input, array $expected) : void
+    {
+        $this->layer->initialize($this->fanIn);
+
+        $forward = $this->layer->forward($input);
+        self::assertEqualsWithDelta($expected, $forward->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Computes backpropagated gradients after forward pass')]
+    #[DataProvider('backProvider')]
+    public function testBack(NDArray $input, NDArray $prevGrad, array $expected) : void
+    {
+        $this->layer->initialize($this->fanIn);
+
+        // Forward pass to set internal input/output state
+        $this->layer->forward($input);
+
+        $gradient = $this->layer
+            ->back(prevGradient: new Deferred(fn: fn () => $prevGrad), optimizer: $this->optimizer)
+            ->compute();
+
+        self::assertEqualsWithDelta($expected, $gradient->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Computes inference activations')]
+    #[DataProvider('forwardProvider')]
+    public function testInfer(NDArray $input, array $expected) : void
+    {
+        $this->layer->initialize($this->fanIn);
+
+        $infer = $this->layer->infer($input);
+        self::assertEqualsWithDelta($expected, $infer->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Computes gradient correctly given input, output, and previous gradient')]
+    #[DataProvider('backProvider')]
+    public function testGradient(NDArray $input, NDArray $prevGrad, array $expected) : void
+    {
+        $this->layer->initialize($this->fanIn);
+
+        // Produce output to pass explicitly to gradient
+        $output = $this->layer->forward($input);
+
+        $gradient = $this->layer->gradient(
+            $input,
+            $output,
+            new Deferred(fn: fn () => $prevGrad)
+        );
+
+        self::assertEqualsWithDelta($expected, $gradient->toArray(), 1e-7);
+    }
+}
diff --git a/tests/NeuralNet/Layers/BatchNorm/BatchNormTest.php b/tests/NeuralNet/Layers/BatchNorm/BatchNormTest.php
new file mode 100644
index 000000000..dd5380941
--- /dev/null
+++ b/tests/NeuralNet/Layers/BatchNorm/BatchNormTest.php
@@ -0,0 +1,393 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Rubix\ML\Tests\NeuralNet\Layers\BatchNorm;
+
+use PHPUnit\Framework\Attributes\CoversClass;
+use PHPUnit\Framework\Attributes\Group;
+use NDArray;
+use NumPower;
+use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestDox;
+use PHPUnit\Framework\Attributes\DataProvider;
+use PHPUnit\Framework\MockObject\Rule\Parameters;
+use Rubix\ML\Deferred;
+use Rubix\ML\NeuralNet\Layers\BatchNorm\BatchNorm;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\NeuralNet\Optimizers\Stochastic\Stochastic;
+use Rubix\ML\NeuralNet\Initializers\Constant\Constant;
+use Rubix\ML\NeuralNet\Parameters\Parameter as TrainableParameter;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\Exceptions\RuntimeException as RubixRuntimeException;
+use PHPUnit\Framework\TestCase;
+
+#[Group('Layers')]
+#[CoversClass(BatchNorm::class)]
+class BatchNormTest extends TestCase
+{
+    /**
+     * @var positive-int
+     */
+    protected int $fanIn;
+
+    protected NDArray $input;
+
+    protected Deferred $prevGrad;
+
+    protected Optimizer $optimizer;
+
+    protected BatchNorm $layer;
+
+    /**
+     * @return array<string, array{0:int}>
+     */
+    public static function initializeProvider() : array
+    {
+        return [
+            'fanIn=3' => [3],
+        ];
+    }
+
+    /**
+     * @return array<string, array{0:array}>
+     */
+    public static function forwardProvider() : array
+    {
+        return [
+            'expectedForward' => [[
+                [-0.1251222, 1.2825031, -1.1573808],
+                [-0.6708631, -0.7427414, 1.4136046],
+                [0.7974158, -1.4101899, 0.6127743],
+            ]],
+        ];
+    }
+
+    /**
+     * @return array<string, array{0:array}>
+     */
+    public static function backProvider() : array
+    {
+        return [
+            'expectedGradient' => [[
+                [-0.0644587, 0.0272710, 0.0371877],
+                [0.1137590, -0.1099670, -0.0037919],
+                [-0.1190978, -0.0108703, 0.1299681],
+            ]],
+        ];
+    }
+
+    /**
+     * @return array<string, array{0:array}>
+     */
+    public static function inferProvider() : array
+    {
+        return [
+            'expectedInfer' => [[
+                [-0.1251222, 1.2825031, -1.1573808],
+                [-0.6708631, -0.7427414, 1.4136046],
+                [0.7974158, -1.4101899, 0.6127743],
+            ]],
+        ];
+    }
+
+    /**
+     * Additional inputs to validate behavior across different batch sizes.
+     *
+     * @return array<string, array{0:array}>
+     */
+    public static function batchInputsProvider() : array
+    {
+        return [
+            'batch1x3' => [[
+                [2.0, -1.0, 0.0],
+            ]],
+            'batch2x3' => [[
+                [1.0, 2.0, 3.0],
+                [3.0, 3.0, 3.0],
+            ]],
+            'batch4x3' => [[
+                [0.5, -0.5, 1.5],
+                [10.0, -10.0, 0.0],
+                [7.2, 3.3, -2.4],
+                [-1.0, -2.0, 4.0],
+            ]],
+        ];
+    }
+
+    /**
+     * @return array<string, array{0:array}>
+     */
+    public static function gradientProvider() : array
+    {
+        return [
+            'expectedGradient' => [[
+                [-0.0644587, 0.0272710, 0.0371877],
+                [0.1137590, -0.1099670, -0.0037919],
+                [-0.1190978, -0.0108703, 0.1299681],
+            ]],
+        ];
+    }
+
+    /**
+     * @return array<string, array{0:float}>
+     */
+    public static function badDecayProvider() : array
+    {
+        return [
+            'negative' => [-0.01],
+            'greaterThanOne' => [1.01],
+        ];
+    }
+
+    protected function setUp() : void
+    {
+        $this->fanIn = 3;
+
+        $this->input = NumPower::array([
+            [1.0, 2.5, -0.1],
+            [0.1, 0.0, 3.0],
+            [0.002, -6.0, -0.5],
+        ]);
+
+        $this->prevGrad = new Deferred(fn: function () : NDArray {
+            return NumPower::array([
+                [0.25, 0.7, 0.1],
+                [0.50, 0.2, 0.01],
+                [0.25, 0.1, 0.89],
+            ]);
+        });
+
+        $this->optimizer = new Stochastic(0.001);
+
+        $this->layer = new BatchNorm(
+            decay: 0.9,
+            betaInitializer: new Constant(0.0),
+            gammaInitializer: new Constant(1.0)
+        );
+    }
+
+    #[Test]
+    #[TestDox('Can be cast to a string')]
+    public function testToString() : void
+    {
+        self::assertEquals(
+            'Batch Norm (decay: 0.9, beta initializer: Constant (value: 0), gamma initializer: Constant (value: 1))',
+            (string) $this->layer
+        );
+    }
+
+    #[Test]
+    #[TestDox('Initializes width and returns fan out')]
+    #[DataProvider('initializeProvider')]
+    public function testInitialize(int $fanIn) : void
+    {
+        $fanOut = $this->layer->initialize($fanIn);
+        self::assertEquals($fanIn, $fanOut);
+        self::assertEquals($fanIn, $this->layer->width());
+    }
+
+    #[Test]
+    #[TestDox('Computes forward pass')]
+    #[DataProvider('forwardProvider')]
+    public function testForward(array $expected) : void
+    {
+        $this->layer->initialize($this->fanIn);
+
+        $forward = $this->layer->forward($this->input);
+
+        self::assertEqualsWithDelta($expected, $forward->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Backpropagates and returns gradient for previous layer')]
+    #[DataProvider('backProvider')]
+    public function testBack(array $expected) : void
+    {
+        $this->layer->initialize($this->fanIn);
+        $this->layer->forward($this->input);
+
+        $gradient = $this->layer->back(
+            prevGradient: $this->prevGrad,
+            optimizer: $this->optimizer
+        )->compute();
+
+        self::assertInstanceOf(NDArray::class, $gradient);
+        self::assertEqualsWithDelta($expected, $gradient->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Infers using running statistics')]
+    #[DataProvider('inferProvider')]
+    public function testInfer(array $expected) : void
+    {
+        $this->layer->initialize($this->fanIn);
+        // Perform a forward pass to set running mean/variance
+        $this->layer->forward($this->input);
+
+        $infer = $this->layer->infer($this->input);
+
+        self::assertEqualsWithDelta($expected, $infer->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Computes forward pass (row-wise) with zero mean and unit variance per sample for various batch sizes')]
+    #[DataProvider('batchInputsProvider')]
+    public function testForwardStatsMultipleBatches(array $input) : void
+    {
+        $this->layer->initialize($this->fanIn);
+
+        $forward = $this->layer->forward(NumPower::array($input));
+        $out = $forward->toArray();
+
+        // Check per-row mean ~ 0 and variance ~ 1 (allow 0 for degenerate rows)
+        $this->assertRowwiseStats($input, $out, true);
+    }
+
+    #[Test]
+    #[TestDox('Infers (row-wise) with zero mean and unit variance per sample for various batch sizes')]
+    #[DataProvider('batchInputsProvider')]
+    public function testInferStatsMultipleBatches(array $input) : void
+    {
+        $this->layer->initialize($this->fanIn);
+
+        // Perform a forward pass on the same input to initialize running stats
+        $this->layer->forward(NumPower::array($input));
+
+        $infer = $this->layer->infer(NumPower::array($input));
+        $out = $infer->toArray();
+
+        $this->assertRowwiseStats($input, $out, false);
+    }
+
+    #[Test]
+    #[TestDox('Throws when width is requested before initialization')]
+    public function testWidthThrowsBeforeInitialize() : void
+    {
+        $layer = new BatchNorm();
+        $this->expectException(RubixRuntimeException::class);
+        $layer->width();
+    }
+
+    #[Test]
+    #[TestDox('Constructor rejects invalid decay values')]
+    #[DataProvider('badDecayProvider')]
+    public function testConstructorRejectsInvalidDecay(float $decay) : void
+    {
+        $this->expectException(InvalidArgumentException::class);
+        new BatchNorm(decay: $decay);
+    }
+
+    #[Test]
+    #[TestDox('Yields trainable parameters beta and gamma')]
+    public function testParameters() : void
+    {
+        $this->layer->initialize($this->fanIn);
+
+        $params = iterator_to_array($this->layer->parameters());
+
+        self::assertArrayHasKey('beta', $params);
+        self::assertArrayHasKey('gamma', $params);
+        self::assertInstanceOf(TrainableParameter::class, $params['beta']);
+        self::assertInstanceOf(TrainableParameter::class, $params['gamma']);
+
+        self::assertEquals([0.0, 0.0, 0.0], $params['beta']->param()->toArray());
+        self::assertEquals([1.0, 1.0, 1.0], $params['gamma']->param()->toArray());
+    }
+
+    #[Test]
+    #[TestDox('Restores parameters from array')]
+    public function testRestore() : void
+    {
+        $this->layer->initialize($this->fanIn);
+
+        $betaNew = new TrainableParameter(NumPower::full([3], 2.0));
+        $gammaNew = new TrainableParameter(NumPower::full([3], 3.0));
+
+        $this->layer->restore([
+            'beta' => $betaNew,
+            'gamma' => $gammaNew,
+        ]);
+
+        $restored = iterator_to_array($this->layer->parameters());
+        self::assertSame($betaNew, $restored['beta']);
+        self::assertSame($gammaNew, $restored['gamma']);
+        self::assertEquals([2.0, 2.0, 2.0], $restored['beta']->param()->toArray());
+        self::assertEquals([3.0, 3.0, 3.0], $restored['gamma']->param()->toArray());
+    }
+
+    #[Test]
+    #[TestDox('Computes gradient for previous layer directly')]
+    #[DataProvider('gradientProvider')]
+    public function testGradient(array $expected) : void
+    {
+        $this->layer->initialize($this->fanIn);
+
+        // Compute forward-time caches manually to pass into gradient()
+        $input = $this->input;
+        $rows = $input->shape()[0];
+        $meanArr = [];
+        $varArr = [];
+        $stdInvArr = [];
+
+        for ($i = 0; $i < $rows; $i++) {
+            $row = $input->toArray()[$i];
+            $meanArr[$i] = NumPower::mean($row);
+            $varArr[$i] = NumPower::variance($row);
+            $stdInvArr[$i] = 1.0 / sqrt($varArr[$i]);
+        }
+
+        $mean = NumPower::array($meanArr);
+        $stdInv = NumPower::array($stdInvArr);
+
+        $xHat = NumPower::multiply(
+            NumPower::subtract(NumPower::transpose($input, [1, 0]), $mean),
+            $stdInv
+        );
+        $xHat = NumPower::transpose($xHat, [1, 0]);
+
+        // Use provided prevGrad as dOut and current gamma parameter
+        $dOut = ($this->prevGrad)();
+        $gamma = iterator_to_array($this->layer->parameters())['gamma']->param();
+
+        $gradient = $this->layer->gradient($dOut, $gamma, $stdInv, $xHat);
+
+        self::assertEqualsWithDelta($expected, $gradient->toArray(), 1e-7);
+    }
+
+    /**
+     * @param array<int, array<int, float>> $inputRows
+     * @param array<int, array<int, float>> $outRows
+     */
+    private function assertRowwiseStats(array $inputRows, array $outRows, bool $checkMean) : void
+    {
+        foreach ($outRows as $i => $row) {
+            $mean = array_sum($row) / count($row);
+            $var = 0.0;
+            foreach ($row as $v) {
+                $var += ($v - $mean) * ($v - $mean);
+            }
+            $var /= count($row);
+
+            $orig = $inputRows[$i];
+            $origMean = array_sum($orig) / count($orig);
+            $origVar = 0.0;
+            foreach ($orig as $ov) {
+                $origVar += ($ov - $origMean) * ($ov - $origMean);
+            }
+            $origVar /= count($orig);
+
+            $expectedVar = $origVar < 1e-12 ? 0.0 : 1.0;
+
+            if ($checkMean) {
+                self::assertEqualsWithDelta(0.0, $mean, 1e-7);
+            }
+
+            if ($expectedVar === 0.0) {
+                self::assertLessThan(1e-6, $var);
+            } else {
+                self::assertEqualsWithDelta(1.0, $var, 1e-6);
+            }
+        }
+    }
+}
diff --git a/tests/NeuralNet/Layers/Binary/BinaryTest.php b/tests/NeuralNet/Layers/Binary/BinaryTest.php
new file mode 100644
index 000000000..645d7c86b
--- /dev/null
+++ b/tests/NeuralNet/Layers/Binary/BinaryTest.php
@@ -0,0 +1,192 @@
+<?php
+
+declare(strict_types = 1);
+
+namespace Rubix\ML\Tests\NeuralNet\Layers\Binary;
+
+use PHPUnit\Framework\Attributes\CoversClass;
+use PHPUnit\Framework\Attributes\Group;
+use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestDox;
+use PHPUnit\Framework\Attributes\DataProvider;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use NDArray;
+use NumPower;
+use Rubix\ML\Deferred;
+use Rubix\ML\NeuralNet\Layers\Binary\Binary;
+use Rubix\ML\NeuralNet\Optimizers\Stochastic\Stochastic;
+use Rubix\ML\NeuralNet\CostFunctions\CrossEntropy\CrossEntropy;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use PHPUnit\Framework\TestCase;
+
+#[Group('Layers')]
+#[CoversClass(Binary::class)]
+class BinaryTest extends TestCase
+{
+    protected NDArray $input;
+
+    /**
+     * @var string[]
+     */
+    protected array $labels;
+
+    protected Optimizer $optimizer;
+
+    protected Binary $layer;
+
+    /**
+     * @return array<int, array{0:array}>
+     */
+    public static function forwardProvider() : array
+    {
+        return [
+            [
+                [
+                    [0.7310585, 0.9241418, 0.4750207],
+                ],
+            ],
+        ];
+    }
+
+    /**
+     * @return array<int, array{0:array}>
+     */
+    public static function backProvider() : array
+    {
+        return [
+            [
+                [
+                    [0.2436861, -0.0252860, 0.1583402],
+                ],
+            ],
+        ];
+    }
+
+    /**
+     * @return array<string, array{0: array<int, string>}> 
+     */
+    public static function badClassesProvider() : array
+    {
+        return [
+            'empty' => [[]],
+            'single' => [['hot']],
+            'duplicatesToOne' => [['hot', 'hot']],
+            'threeUnique' => [['hot', 'cold', 'warm']],
+        ];
+    }
+
+    protected function setUp() : void
+    {
+        $this->input = NumPower::array([
+            [1.0, 2.5, -0.1],
+        ]);
+
+        $this->labels = ['hot', 'cold', 'hot'];
+
+        $this->optimizer = new Stochastic(0.001);
+
+        $this->layer = new Binary(classes: ['hot', 'cold'], costFn: new CrossEntropy());
+    }
+
+    #[Test]
+    #[TestDox('Returns string representation')]
+    public function testToString() : void
+    {
+        $this->layer->initialize(1);
+
+        self::assertEquals('Binary (cost function: Cross Entropy)', (string) $this->layer);
+    }
+
+    #[Test]
+    #[TestDox('Initializes and reports width')]
+    public function testInitializeWidth() : void
+    {
+        $this->layer->initialize(1);
+        self::assertEquals(1, $this->layer->width());
+    }
+
+    #[Test]
+    #[TestDox('Constructor rejects invalid classes arrays')]
+    #[DataProvider('badClassesProvider')]
+    public function testConstructorRejectsInvalidClasses(array $classes) : void
+    {
+        $this->expectException(InvalidArgumentException::class);
+        new Binary(classes: $classes, costFn: new CrossEntropy());
+    }
+
+    #[Test]
+    #[TestDox('Constructor accepts classes arrays that dedupe to exactly 2 labels')]
+    public function testConstructorAcceptsDuplicateClassesThatDedupeToTwo() : void
+    {
+        $layer = new Binary(classes: ['hot', 'cold', 'hot'], costFn: new CrossEntropy());
+        // Should initialize without throwing and report correct width
+        $layer->initialize(1);
+        self::assertEquals(1, $layer->width());
+    }
+
+    #[Test]
+    #[TestDox('Computes forward pass')]
+    #[DataProvider('forwardProvider')]
+    public function testForward(array $expected) : void
+    {
+        $this->layer->initialize(1);
+
+        $forward = $this->layer->forward($this->input);
+        self::assertEqualsWithDelta($expected, $forward->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Backpropagates and returns gradient for previous layer')]
+    #[DataProvider('backProvider')]
+    public function testBack(array $expectedGradient) : void
+    {
+        $this->layer->initialize(1);
+        $this->layer->forward($this->input);
+
+        [$computation, $loss] = $this->layer->back(labels: $this->labels, optimizer: $this->optimizer);
+
+        self::assertInstanceOf(Deferred::class, $computation);
+        self::assertIsFloat($loss);
+
+        $gradient = $computation->compute();
+
+        self::assertInstanceOf(NDArray::class, $gradient);
+        self::assertEqualsWithDelta($expectedGradient, $gradient->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Computes gradient directly given input, output, expected, and batch size')]
+    #[DataProvider('backProvider')]
+    public function testGradient(array $expectedGradient) : void
+    {
+        $this->layer->initialize(1);
+
+        $input = $this->input;
+        $output = $this->layer->forward($input);
+
+        // Build expected NDArray (1, batch) using the Binary classes mapping: hot=>0.0, cold=>1.0
+        $expected = [];
+        foreach ($this->labels as $label) {
+            $expected[] = ($label === 'cold') ? 1.0 : 0.0;
+        }
+        $expected = NumPower::array([$expected]);
+
+        $batchSize = count($this->labels);
+
+        $gradient = $this->layer->gradient($input, $output, $expected, $batchSize);
+
+        self::assertInstanceOf(NDArray::class, $gradient);
+        self::assertEqualsWithDelta($expectedGradient, $gradient->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Computes inference activations')]
+    #[DataProvider('forwardProvider')]
+    public function testInfer(array $expected) : void
+    {
+        $this->layer->initialize(1);
+
+        $infer = $this->layer->infer($this->input);
+        self::assertEqualsWithDelta($expected, $infer->toArray(), 1e-7);
+    }
+}
diff --git a/tests/NeuralNet/Layers/Continuous/ContinuousTest.php b/tests/NeuralNet/Layers/Continuous/ContinuousTest.php
new file mode 100644
index 000000000..39592cdcb
--- /dev/null
+++ b/tests/NeuralNet/Layers/Continuous/ContinuousTest.php
@@ -0,0 +1,159 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Rubix\ML\Tests\NeuralNet\Layers\Continuous;
+
+use PHPUnit\Framework\Attributes\CoversClass;
+use PHPUnit\Framework\Attributes\Group;
+use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestDox;
+use PHPUnit\Framework\Attributes\DataProvider;
+use NDArray;
+use NumPower;
+use Rubix\ML\Deferred;
+use Rubix\ML\NeuralNet\Layers\Continuous\Continuous;
+use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer;
+use Rubix\ML\NeuralNet\Optimizers\Stochastic\Stochastic;
+use Rubix\ML\NeuralNet\CostFunctions\LeastSquares\LeastSquares;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use PHPUnit\Framework\TestCase;
+
+#[Group('Layers')]
+#[CoversClass(Continuous::class)]
+class ContinuousTest extends TestCase
+{
+    protected NDArray $input;
+
+    /**
+     * @var (int|float)[]
+     */
+    protected array $labels;
+
+    protected Optimizer $optimizer;
+
+    protected Continuous $layer;
+
+    /**
+     * @return array<int, array{0: array}>
+     */
+    public static function forwardProvider() : array
+    {
+        return [
+            [
+                [
+                    [2.5, 0.0, -6.0],
+                ],
+            ],
+        ];
+    }
+
+    /**
+     * @return array<int, array{0: array}>
+     */
+    public static function gradientProvider() : array
+    {
+        return [
+            [
+                [
+                    [0.8333333, 0.8333333, -32.0],
+                ],
+            ],
+        ];
+    }
+
+    protected function setUp() : void
+    {
+        $this->input = NumPower::array([
+            [2.5, 0.0, -6.0],
+        ]);
+
+        $this->labels = [0.0, -2.5, 90.0];
+
+        $this->optimizer = new Stochastic(0.001);
+
+        $this->layer = new Continuous(new LeastSquares());
+    }
+
+    #[Test]
+    #[TestDox('Returns string representation')]
+    public function testToString() : void
+    {
+        $this->layer->initialize(1);
+
+        self::assertEquals('Continuous (cost function: Least Squares)', (string) $this->layer);
+    }
+
+    #[Test]
+    #[TestDox('Initializes and reports width')]
+    public function testInitializeWidth() : void
+    {
+        $this->layer->initialize(1);
+        self::assertEquals(1, $this->layer->width());
+    }
+
+    #[Test]
+    #[TestDox('Initialize rejects fan-in not equal to 1')]
+    public function testInitializeRejectsInvalidFanIn() : void
+    {
+        $this->expectException(InvalidArgumentException::class);
+        $this->layer->initialize(2);
+    }
+
+    #[Test]
+    #[TestDox('Computes forward pass')]
+    #[DataProvider('forwardProvider')]
+    public function testForward(array $expected) : void
+    {
+        $this->layer->initialize(1);
+
+        $forward = $this->layer->forward($this->input);
+        self::assertEqualsWithDelta($expected, $forward->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Backpropagates and returns gradient for previous layer')]
+    #[DataProvider('gradientProvider')]
+    public function testBack(array $expectedGradient) : void
+    {
+        $this->layer->initialize(1);
+        $this->layer->forward($this->input);
+
+        [$computation, $loss] = $this->layer->back(labels: $this->labels, optimizer: $this->optimizer);
+
+        self::assertInstanceOf(Deferred::class, $computation);
+        self::assertIsFloat($loss);
+
+        $gradient = $computation->compute();
+
+        self::assertInstanceOf(NDArray::class, $gradient);
+        self::assertEqualsWithDelta($expectedGradient, $gradient->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Computes gradient directly given input and expected')]
+    #[DataProvider('gradientProvider')]
+    public function testGradient(array $expectedGradient) : void
+    {
+        $this->layer->initialize(1);
+
+        $input = $this->input;
+        $expected = NumPower::array([$this->labels]);
+
+        $gradient = $this->layer->gradient($input, $expected);
+
+        self::assertInstanceOf(NDArray::class, $gradient);
+        self::assertEqualsWithDelta($expectedGradient, $gradient->toArray(), 1e-7);
+    }
+
+    #[Test]
+    #[TestDox('Computes inference activations')]
+    #[DataProvider('forwardProvider')]
+    public function testInfer(array $expected) : void
+    {
+        $this->layer->initialize(1);
+
+        $infer = $this->layer->infer($this->input);
+        self::assertEqualsWithDelta($expected, $infer->toArray(), 1e-7);
+    }
+}
diff --git a/tests/NeuralNet/Optimizers/AdaGrad/AdaGradTest.php b/tests/NeuralNet/Optimizers/AdaGrad/AdaGradTest.php
new file mode 100644
index 000000000..44ff773f5
--- /dev/null
+++ b/tests/NeuralNet/Optimizers/AdaGrad/AdaGradTest.php
@@ -0,0 +1,94 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Rubix\ML\Tests\NeuralNet\Optimizers\AdaGrad;
+
+use Generator;
+use NDArray;
+use NumPower;
+use PHPUnit\Framework\Attributes\CoversClass;
+use PHPUnit\Framework\Attributes\DataProvider;
+use PHPUnit\Framework\Attributes\Group;
+use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestDox;
+use PHPUnit\Framework\TestCase;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\NeuralNet\Optimizers\AdaGrad\AdaGrad;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+
+#[Group('Optimizers')]
+#[CoversClass(AdaGrad::class)]
+class AdaGradTest extends TestCase
+{
+    protected AdaGrad $optimizer;
+
+    public static function invalidConstructorProvider() : Generator
+    {
+        yield 'zero rate' => [0.0];
+        yield 'negative rate' => [-0.001];
+    }
+
+    public static function stepProvider() : Generator
+    {
+        yield [
+            new Parameter(NumPower::array([
+                [0.1, 0.6, -0.4],
+                [0.5, 0.6, -0.4],
+                [0.1, 0.1, -0.7],
+            ])),
+            NumPower::array([
+                [0.01, 0.05, -0.02],
+                [-0.01, 0.02, 0.03],
+                [0.04, -0.01, -0.5],
+            ]),
+            [
+                [0.001, 0.001, -0.001],
+                [-0.001, 0.001, 0.001],
+                [0.001, -0.001, -0.001],
+            ],
+        ];
+    }
+
+    protected function setUp() : void
+    {
+        $this->optimizer = new AdaGrad(0.001);
+    }
+
+    #[Test]
+    #[TestDox('Can be cast to a string')]
+    public function testToString() : void
+    {
+        self::assertSame('AdaGrad (rate: 0.01)', (string) (new AdaGrad()));
+    }
+
+    /**
+     * @param float $rate
+     */
+    #[Test]
+    #[DataProvider('invalidConstructorProvider')]
+    #[TestDox('Throws exception when constructed with invalid arguments')]
+    public function testInvalidConstructorParams(float $rate) : void
+    {
+        $this->expectException(InvalidArgumentException::class);
+
+        new AdaGrad(rate: $rate);
+    }
+
+    /**
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @param list<list<float>> $expected
+     */
+    #[Test]
+    #[DataProvider('stepProvider')]
+    #[TestDox('Can compute the step')]
+    public function testStep(Parameter $param, NDArray $gradient, array $expected) : void
+    {
+        $this->optimizer->warm($param);
+
+        $step = $this->optimizer->step(param: $param, gradient: $gradient);
+
+        self::assertEqualsWithDelta($expected, $step->toArray(), 1e-7);
+    }
+}
diff --git a/tests/NeuralNet/Optimizers/AdaMax/AdaMaxTest.php b/tests/NeuralNet/Optimizers/AdaMax/AdaMaxTest.php
new file mode 100644
index 000000000..0ca059561
--- /dev/null
+++ b/tests/NeuralNet/Optimizers/AdaMax/AdaMaxTest.php
@@ -0,0 +1,108 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Rubix\ML\Tests\NeuralNet\Optimizers\AdaMax;
+
+use PHPUnit\Framework\Attributes\CoversClass;
+use PHPUnit\Framework\Attributes\DataProvider;
+use PHPUnit\Framework\Attributes\Group;
+use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestDox;
+use NDArray;
+use NumPower;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+use Rubix\ML\NeuralNet\Optimizers\AdaMax\AdaMax;
+use PHPUnit\Framework\TestCase;
+use Generator;
+
+#[Group('Optimizers')]
+#[CoversClass(AdaMax::class)]
+class AdaMaxTest extends TestCase
+{
+    protected AdaMax $optimizer;
+
+    public static function invalidConstructorProvider() : Generator
+    {
+        yield 'zero rate' => [0.0, 0.1, 0.001];
+        yield 'negative rate' => [-0.001, 0.1, 0.001];
+        yield 'zero momentum decay' => [0.001, 0.0, 0.001];
+        yield 'momentum decay == 1' => [0.001, 1.0, 0.001];
+        yield 'momentum decay > 1' => [0.001, 1.5, 0.001];
+        yield 'negative momentum decay' => [0.001, -0.1, 0.001];
+        yield 'zero norm decay' => [0.001, 0.1, 0.0];
+        yield 'norm decay == 1' => [0.001, 0.1, 1.0];
+        yield 'norm decay > 1' => [0.001, 0.1, 1.5];
+        yield 'negative norm decay' => [0.001, 0.1, -0.1];
+    }
+
+    public static function stepProvider() : Generator
+    {
+        yield [
+            new Parameter(NumPower::array([
+                [0.1, 0.6, -0.4],
+                [0.5, 0.6, -0.4],
+                [0.1, 0.1, -0.7],
+            ])),
+            NumPower::array([
+                [0.01, 0.05, -0.02],
+                [-0.01, 0.02, 0.03],
+                [0.04, -0.01, -0.5],
+            ]),
+            [
+                [0.0001, 0.0001, -0.0001],
+                [-0.0001, 0.0001, 0.0001],
+                [0.0001, -0.0001, -0.0001],
+            ],
+        ];
+    }
+
+    protected function setUp() : void
+    {
+        $this->optimizer = new AdaMax(
+            rate: 0.001,
+            momentumDecay: 0.1,
+            normDecay: 0.001
+        );
+    }
+
+    #[Test]
+    #[TestDox('Can be cast to a string')]
+    public function testToString() : void
+    {
+        self::assertEquals('AdaMax (rate: 0.001, momentum decay: 0.1, norm decay: 0.001)', (string) $this->optimizer);
+    }
+
+    /**
+     * @param float $rate
+     * @param float $momentumDecay
+     * @param float $normDecay
+     */
+    #[Test]
+    #[DataProvider('invalidConstructorProvider')]
+    #[TestDox('Throws exception when constructed with invalid arguments')]
+    public function testInvalidConstructorParams(float $rate, float $momentumDecay, float $normDecay) : void
+    {
+        $this->expectException(InvalidArgumentException::class);
+
+        new AdaMax(rate: $rate, momentumDecay: $momentumDecay, normDecay: $normDecay);
+    }
+
+    /**
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @param list<list<float>> $expected
+     */
+    #[Test]
+    #[DataProvider('stepProvider')]
+    #[TestDox('Can compute the step')]
+    public function testStep(Parameter $param, NDArray $gradient, array $expected) : void
+    {
+        $this->optimizer->warm($param);
+
+        $step = $this->optimizer->step(param: $param, gradient: $gradient);
+
+        self::assertEqualsWithDelta($expected, $step->toArray(), 1e-7);
+    }
+}
diff --git a/tests/NeuralNet/Optimizers/Adam/AdamTest.php b/tests/NeuralNet/Optimizers/Adam/AdamTest.php
new file mode 100644
index 000000000..bcf19d344
--- /dev/null
+++ b/tests/NeuralNet/Optimizers/Adam/AdamTest.php
@@ -0,0 +1,141 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Rubix\ML\Tests\NeuralNet\Optimizers\Adam;
+
+use PHPUnit\Framework\Attributes\CoversClass;
+use PHPUnit\Framework\Attributes\DataProvider;
+use PHPUnit\Framework\Attributes\Group;
+use NDArray;
+use NumPower;
+use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestDox;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+use Rubix\ML\NeuralNet\Optimizers\Adam\Adam;
+use Rubix\ML\NeuralNet\Optimizers\Base\Adaptive;
+use PHPUnit\Framework\TestCase;
+use Generator;
+
+#[Group('Optimizers')]
+#[CoversClass(Adaptive::class)]
+class AdamTest extends TestCase
+{
+    protected Adam $optimizer;
+
+    public static function invalidConstructorProvider() : Generator
+    {
+        // Invalid rates (<= 0)
+        yield 'zero rate' => [0.0, 0.1, 0.001];
+        yield 'negative rate' => [-0.5, 0.1, 0.001];
+
+        // Invalid momentumDecay (<= 0 or >= 1)
+        yield 'zero momentumDecay' => [0.001, 0.0, 0.001];
+        yield 'negative momentumDecay' => [0.001, -0.1, 0.001];
+        yield 'momentumDecay == 1' => [0.001, 1.0, 0.001];
+        yield 'momentumDecay > 1' => [0.001, 1.1, 0.001];
+
+        // Invalid normDecay (<= 0 or >= 1)
+        yield 'zero normDecay' => [0.001, 0.1, 0.0];
+        yield 'negative normDecay' => [0.001, 0.1, -0.1];
+        yield 'normDecay == 1' => [0.001, 0.1, 1.0];
+        yield 'normDecay > 1' => [0.001, 0.1, 1.1];
+    }
+
+    public static function stepProvider() : Generator
+    {
+        yield [
+            new Parameter(NumPower::array([
+                [0.1, 0.6, -0.4],
+                [0.5, 0.6, -0.4],
+                [0.1, 0.1, -0.7],
+            ])),
+            NumPower::array([
+                [0.01, 0.05, -0.02],
+                [-0.01, 0.02, 0.03],
+                [0.04, -0.01, -0.5],
+            ]),
+            [
+                [0.0031622, 0.0031622, -0.0031622],
+                [-0.0031622, 0.0031622, 0.0031622],
+                [0.0031622, -0.0031622, -0.0031622],
+            ],
+        ];
+    }
+
+    protected function setUp() : void
+    {
+        $this->optimizer = new Adam(
+            rate: 0.001,
+            momentumDecay: 0.1,
+            normDecay: 0.001
+        );
+    }
+
+    #[Test]
+    #[TestDox('Can be cast to a string')]
+    public function testToString() : void
+    {
+        $expected = 'Adam (rate: 0.001, momentum decay: 0.1, norm decay: 0.001)';
+        self::assertSame($expected, (string) $this->optimizer);
+    }
+
+    #[Test]
+    #[TestDox('Warm initializes zeroed velocity and norm caches with the parameter\'s shape')]
+    public function testWarmInitializesZeroedCache() : void
+    {
+        $param = new Parameter(NumPower::array([
+            [1.0, 2.0, 3.0],
+            [4.0, 5.0, 6.0],
+        ]));
+
+        // Warm the optimizer for this parameter
+        $this->optimizer->warm($param);
+
+        // Inspect protected cache via reflection
+        $ref = new \ReflectionClass($this->optimizer);
+        $prop = $ref->getProperty('cache');
+        $prop->setAccessible(true);
+        $cache = $prop->getValue($this->optimizer);
+
+        self::assertArrayHasKey($param->id(), $cache);
+
+        [$velocity, $norm] = $cache[$param->id()];
+
+        $zeros = NumPower::zeros($param->param()->shape());
+        self::assertEqualsWithDelta($zeros->toArray(), $velocity->toArray(), 0.0);
+        self::assertEqualsWithDelta($zeros->toArray(), $norm->toArray(), 0.0);
+    }
+
+    /**
+     * @param float $rate
+     * @param float $momentumDecay
+     * @param float $normDecay
+     */
+    #[Test]
+    #[DataProvider('invalidConstructorProvider')]
+    #[TestDox('Throws exception when constructed with invalid arguments')]
+    public function testInvalidConstructorParams(float $rate, float $momentumDecay, float $normDecay) : void
+    {
+        $this->expectException(InvalidArgumentException::class);
+        new Adam(rate: $rate, momentumDecay: $momentumDecay, normDecay: $normDecay);
+    }
+
+    /**
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @param list<list<float>> $expected
+     */
+    #[Test]
+    #[DataProvider('stepProvider')]
+    #[TestDox('Can compute the step')]
+    public function testStep(Parameter $param, NDArray $gradient, array $expected) : void
+    {
+        $this->optimizer->warm($param);
+
+        $step = $this->optimizer->step(param: $param, gradient: $gradient);
+
+        self::assertEqualsWithDelta($expected, $step->toArray(), 1e-7);
+    }
+}
diff --git a/tests/NeuralNet/Optimizers/Cyclical/CyclicalTest.php b/tests/NeuralNet/Optimizers/Cyclical/CyclicalTest.php
new file mode 100644
index 000000000..302b770be
--- /dev/null
+++ b/tests/NeuralNet/Optimizers/Cyclical/CyclicalTest.php
@@ -0,0 +1,106 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Rubix\ML\Tests\NeuralNet\Optimizers\Cyclical;
+
+use Generator;
+use NDArray;
+use NumPower;
+use PHPUnit\Framework\Attributes\CoversClass;
+use PHPUnit\Framework\Attributes\DataProvider;
+use PHPUnit\Framework\Attributes\Group;
+use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestDox;
+use PHPUnit\Framework\TestCase;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\NeuralNet\Optimizers\Cyclical\Cyclical;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+
+#[Group('Optimizers')]
+#[CoversClass(Cyclical::class)]
+class CyclicalTest extends TestCase
+{
+    protected Cyclical $optimizer;
+
+    public static function invalidConstructorProvider() : Generator
+    {
+        yield 'zero lower' => [0.0, 0.006, 2000, null];
+        yield 'negative lower' => [-0.001, 0.006, 2000, null];
+        yield 'lower > upper' => [0.01, 0.006, 2000, null];
+        yield 'zero steps' => [0.001, 0.006, 0, null];
+        yield 'negative steps' => [0.001, 0.006, -5, null];
+        yield 'zero decay' => [0.001, 0.006, 2000, 0.0];
+        yield 'decay == 1' => [0.001, 0.006, 2000, 1.0];
+        yield 'decay > 1' => [0.001, 0.006, 2000, 1.5];
+        yield 'negative decay' => [0.001, 0.006, 2000, -0.1];
+    }
+
+    public static function stepProvider() : Generator
+    {
+        yield [
+            new Parameter(NumPower::array([
+                [0.1, 0.6, -0.4],
+                [0.5, 0.6, -0.4],
+                [0.1, 0.1, -0.7],
+            ])),
+            NumPower::array([
+                [0.01, 0.05, -0.02],
+                [-0.01, 0.02, 0.03],
+                [0.04, -0.01, -0.5],
+            ]),
+            [
+                [0.00001, 0.00005, -0.00002],
+                [-0.00001, 0.00002, 0.00003],
+                [0.00004, -0.00001, -0.0005],
+            ],
+        ];
+    }
+
+    protected function setUp() : void
+    {
+        $this->optimizer = new Cyclical(lower: 0.001, upper: 0.006, losses: 2000);
+    }
+
+    #[Test]
+    #[TestDox('Can be cast to a string')]
+    public function testToString() : void
+    {
+        self::assertEquals('Cyclical (lower: 0.001, upper: 0.006, steps: 2000, decay: 0.99994)', (string) $this->optimizer);
+    }
+
+    /**
+     * @param float $lower
+     * @param float $upper
+     * @param int $losses
+     * @param float|null $decay
+     */
+    #[Test]
+    #[DataProvider('invalidConstructorProvider')]
+    #[TestDox('Throws exception when constructed with invalid arguments')]
+    public function testConstructorInvalidArgs(float $lower, float $upper, int $losses, ?float $decay) : void
+    {
+        $this->expectException(InvalidArgumentException::class);
+
+        if ($decay === null) {
+            new Cyclical(lower: $lower, upper: $upper, losses: $losses);
+        } else {
+            new Cyclical(lower: $lower, upper: $upper, losses: $losses, decay: $decay);
+        }
+    }
+
+    /**
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @param list<list<float>> $expected
+     */
+    #[Test]
+    #[DataProvider('stepProvider')]
+    #[TestDox('Can compute the step')]
+    public function testStep(Parameter $param, NDArray $gradient, array $expected) : void
+    {
+        $step = $this->optimizer->step(param: $param, gradient: $gradient);
+
+        self::assertEqualsWithDelta($expected, $step->toArray(), 1e-7);
+    }
+}
diff --git a/tests/NeuralNet/Optimizers/Momentum/MomentumTest.php b/tests/NeuralNet/Optimizers/Momentum/MomentumTest.php
new file mode 100644
index 000000000..03b65f9a7
--- /dev/null
+++ b/tests/NeuralNet/Optimizers/Momentum/MomentumTest.php
@@ -0,0 +1,126 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Rubix\ML\Tests\NeuralNet\Optimizers\Momentum;
+
+use Generator;
+use NDArray;
+use NumPower;
+use PHPUnit\Framework\Attributes\CoversClass;
+use PHPUnit\Framework\Attributes\DataProvider;
+use PHPUnit\Framework\Attributes\Group;
+use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestDox;
+use PHPUnit\Framework\TestCase;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\NeuralNet\Optimizers\Momentum\Momentum;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+
+#[Group('Optimizers')]
+#[CoversClass(Momentum::class)]
+class MomentumTest extends TestCase
+{
+    protected Momentum $optimizer;
+
+    public static function invalidConstructorProvider() : Generator
+    {
+        yield 'zero rate' => [0.0, 0.1];
+        yield 'negative rate' => [-0.001, 0.1];
+        yield 'zero decay' => [0.001, 0.0];
+        yield 'decay == 1' => [0.001, 1.0];
+        yield 'decay > 1' => [0.001, 1.5];
+        yield 'negative decay' => [0.001, -0.1];
+    }
+
+    public static function stepProvider() : Generator
+    {
+        yield [
+            new Parameter(NumPower::array([
+                [0.1, 0.6, -0.4],
+                [0.5, 0.6, -0.4],
+                [0.1, 0.1, -0.7],
+            ])),
+            NumPower::array([
+                [0.01, 0.05, -0.02],
+                [-0.01, 0.02, 0.03],
+                [0.04, -0.01, -0.5],
+            ]),
+            [
+                [0.00001, 0.00005, -0.00002],
+                [-0.00001, 0.00002, 0.00003],
+                [0.00004, -0.00001, -0.0005],
+            ],
+        ];
+    }
+
+    protected function setUp() : void
+    {
+        $this->optimizer = new Momentum(rate: 0.001, decay: 0.1, lookahead: false);
+    }
+
+    #[Test]
+    #[TestDox('Can be cast to a string')]
+    public function testToString() : void
+    {
+        self::assertEquals('Momentum (rate: 0.001, decay: 0.1, lookahead: false)', (string) $this->optimizer);
+    }
+
+    /**
+     * @param float $rate
+     * @param float $decay
+     */
+    #[Test]
+    #[DataProvider('invalidConstructorProvider')]
+    #[TestDox('Throws exception when constructed with invalid arguments')]
+    public function testInvalidConstructorParams(float $rate, float $decay) : void
+    {
+        $this->expectException(InvalidArgumentException::class);
+
+        new Momentum(rate: $rate, decay: $decay);
+    }
+
+    #[Test]
+    #[TestDox('Warm initializes a zeroed velocity cache with the parameter\'s shape')]
+    public function testWarmInitializesZeroedCache() : void
+    {
+        $param = new Parameter(NumPower::array([
+            [1.0, 2.0, 3.0],
+            [4.0, 5.0, 6.0],
+        ]));
+
+        // Warm the optimizer for this parameter
+        $this->optimizer->warm($param);
+
+        // Use reflection to read the protected cache
+        $ref = new \ReflectionClass($this->optimizer);
+        $prop = $ref->getProperty('cache');
+        $prop->setAccessible(true);
+        $cache = $prop->getValue($this->optimizer);
+
+        self::assertArrayHasKey($param->id(), $cache);
+
+        $velocity = $cache[$param->id()];
+
+        // Verify the velocity is an all-zeros tensor of the correct shape
+        $zeros = NumPower::zeros($param->param()->shape());
+        self::assertEqualsWithDelta($zeros->toArray(), $velocity->toArray(), 0.0);
+    }
+
+    /**
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @param list<list<float>> $expected
+     */
+    #[Test]
+    #[DataProvider('stepProvider')]
+    #[TestDox('Can compute the step')]
+    public function testStep(Parameter $param, NDArray $gradient, array $expected) : void
+    {
+        $this->optimizer->warm($param);
+
+        $step = $this->optimizer->step(param: $param, gradient: $gradient);
+
+        self::assertEqualsWithDelta($expected, $step->toArray(), 1e-7);
+    }
+}
diff --git a/tests/NeuralNet/Optimizers/RMSProp/RMSPropTest.php b/tests/NeuralNet/Optimizers/RMSProp/RMSPropTest.php
new file mode 100644
index 000000000..f47e4f2b3
--- /dev/null
+++ b/tests/NeuralNet/Optimizers/RMSProp/RMSPropTest.php
@@ -0,0 +1,122 @@
+<?php
+
+declare(strict_types = 1);
+
+namespace Rubix\ML\Tests\NeuralNet\Optimizers\RMSProp;
+
+use Generator;
+use NDArray;
+use NumPower;
+use PHPUnit\Framework\Attributes\CoversClass;
+use PHPUnit\Framework\Attributes\DataProvider;
+use PHPUnit\Framework\Attributes\Group;
+use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestDox;
+use PHPUnit\Framework\TestCase;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+use Rubix\ML\NeuralNet\Optimizers\RMSProp\RMSProp;
+
+#[Group('Optimizers')]
+#[CoversClass(RMSProp::class)]
+class RMSPropTest extends TestCase
+{
+    protected RMSProp $optimizer;
+
+    public static function invalidConstructorProvider() : Generator
+    {
+        yield 'zero rate' => [0.0, 0.1];
+        yield 'negative rate' => [-0.001, 0.1];
+        yield 'zero decay' => [0.001, 0.0];
+        yield 'decay == 1' => [0.001, 1.0];
+        yield 'decay > 1' => [0.001, 1.5];
+        yield 'negative decay' => [0.001, -0.1];
+    }
+
+    public static function stepProvider() : Generator
+    {
+        yield [
+            new Parameter(NumPower::array([
+                [0.1, 0.6, -0.4],
+                [0.5, 0.6, -0.4],
+                [0.1, 0.1, -0.7],
+            ])),
+            NumPower::array([
+                [0.01, 0.05, -0.02],
+                [-0.01, 0.02, 0.03],
+                [0.04, -0.01, -0.5],
+            ]),
+            [
+                [0.0031622, 0.0031622, -0.0031622],
+                [-0.0031622, 0.0031622, 0.0031622],
+                [0.0031622, -0.0031622, -0.0031622],
+            ],
+        ];
+    }
+
+    protected function setUp() : void
+    {
+        $this->optimizer = new RMSProp(rate: 0.001, decay: 0.1);
+    }
+
+    #[Test]
+    #[TestDox('Can be cast to a string')]
+    public function testToString() : void
+    {
+        self::assertEquals('RMS Prop (rate: 0.001, decay: 0.1)', (string) $this->optimizer);
+    }
+
+    #[Test]
+    #[DataProvider('invalidConstructorProvider')]
+    #[TestDox('Throws exception when constructed with invalid arguments')]
+    public function testInvalidConstructorParams(float $rate, float $decay) : void
+    {
+        $this->expectException(InvalidArgumentException::class);
+
+        new RMSProp(rate: $rate, decay: $decay);
+    }
+
+    #[Test]
+    #[TestDox('Warm initializes a zeroed velocity cache with the parameter\'s shape')]
+    public function testWarmInitializesZeroedCache() : void
+    {
+        $param = new Parameter(NumPower::array([
+            [1.0, 2.0, 3.0],
+            [4.0, 5.0, 6.0],
+        ]));
+
+        // Warm the optimizer for this parameter
+        $this->optimizer->warm($param);
+
+        // Use reflection to read the protected cache
+        $ref = new \ReflectionClass($this->optimizer);
+        $prop = $ref->getProperty('cache');
+        $prop->setAccessible(true);
+        $cache = $prop->getValue($this->optimizer);
+
+        self::assertArrayHasKey($param->id(), $cache);
+
+        $velocity = $cache[$param->id()];
+
+        // Verify the velocity is an all-zeros tensor of the correct shape
+        $zeros = NumPower::zeros($param->param()->shape());
+        self::assertEqualsWithDelta($zeros->toArray(), $velocity->toArray(), 0.0);
+    }
+
+    /**
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @param list<list<float>> $expected
+     */
+    #[Test]
+    #[DataProvider('stepProvider')]
+    #[TestDox('Can compute the step')]
+    public function testStep(Parameter $param, NDArray $gradient, array $expected) : void
+    {
+        $this->optimizer->warm($param);
+
+        $step = $this->optimizer->step(param: $param, gradient: $gradient);
+
+        self::assertEqualsWithDelta($expected, $step->toArray(), 1e-7);
+    }
+}
diff --git a/tests/NeuralNet/Optimizers/StepDecay/StepDecayTest.php b/tests/NeuralNet/Optimizers/StepDecay/StepDecayTest.php
new file mode 100644
index 000000000..ae7f78810
--- /dev/null
+++ b/tests/NeuralNet/Optimizers/StepDecay/StepDecayTest.php
@@ -0,0 +1,97 @@
+<?php
+
+declare(strict_types = 1);
+
+namespace Rubix\ML\Tests\NeuralNet\Optimizers\StepDecay;
+
+use Generator;
+use NDArray;
+use NumPower;
+use PHPUnit\Framework\Attributes\CoversClass;
+use PHPUnit\Framework\Attributes\DataProvider;
+use PHPUnit\Framework\Attributes\Group;
+use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestDox;
+use Rubix\ML\Exceptions\InvalidArgumentException;
+use Rubix\ML\NeuralNet\Parameters\Parameter;
+use Rubix\ML\NeuralNet\Optimizers\StepDecay\StepDecay;
+use PHPUnit\Framework\TestCase;
+
+#[Group('Optimizers')]
+#[CoversClass(StepDecay::class)]
+class StepDecayTest extends TestCase
+{
+    protected StepDecay $optimizer;
+
+    public static function invalidConstructorProvider() : Generator
+    {
+        yield 'zero rate' => [0.0, 100, 0.001];
+        yield 'negative rate' => [-0.001, 100, 0.001];
+        yield 'zero losses' => [0.01, 0, 0.001];
+        yield 'negative losses' => [0.01, -5, 0.001];
+        yield 'negative decay' => [0.01, 100, -0.1];
+    }
+
+    public static function stepProvider() : Generator
+    {
+        yield [
+            new Parameter(NumPower::array([
+                [0.1, 0.6, -0.4],
+                [0.5, 0.6, -0.4],
+                [0.1, 0.1, -0.7],
+            ])),
+            NumPower::array([
+                [0.01, 0.05, -0.02],
+                [-0.01, 0.02, 0.03],
+                [0.04, -0.01, -0.5],
+            ]),
+            [
+                [0.00001, 0.00005, -0.00002],
+                [-0.00001, 0.00002, 0.00003],
+                [0.00004, -0.00001, -0.0005],
+            ],
+        ];
+    }
+
+    protected function setUp() : void
+    {
+        $this->optimizer = new StepDecay(rate: 0.001);
+    }
+
+    #[Test]
+    #[TestDox('Can be cast to a string')]
+    public function testToString() : void
+    {
+        self::assertEquals('Step Decay (rate: 0.001, steps: 100, decay: 0.001)', (string) $this->optimizer);
+    }
+
+    /**
+     * @param float $rate
+     * @param int $losses
+     * @param float $decay
+     */
+    #[Test]
+    #[DataProvider('invalidConstructorProvider')]
+    #[TestDox('Throws exception when constructed with invalid arguments')]
+    public function testInvalidConstructorParams(float $rate, int $losses, float $decay) : void
+    {
+        $this->expectException(InvalidArgumentException::class);
+
+        new StepDecay(rate: $rate, losses: $losses, decay: $decay);
+    }
+
+    /**
+     * @param Parameter $param
+     * @param NDArray $gradient
+     * @param list<list<float>> $expected
+     */
+    #[Test]
+    #[DataProvider('stepProvider')]
+    #[TestDox('Can compute the step')]
+    public function testStep(Parameter $param, NDArray $gradient, array $expected) : void
+    {
+        $step = $this->optimizer->step(param: $param, gradient: $gradient);
+
+        self::assertEqualsWithDelta($expected, $step->toArray(), 1e-7);
+    }
+}
diff --git a/tests/NeuralNet/Optimizers/Stochastic/StochasticTest.php b/tests/NeuralNet/Optimizers/Stochastic/StochasticTest.php
index 57a50335f..c24b990f7 100644
--- a/tests/NeuralNet/Optimizers/Stochastic/StochasticTest.php
+++ b/tests/NeuralNet/Optimizers/Stochastic/StochasticTest.php
@@ -23,6 +23,12 @@ class StochasticTest extends TestCase
 {
     protected Stochastic $optimizer;
 
+    public static function invalidConstructorProvider() : Generator
+    {
+        yield 'zero rate' => [0.0];
+        yield 'negative rate' => [-0.001];
+    }
+
     public static function stepProvider() : Generator
     {
         yield [
@@ -50,19 +56,23 @@ protected function setUp() : void
     }
 
     #[Test]
-    #[TestDox('Throws exception when constructed with invalid learning rate')]
-    public function testConstructorWithInvalidRate() : void
+    #[TestDox('Can be cast to a string')]
+    public function testToString() : void
     {
-        $this->expectException(InvalidArgumentException::class);
-
-        new Stochastic(0.0);
+        self::assertEquals('Stochastic (rate: 0.001)', (string) $this->optimizer);
     }
 
+    /**
+     * @param float $rate
+     */
     #[Test]
-    #[TestDox('Can be cast to a string')]
-    public function testToString() : void
+    #[DataProvider('invalidConstructorProvider')]
+    #[TestDox('Throws exception when constructed with invalid arguments')]
+    public function testInvalidConstructorParams(float $rate) : void
     {
-        self::assertEquals('Stochastic (rate: 0.001)', (string) $this->optimizer);
+        $this->expectException(InvalidArgumentException::class);
+
+        new Stochastic($rate);
     }
 
     /**
@@ -70,7 +80,9 @@ public function testToString() : void
      * @param NDArray $gradient
      * @param list<list<float>> $expected
      */
+    #[Test]
     #[DataProvider('stepProvider')]
+    #[TestDox('Can compute the step')]
     public function testStep(Parameter $param, NDArray $gradient, array $expected) : void
     {
         $step = $this->optimizer->step(param: $param, gradient: $gradient);