Send the data to optimizer without a copy works for dense layers

milancurcic · milancurcic · commit 21c5707af2e7 · 2025-05-27T13:57:48.000-04:00
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
@@ -694,22 +694,22 @@ module subroutine update(self, optimizer, batch_size)
     end do
 #endif
 
-    !params = self % get_params()
-    !call self % optimizer % minimize(params, self % get_gradients() / batch_size_)
-    !call self % set_params(params)
-
     do n = 2, size(self % layers)
       select type(this_layer => self % layers(n) % p)
         type is(dense_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
           call self % optimizer % minimize(weights, dw / batch_size_)
           call self % optimizer % minimize(biases, db / batch_size_)
-          !call this_layer % set_params(weights, biases)
+        type is(locally_connected1d_layer)
+          !TODO
+        type is(conv1d_layer)
+          !TODO
+        type is(conv2d_layer)
+          !TODO
       end select
     end do
 
-
     ! Flush network gradients to zero.
     do n = 2, size(self % layers)
       select type(this_layer => self % layers(n) % p)
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
@@ -155,6 +155,32 @@ pure subroutine minimize_sgd_1d(self, param, gradient)
   end subroutine minimize_sgd_1d
 
 
+  pure subroutine minimize_sgd_2d(self, param, gradient)
+    !! Concrete implementation of a stochastic gradient descent optimizer
+    !! update rule for 2D arrays.
+    class(sgd), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    if (self % momentum > 0) then
+      ! Apply momentum update
+      self % velocity = self % momentum * self % velocity &
+        - self % learning_rate * reshape(gradient, [size(gradient)])
+      if (self % nesterov) then
+        ! Apply Nesterov update
+        param = param + reshape(self % momentum * self % velocity &
+          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
+      else
+        param = param + reshape(self % velocity, shape(param))
+      end if
+    else
+      ! Apply regular update
+      param = param - self % learning_rate * gradient
+    end if
+
+  end subroutine minimize_sgd_2d
+
+
   impure elemental subroutine init_rmsprop(self, num_params)
     class(rmsprop), intent(inout) :: self
     integer, intent(in) :: num_params
@@ -182,6 +208,23 @@ pure subroutine minimize_rmsprop_1d(self, param, gradient)
   end subroutine minimize_rmsprop_1d
 
 
+  pure subroutine minimize_rmsprop_2d(self, param, gradient)
+    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
+    class(rmsprop), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    ! Compute the RMS of the gradient using the RMSProp rule
+    self % rms_gradient = self % decay_rate * self % rms_gradient &
+      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
+
+    ! Update the network parameters based on the new RMS of the gradient
+    param = param - self % learning_rate &
+      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
+
+  end subroutine minimize_rmsprop_2d
+
+
   impure elemental subroutine init_adam(self, num_params)
     class(adam), intent(inout) :: self
     integer, intent(in) :: num_params
@@ -224,6 +267,37 @@ pure subroutine minimize_adam_1d(self, param, gradient)
   end subroutine minimize_adam_1d
 
 
+  pure subroutine minimize_adam_2d(self, param, gradient)
+    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
+    class(adam), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    self % t = self % t + 1
+
+    ! If weight_decay_l2 > 0, use L2 regularization;
+    ! otherwise, default to regular Adam.
+    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
+      self % m = self % beta1 * self % m + (1 - self % beta1) * g
+      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
+    end associate
+
+    ! Compute bias-corrected first and second moment estimates.
+    associate( &
+      m_hat => self % m / (1 - self % beta1**self % t), &
+      v_hat => self % v / (1 - self % beta2**self % t) &
+    )
+
+    ! Update parameters.
+    param = param &
+      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
+      - self % learning_rate * self % weight_decay_decoupled * param
+
+    end associate
+
+  end subroutine minimize_adam_2d
+
+
   impure elemental subroutine init_adagrad(self, num_params)
     class(adagrad), intent(inout) :: self
     integer, intent(in) :: num_params
@@ -262,80 +336,6 @@ pure subroutine minimize_adagrad_1d(self, param, gradient)
   end subroutine minimize_adagrad_1d
 
 
-  pure subroutine minimize_sgd_2d(self, param, gradient)
-    !! Concrete implementation of a stochastic gradient descent optimizer
-    !! update rule for 2D arrays.
-    class(sgd), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    if (self % momentum > 0) then
-      ! Apply momentum update
-      self % velocity = self % momentum * self % velocity &
-        - self % learning_rate * reshape(gradient, [size(gradient)])
-      if (self % nesterov) then
-        ! Apply Nesterov update
-        param = param + reshape(self % momentum * self % velocity &
-          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
-      else
-        param = param + reshape(self % velocity, shape(param))
-      end if
-    else
-      ! Apply regular update
-      param = param - self % learning_rate * gradient
-    end if
-
-  end subroutine minimize_sgd_2d
-
-
-  pure subroutine minimize_rmsprop_2d(self, param, gradient)
-    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
-    class(rmsprop), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    ! Compute the RMS of the gradient using the RMSProp rule
-    self % rms_gradient = self % decay_rate * self % rms_gradient &
-      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
-
-    ! Update the network parameters based on the new RMS of the gradient
-    param = param - self % learning_rate &
-      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
-
-  end subroutine minimize_rmsprop_2d
-
-
-  pure subroutine minimize_adam_2d(self, param, gradient)
-    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
-    class(adam), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    self % t = self % t + 1
-
-    ! If weight_decay_l2 > 0, use L2 regularization;
-    ! otherwise, default to regular Adam.
-    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
-      self % m = self % beta1 * self % m + (1 - self % beta1) * g
-      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
-    end associate
-
-    ! Compute bias-corrected first and second moment estimates.
-    associate( &
-      m_hat => self % m / (1 - self % beta1**self % t), &
-      v_hat => self % v / (1 - self % beta2**self % t) &
-    )
-
-    ! Update parameters.
-    param = param &
-      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
-      - self % learning_rate * self % weight_decay_decoupled * param
-
-    end associate
-
-  end subroutine minimize_adam_2d
-
-
   pure subroutine minimize_adagrad_2d(self, param, gradient)
     !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays.
     class(adagrad), intent(inout) :: self
@@ -363,4 +363,4 @@ pure subroutine minimize_adagrad_2d(self, param, gradient)
 
   end subroutine minimize_adagrad_2d
 
-end module nf_optimizers
+end module nf_optimizers