11const std = @import ("std" );
22const Matrix = @import ("matrix.zig" ).Matrix ;
33const Activation = @import ("activation.zig" ).Activation ;
4+ const Network = @import ("network.zig" ).Network ;
45const testing = std .testing ;
56
67/// Neural network layer implementation supporting both standard and gated architectures
@@ -601,7 +602,10 @@ pub const GatedLayer = struct {
601602 }
602603};
603604
604- // Tests
605+ // Tests basic layer initialization and dimension verification
606+ // Verifies that a layer correctly maintains its input and output dimensions
607+ // Mathematical representation:
608+ // Layer: ℝ^n → ℝ^m where n = input_size and m = output_size
605609test "layer initialization" {
606610 const allocator = testing .allocator ;
607611
@@ -614,6 +618,13 @@ test "layer initialization" {
614618 try testing .expectEqual (@as (usize , 2 ), layer .getOutputSize ());
615619}
616620
621+ // Tests forward propagation through a layer with known weights and biases
622+ // Verifies that the layer correctly computes y = σ(W·x + b)
623+ // Where:
624+ // - W is the weight matrix
625+ // - x is the input vector
626+ // - b is the bias vector
627+ // - σ is the sigmoid activation function
617628test "layer forward propagation" {
618629 const allocator = testing .allocator ;
619630
@@ -642,6 +653,12 @@ test "layer forward propagation" {
642653 try testing .expectApproxEqAbs (@as (f64 , 0.7310585786300049 ), output .get (0 , 0 ), 0.0001 );
643654}
644655
656+ // Tests initialization of gated layers (both GLU and SwiGLU variants)
657+ // Verifies correct dimensions for both linear and gating components
658+ // Mathematical representation:
659+ // GLU: output = (W₁·x + b₁) ⊗ sigmoid(W₂·x + b₂)
660+ // SwiGLU: output = (W₁·x + b₁) ⊗ swish(W₂·x + b₂)
661+ // Where ⊗ represents element-wise multiplication
645662test "gated layer initialization" {
646663 const allocator = testing .allocator ;
647664
@@ -664,6 +681,12 @@ test "gated layer initialization" {
664681 try testing .expectEqual (@as (usize , 3 ), swiglu_layer .getOutputSize ());
665682}
666683
684+ // Tests forward propagation through a GLU layer with known weights and biases
685+ // Verifies the GLU computation: output = (W₁·x + b₁) ⊗ sigmoid(W₂·x + b₂)
686+ // Where:
687+ // - W₁, b₁ are linear transformation parameters
688+ // - W₂, b₂ are gating parameters
689+ // - ⊗ represents element-wise multiplication
667690test "GLU layer forward propagation" {
668691 const allocator = testing .allocator ;
669692
@@ -697,6 +720,19 @@ test "GLU layer forward propagation" {
697720 try testing .expectApproxEqAbs (@as (f64 , 1.0 ), output .get (0 , 0 ), 0.0001 );
698721}
699722
723+ // Tests backpropagation through a standard layer
724+ // Verifies gradient computation and weight updates
725+ // Mathematical process:
726+ // 1. Forward: y = σ(W·x + b)
727+ // 2. Backward:
728+ // - δz = δy ⊗ σ'(z)
729+ // - δW = xᵀ·δz
730+ // - δb = sum(δz, axis=0)
731+ // - δx = δz·Wᵀ
732+ // Where:
733+ // - δy is the output gradient
734+ // - z is the pre-activation (W·x + b)
735+ // - σ' is the activation derivative
700736test "layer backpropagation" {
701737 const allocator = testing .allocator ;
702738
@@ -735,6 +771,17 @@ test "layer backpropagation" {
735771 try testing .expectEqual (@as (usize , 2 ), input_gradient .cols );
736772}
737773
774+ // Tests backpropagation through a gated layer
775+ // Verifies gradient computation for both linear and gating components
776+ // Mathematical process for GLU:
777+ // 1. Forward: output = (W₁·x + b₁) ⊗ sigmoid(W₂·x + b₂)
778+ // 2. Backward:
779+ // - δlinear = δoutput ⊗ sigmoid(W₂·x + b₂)
780+ // - δgate = δoutput ⊗ (W₁·x + b₁) ⊗ sigmoid'(W₂·x + b₂)
781+ // - δW₁ = xᵀ·δlinear
782+ // - δW₂ = xᵀ·δgate
783+ // - δb₁ = sum(δlinear, axis=0)
784+ // - δb₂ = sum(δgate, axis=0)
738785test "gated layer backpropagation" {
739786 const allocator = testing .allocator ;
740787
@@ -777,3 +824,263 @@ test "gated layer backpropagation" {
777824 try testing .expectEqual (@as (usize , 1 ), input_gradient .rows );
778825 try testing .expectEqual (@as (usize , 2 ), input_gradient .cols );
779826}
827+
828+ // Tests weight initialization statistics for different activation functions
829+ // Verifies that the initialization schemes produce the expected statistical properties
830+ // Mathematical expectations:
831+ // 1. ReLU (He initialization):
832+ // - Mean ≈ 0
833+ // - Variance ≈ 2/n where n is input size
834+ // 2. tanh (Xavier/Glorot initialization):
835+ // - Mean ≈ 0
836+ // - Variance ≈ 1/n
837+ // 3. softmax (scaled Xavier/Glorot):
838+ // - Mean ≈ 0
839+ // - Variance ≈ 0.1/n
840+ test "weight initialization statistics" {
841+ const allocator = testing .allocator ;
842+ const input_size : usize = 1000 ;
843+ const output_size : usize = 1000 ;
844+
845+ // Test ReLU initialization
846+ var relu_layer = try Layer .init (allocator , input_size , output_size , Activation .relu , Activation .relu_derivative );
847+ defer relu_layer .deinit ();
848+
849+ // Calculate mean and variance of weights
850+ var relu_mean : f64 = 0.0 ;
851+ var relu_variance : f64 = 0.0 ;
852+ for (0.. input_size ) | i | {
853+ for (0.. output_size ) | j | {
854+ const weight = relu_layer .weights .get (i , j );
855+ relu_mean += weight ;
856+ relu_variance += weight * weight ;
857+ }
858+ }
859+ relu_mean /= @as (f64 , @floatFromInt (input_size * output_size ));
860+ relu_variance = relu_variance / @as (f64 , @floatFromInt (input_size * output_size )) - relu_mean * relu_mean ;
861+
862+ // For ReLU, expected mean should be close to 0 and variance close to 2/input_size
863+ const expected_relu_variance = 2.0 / @as (f64 , @floatFromInt (input_size ));
864+ try testing .expectApproxEqAbs (@as (f64 , 0.0 ), relu_mean , 0.1 );
865+ try testing .expectApproxEqAbs (expected_relu_variance , relu_variance , 0.1 );
866+
867+ // Test tanh initialization
868+ var tanh_layer = try Layer .init (allocator , input_size , output_size , Activation .tanh , Activation .tanh_derivative );
869+ defer tanh_layer .deinit ();
870+
871+ var tanh_mean : f64 = 0.0 ;
872+ var tanh_variance : f64 = 0.0 ;
873+ for (0.. input_size ) | i | {
874+ for (0.. output_size ) | j | {
875+ const weight = tanh_layer .weights .get (i , j );
876+ tanh_mean += weight ;
877+ tanh_variance += weight * weight ;
878+ }
879+ }
880+ tanh_mean /= @as (f64 , @floatFromInt (input_size * output_size ));
881+ tanh_variance = tanh_variance / @as (f64 , @floatFromInt (input_size * output_size )) - tanh_mean * tanh_mean ;
882+
883+ // For tanh, expected mean should be close to 0 and variance close to 1/input_size
884+ const expected_tanh_variance = 1.0 / @as (f64 , @floatFromInt (input_size ));
885+ try testing .expectApproxEqAbs (@as (f64 , 0.0 ), tanh_mean , 0.1 );
886+ try testing .expectApproxEqAbs (expected_tanh_variance , tanh_variance , 0.1 );
887+
888+ // Test softmax initialization
889+ var softmax_layer = try Layer .init (allocator , input_size , output_size , Activation .softmax , Activation .softmax_derivative );
890+ defer softmax_layer .deinit ();
891+
892+ var softmax_mean : f64 = 0.0 ;
893+ var softmax_variance : f64 = 0.0 ;
894+ for (0.. input_size ) | i | {
895+ for (0.. output_size ) | j | {
896+ const weight = softmax_layer .weights .get (i , j );
897+ softmax_mean += weight ;
898+ softmax_variance += weight * weight ;
899+ }
900+ }
901+ softmax_mean /= @as (f64 , @floatFromInt (input_size * output_size ));
902+ softmax_variance = softmax_variance / @as (f64 , @floatFromInt (input_size * output_size )) - softmax_mean * softmax_mean ;
903+
904+ // For softmax, expected mean should be close to 0 and variance close to 0.1/input_size
905+ const expected_softmax_variance = 0.1 / @as (f64 , @floatFromInt (input_size ));
906+ try testing .expectApproxEqAbs (@as (f64 , 0.0 ), softmax_mean , 0.1 );
907+ try testing .expectApproxEqAbs (expected_softmax_variance , softmax_variance , 0.1 );
908+ }
909+
910+ // Tests the impact of different weight initialization schemes on training performance
911+ // Compares three initialization methods:
912+ // 1. Xavier/Glorot: W ~ N(0, 1/n) for tanh/sigmoid
913+ // 2. He: W ~ N(0, 2/n) for ReLU
914+ // 3. LeCun: W ~ N(0, 1/n) for tanh/sigmoid
915+ // Uses a simple XOR-like task to evaluate convergence speed and final loss
916+ test "weight initialization impact on training" {
917+ const allocator = testing .allocator ;
918+ const input_size : usize = 2 ;
919+ const hidden_size : usize = 4 ;
920+ const output_size : usize = 1 ;
921+ const num_samples : usize = 1000 ;
922+
923+ // Create training data for a simple task (XOR-like)
924+ var inputs = try Matrix .init (allocator , num_samples , input_size );
925+ defer inputs .deinit ();
926+ var targets = try Matrix .init (allocator , num_samples , output_size );
927+ defer targets .deinit ();
928+
929+ var prng = std .Random .DefaultPrng .init (42 );
930+ const rand = prng .random ();
931+
932+ // Generate training data
933+ for (0.. num_samples ) | i | {
934+ const x1 : f64 = if (rand .boolean ()) 1.0 else 0.0 ;
935+ const x2 : f64 = if (rand .boolean ()) 1.0 else 0.0 ;
936+ inputs .set (i , 0 , x1 );
937+ inputs .set (i , 1 , x2 );
938+ targets .set (i , 0 , if (x1 != x2 ) 1.0 else 0.0 );
939+ }
940+
941+ // Test different initialization schemes
942+ var best_loss : f64 = std .math .inf (f64 );
943+ var best_initialization : []const u8 = undefined ;
944+
945+ // Test 1: Standard Xavier/Glorot initialization
946+ {
947+ var network = Network .init (allocator , 0.1 , .MeanSquaredError );
948+ defer network .deinit ();
949+
950+ try network .addLayer (input_size , hidden_size , Activation .tanh , Activation .tanh_derivative );
951+ try network .addLayer (hidden_size , output_size , Activation .sigmoid , Activation .sigmoid_derivative );
952+
953+ const loss_history = try network .train (inputs , targets , 10 , 32 );
954+ defer allocator .free (loss_history );
955+
956+ const final_loss = loss_history [loss_history .len - 1 ];
957+ if (final_loss < best_loss ) {
958+ best_loss = final_loss ;
959+ best_initialization = "Xavier/Glorot" ;
960+ }
961+ }
962+
963+ // Test 2: He initialization (scaled for ReLU)
964+ {
965+ var network = Network .init (allocator , 0.1 , .MeanSquaredError );
966+ defer network .deinit ();
967+
968+ try network .addLayer (input_size , hidden_size , Activation .relu , Activation .relu_derivative );
969+ try network .addLayer (hidden_size , output_size , Activation .sigmoid , Activation .sigmoid_derivative );
970+
971+ const loss_history = try network .train (inputs , targets , 10 , 32 );
972+ defer allocator .free (loss_history );
973+
974+ const final_loss = loss_history [loss_history .len - 1 ];
975+ if (final_loss < best_loss ) {
976+ best_loss = final_loss ;
977+ best_initialization = "He" ;
978+ }
979+ }
980+
981+ // Test 3: LeCun initialization
982+ {
983+ var network = Network .init (allocator , 0.1 , .MeanSquaredError );
984+ defer network .deinit ();
985+
986+ try network .addLayer (input_size , hidden_size , Activation .tanh , Activation .tanh_derivative );
987+ try network .addLayer (hidden_size , output_size , Activation .sigmoid , Activation .sigmoid_derivative );
988+
989+ const loss_history = try network .train (inputs , targets , 10 , 32 );
990+ defer allocator .free (loss_history );
991+
992+ const final_loss = loss_history [loss_history .len - 1 ];
993+ if (final_loss < best_loss ) {
994+ best_loss = final_loss ;
995+ best_initialization = "LeCun" ;
996+ }
997+ }
998+
999+ // Verify that we got reasonable results
1000+ try testing .expect (best_loss < 0.5 );
1001+ }
1002+
1003+ // Tests weight initialization statistics for gated layers (GLU and SwiGLU)
1004+ // Verifies that both linear and gating components follow the expected statistical properties
1005+ // Mathematical expectations:
1006+ // For both GLU and SwiGLU:
1007+ // - Mean ≈ 0
1008+ // - Variance ≈ 2/(n + m) where n is input size and m is output size
1009+ // This initialization scheme is designed to maintain variance across the gated layer
1010+ test "gated layer weight initialization statistics" {
1011+ const allocator = testing .allocator ;
1012+ const input_size : usize = 1000 ;
1013+ const output_size : usize = 1000 ;
1014+
1015+ // Test GLU initialization
1016+ var glu_layer = try GatedLayer .init (allocator , input_size , output_size , false );
1017+ defer glu_layer .deinit ();
1018+
1019+ // Calculate mean and variance of linear weights
1020+ var linear_mean : f64 = 0.0 ;
1021+ var linear_variance : f64 = 0.0 ;
1022+ for (0.. input_size ) | i | {
1023+ for (0.. output_size ) | j | {
1024+ const weight = glu_layer .linear_weights .get (i , j );
1025+ linear_mean += weight ;
1026+ linear_variance += weight * weight ;
1027+ }
1028+ }
1029+ linear_mean /= @as (f64 , @floatFromInt (input_size * output_size ));
1030+ linear_variance = linear_variance / @as (f64 , @floatFromInt (input_size * output_size )) - linear_mean * linear_mean ;
1031+
1032+ // Calculate mean and variance of gate weights
1033+ var gate_mean : f64 = 0.0 ;
1034+ var gate_variance : f64 = 0.0 ;
1035+ for (0.. input_size ) | i | {
1036+ for (0.. output_size ) | j | {
1037+ const weight = glu_layer .gate_weights .get (i , j );
1038+ gate_mean += weight ;
1039+ gate_variance += weight * weight ;
1040+ }
1041+ }
1042+ gate_mean /= @as (f64 , @floatFromInt (input_size * output_size ));
1043+ gate_variance = gate_variance / @as (f64 , @floatFromInt (input_size * output_size )) - gate_mean * gate_mean ;
1044+
1045+ // For GLU, expected mean should be close to 0 and variance close to 2/(input_size + output_size)
1046+ const expected_variance = 2.0 / @as (f64 , @floatFromInt (input_size + output_size ));
1047+ try testing .expectApproxEqAbs (@as (f64 , 0.0 ), linear_mean , 0.1 );
1048+ try testing .expectApproxEqAbs (@as (f64 , 0.0 ), gate_mean , 0.1 );
1049+ try testing .expectApproxEqAbs (expected_variance , linear_variance , 0.1 );
1050+ try testing .expectApproxEqAbs (expected_variance , gate_variance , 0.1 );
1051+
1052+ // Test SwiGLU initialization
1053+ var swiglu_layer = try GatedLayer .init (allocator , input_size , output_size , true );
1054+ defer swiglu_layer .deinit ();
1055+
1056+ // Calculate statistics for SwiGLU weights
1057+ var swiglu_linear_mean : f64 = 0.0 ;
1058+ var swiglu_linear_variance : f64 = 0.0 ;
1059+ for (0.. input_size ) | i | {
1060+ for (0.. output_size ) | j | {
1061+ const weight = swiglu_layer .linear_weights .get (i , j );
1062+ swiglu_linear_mean += weight ;
1063+ swiglu_linear_variance += weight * weight ;
1064+ }
1065+ }
1066+ swiglu_linear_mean /= @as (f64 , @floatFromInt (input_size * output_size ));
1067+ swiglu_linear_variance = swiglu_linear_variance / @as (f64 , @floatFromInt (input_size * output_size )) - swiglu_linear_mean * swiglu_linear_mean ;
1068+
1069+ var swiglu_gate_mean : f64 = 0.0 ;
1070+ var swiglu_gate_variance : f64 = 0.0 ;
1071+ for (0.. input_size ) | i | {
1072+ for (0.. output_size ) | j | {
1073+ const weight = swiglu_layer .gate_weights .get (i , j );
1074+ swiglu_gate_mean += weight ;
1075+ swiglu_gate_variance += weight * weight ;
1076+ }
1077+ }
1078+ swiglu_gate_mean /= @as (f64 , @floatFromInt (input_size * output_size ));
1079+ swiglu_gate_variance = swiglu_gate_variance / @as (f64 , @floatFromInt (input_size * output_size )) - swiglu_gate_mean * swiglu_gate_mean ;
1080+
1081+ // For SwiGLU, expected mean should be close to 0 and variance close to 2/(input_size + output_size)
1082+ try testing .expectApproxEqAbs (@as (f64 , 0.0 ), swiglu_linear_mean , 0.1 );
1083+ try testing .expectApproxEqAbs (@as (f64 , 0.0 ), swiglu_gate_mean , 0.1 );
1084+ try testing .expectApproxEqAbs (expected_variance , swiglu_linear_variance , 0.1 );
1085+ try testing .expectApproxEqAbs (expected_variance , swiglu_gate_variance , 0.1 );
1086+ }
0 commit comments