Skip to content

Commit 273852d

Browse files
committed
test: add comprehensive layer tests
Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>
1 parent 23c4c8e commit 273852d

File tree

1 file changed

+308
-1
lines changed

1 file changed

+308
-1
lines changed

src/layer.zig

Lines changed: 308 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
const std = @import("std");
22
const Matrix = @import("matrix.zig").Matrix;
33
const Activation = @import("activation.zig").Activation;
4+
const Network = @import("network.zig").Network;
45
const testing = std.testing;
56

67
/// Neural network layer implementation supporting both standard and gated architectures
@@ -601,7 +602,10 @@ pub const GatedLayer = struct {
601602
}
602603
};
603604

604-
// Tests
605+
// Tests basic layer initialization and dimension verification
606+
// Verifies that a layer correctly maintains its input and output dimensions
607+
// Mathematical representation:
608+
// Layer: ℝ^n → ℝ^m where n = input_size and m = output_size
605609
test "layer initialization" {
606610
const allocator = testing.allocator;
607611

@@ -614,6 +618,13 @@ test "layer initialization" {
614618
try testing.expectEqual(@as(usize, 2), layer.getOutputSize());
615619
}
616620

621+
// Tests forward propagation through a layer with known weights and biases
622+
// Verifies that the layer correctly computes y = σ(W·x + b)
623+
// Where:
624+
// - W is the weight matrix
625+
// - x is the input vector
626+
// - b is the bias vector
627+
// - σ is the sigmoid activation function
617628
test "layer forward propagation" {
618629
const allocator = testing.allocator;
619630

@@ -642,6 +653,12 @@ test "layer forward propagation" {
642653
try testing.expectApproxEqAbs(@as(f64, 0.7310585786300049), output.get(0, 0), 0.0001);
643654
}
644655

656+
// Tests initialization of gated layers (both GLU and SwiGLU variants)
657+
// Verifies correct dimensions for both linear and gating components
658+
// Mathematical representation:
659+
// GLU: output = (W₁·x + b₁) ⊗ sigmoid(W₂·x + b₂)
660+
// SwiGLU: output = (W₁·x + b₁) ⊗ swish(W₂·x + b₂)
661+
// Where ⊗ represents element-wise multiplication
645662
test "gated layer initialization" {
646663
const allocator = testing.allocator;
647664

@@ -664,6 +681,12 @@ test "gated layer initialization" {
664681
try testing.expectEqual(@as(usize, 3), swiglu_layer.getOutputSize());
665682
}
666683

684+
// Tests forward propagation through a GLU layer with known weights and biases
685+
// Verifies the GLU computation: output = (W₁·x + b₁) ⊗ sigmoid(W₂·x + b₂)
686+
// Where:
687+
// - W₁, b₁ are linear transformation parameters
688+
// - W₂, b₂ are gating parameters
689+
// - ⊗ represents element-wise multiplication
667690
test "GLU layer forward propagation" {
668691
const allocator = testing.allocator;
669692

@@ -697,6 +720,19 @@ test "GLU layer forward propagation" {
697720
try testing.expectApproxEqAbs(@as(f64, 1.0), output.get(0, 0), 0.0001);
698721
}
699722

723+
// Tests backpropagation through a standard layer
724+
// Verifies gradient computation and weight updates
725+
// Mathematical process:
726+
// 1. Forward: y = σ(W·x + b)
727+
// 2. Backward:
728+
// - δz = δy ⊗ σ'(z)
729+
// - δW = xᵀ·δz
730+
// - δb = sum(δz, axis=0)
731+
// - δx = δz·Wᵀ
732+
// Where:
733+
// - δy is the output gradient
734+
// - z is the pre-activation (W·x + b)
735+
// - σ' is the activation derivative
700736
test "layer backpropagation" {
701737
const allocator = testing.allocator;
702738

@@ -735,6 +771,17 @@ test "layer backpropagation" {
735771
try testing.expectEqual(@as(usize, 2), input_gradient.cols);
736772
}
737773

774+
// Tests backpropagation through a gated layer
775+
// Verifies gradient computation for both linear and gating components
776+
// Mathematical process for GLU:
777+
// 1. Forward: output = (W₁·x + b₁) ⊗ sigmoid(W₂·x + b₂)
778+
// 2. Backward:
779+
// - δlinear = δoutput ⊗ sigmoid(W₂·x + b₂)
780+
// - δgate = δoutput ⊗ (W₁·x + b₁) ⊗ sigmoid'(W₂·x + b₂)
781+
// - δW₁ = xᵀ·δlinear
782+
// - δW₂ = xᵀ·δgate
783+
// - δb₁ = sum(δlinear, axis=0)
784+
// - δb₂ = sum(δgate, axis=0)
738785
test "gated layer backpropagation" {
739786
const allocator = testing.allocator;
740787

@@ -777,3 +824,263 @@ test "gated layer backpropagation" {
777824
try testing.expectEqual(@as(usize, 1), input_gradient.rows);
778825
try testing.expectEqual(@as(usize, 2), input_gradient.cols);
779826
}
827+
828+
// Tests weight initialization statistics for different activation functions
829+
// Verifies that the initialization schemes produce the expected statistical properties
830+
// Mathematical expectations:
831+
// 1. ReLU (He initialization):
832+
// - Mean ≈ 0
833+
// - Variance ≈ 2/n where n is input size
834+
// 2. tanh (Xavier/Glorot initialization):
835+
// - Mean ≈ 0
836+
// - Variance ≈ 1/n
837+
// 3. softmax (scaled Xavier/Glorot):
838+
// - Mean ≈ 0
839+
// - Variance ≈ 0.1/n
840+
test "weight initialization statistics" {
841+
const allocator = testing.allocator;
842+
const input_size: usize = 1000;
843+
const output_size: usize = 1000;
844+
845+
// Test ReLU initialization
846+
var relu_layer = try Layer.init(allocator, input_size, output_size, Activation.relu, Activation.relu_derivative);
847+
defer relu_layer.deinit();
848+
849+
// Calculate mean and variance of weights
850+
var relu_mean: f64 = 0.0;
851+
var relu_variance: f64 = 0.0;
852+
for (0..input_size) |i| {
853+
for (0..output_size) |j| {
854+
const weight = relu_layer.weights.get(i, j);
855+
relu_mean += weight;
856+
relu_variance += weight * weight;
857+
}
858+
}
859+
relu_mean /= @as(f64, @floatFromInt(input_size * output_size));
860+
relu_variance = relu_variance / @as(f64, @floatFromInt(input_size * output_size)) - relu_mean * relu_mean;
861+
862+
// For ReLU, expected mean should be close to 0 and variance close to 2/input_size
863+
const expected_relu_variance = 2.0 / @as(f64, @floatFromInt(input_size));
864+
try testing.expectApproxEqAbs(@as(f64, 0.0), relu_mean, 0.1);
865+
try testing.expectApproxEqAbs(expected_relu_variance, relu_variance, 0.1);
866+
867+
// Test tanh initialization
868+
var tanh_layer = try Layer.init(allocator, input_size, output_size, Activation.tanh, Activation.tanh_derivative);
869+
defer tanh_layer.deinit();
870+
871+
var tanh_mean: f64 = 0.0;
872+
var tanh_variance: f64 = 0.0;
873+
for (0..input_size) |i| {
874+
for (0..output_size) |j| {
875+
const weight = tanh_layer.weights.get(i, j);
876+
tanh_mean += weight;
877+
tanh_variance += weight * weight;
878+
}
879+
}
880+
tanh_mean /= @as(f64, @floatFromInt(input_size * output_size));
881+
tanh_variance = tanh_variance / @as(f64, @floatFromInt(input_size * output_size)) - tanh_mean * tanh_mean;
882+
883+
// For tanh, expected mean should be close to 0 and variance close to 1/input_size
884+
const expected_tanh_variance = 1.0 / @as(f64, @floatFromInt(input_size));
885+
try testing.expectApproxEqAbs(@as(f64, 0.0), tanh_mean, 0.1);
886+
try testing.expectApproxEqAbs(expected_tanh_variance, tanh_variance, 0.1);
887+
888+
// Test softmax initialization
889+
var softmax_layer = try Layer.init(allocator, input_size, output_size, Activation.softmax, Activation.softmax_derivative);
890+
defer softmax_layer.deinit();
891+
892+
var softmax_mean: f64 = 0.0;
893+
var softmax_variance: f64 = 0.0;
894+
for (0..input_size) |i| {
895+
for (0..output_size) |j| {
896+
const weight = softmax_layer.weights.get(i, j);
897+
softmax_mean += weight;
898+
softmax_variance += weight * weight;
899+
}
900+
}
901+
softmax_mean /= @as(f64, @floatFromInt(input_size * output_size));
902+
softmax_variance = softmax_variance / @as(f64, @floatFromInt(input_size * output_size)) - softmax_mean * softmax_mean;
903+
904+
// For softmax, expected mean should be close to 0 and variance close to 0.1/input_size
905+
const expected_softmax_variance = 0.1 / @as(f64, @floatFromInt(input_size));
906+
try testing.expectApproxEqAbs(@as(f64, 0.0), softmax_mean, 0.1);
907+
try testing.expectApproxEqAbs(expected_softmax_variance, softmax_variance, 0.1);
908+
}
909+
910+
// Tests the impact of different weight initialization schemes on training performance
911+
// Compares three initialization methods:
912+
// 1. Xavier/Glorot: W ~ N(0, 1/n) for tanh/sigmoid
913+
// 2. He: W ~ N(0, 2/n) for ReLU
914+
// 3. LeCun: W ~ N(0, 1/n) for tanh/sigmoid
915+
// Uses a simple XOR-like task to evaluate convergence speed and final loss
916+
test "weight initialization impact on training" {
917+
const allocator = testing.allocator;
918+
const input_size: usize = 2;
919+
const hidden_size: usize = 4;
920+
const output_size: usize = 1;
921+
const num_samples: usize = 1000;
922+
923+
// Create training data for a simple task (XOR-like)
924+
var inputs = try Matrix.init(allocator, num_samples, input_size);
925+
defer inputs.deinit();
926+
var targets = try Matrix.init(allocator, num_samples, output_size);
927+
defer targets.deinit();
928+
929+
var prng = std.Random.DefaultPrng.init(42);
930+
const rand = prng.random();
931+
932+
// Generate training data
933+
for (0..num_samples) |i| {
934+
const x1: f64 = if (rand.boolean()) 1.0 else 0.0;
935+
const x2: f64 = if (rand.boolean()) 1.0 else 0.0;
936+
inputs.set(i, 0, x1);
937+
inputs.set(i, 1, x2);
938+
targets.set(i, 0, if (x1 != x2) 1.0 else 0.0);
939+
}
940+
941+
// Test different initialization schemes
942+
var best_loss: f64 = std.math.inf(f64);
943+
var best_initialization: []const u8 = undefined;
944+
945+
// Test 1: Standard Xavier/Glorot initialization
946+
{
947+
var network = Network.init(allocator, 0.1, .MeanSquaredError);
948+
defer network.deinit();
949+
950+
try network.addLayer(input_size, hidden_size, Activation.tanh, Activation.tanh_derivative);
951+
try network.addLayer(hidden_size, output_size, Activation.sigmoid, Activation.sigmoid_derivative);
952+
953+
const loss_history = try network.train(inputs, targets, 10, 32);
954+
defer allocator.free(loss_history);
955+
956+
const final_loss = loss_history[loss_history.len - 1];
957+
if (final_loss < best_loss) {
958+
best_loss = final_loss;
959+
best_initialization = "Xavier/Glorot";
960+
}
961+
}
962+
963+
// Test 2: He initialization (scaled for ReLU)
964+
{
965+
var network = Network.init(allocator, 0.1, .MeanSquaredError);
966+
defer network.deinit();
967+
968+
try network.addLayer(input_size, hidden_size, Activation.relu, Activation.relu_derivative);
969+
try network.addLayer(hidden_size, output_size, Activation.sigmoid, Activation.sigmoid_derivative);
970+
971+
const loss_history = try network.train(inputs, targets, 10, 32);
972+
defer allocator.free(loss_history);
973+
974+
const final_loss = loss_history[loss_history.len - 1];
975+
if (final_loss < best_loss) {
976+
best_loss = final_loss;
977+
best_initialization = "He";
978+
}
979+
}
980+
981+
// Test 3: LeCun initialization
982+
{
983+
var network = Network.init(allocator, 0.1, .MeanSquaredError);
984+
defer network.deinit();
985+
986+
try network.addLayer(input_size, hidden_size, Activation.tanh, Activation.tanh_derivative);
987+
try network.addLayer(hidden_size, output_size, Activation.sigmoid, Activation.sigmoid_derivative);
988+
989+
const loss_history = try network.train(inputs, targets, 10, 32);
990+
defer allocator.free(loss_history);
991+
992+
const final_loss = loss_history[loss_history.len - 1];
993+
if (final_loss < best_loss) {
994+
best_loss = final_loss;
995+
best_initialization = "LeCun";
996+
}
997+
}
998+
999+
// Verify that we got reasonable results
1000+
try testing.expect(best_loss < 0.5);
1001+
}
1002+
1003+
// Tests weight initialization statistics for gated layers (GLU and SwiGLU)
1004+
// Verifies that both linear and gating components follow the expected statistical properties
1005+
// Mathematical expectations:
1006+
// For both GLU and SwiGLU:
1007+
// - Mean ≈ 0
1008+
// - Variance ≈ 2/(n + m) where n is input size and m is output size
1009+
// This initialization scheme is designed to maintain variance across the gated layer
1010+
test "gated layer weight initialization statistics" {
1011+
const allocator = testing.allocator;
1012+
const input_size: usize = 1000;
1013+
const output_size: usize = 1000;
1014+
1015+
// Test GLU initialization
1016+
var glu_layer = try GatedLayer.init(allocator, input_size, output_size, false);
1017+
defer glu_layer.deinit();
1018+
1019+
// Calculate mean and variance of linear weights
1020+
var linear_mean: f64 = 0.0;
1021+
var linear_variance: f64 = 0.0;
1022+
for (0..input_size) |i| {
1023+
for (0..output_size) |j| {
1024+
const weight = glu_layer.linear_weights.get(i, j);
1025+
linear_mean += weight;
1026+
linear_variance += weight * weight;
1027+
}
1028+
}
1029+
linear_mean /= @as(f64, @floatFromInt(input_size * output_size));
1030+
linear_variance = linear_variance / @as(f64, @floatFromInt(input_size * output_size)) - linear_mean * linear_mean;
1031+
1032+
// Calculate mean and variance of gate weights
1033+
var gate_mean: f64 = 0.0;
1034+
var gate_variance: f64 = 0.0;
1035+
for (0..input_size) |i| {
1036+
for (0..output_size) |j| {
1037+
const weight = glu_layer.gate_weights.get(i, j);
1038+
gate_mean += weight;
1039+
gate_variance += weight * weight;
1040+
}
1041+
}
1042+
gate_mean /= @as(f64, @floatFromInt(input_size * output_size));
1043+
gate_variance = gate_variance / @as(f64, @floatFromInt(input_size * output_size)) - gate_mean * gate_mean;
1044+
1045+
// For GLU, expected mean should be close to 0 and variance close to 2/(input_size + output_size)
1046+
const expected_variance = 2.0 / @as(f64, @floatFromInt(input_size + output_size));
1047+
try testing.expectApproxEqAbs(@as(f64, 0.0), linear_mean, 0.1);
1048+
try testing.expectApproxEqAbs(@as(f64, 0.0), gate_mean, 0.1);
1049+
try testing.expectApproxEqAbs(expected_variance, linear_variance, 0.1);
1050+
try testing.expectApproxEqAbs(expected_variance, gate_variance, 0.1);
1051+
1052+
// Test SwiGLU initialization
1053+
var swiglu_layer = try GatedLayer.init(allocator, input_size, output_size, true);
1054+
defer swiglu_layer.deinit();
1055+
1056+
// Calculate statistics for SwiGLU weights
1057+
var swiglu_linear_mean: f64 = 0.0;
1058+
var swiglu_linear_variance: f64 = 0.0;
1059+
for (0..input_size) |i| {
1060+
for (0..output_size) |j| {
1061+
const weight = swiglu_layer.linear_weights.get(i, j);
1062+
swiglu_linear_mean += weight;
1063+
swiglu_linear_variance += weight * weight;
1064+
}
1065+
}
1066+
swiglu_linear_mean /= @as(f64, @floatFromInt(input_size * output_size));
1067+
swiglu_linear_variance = swiglu_linear_variance / @as(f64, @floatFromInt(input_size * output_size)) - swiglu_linear_mean * swiglu_linear_mean;
1068+
1069+
var swiglu_gate_mean: f64 = 0.0;
1070+
var swiglu_gate_variance: f64 = 0.0;
1071+
for (0..input_size) |i| {
1072+
for (0..output_size) |j| {
1073+
const weight = swiglu_layer.gate_weights.get(i, j);
1074+
swiglu_gate_mean += weight;
1075+
swiglu_gate_variance += weight * weight;
1076+
}
1077+
}
1078+
swiglu_gate_mean /= @as(f64, @floatFromInt(input_size * output_size));
1079+
swiglu_gate_variance = swiglu_gate_variance / @as(f64, @floatFromInt(input_size * output_size)) - swiglu_gate_mean * swiglu_gate_mean;
1080+
1081+
// For SwiGLU, expected mean should be close to 0 and variance close to 2/(input_size + output_size)
1082+
try testing.expectApproxEqAbs(@as(f64, 0.0), swiglu_linear_mean, 0.1);
1083+
try testing.expectApproxEqAbs(@as(f64, 0.0), swiglu_gate_mean, 0.1);
1084+
try testing.expectApproxEqAbs(expected_variance, swiglu_linear_variance, 0.1);
1085+
try testing.expectApproxEqAbs(expected_variance, swiglu_gate_variance, 0.1);
1086+
}

0 commit comments

Comments
 (0)