minor improvements

EnzymeAD · Feb 11, 2024 · 96782cd · 96782cd
1 parent 7516f1b
commit 96782cd
Showing 1 changed file with 30 additions and 18 deletions.
diff --git a/src/usage/usage.md b/src/usage/usage.md
@@ -14,7 +14,7 @@ fn f(x: &[f32], y: &mut f32) {
     y = x[0] * x[0] + x[1] * x[0];
 }
 ```
-We also support functions that return a scalar:
+We also support functions that return a float value:
 ```rust
 fn g(x: &[f32]) -> f32 {
     x[0] * x[0] + x[1] * x[0]
@@ -33,7 +33,7 @@ y &= f(x) \\\\
 To obtain the first element of the gradient using the forward model 
 we have to seed \\(\dot{x}\\) with \\(\dot{x}=[1.0,0.0]\\).
 
-In the forward mode the second element of Dual stores the tangent.
+In the forward mode the second element which gets added for Dual arguments stores the tangent.
 ```rust
 #[autodiff(df, Forward, Dual, Dual)]
 fn f(x: &[f32], y: &mut f32) { ... }
@@ -43,7 +43,7 @@ fn main() {
     let dx = [1.0, 0.0];
     let y  = 0.0
     let dy = 0.0;
-    df(&x,&mut dx,&mut y, &mut dy);
+    df(&x, &mut dx, &mut y, &mut dy);
 }
 ```
 In the returning case we would write similar code, note that in this case
@@ -55,7 +55,7 @@ fn f(x: &[f32]) -> f32 { ... }
 fn main() {
     let x  = [2.0, 2.0];
     let dx = [1.0, 0.0];
-    let (y, dy) = df(&x,&mut dx);
+    let (y, dy) = df(&x, &mut dx);
 }
 ```
 Note that to acquire the full gradient one needs to execute the forward model a second time with the seed dx set to [0.0,1.0].
@@ -84,34 +84,35 @@ fn f(x: &[f32], y: &mut f32) { ... }
 
 fn main() {
     let x  = [2.0, 2.0];
-    let dx = [1.0, 0.0];
+    let bx = [0.0, 0.0];
     let y  = 0.0
-    let dy = 1.0;
+    let by = 1.0;
     df(&x, &mut dx, &mut y, &mut dy);
 }
 ```
 This yields the gradient of `f` in `bx` at point `x = [2.0, 2.0]`. 
 `by` is called the seed and has to be set to ``1.0`` in order to compute 
-the gradient.
+the gradient. Please note that unlike `Dual`, for `Duplicated` the seed
+is getting zeroed, which is required for correctness in certain cases.
 
 We can again also handle functions returning a scalar. In this case we mark the
 return value as duplicated. The seed is then going to be an extra,
 last input argument.
 
 ```rust
-#[autodiff(dg, Reverse, Duplicated, Duplicated)]
+#[autodiff(dg, Reverse, Duplicated, Active)]
 fn g(x: &[f32]) -> f32 { ... }
 
 fn main() {
     let x  = [2.0, 2.0];
-    let dx = [1.0, 0.0];
+    let bx = [0.0, 0.0];
     let seed = 1.0;
-    let (y, dy) = dg(&x, &mut dx, seed);
+    let y = dg(&x, &mut dx, seed);
     assert!(dy[0] == 6.0 && dy[1] == 2.0);
 }
 ```
 
-We can now verify that indeed the reverse mode and forward mode yield the same result for the first component of the gradient. 
+We can now verify that indeed the reverse mode and forward mode yield the same result. 
 
 ```rust
 #[autodiff(df_f, Forward, Dual, Dual)]
@@ -120,15 +121,26 @@ fn f(x: &[f32], y: &mut f32) { ... }
 
 fn main() {
     let x  = [2.0, 2.0];
-    let dx_f1 = [1.0, 0.0];
-    let dx_f2 = [0.0, 1.0];
-    df_r(&x, &mut dx, &mut y, &mut dy);
+    let dx_1 = [1.0, 0.0];
+    let dx_2 = [0.0, 1.0];
+    let mut y = 0.0;
+    let mut dy_f = [0.0, 0.0];
+    df_f(&x, &mut dx_1, &mut y, &mut dy_f[0]);
+    df_f(&x, &mut dx_2, &mut y, &mut dy_f[1]);
 
 
-    let dx = [0.0, 0.0];
-    let y  = 0.0
-    let dy = 1.0;
+    let bx = [0.0, 0.0];
+    let y  = 0.0;
+    let mut dy_r = 1.0;
+    df_r(&x, &mut bx, &mut y, &mut dy_r);
+
+    assert_approx_eq!(dy_f[0], dy_r[0]);
+    assert_approx_eq!(dy_f[1], dy_r[1]);
 }
 ```
 
-
+As we can see, the number of calls under Forward mode scales with the number of 
+input values. Reverse mode scales with the number of output parameters, 
+and is therefore preferable if we have less outputs than inputs. A common example 
+is the training of neural networks, where we have a single output (loss), 
+but a large input (weights).