HugoPhi
diff --git a/‎README.md
+2-2 b/‎README.md
+2-2
diff --git a/‎cnn_cifar10.ipynb
+17-19 b/‎cnn_cifar10.ipynb
+17-19
diff --git a/‎cnn_mnist.ipynb
+23-20 b/‎cnn_mnist.ipynb
+23-20
diff --git a/‎gru_ucihar.ipynb
+44-94 b/‎gru_ucihar.ipynb
+44-94
diff --git a/‎knn_cifar10.ipynb
+36-8 b/‎knn_cifar10.ipynb
+36-8
diff --git a/‎lstm_ucihar.ipynb
+55-58 b/‎lstm_ucihar.ipynb
+55-58
diff --git a/‎notebook_docs/jax_tips.ipynb
+3-9 b/‎notebook_docs/jax_tips.ipynb
+3-9
diff --git a/‎plugins/minitorch/optimizer.py
+39-11 b/‎plugins/minitorch/optimizer.py
+39-11
@@ -8,7 +8,7 @@
 </p>
 
 <p align="center">
-On MNIST: (a) acc[96.80%] & loss vs. epochs for mlp; (b) acc[97.86%] & loss vs. epochs for LeNet  
+On MNIST: (a) acc[96.80%] & loss vs. epochs for mlp; (b) acc[98.24%] & loss vs. epochs for LeNet  
 </p>
 
 
@@ -35,7 +35,7 @@ On MNIST: (a) acc[96.80%] & loss vs. epochs for mlp; (b) acc[97.86%] & loss vs.
 - Nerual ODE[[5](#reference)]
   - MNIST. <mark>TODO</mark>
 - VAE[[7](#reference)]
-  - MNIST. <mark>TODO</mark>
+  - MNIST. 
 
 ## # NoteBook Docs
 
 
@@ -183,7 +183,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "id": "79ce60c8-8923-400e-b7fa-4753442b85ae",
    "metadata": {},
    "outputs": [
@@ -192,19 +192,47 @@
      "output_type": "stream",
      "text": [
       "0.30600002\n",
-      "\n",
-      "time: 5.3901941776275635 s\n"
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "0.30600002\n",
+      "5.64 s ± 74.3 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)\n"
      ]
     }
    ],
    "source": [
     "import time\n",
     "\n",
-    "s = time.time()\n",
-    "y_pred = knn.predict(x_test)\n",
-    "print(jnp.mean(y_test==y_pred))\n",
-    "print()\n",
-    "print(f'time: {time.time() - s} s')"
+    "def fast_run():\n",
+    "    y_pred = knn.predict(x_test)\n",
+    "    print(jnp.mean(y_test==y_pred))\n",
+    "\n",
+    "%timeit -n10 -r3 fast_run()"
    ]
   }
  ],
 
@@ -679,16 +679,10 @@
    "id": "b7375ed6-dee1-416e-a6d9-a81247f4ba2f",
    "metadata": {},
    "source": [
-    "## # jax.lax.scan: Iter Functool"
+    "## # jax.lax.scan: Iter Functool\n",
+    "\n",
+    "see [knn_on_cifar10](https://github.com/HugoPhi/jaxdls/blob/main/knn_cifar10.ipynb) & [lstm cell](https://github.com/HugoPhi/jaxdls/blob/main/plugins/minitorch/nn/JaxOptimized/rnncell.py)."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d3e6e1c8-4ad3-4943-929f-c9f749b09ffd",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
 
@@ -1,7 +1,7 @@
 '''
 JAX Optimization Algorithms Module
 
-* Last Updated: 2025-03-09
+* Last Updated: 2025-03-15
 * Author: HugoPhi, [GitHub](https://github.com/HugoPhi)
 * Maintainer: [email protected]
 
@@ -39,7 +39,7 @@
 '''
 
 import jax.numpy as jnp
-from jax import grad, tree, lax
+from jax import grad, tree, lax, random
 from abc import ABC, abstractmethod
 
 
@@ -89,19 +89,20 @@ def flash(self):
         '''
         pass
 
-    def open(self, loss_function, x_train: jnp.ndarray, y_train: jnp.ndarray, short_batch='drop'):
+    def open(self, loss_function, x_train: jnp.ndarray, y_train: jnp.ndarray, short_batch='drop', key=random.PRNGKey(42)):
         '''
         Prepares the optimizer for training by initializing its state and setting up the training data.
 
         Args:
             loss_function: A loss function that computes the scalar loss given model parameters,
                           input data, and true labels. It must be JIT-compiled.
-                          Signature: `f(params, x, y_true) -> scalar`.
+                          Signature: `f(params, x, y_true, key=random.PRNGKey(42)) -> scalar`.
             x_train: Input data for training. Shape: `(num_samples, ...)`.
             y_train: True labels for training. Shape: `(num_samples, ...)`.
             short_batch: The Strategy to handle short batch. including:
                 - 'drop': drop short batch, used when: dataset size >> batch size, num_batches = N // B
                 - 'pad': append arr[-B:] to trimmed arr, used when: dataset size >~ batch size, num_batches = N // B + 1
+            key: A random number generator key used for initialization.
 
         Notes:
             - The training data is divided into batches based on the `batch_size` attribute.
@@ -114,6 +115,7 @@ def open(self, loss_function, x_train: jnp.ndarray, y_train: jnp.ndarray, short_
         else:
             self.flash()
             self._loss = loss_function
+            self.key = key
 
             if short_batch == 'drop':
                 self.num_batches = x_train.shape[0] // self.batch_size
@@ -247,6 +249,8 @@ def update(self):
             ixs = jnp.arange(self.num_batches)
             bxs = self.x_train.reshape(self.num_batches, self.batch_size, *self.x_train.shape[1:])
             bys = self.y_train.reshape(self.num_batches, self.batch_size, *self.y_train.shape[1:])
+            subkeys = random.split(self.key, self.num_batches + 1)
+            self.key, subkeys = subkeys[0], subkeys[1:]  # update self.key & get subkeys
 
             def one_batch(carry, ix):
 
@@ -270,7 +274,8 @@ def adam(d_w, w, v, vv):
 
                 bx = bxs[ix]
                 by = bys[ix]
-                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by)
+                kkey = subkeys[ix]
+                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by, kkey)
 
                 pack = tree.map(adam, d_params, carry['params'], carry['V'], carry['VV'])  # use Adam
                 carry['params'] = tree.map(lambda x: x[0], pack)
@@ -346,6 +351,8 @@ def update(self):
             ixs = jnp.arange(self.num_batches)
             bxs = self.x_train.reshape(self.num_batches, self.batch_size, *self.x_train.shape[1:])
             bys = self.y_train.reshape(self.num_batches, self.batch_size, *self.y_train.shape[1:])
+            subkeys = random.split(self.key, self.num_batches + 1)
+            self.key, subkeys = subkeys[0], subkeys[1:]  # update self.key & get subkeys
 
             def one_batch(carry, ix):
 
@@ -355,8 +362,9 @@ def gd(d_w, w):
 
                 bx = bxs[ix]
                 by = bys[ix]
+                kkey = subkeys[ix]
 
-                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by)
+                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by, kkey)
 
                 pack = tree.map(gd, d_params, carry['params'])
                 carry['params'] = pack
@@ -436,6 +444,8 @@ def update(self):
             ixs = jnp.arange(self.num_batches)
             bxs = self.x_train.reshape(self.num_batches, self.batch_size, *self.x_train.shape[1:])
             bys = self.y_train.reshape(self.num_batches, self.batch_size, *self.y_train.shape[1:])
+            subkeys = random.split(self.key, self.num_batches + 1)
+            self.key, subkeys = subkeys[0], subkeys[1:]  # update self.key & get subkeys
 
             def one_batch(carry, ix):
 
@@ -446,7 +456,9 @@ def momentum(d_w, w, v):
 
                 bx = bxs[ix]
                 by = bys[ix]
-                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by)
+                kkey = subkeys[ix]
+
+                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by, kkey)
 
                 pack = tree.map(momentum, d_params, carry['params'], carry['V'])
                 carry['params'] = tree.map(lambda x: x[0], pack)
@@ -529,6 +541,8 @@ def update(self):
             ixs = jnp.arange(self.num_batches)
             bxs = self.x_train.reshape(self.num_batches, self.batch_size, *self.x_train.shape[1:])
             bys = self.y_train.reshape(self.num_batches, self.batch_size, *self.y_train.shape[1:])
+            subkeys = random.split(self.key, self.num_batches + 1)
+            self.key, subkeys = subkeys[0], subkeys[1:]  # update self.key & get subkeys
 
             def one_batch(carry, ix):
 
@@ -541,7 +555,9 @@ def nag(d_w, w, v):
 
                 bx = bxs[ix]
                 by = bys[ix]
-                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by)
+                kkey = subkeys[ix]
+
+                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by, kkey)
 
                 pack = tree.map(nag, d_params, carry['params'], carry['V'])
                 carry['params'] = tree.map(lambda x: x[0], pack)
@@ -623,6 +639,8 @@ def update(self):
             ixs = jnp.arange(self.num_batches)
             bxs = self.x_train.reshape(self.num_batches, self.batch_size, *self.x_train.shape[1:])
             bys = self.y_train.reshape(self.num_batches, self.batch_size, *self.y_train.shape[1:])
+            subkeys = random.split(self.key, self.num_batches + 1)
+            self.key, subkeys = subkeys[0], subkeys[1:]  # update self.key & get subkeys
 
             def one_batch(carry, ix):
 
@@ -633,7 +651,9 @@ def adagrad(d_w, w, g):
 
                 bx = bxs[ix]
                 by = bys[ix]
-                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by)
+                kkey = subkeys[ix]
+
+                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by, kkey)
 
                 pack = tree.map(adagrad, d_params, carry['params'], carry['G'])
                 carry['params'] = tree.map(lambda x: x[0], pack)
@@ -720,6 +740,8 @@ def update(self):
             ixs = jnp.arange(self.num_batches)
             bxs = self.x_train.reshape(self.num_batches, self.batch_size, *self.x_train.shape[1:])
             bys = self.y_train.reshape(self.num_batches, self.batch_size, *self.y_train.shape[1:])
+            subkeys = random.split(self.key, self.num_batches + 1)
+            self.key, subkeys = subkeys[0], subkeys[1:]  # update self.key & get subkeys
 
             def one_batch(carry, ix):
 
@@ -730,7 +752,9 @@ def rmsprop(d_w, w, g):
 
                 bx = bxs[ix]
                 by = bys[ix]
-                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by)
+                kkey = subkeys[ix]
+
+                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by, kkey)
 
                 pack = tree.map(rmsprop, d_params, carry['params'], carry['G'])
                 carry['params'] = tree.map(lambda x: x[0], pack)
@@ -817,6 +841,8 @@ def update(self):
             ixs = jnp.arange(self.num_batches)
             bxs = self.x_train.reshape(self.num_batches, self.batch_size, *self.x_train.shape[1:])
             bys = self.y_train.reshape(self.num_batches, self.batch_size, *self.y_train.shape[1:])
+            subkeys = random.split(self.key, self.num_batches + 1)
+            self.key, subkeys = subkeys[0], subkeys[1:]  # update self.key & get subkeys
 
             def one_batch(carry, ix):
 
@@ -829,7 +855,9 @@ def adadelta(d_w, w, e_g2, e_dx2):
 
                 bx = bxs[ix]
                 by = bys[ix]
-                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by)
+                kkey = subkeys[ix]
+
+                d_params = grad(self._loss, argnums=0)(carry['params'], bx, by, kkey)
 
                 pack = tree.map(adadelta, d_params, carry['params'], carry['E_g2'], carry['E_dx2'])
                 carry['params'] = tree.map(lambda x: x[0], pack)