HugoPhi
diff --git a/‎README.md
+4-4 b/‎README.md
+4-4
diff --git a/‎example/data_analysis.ipynb
+375 b/‎example/data_analysis.ipynb
+375
diff --git a/‎example/data_process.py
+14-7 b/‎example/data_process.py
+14-7
diff --git a/‎example/main.py
+49-29 b/‎example/main.py
+49-29
diff --git a/‎example/plot.py b/‎example/plot.py
diff --git a/‎example/rebuild_date.ipynb
+425 b/‎example/rebuild_date.ipynb
+425
diff --git a/‎notebook_docs/grad.ipynb
+3-1 b/‎notebook_docs/grad.ipynb
+3-1
@@ -41,11 +41,11 @@ On MNIST: (a) acc[96.80%] & loss vs. epochs for mlp; (b) acc[97.86%] & loss vs.
 
 Some small tests for debug during the development of this project:   
 
-- How to Use Jax Gradient, <ins>*Ideas about how I manage parameters in this Framework*</ins>. 
-- When to use JIT in Jax? <ins>*About Time & Space*</ins>  <mark>TODO</mark>
 - How to Use Mini-torch? <ins>*A brief e.g. Doc*</ins>  <mark>TODO</mark>
-- Kaiming Initialization[[2](#reference)] used in MLP & Conv, <ins>*With math derivation*</ins>  
-- Difference between Conv2d Operation by python loop and by **Jax.Lax**.
+- How to Use Jax Gradient, <ins>*Ideas about how I manage parameters in this Framework*</ins>. 
+- Some Jax Tips, <ins>*About How to Use Jax Builtins & JIT to Optimize Loops & Matrix Operations.*</ins>   
+- Kaiming Initialization[[2](#reference)] used in MLP & Conv, <ins>*With math derivation.*</ins>  
+- Difference between Conv2d Operation by python loop and by <ins>**Jax.lax**</ins>.
 - Dropout mechanism impl, <ins>*About Seed in Jax*.</ins>
 - Runge-Kuta solver for Neural ODE.
 
 
@@ -57,13 +57,20 @@ def one_hot(y: jnp.ndarray, num_class: int):
 TRAIN = None
 TEST = None
 
-shuffle_kernel = random.permutation(key, (X_train.shape[0]))
-X_train = X_train[shuffle_kernel][:TRAIN]
-y_train = y_train[shuffle_kernel][:TRAIN]
-shuffle_kernel = random.permutation(key, (X_test.shape[0]))
-X_test = X_test[shuffle_kernel][:TEST]
-y_test = y_test[shuffle_kernel][:TEST]
-
+Shuffle = False
+
+if Shuffle:
+    shuffle_kernel = random.permutation(key, (X_train.shape[0]))
+    X_train = X_train[shuffle_kernel][:TRAIN]
+    y_train = y_train[shuffle_kernel][:TRAIN]
+    shuffle_kernel = random.permutation(key, (X_test.shape[0]))
+    X_test = X_test[shuffle_kernel][:TEST]
+    y_test = y_test[shuffle_kernel][:TEST]
+else:
+    X_train = X_train[:TRAIN]
+    y_train = y_train[:TRAIN]
+    X_test = X_test[:TEST]
+    y_test = y_test[:TEST]
 
 # X_train = jnp.transpose(X_train, (2, 0, 1))
 # X_test = jnp.transpose(X_test, (2, 0, 1))
 
@@ -5,26 +5,42 @@
 from plugins.minitorch.nn import Rnn, Dense, Model
 from plugins.minitorch.optimizer import Adam
 from plugins.minitorch.initer import Initer
-from plugins.minitorch.utils import softmax
+from plugins.minitorch.utils import softmax, cross_entropy_loss, l2_regularization
+from plugins.minitorch.loss import CrossEntropyLoss
 
 from data_process import X_train, X_test, y_train, y_test
 
 key = random.PRNGKey(0)
 
 
-class LSTM(Model):
+class MyLoss(CrossEntropyLoss):
+    def __init__(self, f):
+        super(MyLoss, self).__init__(f)
+
+    def get_loss(self, train):
+        loss_function = lambda params, x, y_true: cross_entropy_loss(y_true, self.f(x, params, train)) + l2_regularization(params, 0.01)
+        return loss_function
+
+    def get_embed_loss(self, x, y_true, train):
+        embed_loss_function = lambda params: cross_entropy_loss(y_true, self.f(x, params, train)) + l2_regularization(params, 0.01)
+        return embed_loss_function
+
+
+class SplitLSTM(Model):
     def __init__(self, lr, epoches, batch_size):
         super().__init__(lr=lr, epoches=epoches)
 
         self.config = {
             'lstm:0': Rnn.get_lstm(128, 9, 64),
             'lstm:1even': Rnn.get_lstm(64, 64, 32),
             'lstm:1odd': Rnn.get_lstm(64, 64, 32),
+            'lstm:2': Rnn.get_lstm(64, 64, 64),
             'fc:0': Dense.get_linear(64, 6),
         }
 
         initer = Initer(self.config, key)
         self.optr = Adam(initer(), lr=lr, batch_size=batch_size)
+        self.lossr = MyLoss(self.predict_proba)
 
     def predict_proba(self, x, params, train=True):
         res = jnp.transpose(x, (2, 0, 1))
@@ -36,18 +52,21 @@ def predict_proba(self, x, params, train=True):
         even, _, _ = Rnn.lstm(even, params['lstm:1even'], self.config['lstm:1even'])
         odd, _, _ = Rnn.lstm(odd, params['lstm:1odd'], self.config['lstm:1odd'])
 
-        res = jnp.concatenate((even[-1], odd[-1]), axis=1)
+        res = jnp.concatenate((even, odd), axis=2)
+
+        res, _, _ = Rnn.lstm(res, params['lstm:2'], self.config['lstm:2'])
+        res = res[-1]
 
         res = Dense.linear(res, params['fc:0'])
 
         return softmax(res)
 
 
-epochs = 40
+epochs = 200
 batch_size = 64
-learning_rate = 0.015
+learning_rate = 0.005
 
-model = LSTM(lr=learning_rate, epoches=epochs, batch_size=batch_size)
+model = SplitLSTM(lr=learning_rate, epoches=epochs, batch_size=batch_size)
 acc, loss, tacc, tloss = model.fit(
     x_train=X_train,
     y_train_proba=y_train,
@@ -56,33 +75,34 @@ def predict_proba(self, x, params, train=True):
 )
 
 
-fig, ax1 = plt.subplots()
+def plot_curve(acc, tacc, loss, tloss, epochs):
+    fig, ax1 = plt.subplots()
 
-plt.rcParams['font.family'] = 'Noto Serif SC'
-plt.rcParams['font.sans-serif'] = ['Noto Serif SC']
+    plt.rcParams['font.family'] = 'Noto Serif SC'
+    plt.rcParams['font.sans-serif'] = ['Noto Serif SC']
 
-color = 'tab:red'
-ax1.set_xlabel('Epochs')
-ax1.set_ylabel('Accuracy', color=color)
-ax1.plot(range(epochs), acc, color=color, label='Train Accuracy', linestyle='-')
-ax1.plot(range(epochs), tacc, color=color, label='Test Accuracy', linestyle='--')
-ax1.tick_params(axis='y', labelcolor=color)
+    color = 'tab:red'
+    ax1.set_xlabel('Epochs')
+    ax1.set_ylabel('Accuracy', color=color)
+    ax1.plot(range(epochs), acc, color=color, label='Train Accuracy', linestyle='-')
+    ax1.plot(range(epochs), tacc, color=color, label='Test Accuracy', linestyle='--')
+    ax1.tick_params(axis='y', labelcolor=color)
 
-ax2 = ax1.twinx()
+    ax2 = ax1.twinx()
 
-color = 'tab:blue'
-ax2.set_ylabel('Loss', color=color)
-ax2.plot(range(epochs), loss, color=color, label='Train Loss', linestyle='-')
-ax2.plot(range(epochs), tloss, color=color, label='Test Loss', linestyle='--')
-ax2.tick_params(axis='y', labelcolor=color)
+    color = 'tab:blue'
+    ax2.set_ylabel('Loss', color=color)
+    ax2.plot(range(epochs), loss, color=color, label='Train Loss', linestyle='-')
+    ax2.plot(range(epochs), tloss, color=color, label='Test Loss', linestyle='--')
+    ax2.tick_params(axis='y', labelcolor=color)
 
-handles1, labels1 = ax1.get_legend_handles_labels()
-handles2, labels2 = ax2.get_legend_handles_labels()
-ax1.legend(handles1 + handles2, labels1 + labels2, loc='lower right')
+    handles1, labels1 = ax1.get_legend_handles_labels()
+    handles2, labels2 = ax2.get_legend_handles_labels()
+    ax1.legend(handles1 + handles2, labels1 + labels2, loc='lower right')
 
-plt.title('Training and Testing Accuracy and Loss over Epochs')
-fig.tight_layout()
-plt.show()
+    plt.title('Training and Testing Accuracy and Loss over Epochs')
+    fig.tight_layout()
+    plt.show()
 
-print(f'final train, test acc : {acc[-1]}, {tacc[-1]}')
-print(f'final train, test loss: {loss[-1]}, {tloss[-1]}')
+    print(f'final train, test acc : {acc[-1]}, {tacc[-1]}')
+    print(f'final train, test loss: {loss[-1]}, {tloss[-1]}')
@@ -158,7 +158,9 @@
     "Here is a very simple MLP case. As you can see, we get a gradient dict of trainable parameters we inited before. And then you can apply this result to GD algorithms like SGD, Adam... easy right?  \n",
     "But this is also not what we want. This kind of initalization and optimization is very complex. So we can apply Pipeline Pattern to make it more easy to manage this procedure for users:  \n",
     "\n",
-    "![pipeline.svg](../assets/notebook_docs/minitorch.svg)\n",
+    "<p align=\"center\">\n",
+    "  <img src=\"../assets/notebook_docs/minitorch.svg\" alt=\"Overview of framework\", width=\"50%\">\n",
+    "</p>\n",
     "\n",
     "<p align=\"center\">\n",
     "Overview of Framework\n",