diff --git a/assignment-2/submission/18307130074/img/basic_test.png b/assignment-2/submission/18307130074/img/basic_test.png
new file mode 100644
index 0000000000000000000000000000000000000000..09a77bfd30f6ba7bf950b8413637d9fe163ebe53
Binary files /dev/null and b/assignment-2/submission/18307130074/img/basic_test.png differ
diff --git a/assignment-2/submission/18307130074/img/get_torch_initialization.png b/assignment-2/submission/18307130074/img/get_torch_initialization.png
new file mode 100644
index 0000000000000000000000000000000000000000..5346f4e9b753d2375e9db0d9674101836a32989c
Binary files /dev/null and b/assignment-2/submission/18307130074/img/get_torch_initialization.png differ
diff --git a/assignment-2/submission/18307130074/img/mini_batch.png b/assignment-2/submission/18307130074/img/mini_batch.png
new file mode 100644
index 0000000000000000000000000000000000000000..45a8913a0f8fc0dd86aab705aa6fb87d7d796fdc
Binary files /dev/null and b/assignment-2/submission/18307130074/img/mini_batch.png differ
diff --git a/assignment-2/submission/18307130074/img/research.png b/assignment-2/submission/18307130074/img/research.png
new file mode 100644
index 0000000000000000000000000000000000000000..77f8f897fd02871f3a93a3636bc374c0232b6341
Binary files /dev/null and b/assignment-2/submission/18307130074/img/research.png differ
diff --git a/assignment-2/submission/18307130074/numpy_fnn.py b/assignment-2/submission/18307130074/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..11820682b35a685487f8246c31a6026ed2190c00
--- /dev/null
+++ b/assignment-2/submission/18307130074/numpy_fnn.py
@@ -0,0 +1,171 @@
+import numpy as np
+
+
+class NumpyOp:
+    
+    def __init__(self):
+        self.memory = {}
+        self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+    
+    def forward(self, x, W):
+        """
+        x: shape(N, d)
+        w: shape(d, d')
+        """
+        self.memory['x'] = x
+        self.memory['W'] = W
+        h = np.matmul(x, W)
+        return h
+    
+    def backward(self, grad_y):
+        """
+        grad_y: shape(N, d')
+        """
+        
+        grad_x = np.matmul(grad_y, self.memory['W'].T)
+        grad_W = np.matmul(self.memory['x'].T, grad_y)
+        
+        return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+    
+    def forward(self, x):
+        self.memory['x'] = x
+        return np.where(x > 0, x, np.zeros_like(x))
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        grad_x = np.where(self.memory['x'] > 0, 1, 0) * grad_y
+        
+        return grad_x
+
+
+class Log(NumpyOp):
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        out = np.log(x + self.epsilon)
+        self.memory['x'] = x
+        
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        grad_x = np.reciprocal(self.memory['x'] + self.epsilon) * grad_y
+        
+        return grad_x
+
+
+class Softmax(NumpyOp):
+    """
+    softmax over last dimension
+    """
+    
+    def forward(self, x):
+        """
+        x: shape(N, c)
+        """
+        
+        r = np.exp(x)
+        s = np.sum(r, axis=1).reshape(-1, 1)
+        out = (r / s).astype('float64')
+        self.memory['out'] = out
+
+        return out
+    
+    def backward(self, grad_y):
+        """
+        grad_y: same shape as x
+        """
+        
+        out = self.memory['out']
+        Matrix = []
+        for i in range(out.shape[0]):
+            row = out[i]
+            Jacob = np.diag(row) - np.outer(row, row)
+            Matrix.append(Jacob)
+        Matrix = np.array(Matrix)
+        grad_x = np.squeeze(np.matmul(grad_y[:,np.newaxis,:], Matrix), axis=1)
+
+        return grad_x
+
+
+class NumpyLoss:
+    
+    def __init__(self):
+        self.target = None
+    
+    def get_loss(self, pred, target):
+        self.target = target
+        return (-pred * target).sum(axis=1).mean()
+    
+    def backward(self):
+        return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+    def __init__(self):
+        self.W1 = np.random.normal(size=(28 * 28, 256))
+        self.W2 = np.random.normal(size=(256, 64))
+        self.W3 = np.random.normal(size=(64, 10))
+        
+        # 以下算子会在 forward 和 backward 中使用
+        self.matmul_1 = Matmul()
+        self.relu_1 = Relu()
+        self.matmul_2 = Matmul()
+        self.relu_2 = Relu()
+        self.matmul_3 = Matmul()
+        self.softmax = Softmax()
+        self.log = Log()
+        
+        # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度（ loss 关于算子输入的偏导）
+        self.x1_grad, self.W1_grad = None, None
+        self.relu_1_grad = None
+        self.x2_grad, self.W2_grad = None, None
+        self.relu_2_grad = None
+        self.x3_grad, self.W3_grad = None, None
+        self.softmax_grad = None
+        self.log_grad = None
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        
+        z1 = self.matmul_1.forward(x, self.W1)
+        x2 = self.relu_1.forward(z1)
+        z2 = self.matmul_2.forward(x2, self.W2)
+        x3 = self.relu_2.forward(z2)
+        z3 = self.matmul_3.forward(x3, self.W3)
+        out = self.softmax.forward(z3)
+        x = self.log.forward(out)
+        
+        return x
+    
+    def backward(self, y):
+        
+        self.log_grad = self.log.backward(y)   
+        self.softmax_grad = self.softmax.backward(self.log_grad)
+        self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+        self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+        self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+        self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+        self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+        
+        pass
+    
+    def optimize(self, learning_rate):
+        self.W1 -= learning_rate * self.W1_grad
+        self.W2 -= learning_rate * self.W2_grad
+        self.W3 -= learning_rate * self.W3_grad
diff --git a/assignment-2/submission/18307130074/numpy_mnist.py b/assignment-2/submission/18307130074/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..689df447bfdcf31afa5bfc4d9328e77177412dba
--- /dev/null
+++ b/assignment-2/submission/18307130074/numpy_mnist.py
@@ -0,0 +1,83 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+
+# from utils import download_mnist, batch, mini_batch, get_torch_initialization, plot_curve, one_hot
+from utils import download_mnist, batch, plot_curve, one_hot
+
+def mini_batch(dataset, batch_size=128):
+    
+    data = []
+    label = []
+
+    for each in dataset:
+        data.append(np.array(each[0]))
+        label.append(each[1])
+    
+    label = np.array(label)
+    data = np.array(data)
+
+    num = data.shape[0]
+    i = np.arange(num)
+    np.random.shuffle(i)
+
+    label_ = label[i]
+    data_ = data[i]
+
+    res = []
+    for id in range(num // batch_size):
+        batch_data = data_[id * batch_size: (id + 1) * batch_size]
+        batch_label = label_[id * batch_size: (id + 1) * batch_size]
+        res.append((batch_data, batch_label))
+    
+    return res
+
+    
+def get_torch_initialization():
+    
+    def parameters(in_features, out_features, param = 5**0.5):
+        bound = (6 / (1 + param * param) / in_features ) ** 0.5
+        return np.random.uniform(-bound, bound, (in_features, out_features))
+    
+    W1 = parameters(28 * 28, 256)
+    W2 = parameters(256, 64)
+    W3 = parameters(64, 10)
+    return W1, W2, W3
+
+
+def numpy_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = NumpyModel()
+    numpy_loss = NumpyLoss()
+    model.W1, model.W2, model.W3 = get_torch_initialization()
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+
+        for x, y in mini_batch(train_dataset):
+            y = one_hot(y)
+            
+            y_pred = model.forward(x)
+            loss = numpy_loss.get_loss(y_pred, y)
+
+            model.backward(numpy_loss.backward())
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset)[0]
+        accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    numpy_run()
diff --git a/assignment-2/submission/18307130074/readme.md b/assignment-2/submission/18307130074/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..71c3f6f1d990e3eff9e30a6ae8a33d2fbfbfe19b
--- /dev/null
+++ b/assignment-2/submission/18307130074/readme.md
@@ -0,0 +1,371 @@
+# Assignment2:FNN
+
+18307130074  姜博天（选题1）
+
+## 1. numpy_fnn.py算子反向传播推导
+
+### 1. Matmul
+
+首先正向计算时有矩阵 x(N * d') * W(d' * d) = h(N * d)
+
+假设反向传播中输入的梯度为y(N * d)
+
+则有
+$$
+y_{ij} = \frac{\partial Loss}{\partial h_{ij}}
+$$
+计算W的梯度
+$$
+\begin{aligned}
+	\frac{\partial Loss}{\partial W_{pq}}
+		&=\sum_{i \leqslant N\\ j\leqslant d} \frac{\partial Loss}{\partial h_{ij}} \times \frac{\partial h_{ij}}{\partial W_{pq}}\\\\
+		&=\sum_{i \leqslant N} \ y_{iq} \times \frac{\partial h_{iq}}{\partial W_{pq}}\\\\
+		&=\sum_{i \leqslant N} \ y_{iq} \times \ x_{ip}\\\\
+		&=\sum_{i \leqslant N} \ x_{pi}^T \times y_{iq}
+\end{aligned}
+$$
+所以有
+$$
+\frac{\partial Loss}{\partial W} = x^T \times y
+$$
+同理可得P的梯度
+$$
+\frac{\partial Loss}{\partial x} = y \times W^T
+$$
+
+### 2. RELU
+
+正向计算有RELU(x(N * d)) = out(N * d)
+$$
+out = \begin{cases}
+	  0& where\ x \leq 0 \\\\
+	  x& where\ x > 0
+	  \end{cases}
+$$
+计算x的梯度
+$$
+\begin{aligned}
+	\frac{\partial Loss}{\partial x_{ij}}
+		&=\frac{\partial Loss}{\partial out_{ij}} \times \frac{d_{out_{ij}}}{d_{x_{ij}}}\\\\
+		&=y \times \frac{d_{out_{ij}}}{d_{x_{ij}}}
+\end{aligned}
+$$
+其中
+$$
+\frac{d_{out_{ij}}}{d_{x_{ij}}} = \begin{cases}
+	  0& x_{ij} \leq 0 \\\\
+	  1& x_{ij} > 0
+	  \end{cases}
+$$
+所以
+$$
+\frac{\partial Loss}{\partial x} = y \times x'\\\
+其中x' = \begin{cases}
+	  0& where\ x \leq 0 \\\\
+	  1& where\ x > 0
+	  \end{cases}
+$$
+
+
+### 3. Log
+
+正向计算有Log(x(N * d)) = out(N * d)
+$$
+out_{ij} = log\ x_{ij}
+$$
+计算x的梯度
+$$
+\begin{aligned}
+	\frac{\partial Loss}{\partial x_{ij}}
+		&=\frac{\partial Loss}{\partial out_{ij}} \times \frac{d_{out_{ij}}}{d_{x_{ij}}}\\\\
+		&=y \times \frac{d_{out_{ij}}}{d_{x_{ij}}}
+\end{aligned}
+$$
+其中
+$$
+\frac{d_{out_{ij}}}{d_{x_{ij}}} = \frac {1}{x_{ij}}
+$$
+所以
+$$
+\frac{\partial Loss}{\partial x} = y \times x'\\\
+其中x' = \frac{1}{x}
+$$
+
+
+### 4. Softmax
+
+正向计算有Softmax(x(N * d)) = out(N * d)
+$$
+out_{ij} = \frac{e^{ij}}{\sum_{k=1}^d e^{x_{ik}}}
+$$
+可见每一行的out只与所在行有关，所以我们不妨先只考虑一行，然后推广到整个矩阵。
+$$
+\begin{aligned}
+	\frac{\partial Loss}{\partial x_i}
+		&=\sum_{j=1}^d \frac{\partial Loss}{\partial out_j} \times \frac{\partial out_j}{\partial x_i}\\\\
+		&=y \times \frac{\partial out_i}{\partial x_i}
+\end{aligned}
+$$
+并且有
+$$
+\frac{\partial out_i}{\partial x_j} =
+	\begin{cases}
+    out_i \times (1 - out_i) & i = j\\\\
+    -out_i \times out_j & i \neq j
+    \end{cases}
+$$
+
+
+
+$$
+softmax([x_1, x_2, ...,x_d]) = [out_1, out_2, ..., out_d]\\\\
+$$
+
+
+
+$$
+\frac{\partial Loss}{\partial x} = [\frac{\partial Loss}{\partial out_1}, \frac{\partial Loss}{\partial out_2},...,\frac{\partial Loss}{\partial out_d}] \times 
+\begin{matrix}
+\frac{\partial out_1}{\partial x_1} & \frac{\partial out_1}{\partial x_2} & ... & \frac{\partial out_1}{\partial x_d}\\\\
+\frac{\partial out_2}{\partial x_1} & \frac{\partial out_2}{\partial x_2} & ... & \frac{\partial out_2}{\partial x_d}\\\\
+... & ... & ... & ...\\\\
+\frac{\partial out_d}{\partial x_1} & \frac{\partial out_d}{\partial x_2} & ... & \frac{\partial out_d}{\partial x_d}\\\\
+\end{matrix}
+$$
+可以看到[1 * d]维的向量需要乘一个[d * d]的矩阵才可得到梯度
+
+那么对于[N * d]维的矩阵则可以通过添加维看成一个[N * 1 * d]的多维矩阵，需要乘一个[N * d * d]的矩阵才能得到梯度，然而得到的矩阵也为[N * 1 * d]，需要通过numpy压缩成[N * d]维的矩阵作为结果返回
+
+代码如下：
+
+```python
+out = self.memory['out']
+Matrix = []
+for i in range(out.shape[0]):
+	row = out[i]
+	Jacob = np.diag(row) - np.outer(row, row)
+	Matrix.append(Jacob)
+Matrix = np.array(Matrix)
+grad_x = np.squeeze(np.matmul(grad_y[:,np.newaxis,:], Matrix), axis=1)
+
+return grad_x
+```
+
+
+
+## 2. 模型训练与测试
+
+### 1. 利用utils.py中已经实现的函数来做测试
+
+基础测试
+
+| epoch | accuracy |
+| ----- | -------- |
+| 0     | 0.9409   |
+| 1     | 0.9653   |
+| 2     | 0.9694   |
+
+![basic_test](img/basic_test.png)
+
+**考虑更改epoch数量和learning_rate做对比实验**
+
+| epoch | accuracy with learning_rate = 0.05 | accuracy with learning_rate = 0.1 | accuracy with learning_rate = 0.2 |
+| ----- | ---------------------------------- | --------------------------------- | --------------------------------- |
+| 0     | 0.9266                             | 0.9468                            | 0.9602                            |
+| 1     | 0.9482                             | 0.9638                            | 0.9708                            |
+| 2     | 0.9571                             | 0.9701                            | 0.9752                            |
+| 4     | 0.9692                             | 0.9755                            | 0.9785                            |
+| 9     | 0.9787                             | 0.9792                            | 0.9811                            |
+| 14    | 0.9793                             | 0.9811                            | 0.9835                            |
+| 19    | 0.9811                             | 0.9811                            | 0.9835                            |
+| 29    | 0.9820                             | 0.9806                            | 0.9838                            |
+| 39    | 0.9823                             | 0.9809                            | 0.9832                            |
+| 49    | 0.9822                             | 0.9809                            | 0.9835                            |
+| 69    | 0.9826                             | 0.9810                            | 0.9836                            |
+| 99    | 0.9826                             | 0.9811                            | 0.9836                            |
+
+由于数据量较大，这里只展示部分数据。实际上不同的learning rate以及其对应的最高accuracy和accuracy所对应的epoch如下图所示。
+
+| learning_rate | epoch | accuracy |
+| ------------- | ----- | -------- |
+| 0.05          | 44    | 0.9827   |
+| 0.1           | 23    | 0.9815   |
+| 0.2           | 16    | 0.9839   |
+
+可以看到随着learning_rate的提升，第一次得到最大accuracy的epoch数逐渐降低，前几次的accuracy也较高。此外，可以发现无论learning_rate为何值，accuracy都会存在持续的抖动现象（甚至epoch数还未超过10就已经开始抖动）。
+
+**下面对epoch、learning_rate的选取进行探究**
+
+如果epoch选取过大会导致浪费gpu时间且会导致过拟合，如果选取过小又有可能使得结果并非最优。所以在实验开始之前，给epoch和learning_rate选取合适的值是一件非常有意义的事情。查阅资料后发现，learning_rate最优的选取方式并非是一个定值，而是一个随着epoch变化的函数，所以初始learning_rate可以设置较大，随后每隔几个epoch减半是一种比较好的实现方式。而epoch的选取则没有较为固定的方法，一般是观察loss的变化，选取loss最小时的epoch值，一般选取10左右。下面是一组epoch=10，每隔2个epoch则learning_rate减半，learning_rate初始设为0.2的一组实验数据。
+
+| epoch | accuracy |
+| ----- | -------- |
+| 0     | 0.9573   |
+| 1     | 0.9694   |
+| 2     | 0.9759   |
+| 3     | 0.9791   |
+| 4     | 0.9789   |
+| 5     | 0.9789   |
+| 6     | 0.9809   |
+| 7     | 0.9812   |
+| 8     | 0.9807   |
+| 9     | 0.9812   |
+
+![research](img/research.png)
+
+与上文所做的固定learning_rate的结果相比之下，效果有稍微的提升，不过不是很明显。
+
+### 2. mini_batch的复现
+
+可以看到mini_batch函数中只有简单的一行
+
+```python
+def mini_batch(dataset, batch_size=128, numpy=False):
+    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+```
+
+那么去查阅torch.utils.data的源码中DataLoader类可以看到该函数的作用是给定batch_size和dataset，将数据集打乱并分割成batch_size大小的一个个小数据集。知道了原理之后就可以很简单的利用numpy.shuffle来进行打乱，并进行很简单的数据处理实现该函数。
+
+```python
+def mini_batch(dataset, batch_size=128):
+    
+    data = []
+    label = []
+
+    for each in dataset:
+        data.append(np.array(each[0]))
+        label.append(each[1])
+    
+    label = np.array(label)
+    data = np.array(data)
+
+    num = data.shape[0]
+    i = np.arange(num)
+    np.random.shuffle(i)
+
+    label_ = label[i]
+    data_ = data[i]
+
+    res = []
+    for id in range(num // batch_size):
+        batch_data = data_[id * batch_size: (id + 1) * batch_size]
+        batch_label = label_[id * batch_size: (id + 1) * batch_size]
+        res.append((batch_data, batch_label))
+    
+    return res
+```
+
+### 3.利用新版本的mini_batch进行基础测试
+
+| epoch | accuracy |
+| ----- | -------- |
+| 0     | 0.9476   |
+| 1     | 0.9640   |
+| 2     | 0.9715   |
+
+可以看到和utils.py中的mini_batch效果相近
+
+![mini_batch](img/mini_batch.png)
+
+## 3. Pytorch权重初始化
+
+首先去查阅torch.nn.linear的源码，如下所示
+
+```python
+def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
+        super(Linear, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = Parameter(torch.Tensor(out_features, in_features))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+def reset_parameters(self) -> None:
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+```
+
+可以看到在定义self.bias和self.weight之后进行了reset_parameters的操作，而在这个函数中关键的一步为init.kaiming_uniform_
+
+去查看该函数的源码，如下所示
+
+```python
+def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    bound = math.sqrt(3.0) * std
+    with torch.no_grad():
+        return tensor.uniform_(-bound, bound)
+```
+
+calculate_gain的源码，如下所示
+
+```python
+def calculate_gain(nonlinearity, param=None):
+    linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d']
+    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+        return 1
+    elif nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope ** 2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4  
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+```
+
+| 函数       | gain效果                   |
+| ---------- | -------------------------- |
+| ReLU       | sqrt(2)                    |
+| Leaky_ReLU | sqrt(2 / (1 + param ** 2)) |
+
+由于默认为Leaky_ReLU，所以
+$$
+bound = \sqrt[]{\frac{2}{1 + param^2}} \times \sqrt[]{\frac{3}{fan\_in}}
+$$
+而且可以看到默认的param = math.sqrt(5)，所以很容易就可以写出代码，如下所示
+
+```python
+def get_torch_initialization():
+    
+    def parameters(in_features, out_features, param = 5**0.5):
+        bound = (6 / (1 + param * param) / in_features ) ** 0.5
+        return np.random.uniform(-bound, bound, (in_features, out_features))
+    
+    W1 = parameters(28 * 28, 256)
+    W2 = parameters(256, 64)
+    W3 = parameters(64, 10)
+    return W1, W2, W3
+```
+
+如上为kaiming分布中的均匀分布，也是pytorch初始化参数的做法
+
+其实kaiming分布中还有一种正态分布，和均匀分布稍有不同，将值的求法稍稍改动，如下所示
+$$
+std = \sqrt[]{\frac{2}{fan\_in \times (1 + param^2)}}
+$$
+采用复现的get_torch_initialization进行基础测试，结果如下
+
+| epoch | accuracy |
+| ----- | -------- |
+| 0     | 0.9421   |
+| 1     | 0.9658   |
+| 2     | 0.9742   |
+
+![get_torch_initialization](.\img\get_torch_initialization.png)
\ No newline at end of file
diff --git a/assignment-2/submission/18307130074/tester_demo.py b/assignment-2/submission/18307130074/tester_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..e63107a79414018a4600e9795bba41ff82826c9a
--- /dev/null
+++ b/assignment-2/submission/18307130074/tester_demo.py
@@ -0,0 +1,183 @@
+import numpy as np
+import torch
+from torch import matmul as torch_matmul, relu as torch_relu, softmax as torch_softmax, log as torch_log
+
+from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss
+from torch_mnist import TorchModel
+from utils import get_torch_initialization, one_hot
+
+err_epsilon = 1e-6
+err_p = 0.4
+
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+def check_result(numpy_result, torch_result=None):
+    if isinstance(numpy_result, list) and torch_result is None:
+        flag = True
+        for (n, t) in numpy_result:
+            flag = flag and check_result(n, t)
+        return flag
+    # print((torch.from_numpy(numpy_result) - torch_result).abs().mean().item())
+    T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item()
+    direction = T / torch_result.numel() < err_p
+    return direction and ((torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon).item()
+
+
+def case_1():
+    x = np.random.normal(size=[5, 6])
+    W = np.random.normal(size=[6, 4])
+    
+    numpy_matmul = Matmul()
+    numpy_out = numpy_matmul.forward(x, W)
+    numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    torch_W = torch.from_numpy(W).clone().requires_grad_()
+    
+    torch_out = torch_matmul(torch_x, torch_W)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+        (numpy_W_grad, torch_W.grad)
+    ])
+
+
+def case_2():
+    x = np.random.normal(size=[5, 6])
+    
+    numpy_relu = Relu()
+    numpy_out = numpy_relu.forward(x)
+    numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_relu(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_3():
+    x = np.random.uniform(low=0.0, high=1.0, size=[3, 4])
+    
+    numpy_log = Log()
+    numpy_out = numpy_log.forward(x)
+    numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_log(torch_x)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def case_4():
+    x = np.random.normal(size=[4, 5])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_softmax(torch_x, 1)
+    
+    return check_result(numpy_out, torch_out)
+
+
+def case_5():
+    x = np.random.normal(size=[20, 25])
+    
+    numpy_softmax = Softmax()
+    numpy_out = numpy_softmax.forward(x)
+    numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out))
+    
+    torch_x = torch.from_numpy(x).clone().requires_grad_()
+    
+    torch_out = torch_softmax(torch_x, 1)
+    torch_out.sum().backward()
+    
+    return check_result([
+        (numpy_out, torch_out),
+        (numpy_x_grad, torch_x.grad),
+    ])
+
+
+def test_model():
+    try:
+        numpy_loss = NumpyLoss()
+        numpy_model = NumpyModel()
+        torch_model = TorchModel()
+        torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = get_torch_initialization(numpy=False)
+        numpy_model.W1 = torch_model.W1.detach().clone().numpy()
+        numpy_model.W2 = torch_model.W2.detach().clone().numpy()
+        numpy_model.W3 = torch_model.W3.detach().clone().numpy()
+        
+        x = torch.randn((10000, 28, 28))
+        y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000)
+        
+        y = one_hot(y, numpy=False)
+        x2 = x.numpy()
+        y_pred = torch_model.forward(x)
+        loss = (-y_pred * y).sum(dim=1).mean()
+        loss.backward()
+        
+        y_pred_numpy = numpy_model.forward(x2)
+        numpy_loss.get_loss(y_pred_numpy, y.numpy())
+        
+        check_flag_1 = check_result(y_pred_numpy, y_pred)
+        print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10))
+    except:
+        print("[Runtime Error in forward]")
+        print("+ {:12} {}/{}".format("forward", 0, 10))
+        return 0
+    
+    try:
+        
+        numpy_model.backward(numpy_loss.backward())
+        
+        check_flag_2 = [
+            check_result(numpy_model.log_grad, torch_model.log_input.grad),
+            check_result(numpy_model.softmax_grad, torch_model.softmax_input.grad),
+            check_result(numpy_model.W3_grad, torch_model.W3.grad),
+            check_result(numpy_model.W2_grad, torch_model.W2.grad),
+            check_result(numpy_model.W1_grad, torch_model.W1.grad)
+        ]
+        check_flag_2 = sum(check_flag_2) >= 4
+        print("+ {:12} {}/{}".format("backward", 20 * check_flag_2, 20))
+    except:
+        print("[Runtime Error in backward]")
+        print("+ {:12} {}/{}".format("backward", 0, 20))
+        check_flag_2 = False
+    
+    return 10 * check_flag_1 + 20 * check_flag_2
+
+
+if __name__ == "__main__":
+    testcases = [
+        ["matmul", case_1, 5],
+        ["relu", case_2, 5],
+        ["log", case_3, 5],
+        ["softmax_1", case_4, 5],
+        ["softmax_2", case_5, 10],
+    ]
+    score = 0
+    for case in testcases:
+        try:
+            res = case[2] if case[1]() else 0
+        except:
+            print("[Runtime Error in {}]".format(case[0]))
+            res = 0
+        score += res
+        print("+ {:12} {}/{}".format(case[0], res, case[2]))
+    score += test_model()
+    print("{:14} {}/60".format("FINAL SCORE", score))
diff --git a/assignment-2/submission/18307130074/torch_mnist.py b/assignment-2/submission/18307130074/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3e214c7606e3d43dac4b94554f942508afffb3
--- /dev/null
+++ b/assignment-2/submission/18307130074/torch_mnist.py
@@ -0,0 +1,73 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+    
+    def __init__(self):
+        self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+        self.W2 = torch.randn((256, 64), requires_grad=True)
+        self.W3 = torch.randn((64, 10), requires_grad=True)
+        self.softmax_input = None
+        self.log_input = None
+    
+    def forward(self, x):
+        x = x.reshape(-1, 28 * 28)
+        x = torch.relu(torch.matmul(x, self.W1))
+        x = torch.relu(torch.matmul(x, self.W2))
+        x = torch.matmul(x, self.W3)
+        
+        self.softmax_input = x
+        self.softmax_input.retain_grad()
+        
+        x = torch.softmax(x, 1)
+        
+        self.log_input = x
+        self.log_input.retain_grad()
+        
+        x = torch.log(x)
+        
+        return x
+    
+    def optimize(self, learning_rate):
+        with torch.no_grad():
+            self.W1 -= learning_rate * self.W1.grad
+            self.W2 -= learning_rate * self.W2.grad
+            self.W3 -= learning_rate * self.W3.grad
+            
+            self.W1.grad = None
+            self.W2.grad = None
+            self.W3.grad = None
+
+
+def torch_run():
+    train_dataset, test_dataset = download_mnist()
+    
+    model = TorchModel()
+    model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False)
+    
+    train_loss = []
+    
+    epoch_number = 3
+    learning_rate = 0.1
+    
+    for epoch in range(epoch_number):
+        for x, y in mini_batch(train_dataset, numpy=False):
+            y = one_hot(y, numpy=False)
+            
+            y_pred = model.forward(x)
+            loss = (-y_pred * y).sum(dim=1).mean()
+            loss.backward()
+            model.optimize(learning_rate)
+            
+            train_loss.append(loss.item())
+        
+        x, y = batch(test_dataset, numpy=False)[0]
+        accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+        print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+    
+    plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+    torch_run()
diff --git a/assignment-2/submission/18307130074/utils.py b/assignment-2/submission/18307130074/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..709220cfa7a924d914ec1c098c505f864bcd4cfc
--- /dev/null
+++ b/assignment-2/submission/18307130074/utils.py
@@ -0,0 +1,71 @@
+import torch
+import numpy as np
+from matplotlib import pyplot as plt
+
+
+def plot_curve(data):
+    plt.plot(range(len(data)), data, color='blue')
+    plt.legend(['loss_value'], loc='upper right')
+    plt.xlabel('step')
+    plt.ylabel('value')
+    plt.show()
+
+
+def download_mnist():
+    from torchvision import datasets, transforms
+    
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+    ])
+    
+    train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True)
+    test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True)
+    
+    return train_dataset, test_dataset
+
+
+def one_hot(y, numpy=True):
+    if numpy:
+        y_ = np.zeros((y.shape[0], 10))
+        y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+        return y_
+    else:
+        y_ = torch.zeros((y.shape[0], 10))
+        y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1
+    return y_
+
+
+def batch(dataset, numpy=True):
+    data = []
+    label = []
+    for each in dataset:
+        data.append(each[0])
+        label.append(each[1])
+    data = torch.stack(data)
+    label = torch.LongTensor(label)
+    if numpy:
+        return [(data.numpy(), label.numpy())]
+    else:
+        return [(data, label)]
+
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+def get_torch_initialization(numpy=True):
+    fc1 = torch.nn.Linear(28 * 28, 256)
+    fc2 = torch.nn.Linear(256, 64)
+    fc3 = torch.nn.Linear(64, 10)
+    
+    if numpy:
+        W1 = fc1.weight.T.detach().clone().numpy()
+        W2 = fc2.weight.T.detach().clone().numpy()
+        W3 = fc3.weight.T.detach().clone().numpy()
+    else:
+        W1 = fc1.weight.T.detach().clone().data
+        W2 = fc2.weight.T.detach().clone().data
+        W3 = fc3.weight.T.detach().clone().data
+    
+    return W1, W2, W3