diff --git a/assignment-2/submission/16307130040/README.md b/assignment-2/submission/16307130040/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..75a75c96dba24c1fe572704318d3646659659437
--- /dev/null
+++ b/assignment-2/submission/16307130040/README.md
@@ -0,0 +1,148 @@
+# 实验报告2
+
+### 1,实验结果
+
+在完成了对mini_batch函数的替换后,模型得以顺利进行:
+
+
+
+```shell
+[0] Accuracy: 0.9453
+[1] Accuracy: 0.9656
+[2] Accuracy: 0.9689
+```
+
+### 2,mini_batch的替换
+
+```python
+def mini_batch(dataset, batch_size=128, numpy=True):
+ data = []
+ label = []
+ for each in dataset:
+ data.append(np.array(each[0]))
+ label.append(each[1])
+ data = np.array(data)
+ label = np.array(label)
+
+ m = data.shape[0]
+ permutation = list(np.random.permutation(m))
+ data =data[permutation]
+ label=label[permutation]
+
+ n=m//batch_size
+ mini_batches=[]
+ for i in range(n):
+ mini_batches.append([data[i*batch_size:(i+1)*batch_size],label[i*batch_size:(i+1)*batch_size]])
+
+ return mini_batches
+```
+
+整体上参考了utils.py的batch函数。前半段和batch一样,将dataset中的数据分别放到data和label之中。之后,让data和label的元素顺序随机变化,再将每一个小batch的数据和标记放入对应的列表中,依次放入一个大的列表,并最终输出。
+
+### 3,反向传播公式的推导
+
+matmul:
+
+#### 
+
+设输出y为l维的向量。
+
+**dx:**对于每个x中的元素xi,它对y1,y2,y3,.....,yl的偏导为wi1,wi2......,wil.
+
+dL/dxi=(dL/dy1)·wi1+(dL/dy2)·wi2+……+(dL/dyl)·wil
+
+所以,dx=dy·WT.
+
+**dW:** 对于W的元素wij,它对yj的偏导为xi。
+
+dL/dwij=(dL/dyj)·xi
+
+所以,dW=xT*dy。对于多个样本,需要求平均值。
+
+```python
+def backward(self, grad_y):
+ """
+ grad_y: shape(N, d')
+ """
+ N=grad_y.shape[0]
+
+ grad_x=np.matmul(grad_y,self.memory['W'].T)
+ grad_W=np.matmul(self.memory['x'].T,grad_y)
+
+
+ return grad_x, grad_W
+```
+
+
+
+2,Relu函数
+
+```python
+def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+ x=self.memory['x']
+ grad_x = grad_y.copy()
+ grad_x[x<=0]=0
+ return grad_x
+
+```
+
+在xi<=0时,dyi/dxi=0,dL/dxi=0
+
+在xi>0时,dyi/dxi=1,dL/dxi=dL/dyi
+
+所以如上所示,如果xi大于0,则dx相应的位置照搬dy;如果xi小于等于0,则dx响应的位置设置为0.
+
+3,log函数
+
+```python
+def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+
+ x=self.memory['x']
+ x[x<=self.epsilon]=self.epsilon
+ grad_x=grad_y*(1/x)
+
+ return grad_x
+```
+
+dyi/dxi=1/xi,dL/dxi=(dL/dyi)·1/xi
+
+所以,dx=dy·(1/x).
+
+4,softmax函数
+
+
+
+设x和y均为l维的向量。
+
+则对于dyj/dxi,如果i=j,则dyj/dxi=yj-(yj)^2.如果i!=j,则有dyj/dxi=-yi·yj.
+
+设D=diag(y)-yT·y,有dyj/dxi=Dij
+
+这样的话,有dL/dxi=(dL/dy1)·Di1+(dL/dy2)·Di2+……+(dL/dyl)·Dil
+
+所以,dx=dy·D
+
+```python
+def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+ y=self.memory['y']
+ l=y.shape[0]
+ grad_x=[]
+ for grad_y1,y1 in zip(grad_y,y):
+ D= np.diag(y1) - np.outer(y1,y1)
+ grad_x1=np.dot(grad_y1, D)
+ grad_x.append(grad_x1)
+ grad_x=np.array(grad_x)
+ return grad_x
+```
+
+不过,在实际的实现中,要考虑到一批中不只有一个数据,要一个一个数据地逐个生成dx。
+
diff --git a/assignment-2/submission/16307130040/img/Figure_1.png b/assignment-2/submission/16307130040/img/Figure_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..dab6049f889917dcbf2e93d6203b3a6579908777
Binary files /dev/null and b/assignment-2/submission/16307130040/img/Figure_1.png differ
diff --git a/assignment-2/submission/16307130040/img/matmul.jpg b/assignment-2/submission/16307130040/img/matmul.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..dd071796bbe85141e48275be4c38358eefa4112f
Binary files /dev/null and b/assignment-2/submission/16307130040/img/matmul.jpg differ
diff --git a/assignment-2/submission/16307130040/img/softmax.jpg b/assignment-2/submission/16307130040/img/softmax.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2daa913899b5f8401693ffb777ff3e27ed24cf09
Binary files /dev/null and b/assignment-2/submission/16307130040/img/softmax.jpg differ
diff --git a/assignment-2/submission/16307130040/numpy_fnn.py b/assignment-2/submission/16307130040/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..277f81a3f11fb44523777ef4bddcb998454bcdc3
--- /dev/null
+++ b/assignment-2/submission/16307130040/numpy_fnn.py
@@ -0,0 +1,184 @@
+import numpy as np
+
+
+class NumpyOp:
+
+ def __init__(self):
+ self.memory = {}
+ self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+
+ def forward(self, x, W):
+ """
+ x: shape(N, d)
+ w: shape(d, d')
+ """
+ self.memory['x'] = x
+ self.memory['W'] = W
+ h = np.matmul(x, W)
+ return h
+
+ def backward(self, grad_y):
+ """
+ grad_y: shape(N, d')
+ """
+
+ grad_x = np.matmul(grad_y, self.memory['W'].T)
+ grad_W = np.matmul(self.memory['x'].T, grad_y)
+
+
+
+ return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+
+ def forward(self, x):
+ self.memory['x'] = x
+ return np.where(x > 0, x, np.zeros_like(x))
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+
+ x=self.memory['x']
+ grad_x = grad_y.copy()
+ grad_x[x<=0]=0
+
+ return grad_x
+
+
+class Log(NumpyOp):
+
+ def forward(self, x):
+ """
+ x: shape(N, c)
+ """
+
+ out = np.log(x + self.epsilon)
+ self.memory['x'] = x
+
+ return out
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+
+ x=self.memory['x']
+ x[x<=self.epsilon]=self.epsilon
+ grad_x=grad_y*(1/x)
+
+ return grad_x
+
+
+class Softmax(NumpyOp):
+ """
+ softmax over last dimension
+ """
+
+ def forward(self, x):
+ """
+ x: shape(N, c)
+ """
+
+ shift_x = x - np.max(x, axis=1).reshape(-1, 1)
+ y = np.exp(shift_x) / np.sum(np.exp(shift_x), axis=1).reshape(-1, 1)
+ #y = np.exp(x+1) / np.sum(np.exp(x+1), axis=1).reshape(-1, 1)
+ self.memory['y'] = y
+
+ return y
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+
+ y=self.memory['y']
+ l=y.shape[0]
+ grad_x=[]
+ for grad_y1,y1 in zip(grad_y,y):
+ D= np.diag(y1) - np.outer(y1,y1)
+ grad_x1=np.dot(grad_y1, D)
+ grad_x.append(grad_x1)
+ grad_x=np.array(grad_x)
+
+ return grad_x
+
+
+class NumpyLoss:
+
+ def __init__(self):
+ self.target = None
+
+ def get_loss(self, pred, target):
+ self.target = target
+ return (-pred * target).sum(axis=1).mean()
+
+ def backward(self):
+ return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+ def __init__(self):
+ self.W1 = np.random.normal(size=(28 * 28, 256))
+ self.W2 = np.random.normal(size=(256, 64))
+ self.W3 = np.random.normal(size=(64, 10))
+
+ # 以下算子会在 forward 和 backward 中使用
+ self.matmul_1 = Matmul()
+ self.relu_1 = Relu()
+ self.matmul_2 = Matmul()
+ self.relu_2 = Relu()
+ self.matmul_3 = Matmul()
+ self.softmax = Softmax()
+ self.log = Log()
+
+ # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导)
+ self.x1_grad, self.W1_grad = None, None
+ self.relu_1_grad = None
+ self.x2_grad, self.W2_grad = None, None
+ self.relu_2_grad = None
+ self.x3_grad, self.W3_grad = None, None
+ self.softmax_grad = None
+ self.log_grad = None
+
+ def forward(self, x):
+ x = x.reshape(-1, 28 * 28)
+
+ x=self.matmul_1.forward(x,self.W1)
+ x=self.relu_1.forward(x)
+ x=self.matmul_2.forward(x,self.W2)
+ x=self.relu_2.forward(x)
+ x=self.matmul_3.forward(x, self.W3)
+
+
+ x = self.softmax.forward(x)
+ x = self.log.forward(x)
+
+ return x
+
+ def backward(self, y):
+
+ y = self.log.backward(y)
+ self.log_grad = y
+ y = self.softmax.backward(y)
+ self.softmax_grad = y
+ y, self.W3_grad = self.matmul_3.backward(y)
+ self.x3_grad = y
+ y = self.relu_2.backward(y)
+ y, self.W2_grad = self.matmul_2.backward(y)
+ self.x2_grad = y
+ y = self.relu_1.backward(y)
+ y, self.W1_grad = self.matmul_1.backward(y)
+ self.x1_grad = y
+
+ pass
+
+ def optimize(self, learning_rate):
+ self.W1 -= learning_rate * self.W1_grad
+ self.W2 -= learning_rate * self.W2_grad
+ self.W3 -= learning_rate * self.W3_grad
diff --git a/assignment-2/submission/16307130040/numpy_mnist.py b/assignment-2/submission/16307130040/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..a688f7c64114bf150ffff2b903dfc74688bda4ad
--- /dev/null
+++ b/assignment-2/submission/16307130040/numpy_mnist.py
@@ -0,0 +1,59 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, mini_batch, get_torch_initialization, plot_curve, one_hot
+
+def mini_batch(dataset, batch_size=128, numpy=True):
+ data = []
+ label = []
+ for each in dataset:
+ data.append(np.array(each[0]))
+ label.append(each[1])
+ data = np.array(data)
+ label = np.array(label)
+
+ m = data.shape[0]
+ permutation = list(np.random.permutation(m))
+ data =data[permutation]
+ label=label[permutation]
+
+ n=m//batch_size
+ mini_batches=[]
+ for i in range(n):
+ mini_batches.append([data[i*batch_size:(i+1)*batch_size],label[i*batch_size:(i+1)*batch_size]])
+
+ return mini_batches
+
+def numpy_run():
+ train_dataset, test_dataset = download_mnist()
+
+ model = NumpyModel()
+ numpy_loss = NumpyLoss()
+ model.W1, model.W2, model.W3 = get_torch_initialization()
+
+ train_loss = []
+
+ epoch_number = 3
+ learning_rate = 0.1
+
+ for epoch in range(epoch_number):
+ for x, y in mini_batch(train_dataset):
+ y = one_hot(y)
+
+ y_pred = model.forward(x)
+ loss = numpy_loss.get_loss(y_pred, y)
+
+ model.backward(numpy_loss.backward())
+ model.optimize(learning_rate)
+
+ train_loss.append(loss.item())
+
+ x, y = batch(test_dataset)[0]
+ accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+ print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+
+ plot_curve(train_loss)
+
+
+
+if __name__ == "__main__":
+ numpy_run()
diff --git a/assignment-2/submission/16307130040/torch_mnist.py b/assignment-2/submission/16307130040/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a5649bbfa750b3520b4b895de7260c3aa8ea7cd
--- /dev/null
+++ b/assignment-2/submission/16307130040/torch_mnist.py
@@ -0,0 +1,64 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+
+ def __init__(self):
+ self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+ self.W2 = torch.randn((256, 64), requires_grad=True)
+ self.W3 = torch.randn((64, 10), requires_grad=True)
+
+ def forward(self, x):
+ x = x.reshape(-1, 28 * 28)
+ x = torch.relu(torch.matmul(x, self.W1))
+ x = torch.relu(torch.matmul(x, self.W2))
+ x = torch.matmul(x, self.W3)
+ self.softmax = torch.softmax(x, 1)
+ self.log = torch.log(self.softmax)
+ self.softmax.retain_grad() # for test only
+ self.log.retain_grad() # for test only
+ return self.log
+
+ def optimize(self, learning_rate):
+ with torch.no_grad():
+ self.W1 -= learning_rate * self.W1.grad
+ self.W2 -= learning_rate * self.W2.grad
+ self.W3 -= learning_rate * self.W3.grad
+
+ self.W1.grad = None
+ self.W2.grad = None
+ self.W3.grad = None
+
+
+def torch_run():
+ train_dataset, test_dataset = download_mnist()
+
+ model = TorchModel()
+ model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False)
+
+ train_loss = []
+
+ epoch_number = 3
+ learning_rate = 0.1
+
+ for epoch in range(epoch_number):
+ for x, y in mini_batch(train_dataset, numpy=False):
+ y = one_hot(y, numpy=False)
+
+ y_pred = model.forward(x)
+ loss = (-y_pred * y).sum(dim=1).mean()
+ loss.backward()
+ model.optimize(learning_rate)
+
+ train_loss.append(loss.item())
+
+ x, y = batch(test_dataset, numpy=False)[0]
+ accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+ print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+
+ plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+ torch_run()
diff --git a/assignment-2/submission/17307130331/README.md b/assignment-2/submission/17307130331/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..abd8de5834bacc838e1b813905da469a8d9168c3
--- /dev/null
+++ b/assignment-2/submission/17307130331/README.md
@@ -0,0 +1,343 @@
+# 实验报告
+
+陈疏桐 17307130331
+
+本次实验,我用numpy实现了Matmul、log、softmax和relu四个算子的前向计算与后向计算,用四个算子构建分类模型,通过了自动测试,并实现了mini_batch函数,在mnist数据集上用不同的学习率与Batch大小进行训练和测试,讨论学习率与Batch大小对模型训练效果的影响。最后,我还实现Momentum、RMSProp与Adam三种优化方法,与传统梯度下降进行比较。
+
+## 算子的反向传播与实现
+### Matmul
+
+Matmul是矩阵的乘法,在模型中的作用相当于pytorch的一个线性层,前向传播的公式是:
+
+$$ \mathrm{Y} = \mathrm{X}\mathrm{W} $$
+
+其中,$\mathrm{X}$是形状为 $N \times d$的输入矩阵,$\mathrm{W}$是形状为$d \times d'$的矩阵, $\mathrm{Y}$是形状为$N\times d'$的输出矩阵。Matmul算子相当于输入维度为$d$、输出$d'$维的线性全连接层。
+
+Matmul分别对输入求偏导,有
+
+$$ \frac{\partial \mathrm{Y}}{\partial \mathrm{X}} = \frac{\partial \mathrm{X}\mathrm{W}}{\partial \mathrm{X}} = \mathrm{W}^T$$
+
+$$ \frac{\partial \mathrm{Y}}{\partial \mathrm{W}} = \frac{\partial \mathrm{X}\mathrm{W}}{\partial \mathrm{W}} = \mathrm{X}^T $$
+
+则根据链式法则,反向传播的计算公式为:
+
+$$ \triangledown{\mathrm{X}} = \triangledown{\mathrm{Y}} \times \mathrm{W}^T $$
+$$ \triangledown{\mathrm{W}} = \mathrm{X}^T \times \triangledown{\mathrm{Y}} $$
+
+### Relu
+
+Relu函数对输入每一个元素的公式是:
+
+$$ \mathrm{Y}_{ij}=
+\begin{cases}
+\mathrm{X}_{ij} & \mathrm{X}_{ij} \ge 0 \\\\
+0 & \text{otherwise}
+\end{cases}
+$$
+
+
+每一个输出 $\mathrm{Y}_{ij}$都只与输入$\mathrm{X}_{ij}$有关。则$\mathrm{X}$每一个元素的导数也只和对应的输出有关,为:
+
+$$ \frac{\partial \mathrm{Y}_{ij}}{\partial \mathrm{X}_{ij}} =
+\begin{cases}
+1 & \mathrm{X}_{ij} \ge 0 \\\\
+0 & \text{otherwise}
+\end{cases}$$
+
+因此,根据链式法则,输入的梯度为:
+
+$$ \triangledown{\mathrm{X}_{ij}} = \triangledown{\mathrm{Y}_{ij}} \times \frac{\partial \mathrm{Y}_{ij}}{\partial \mathrm{X}_{ij}}$$
+
+### Log
+
+Log 函数公式:
+
+$$ \mathrm{Y}_{ij} = \log(\mathrm{X}_{ij} + \epsilon)$$
+
+$$ \frac{\partial \mathrm{Y}_{ij}}{\partial \mathrm{X}_{ij}} = \frac{1}{(\mathrm{X}_{ij} + \epsilon)} $$
+
+类似地,反向传播的计算公式为:
+
+$$ \triangledown{\mathrm{X}_{ij}} = \triangledown{\mathrm{Y}_{ij}} \times \frac{\partial \mathrm{Y}_{ij}}{\partial \mathrm{X}_{ij}}$$
+
+### Softmax
+
+Softmax对输入$\mathrm{X}$的最后一个维度进行计算。前向传播的计算公式为:
+
+$$ \mathrm{Y}_{ij} = \frac{\exp^{\mathrm{X}_{ij}}}{\sum_{k} \exp ^ {\mathrm{X}_{ik}}}$$
+
+从公式可知,Softmax的每一行输出都是独立计算的,与其它行的输入无关。而对于同一行,每一个输出都与每一个输入元素有关。以行$k$为例,可推得输出元素对输入元素求导的计算公式是:
+
+$$\frac{\partial Y_{ki}}{\partial X_{kj}} = \begin{cases}
+\frac{\exp ^ {X_{kj}} \times (\sum_{t \ne j}{\exp ^ {X_{kt}}}) }{(\sum_{t}{\exp ^ {X_{kt}}})^2} = Y_{kj}(1-Y_{kj}) & i = j \\\\
+-\frac{\exp^{X_{ki} }\exp^{X_{kj} }}{(\sum_t\exp^{X_{kt}})^2}=-Y_{ki} \times Y_{kj} & i\ne j
+\end{cases}$$
+
+可得每行输出$\mathrm{Y}_{k}$与每行输入$\mathrm{X}_{k}$的Jacob矩阵$\mathrm{J}_{k}$, $\mathrm{J_{k}}_{ij} = \frac{\partial \mathrm{Y}_{ki}}{\partial \mathrm{X}_{kj}}$.
+
+输出的一行对于输入$\mathrm{X}_{kj}$的导数,是输出每一行所有元素对其导数相加,即$\sum_{i} {\frac{\partial \mathrm{Y}_{ki}}{\partial \mathrm{X}_{kj}}}$ 的结果。
+
+因此,根据链式法则,可得到反向传播的计算公式为:
+$$ \triangledown \mathrm{X}_{kj} = \sum_{i} {\frac{\partial \mathrm{Y}_{ki} \times \triangledown \mathrm{Y}_{ki}}{\partial \mathrm{X}_{kj}}}$$
+
+相当于:
+
+$$ \triangledown \mathrm{X}_{k} = \mathrm{J}_{k} \times \triangledown \mathrm{Y}_{k} $$
+
+在实现时,可以用`numpy`的`matmul`操作实现对最后两个维度的矩阵相乘,得到的矩阵堆叠起来,得到最后的结果。
+
+
+## 模型构建与训练
+### 模型构建
+
+参照`torch_mnist.py`中的`torch_model`,`numpy`模型的构建只需要将其中的算子换成我们实现的算子:
+```
+def forward(self, x):
+ x = x.reshape(-1, 28 * 28)
+
+ x = self.relu_1.forward(self.matmul_1.forward(x, self.W1))
+ x = self.relu_2.forward(self.matmul_2.forward(x, self.W2))
+
+ x = self.matmul_3.forward(x, self.W3)
+
+ x = self.softmax.forward(x)
+ x = self.log.forward(x)
+
+ return x
+```
+
+模型的computation graph是:
+
+
+根据计算图,可以应用链式法则,推导出各个叶子变量($\mathrm{W}_{1}, \mathrm{W}_{2}, \mathrm{W}_{3}, \mathrm{X}$)以及中间变量的计算方法。
+
+反向传播的计算图为:
+
+
+可根据计算图完成梯度的计算:
+```
+def backward(self, y):
+ self.log_grad = self.log.backward(y)
+ self.softmax_grad = self.softmax.backward(self.log_grad)
+ self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+ self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+ self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+ self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+ self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+```
+
+### MiniBatch
+
+在`utils`中的`mini_batch`方法,直接调用了`pytorch`的`DataLoader`。 `DataLoader`是一个负责从数据集中读取样本、组合成批次输出的方法。简单地使用`DataLoader`, 可以方便地多线程并行化预取数据,加快训练速度,且节省代码。`DataLoader`还可以自定义`Sampler`,以不同的方式从数据集中进行采样,以及`BatchSampler`以自定的方式将采集的样本组合成批,这样就可以实现在同一Batch内将数据补0、自定义Batch正负样本混合比例等操作。
+
+在这里,我们模仿`DataLoader`的默认行为实现`mini_batch`方法。
+```
+def mini_batch(dataset, batch_size=128):
+ data = np.array([each[0].numpy() for each in dataset]) # 需要先处理数据
+ label = np.array([each[1] for each in dataset])
+
+ data_size = data.shape[0]
+ idx = np.array([i for i in range(data_size)])
+ np.random.shuffle(idx) # 打乱顺序
+
+ return [(data[idx[i: i+batch_size]], label[idx[i:i+batch_size]]) for i in range(0, data_size, batch_size)] # 这里相当于DataLoader 的BatchSampler,但一次性调用
+```
+
+### 模型训练
+
+构建模型,设置`epoch=10`, `learning_rate=0.1`, `batch_size=128`后,开始训练。训练时每次fit一个batch的数据,前向传播计算输出,然后根据输出计算loss,再调用`loss.backward`计算loss对输出的求导,即模型输出的梯度,之后就可以调用模型的`backward`进行后向计算。 最后调用模型的`optimize`更新参数。
+
+训练过程:
+
+
+各个epoch的测试准确率为:
+```
+[0] Test Accuracy: 0.9437
+[1] Test Accuracy: 0.9651
+[2] Test Accuracy: 0.9684
+[3] Test Accuracy: 0.9730
+[4] Test Accuracy: 0.9755
+[5] Test Accuracy: 0.9775
+[6] Test Accuracy: 0.9778
+[7] Test Accuracy: 0.9766
+[8] Test Accuracy: 0.9768
+[9] Test Accuracy: 0.9781
+```
+
+将`learning_rate` 调整到0.2,重新训练:
+
+
+各个epoch的测试准确率为:
+```
+[0] Test Accuracy: 0.9621
+[1] Test Accuracy: 0.9703
+[2] Test Accuracy: 0.9753
+[3] Test Accuracy: 0.9740
+[4] Test Accuracy: 0.9787
+[5] Test Accuracy: 0.9756
+[6] Test Accuracy: 0.9807
+[7] Test Accuracy: 0.9795
+[8] Test Accuracy: 0.9814
+[9] Test Accuracy: 0.9825
+```
+
+可见,稍微提高学习率之后,训练前期参数更新的幅度更大,损失下降得更快,能够更早收敛。训练相同迭代数,现在的模型测试准确率更高。
+
+将`learning_rate` 提高到0.3,重新训练:
+
+
+```
+[0] Test Accuracy: 0.9554
+[1] Test Accuracy: 0.9715
+[2] Test Accuracy: 0.9744
+[3] Test Accuracy: 0.9756
+[4] Test Accuracy: 0.9782
+[5] Test Accuracy: 0.9795
+[6] Test Accuracy: 0.9801
+[7] Test Accuracy: 0.9816
+[8] Test Accuracy: 0.9828
+[9] Test Accuracy: 0.9778
+```
+
+增大学习率到0.3之后,训练前期损失下降速度与上一次训练差不多,但是到了训练后期,过大的学习率导致权重在局部最小值的附近以过大的幅度移动,难以进入最低点,模型loss表现为振荡,难以收敛。本次训练的测试准确率先提高到0.9828,后反而下降。
+
+因此,可认为对于大小为128的batch,0.2是较为合适的学习率。
+
+之后,维持学习率为0.2, 修改batch_size 为256, 重新训练:
+
+```
+[0] Test Accuracy: 0.9453
+[1] Test Accuracy: 0.9621
+[2] Test Accuracy: 0.9657
+[3] Test Accuracy: 0.9629
+[4] Test Accuracy: 0.9733
+[5] Test Accuracy: 0.9766
+[6] Test Accuracy: 0.9721
+[7] Test Accuracy: 0.9768
+[8] Test Accuracy: 0.9724
+[9] Test Accuracy: 0.9775
+```
+
+batch_size增大后,每个batch更新一次参数,参数更新的频率更低,从而收敛速度有所降低;但是对比本次实验与前几次实验loss的曲线图,可发现振荡幅度更小。
+
+将batch_size减小到64, 重新实验:
+
+```
+[0] Test Accuracy: 0.9526
+[1] Test Accuracy: 0.9674
+[2] Test Accuracy: 0.9719
+[3] Test Accuracy: 0.9759
+[4] Test Accuracy: 0.9750
+[5] Test Accuracy: 0.9748
+[6] Test Accuracy: 0.9772
+[7] Test Accuracy: 0.9791
+[8] Test Accuracy: 0.9820
+[9] Test Accuracy: 0.9823
+```
+
+loss的下降速度增加,但是振荡幅度变大了。
+
+总结:在一定范围之内,随着学习率的增大,模型收敛速度增加;随着batch_size的减小,模型收敛速度也会有一定增加,但是振荡幅度增大。 学习率过大会导致后期loss振荡、难以收敛;学习率过小则会导致loss下降速度过慢,甚至可能陷入局部最小值而错过更好的最低点。
+
+## 其他优化方式实现
+
+### momentum
+
+普通梯度下降每次更新参数仅仅取决于当前batch的梯度,这可能会让梯度方向受到某些特殊的输入影响。Momentum引入了动量,让当前更新不仅取决于当前的梯度,还考虑到先前的梯度,能够在一定程度上保持一段时间的趋势。momentum的计算方式为:
+
+$$
+\begin{align}
+& v = \alpha v - \gamma \frac{\partial L}{\partial W} \\\\
+& W = W + v
+\end{align}
+$$
+
+我们在`numpy_fnn.py`的模型中实现了Momentum的优化方法。 设置学习率为0.02,batch_size为128, 继续实验:
+
+```
+[0] Test Accuracy: 0.9586
+[1] Test Accuracy: 0.9717
+[2] Test Accuracy: 0.9743
+[3] Test Accuracy: 0.9769
+[4] Test Accuracy: 0.9778
+[5] Test Accuracy: 0.9786
+[6] Test Accuracy: 0.9782
+[7] Test Accuracy: 0.9809
+[8] Test Accuracy: 0.9790
+[9] Test Accuracy: 0.9818
+```
+
+momentum 相比传统梯度下降,不一定最后会得到更好的效果。当加入动量,当前梯度方向与动量方向相同时,参数就会得到更大幅度的调整,因此loss下降速度更快,并且前期动量基本上会积累起来,如果使用过大的学习率,很容易会溢出。所以momentum适合的学习率比普通梯度下降要小一个数量级。 而当梯度方向错误的时候,加入动量会使得参数来不及更新,从而错过最小值。
+
+### RMSProp
+
+
+RMSProp引入了自适应的学习率调节。 在训练前期,学习率应该较高,使得loss能快速下降;但随着训练迭代增加,学习率应该不断减小,使得模型能够更好地收敛。 自适应调整学习率的基本思路是根据梯度来调节,梯度越大,学习率就衰减得越快;后期梯度减小,学习率衰减就更加缓慢。
+
+而为了避免前期学习率衰减得过快,RMSProp还用了指数平均的方法,来缓慢丢弃原来的梯度历史。计算方法为:
+
+$$
+\begin{align}
+& h = \rho h + (1-\rho) \frac{\partial L}{\partial W} \odot \frac{\partial L}{\partial W} \\\\
+& W = W - \gamma \frac{1}{\sqrt{\delta + h}} \frac{\partial L}{\partial W}
+\end{align}$$
+
+设置梯度为0.001, weight_decay 为0.01, 进行训练和测试:
+
+
+```
+[0] Test Accuracy: 0.9663
+[1] Test Accuracy: 0.9701
+[2] Test Accuracy: 0.9758
+[3] Test Accuracy: 0.9701
+[4] Test Accuracy: 0.9748
+[5] Test Accuracy: 0.9813
+[6] Test Accuracy: 0.9813
+[7] Test Accuracy: 0.9819
+[8] Test Accuracy: 0.9822
+[9] Test Accuracy: 0.9808
+```
+
+可见,在训练的中间部分,loss振荡幅度比普通梯度下降更小。训练前期,模型的收敛速度更快,但到后期比起普通梯度下降并无明显优势。
+
+### Adam
+
+Adam 同时结合了动量与自适应的学习率调节。Adam首先要计算梯度的一阶和二阶矩估计,分别代表了动量与自适应的部分:
+
+$$
+\begin{align}
+& \mathrm{m} = \beta_1 \mathrm{m} + (1-\beta_1) \frac{\partial L}{\partial W} \\\\
+& \mathrm{v} = \beta_2 \mathrm{v} + (1-\beta_2) \frac{\partial L}{\partial W} \odot \frac{\partial L}{\partial W}
+\end{align}
+$$
+
+然后进行修正:
+
+$$
+\begin{align}
+& \mathrm{\hat{m}} = \frac{\mathrm{m}}{1-\beta_1 ^ t }\\\\
+& \mathrm{\hat{v}} = \frac{\mathrm{v}}{1-\beta_2 ^ t}
+\end{align}
+$$
+
+最后,参数的更新为:
+$$ W = W - \gamma \frac{\mathrm{\hat m}}{\sqrt{\mathrm{\hat v}+ \delta}}$$
+
+
+设置学习率为0.001, batch_size为128, 开始训练:
+
+```
+[0] Test Accuracy: 0.9611
+[1] Test Accuracy: 0.9701
+[2] Test Accuracy: 0.9735
+[3] Test Accuracy: 0.9752
+[4] Test Accuracy: 0.9787
+[5] Test Accuracy: 0.9788
+[6] Test Accuracy: 0.9763
+[7] Test Accuracy: 0.9790
+[8] Test Accuracy: 0.9752
+[9] Test Accuracy: 0.9806
+
+```
+
+相比传统梯度下降,loss振荡略微有所减小,前期loss下降速度略微更快,但是最后收敛的速度相当。
\ No newline at end of file
diff --git a/assignment-2/submission/17307130331/img/backgraph.png b/assignment-2/submission/17307130331/img/backgraph.png
new file mode 100644
index 0000000000000000000000000000000000000000..c4a70b28e869708641bd01dba83730ed62ab9c4d
Binary files /dev/null and b/assignment-2/submission/17307130331/img/backgraph.png differ
diff --git a/assignment-2/submission/17307130331/img/compu_graph.png b/assignment-2/submission/17307130331/img/compu_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..74f02ff1b4c4795c99600fb2e358d23a170f11c1
Binary files /dev/null and b/assignment-2/submission/17307130331/img/compu_graph.png differ
diff --git a/assignment-2/submission/17307130331/img/momentum.png b/assignment-2/submission/17307130331/img/momentum.png
new file mode 100644
index 0000000000000000000000000000000000000000..152bfe4eda8bf98cb271e9e3af3801f223273ec2
Binary files /dev/null and b/assignment-2/submission/17307130331/img/momentum.png differ
diff --git a/assignment-2/submission/17307130331/img/rmsprop.png b/assignment-2/submission/17307130331/img/rmsprop.png
new file mode 100644
index 0000000000000000000000000000000000000000..d4c9f6d651ea0dcac312c3a7dcb38266a477679c
Binary files /dev/null and b/assignment-2/submission/17307130331/img/rmsprop.png differ
diff --git a/assignment-2/submission/17307130331/img/train.png b/assignment-2/submission/17307130331/img/train.png
new file mode 100644
index 0000000000000000000000000000000000000000..618816332b78c4f0498444a42dd2a5028df91ef1
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train.png differ
diff --git a/assignment-2/submission/17307130331/img/train02.png b/assignment-2/submission/17307130331/img/train02.png
new file mode 100644
index 0000000000000000000000000000000000000000..a2cbc7b9ccbf2f28955902b86881d7a640f50fa7
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train02.png differ
diff --git a/assignment-2/submission/17307130331/img/train03.png b/assignment-2/submission/17307130331/img/train03.png
new file mode 100644
index 0000000000000000000000000000000000000000..41dd8fd9060e6774b983375f3b025ee6335b9f66
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train03.png differ
diff --git a/assignment-2/submission/17307130331/img/train10.png b/assignment-2/submission/17307130331/img/train10.png
new file mode 100644
index 0000000000000000000000000000000000000000..a2056ba0d21f8f40fc0279e532fd6b9f1ff79cef
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train10.png differ
diff --git a/assignment-2/submission/17307130331/img/train256.png b/assignment-2/submission/17307130331/img/train256.png
new file mode 100644
index 0000000000000000000000000000000000000000..81aa1b2bcc7f708607f8c402f9f41d579793f9e1
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train256.png differ
diff --git a/assignment-2/submission/17307130331/img/train64.png b/assignment-2/submission/17307130331/img/train64.png
new file mode 100644
index 0000000000000000000000000000000000000000..8f34749c6fda428437ff3fe11292b0213eca0d7a
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train64.png differ
diff --git a/assignment-2/submission/17307130331/img/train_adam.png b/assignment-2/submission/17307130331/img/train_adam.png
new file mode 100644
index 0000000000000000000000000000000000000000..eefa8b27deb6485f895033add750f018fd14e293
Binary files /dev/null and b/assignment-2/submission/17307130331/img/train_adam.png differ
diff --git a/assignment-2/submission/17307130331/img/trainloss.png b/assignment-2/submission/17307130331/img/trainloss.png
new file mode 100644
index 0000000000000000000000000000000000000000..b845297f03d5d6e6ae2b026b25554519a77f471b
Binary files /dev/null and b/assignment-2/submission/17307130331/img/trainloss.png differ
diff --git a/assignment-2/submission/17307130331/numpy_fnn.py b/assignment-2/submission/17307130331/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b32d95b7825b4787f5d226ac058c0039aee4bba
--- /dev/null
+++ b/assignment-2/submission/17307130331/numpy_fnn.py
@@ -0,0 +1,208 @@
+import numpy as np
+
+
+class NumpyOp:
+
+ def __init__(self):
+ self.memory = {}
+ self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+
+ def forward(self, x, W):
+ """
+ x: shape(N, d)
+ w: shape(d, d')
+ """
+ self.memory['x'] = x
+ self.memory['W'] = W
+ h = np.matmul(x, W)
+ return h
+
+ def backward(self, grad_y):
+ """
+ grad_y: shape(N, d')
+ """
+
+ ####################
+ # code 1 #
+ grad_W = np.matmul(self.memory['x'].T, grad_y)
+ grad_x = np.matmul(grad_y, self.memory['W'].T)
+ ####################
+
+ return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+
+ def forward(self, x):
+ self.memory['x'] = x
+ return np.where(x > 0, x, np.zeros_like(x))
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+
+ ####################
+ # code 2 #
+ ####################
+ grad_x = np.where(self.memory['x'] > 0, np.ones_like(self.memory['x']), np.zeros_like(self.memory['x'])) * grad_y # 元素乘积
+
+ return grad_x
+
+
+class Log(NumpyOp):
+
+ def forward(self, x):
+ """
+ x: shape(N, c)
+ """
+
+ out = np.log(x + self.epsilon)
+ self.memory['x'] = x
+
+ return out
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+
+ ####################
+ # code 3 #
+ ####################
+ grad_x = (1/(self.memory['x'] + self.epsilon)) * grad_y
+ return grad_x
+
+
+class Softmax(NumpyOp):
+ """
+ softmax over last dimension
+ """
+
+ def forward(self, x):
+ """
+ x: shape(N, c)
+ """
+
+ ####################
+ # code 4 #
+ ####################
+ exp_x = np.exp(x)
+ out = exp_x/np.sum(exp_x, axis=1, keepdims=True)
+ self.memory['x'] = x
+ self.memory['out'] = out
+ return out
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+ o = self.memory['out']
+ Jacob = np.array([np.diag(r) - np.outer(r, r) for r in o])
+ # i!=j - oi* oj
+ # i==j oi*(1-oi)
+ grad_y = grad_y[:, np.newaxis, :]
+ grad_x = np.matmul(grad_y, Jacob).squeeze(1)
+ #print(grad_x.shape)
+ #print(grad_x)
+ return grad_x
+
+
+class NumpyLoss:
+
+ def __init__(self):
+ self.target = None
+
+ def get_loss(self, pred, target):
+ self.target = target
+ return (-pred * target).sum(axis=1).mean()
+
+ def backward(self):
+ return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+ def __init__(self):
+ self.W1 = np.random.normal(size=(28 * 28, 256))
+ self.W2 = np.random.normal(size=(256, 64))
+ self.W3 = np.random.normal(size=(64, 10))
+
+ # 以下算子会在 forward 和 backward 中使用
+ self.matmul_1 = Matmul()
+ self.relu_1 = Relu()
+ self.matmul_2 = Matmul()
+ self.relu_2 = Relu()
+ self.matmul_3 = Matmul()
+ self.softmax = Softmax()
+ self.log = Log()
+
+ # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导)
+ self.x1_grad, self.W1_grad = None, None
+ self.relu_1_grad = None
+ self.x2_grad, self.W2_grad = None, None
+ self.relu_2_grad = None
+ self.x3_grad, self.W3_grad = None, None
+ self.softmax_grad = None
+ self.log_grad = None
+
+ # 以下变量是在 momentum\rmsprop中使用的
+ self.v1 = np.zeros_like(self.W1)
+ self.v2 = np.zeros_like(self.W2)
+ self.v3 = np.zeros_like(self.W3)
+
+
+ def forward(self, x):
+ x = x.reshape(-1, 28 * 28)
+
+ x = self.relu_1.forward(self.matmul_1.forward(x, self.W1))
+ x = self.relu_2.forward(self.matmul_2.forward(x, self.W2))
+
+ x = self.matmul_3.forward(x, self.W3)
+
+ x = self.softmax.forward(x)
+ x = self.log.forward(x)
+
+ return x
+
+ def backward(self, y):
+ self.log_grad = self.log.backward(y)
+ self.softmax_grad = self.softmax.backward(self.log_grad)
+ self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+ self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+ self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+ self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+ self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+
+
+ def optimize(self, learning_rate):
+ self.W1 -= learning_rate * self.W1_grad
+ self.W2 -= learning_rate * self.W2_grad
+ self.W3 -= learning_rate * self.W3_grad
+
+ def momentum(self, learning_rate, alpha=0.9):
+ self.v1 = self.v1 * alpha - learning_rate * self.W1_grad
+ self.v2 = self.v2 * alpha - learning_rate * self.W2_grad
+ self.v3 = self.v3 * alpha - learning_rate * self.W3_grad
+
+ self.W1 += self.v1
+ self.W2 += self.v2
+ self.W3 += self.v3
+
+ def RMSProp(self, learning_rate, weight_decay = 0.99):
+ self.v1 = self.v1 * weight_decay + (1-weight_decay) * self.W1_grad * self.W1_grad
+ self.v2 = self.v2 * weight_decay + (1-weight_decay) * self.W2_grad * self.W2_grad
+ self.v3 = self.v3 * weight_decay + (1-weight_decay) * self.W3_grad * self.W3_grad
+
+ self.W1 = self.W1 - learning_rate * self.W1_grad / np.sqrt( self.v1 + 1e-7)
+ self.W2 = self.W2 - learning_rate * self.W2_grad / np.sqrt( self.v2 + 1e-7)
+ self.W3 = self.W3 - learning_rate * self.W3_grad / np.sqrt( self.v3 + 1e-7)
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/17307130331/numpy_mnist.py b/assignment-2/submission/17307130331/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..4187f01eeebbbcd6ab48bfacf8dedc37085e46e2
--- /dev/null
+++ b/assignment-2/submission/17307130331/numpy_mnist.py
@@ -0,0 +1,70 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, get_torch_initialization, plot_curve, one_hot
+
+def mini_batch(dataset, batch_size=128):
+ data = np.array([each[0].numpy() for each in dataset])
+ label = np.array([each[1] for each in dataset])
+
+ data_size = data.shape[0]
+ idx = np.array([i for i in range(data_size)])
+ np.random.shuffle(idx)
+
+ return [(data[idx[i: i+batch_size]], label[idx[i:i+batch_size]]) for i in range(0, data_size, batch_size)]
+
+class Adam():
+ def __init__(self, param, learning_rate=0.001, beta_1=0.9, beta_2=0.999):
+ self.param = param
+ self.iter = 0
+ self.m = 0
+ self.v = 0
+ self.beta1 = beta_1
+ self.beta2 = beta_2
+ self.lr = learning_rate
+ def optimize(self, grad):
+ self.iter+=1
+ self.m = self.beta1 * self.m + (1 - self.beta1) * grad
+ self.v = self.beta2 * self.v + (1 - self.beta2) * grad * grad
+ m_hat = self.m / (1 - self.beta1 ** self.iter)
+ v_hat = self.v / (1 - self.beta2 ** self.iter)
+ self.param -= self.lr * m_hat / (v_hat ** 0.5 + 1e-8)
+ return self.param
+
+def numpy_run():
+ train_dataset, test_dataset = download_mnist()
+
+ model = NumpyModel()
+ numpy_loss = NumpyLoss()
+ model.W1, model.W2, model.W3 = get_torch_initialization()
+
+ W1_opt, W2_opt, W3_opt = Adam(model.W1), Adam(model.W2), Adam(model.W3)
+
+ train_loss = []
+
+ epoch_number = 10
+ learning_rate = 0.0015
+
+ for epoch in range(epoch_number):
+ for x, y in mini_batch(train_dataset, batch_size=128):
+ y = one_hot(y)
+
+ y_pred = model.forward(x)
+ loss = numpy_loss.get_loss(y_pred, y)
+
+ model.backward(numpy_loss.backward())
+ #model.Adam(learning_rate)
+ W1_opt.optimize(model.W1_grad)
+ W2_opt.optimize(model.W2_grad)
+ W3_opt.optimize(model.W3_grad)
+
+ train_loss.append(loss.item())
+
+ x, y = batch(test_dataset)[0]
+ accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+ print('[{}] Test Accuracy: {:.4f}'.format(epoch, accuracy))
+
+ plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+ numpy_run()
diff --git a/assignment-2/submission/17307130331/tester_demo.py b/assignment-2/submission/17307130331/tester_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..515b86c1240eebad83287461548530c944f23bc8
--- /dev/null
+++ b/assignment-2/submission/17307130331/tester_demo.py
@@ -0,0 +1,182 @@
+import numpy as np
+import torch
+from torch import matmul as torch_matmul, relu as torch_relu, softmax as torch_softmax, log as torch_log
+
+from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss
+from torch_mnist import TorchModel
+from utils import get_torch_initialization, one_hot
+
+err_epsilon = 1e-6
+err_p = 0.4
+
+
+def check_result(numpy_result, torch_result=None):
+ if isinstance(numpy_result, list) and torch_result is None:
+ flag = True
+ for (n, t) in numpy_result:
+ flag = flag and check_result(n, t)
+ return flag
+ # print((torch.from_numpy(numpy_result) - torch_result).abs().mean().item())
+ T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item()
+ direction = T / torch_result.numel() < err_p
+ return direction and ((torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon).item()
+
+
+def case_1():
+ x = np.random.normal(size=[5, 6])
+ W = np.random.normal(size=[6, 4])
+
+ numpy_matmul = Matmul()
+ numpy_out = numpy_matmul.forward(x, W)
+ numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+ torch_W = torch.from_numpy(W).clone().requires_grad_()
+
+ torch_out = torch_matmul(torch_x, torch_W)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ (numpy_W_grad, torch_W.grad)
+ ])
+
+
+def case_2():
+ x = np.random.normal(size=[5, 6])
+
+ numpy_relu = Relu()
+ numpy_out = numpy_relu.forward(x)
+ numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_relu(torch_x)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def case_3():
+ x = np.random.uniform(low=0.0, high=1.0, size=[3, 4])
+
+ numpy_log = Log()
+ numpy_out = numpy_log.forward(x)
+ numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_log(torch_x)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def case_4():
+ x = np.random.normal(size=[4, 5])
+
+ numpy_softmax = Softmax()
+ numpy_out = numpy_softmax.forward(x)
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_softmax(torch_x, 1)
+
+ return check_result(numpy_out, torch_out)
+
+
+def case_5():
+ x = np.random.normal(size=[20, 25])
+
+ numpy_softmax = Softmax()
+ numpy_out = numpy_softmax.forward(x)
+ numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_softmax(torch_x, 1)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def test_model():
+ try:
+ numpy_loss = NumpyLoss()
+ numpy_model = NumpyModel()
+ torch_model = TorchModel()
+ torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = get_torch_initialization(numpy=False)
+ numpy_model.W1 = torch_model.W1.detach().clone().numpy()
+ numpy_model.W2 = torch_model.W2.detach().clone().numpy()
+ numpy_model.W3 = torch_model.W3.detach().clone().numpy()
+
+ x = torch.randn((10000, 28, 28))
+ y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000)
+
+ y = one_hot(y, numpy=False)
+ x2 = x.numpy()
+ y_pred = torch_model.forward(x)
+ loss = (-y_pred * y).sum(dim=1).mean()
+ loss.backward()
+
+ y_pred_numpy = numpy_model.forward(x2)
+ numpy_loss.get_loss(y_pred_numpy, y.numpy())
+
+ check_flag_1 = check_result(y_pred_numpy, y_pred)
+ print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10))
+ except:
+ print("[Runtime Error in forward]")
+ print("+ {:12} {}/{}".format("forward", 0, 10))
+ return 0
+
+ try:
+
+ numpy_model.backward(numpy_loss.backward())
+
+ check_flag_2 = [
+ check_result(numpy_model.log_grad, torch_model.log_input.grad),
+ check_result(numpy_model.softmax_grad, torch_model.softmax_input.grad),
+ check_result(numpy_model.W3_grad, torch_model.W3.grad),
+ check_result(numpy_model.W2_grad, torch_model.W2.grad),
+ check_result(numpy_model.W1_grad, torch_model.W1.grad)
+ ]
+ check_flag_2 = sum(check_flag_2) >= 4
+ print("+ {:12} {}/{}".format("backward", 20 * check_flag_2, 20))
+ except:
+ print("[Runtime Error in backward]")
+ print("+ {:12} {}/{}".format("backward", 0, 20))
+ check_flag_2 = False
+
+ return 10 * check_flag_1 + 20 * check_flag_2
+
+
+if __name__ == "__main__":
+ testcases = [
+ ["matmul", case_1, 5],
+ ["relu", case_2, 5],
+ ["log", case_3, 5],
+ ["softmax_1", case_4, 5],
+ ["softmax_2", case_5, 10],
+ ]
+ score = 0
+ for case in testcases:
+ try:
+ res = case[2] if case[1]() else 0
+ except:
+ print("[Runtime Error in {}]".format(case[0]))
+ res = 0
+ score += res
+ print("+ {:12} {}/{}".format(case[0], res, case[2]))
+ score += test_model()
+ print("{:14} {}/60".format("FINAL SCORE", score))
diff --git a/assignment-2/submission/17307130331/torch_mnist.py b/assignment-2/submission/17307130331/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3e214c7606e3d43dac4b94554f942508afffb3
--- /dev/null
+++ b/assignment-2/submission/17307130331/torch_mnist.py
@@ -0,0 +1,73 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+
+ def __init__(self):
+ self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+ self.W2 = torch.randn((256, 64), requires_grad=True)
+ self.W3 = torch.randn((64, 10), requires_grad=True)
+ self.softmax_input = None
+ self.log_input = None
+
+ def forward(self, x):
+ x = x.reshape(-1, 28 * 28)
+ x = torch.relu(torch.matmul(x, self.W1))
+ x = torch.relu(torch.matmul(x, self.W2))
+ x = torch.matmul(x, self.W3)
+
+ self.softmax_input = x
+ self.softmax_input.retain_grad()
+
+ x = torch.softmax(x, 1)
+
+ self.log_input = x
+ self.log_input.retain_grad()
+
+ x = torch.log(x)
+
+ return x
+
+ def optimize(self, learning_rate):
+ with torch.no_grad():
+ self.W1 -= learning_rate * self.W1.grad
+ self.W2 -= learning_rate * self.W2.grad
+ self.W3 -= learning_rate * self.W3.grad
+
+ self.W1.grad = None
+ self.W2.grad = None
+ self.W3.grad = None
+
+
+def torch_run():
+ train_dataset, test_dataset = download_mnist()
+
+ model = TorchModel()
+ model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False)
+
+ train_loss = []
+
+ epoch_number = 3
+ learning_rate = 0.1
+
+ for epoch in range(epoch_number):
+ for x, y in mini_batch(train_dataset, numpy=False):
+ y = one_hot(y, numpy=False)
+
+ y_pred = model.forward(x)
+ loss = (-y_pred * y).sum(dim=1).mean()
+ loss.backward()
+ model.optimize(learning_rate)
+
+ train_loss.append(loss.item())
+
+ x, y = batch(test_dataset, numpy=False)[0]
+ accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+ print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+
+ plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+ torch_run()
diff --git a/assignment-2/submission/17307130331/utils.py b/assignment-2/submission/17307130331/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..709220cfa7a924d914ec1c098c505f864bcd4cfc
--- /dev/null
+++ b/assignment-2/submission/17307130331/utils.py
@@ -0,0 +1,71 @@
+import torch
+import numpy as np
+from matplotlib import pyplot as plt
+
+
+def plot_curve(data):
+ plt.plot(range(len(data)), data, color='blue')
+ plt.legend(['loss_value'], loc='upper right')
+ plt.xlabel('step')
+ plt.ylabel('value')
+ plt.show()
+
+
+def download_mnist():
+ from torchvision import datasets, transforms
+
+ transform = transforms.Compose([
+ transforms.ToTensor(),
+ transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+ ])
+
+ train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True)
+ test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True)
+
+ return train_dataset, test_dataset
+
+
+def one_hot(y, numpy=True):
+ if numpy:
+ y_ = np.zeros((y.shape[0], 10))
+ y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+ return y_
+ else:
+ y_ = torch.zeros((y.shape[0], 10))
+ y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1
+ return y_
+
+
+def batch(dataset, numpy=True):
+ data = []
+ label = []
+ for each in dataset:
+ data.append(each[0])
+ label.append(each[1])
+ data = torch.stack(data)
+ label = torch.LongTensor(label)
+ if numpy:
+ return [(data.numpy(), label.numpy())]
+ else:
+ return [(data, label)]
+
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+ return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+def get_torch_initialization(numpy=True):
+ fc1 = torch.nn.Linear(28 * 28, 256)
+ fc2 = torch.nn.Linear(256, 64)
+ fc3 = torch.nn.Linear(64, 10)
+
+ if numpy:
+ W1 = fc1.weight.T.detach().clone().numpy()
+ W2 = fc2.weight.T.detach().clone().numpy()
+ W3 = fc3.weight.T.detach().clone().numpy()
+ else:
+ W1 = fc1.weight.T.detach().clone().data
+ W2 = fc2.weight.T.detach().clone().data
+ W3 = fc3.weight.T.detach().clone().data
+
+ return W1, W2, W3
diff --git a/assignment-2/submission/18307130090/README.md b/assignment-2/submission/18307130090/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..647eb99d08956f5fea84c6aa563ab3e1576cfcc6
--- /dev/null
+++ b/assignment-2/submission/18307130090/README.md
@@ -0,0 +1,276 @@
+# PRML-2021 Assignment2
+
+姓名:夏海淞
+
+学号:18307130090
+
+## 简述
+
+在本次实验中,我通过`NumPy`实现了一个简单的前馈神经网络,其中包括`numpy_fnn.py`中算子的反向传播以及前馈神经网络模型的构建。为了验证模型效果,我在MNIST数据集上进行了训练和测试。此外,我还实现了`Momentum`和`Adam`优化算法,并比较了它们的性能。
+
+## 算子的反向传播
+
+### `Matmul`
+
+`Matmul`的计算公式为:
+$$
+Y=X\times W
+$$
+其中$Y,X,W$分别为$n\times d',n\times d,d\times d'$的矩阵。
+
+由[神经网络与深度学习-邱锡鹏](https://nndl.github.io/nndl-book.pdf)中公式(B.20)和(B.21),有
+$$
+\frac{\partial Y}{\partial W}=\frac{\partial(X\times W)}{\partial W}=X^T\\\\
+\frac{\partial Y}{\partial X}=\frac{\partial(X\times W)}{\partial X}=W^T
+$$
+结合链式法则和矩阵运算法则,可得
+$$
+\nabla_X=\nabla_Y\times W^T\\\\
+\nabla_W=X^T\times \nabla_Y
+$$
+
+### `Relu`
+
+`Relu`的计算公式为:
+$$
+Y_{ij}=\begin{cases}
+X_{ij}&X_{ij}\ge0\\\\
+0&\text{otherwise}
+\end{cases}
+$$
+因此有
+$$
+\frac{\partial Y_{ij}}{\partial X_{ij}}=\begin{cases}
+1&X_{ij}>0\\\\
+0&\text{otherwise}
+\end{cases}
+$$
+结合链式法则,得到反向传播的计算公式:$\nabla_{Xij}=\nabla_{Yij}\cdot\frac{\partial Y_{ij}}{\partial X_{ij}}$
+
+### `Log`
+
+`Log`的计算公式为
+$$
+Y_{ij}=\ln(X_{ij}+\epsilon),\epsilon=10^{-12}
+$$
+因此有
+$$
+\frac{\partial Y_{ij}}{\partial X_{ij}}=\frac1{X_{ij}+\epsilon}
+$$
+结合链式法则,得到反向传播的计算公式:$\nabla_{Xij}=\nabla_{Yij}\cdot\frac{\partial Y_{ij}}{\partial {X_{ij}}}$
+
+### `Softmax`
+
+`Softmax`的计算公式为
+$$
+Y_{ij}=\frac{\exp\{X_{ij} \}}{\sum_{k=1}^c\exp\{X_{ik} \}}
+$$
+其中$Y,X$均为$N\times c$的矩阵。容易发现`Softmax`以$X$的每行作为单位进行运算。因此对于$X,Y$的行分量$X_k,Y_k$,有
+$$
+\frac{\partial Y_{ki}}{\partial X_{kj}}=\begin{cases}
+\frac{\exp\{X_{kj} \}(\sum_t\exp\{X_{kt}\})-\exp\{2X_{ki}\}}{(\sum_t\exp\{X_{kt}\})^2}=Y_{ki}(1-Y_{ki})&i=j\\\\
+-\frac{\exp\{X_{ki} \}\exp\{X_{kj} \}}{(\sum_t\exp\{X_{kt}\})^2}=-Y_{ki}Y_{kj}&i\not=j
+\end{cases}
+$$
+因此可计算得到$X_k,Y_k$的Jacob矩阵,满足$J_{ij}=\frac{\partial Y_{ki}}{\partial X_{kj}}$。结合链式法则,可得
+$$
+\nabla_X=\nabla_Y\times J
+$$
+将行分量组合起来,就得到了反向传播的最终结果。
+
+## 模型构建与训练
+
+### 模型构建
+
+#### `forward`
+
+参考`torch_mnist.py`中`TorchModel`方法的模型,使用如下代码构建:
+
+```python
+def forward(self, x):
+ x = x.reshape(-1, 28 * 28)
+
+ x = self.relu_1.forward(self.matmul_1.forward(x, self.W1))
+ x = self.relu_2.forward(self.matmul_2.forward(x, self.W2))
+ x = self.matmul_3.forward(x, self.W3)
+
+ x = self.log.forward(self.softmax.forward(x))
+
+ return x
+```
+
+模型的计算图如下:
+
+
+
+#### `backward`
+
+根据模型的计算图,按照反向的计算顺序依次调用对应算子的反向传播算法即可。
+
+```python
+def backward(self, y):
+ self.log_grad = self.log.backward(y)
+ self.softmax_grad = self.softmax.backward(self.log_grad)
+ self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+ self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+ self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+ self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+ self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+
+ return self.x1_grad
+```
+
+#### `mini_batch`
+
+`mini_batch`的作用是提高模型的训练速度,同时得到较好的优化效果。传统的批处理方法对整个数据集计算平均的损失函数值,随后计算相应梯度进行反向传播。当训练数据集容量较大时,对训练速度造成严重影响;而随机方法则对数据集的每个样本计算损失函数值,随后计算相应梯度进行反向传播。此时数据集容量不对训练速度产生影响,然而由于样本的随机性,可能导致参数无法收敛到最优值,在最优值附近震荡。因此一个折中的方法是将数据集划分为若干批次,在提高训练速度的同时保证了较好的收敛效果。
+
+在本次实验中,我参照`utils.py`中的`mini_batch`,在`numpy_mnist.py`中重新实现了`mini_batch`方法:
+
+```python
+def mini_batch(dataset, batch_size=128):
+ data = np.array([np.array(each[0]) for each in dataset])
+ label = np.array([each[1] for each in dataset])
+
+ size = data.shape[0]
+ index = np.arange(size)
+ np.random.shuffle(index)
+
+ return [(data[index[i:i + batch_size]], label[index[i:i + batch_size]]) for i in range(0, size, batch_size)]
+```
+
+### 模型训练
+
+设定`learning_rate=0.1`,`batch_size=128`,`epoch_number=10`。训练结果如下:
+
+```
+[0] Accuracy: 0.9486
+[1] Accuracy: 0.9643
+[2] Accuracy: 0.9724
+[3] Accuracy: 0.9738
+[4] Accuracy: 0.9781
+[5] Accuracy: 0.9768
+[6] Accuracy: 0.9796
+[7] Accuracy: 0.9802
+[8] Accuracy: 0.9800
+[9] Accuracy: 0.9796
+```
+
+
+
+尝试缩减`batch_size`的大小,设定`batch_size=64`。训练结果如下:
+
+```
+[0] Accuracy: 0.9597
+[1] Accuracy: 0.9715
+[2] Accuracy: 0.9739
+[3] Accuracy: 0.9771
+[4] Accuracy: 0.9775
+[5] Accuracy: 0.9803
+[6] Accuracy: 0.9808
+[7] Accuracy: 0.9805
+[8] Accuracy: 0.9805
+[9] Accuracy: 0.9716
+```
+
+
+
+尝试降低`learning_rate`,设定`learning_rate=0.01`。训练结果如下:
+
+```
+[0] Accuracy: 0.8758
+[1] Accuracy: 0.9028
+[2] Accuracy: 0.9143
+[3] Accuracy: 0.9234
+[4] Accuracy: 0.9298
+[5] Accuracy: 0.9350
+[6] Accuracy: 0.9397
+[7] Accuracy: 0.9434
+[8] Accuracy: 0.9459
+[9] Accuracy: 0.9501
+```
+
+
+
+根据实验结果,可以得出以下结论:
+
+当学习率和批处理容量合适时,参数的收敛速度随着学习率的减小而减小,而参数的震荡幅度随着批处理容量的减小而增大。
+
+## 梯度下降算法的改进
+
+传统的梯度下降算法可以表述为:
+$$
+w_{t+1}=w_t-\eta\cdot\nabla f(w_t)
+$$
+尽管梯度下降作为优化算法被广泛使用,它依然存在一些缺点,主要表现为:
+
+- 参数修正方向完全由当前梯度决定,导致当学习率过高时参数可能在最优点附近震荡;
+- 学习率无法随着训练进度改变,导致训练前期收敛速度较慢,后期可能无法收敛。
+
+针对上述缺陷,产生了许多梯度下降算法的改进算法。其中较为典型的是`Momentum`算法和`Adam`算法。
+
+### `Momentum`
+
+针对“参数修正方向完全由当前梯度决定”的问题,`Momentum`引入了“动量”的概念。
+
+类比现实世界,当小球从高处向低处滚动时,其运动方向不仅与当前位置的“陡峭程度”相关,也和当前的速度,即先前位置的“陡峭程度”相关。因此在`Momentum`算法中,参数的修正值不是取决于当前梯度,而是取决于梯度的各时刻的指数移动平均值:
+$$
+m_t=\beta\cdot m_{t-1}+(1-\beta)\cdot\nabla f(w_t)\\\\
+w_{t+1}=w_t-\eta\cdot m_t
+$$
+指数移动平均值反映了参数调整时的“惯性”。当参数调整方向正确时,`Momentum`有助于加快训练速度,减少震荡的幅度;然而当参数调整方向错误时,`Momentum`会因为无法及时调整方向造成性能上的部分损失。
+
+使用`Momentum`算法的训练结果如下:
+
+```
+[0] Accuracy: 0.9444
+[1] Accuracy: 0.9627
+[2] Accuracy: 0.9681
+[3] Accuracy: 0.9731
+[4] Accuracy: 0.9765
+[5] Accuracy: 0.9755
+[6] Accuracy: 0.9768
+[7] Accuracy: 0.9790
+[8] Accuracy: 0.9794
+[9] Accuracy: 0.9819
+```
+
+
+
+可以看出相较传统的梯度下降算法并无明显优势。
+
+### `Adam`
+
+针对“学习率无法随着训练进度改变”的问题,`Adam`在`Momentum`的基础上引入了“二阶动量”的概念。
+
+`Adam`的改进思路为:由于神经网络中存在大量参数,不同参数的调整频率存在差别。对于频繁更新的参数,我们希望适当降低其学习率,提高收敛概率;而对于其他参数,我们希望适当增大其学习率,加快收敛速度。同时,参数的调整频率可能发生动态改变,我们也希望学习率能够随之动态调整。
+
+因为参数的调整值与当前梯度直接相关,因此取历史梯度的平方和作为衡量参数调整频率的标准。如果历史梯度平方和较大,表明参数被频繁更新,需要降低其学习率。因此梯度下降算法改写为:
+$$
+m_t=\beta\cdot m_{t-1}+(1-\beta)\cdot\nabla f(w_t)\\\\
+V_t=V_{t-1}+\nabla^2f(w_t)\\\\
+w_{t+1}=w_t-\frac\eta{\sqrt{V_t}}\cdot m_t
+$$
+然而,由于$V_t$关于$t$单调递增,可能导致训练后期学习率过低,参数无法收敛至最优。因此将$V_t$也改为指数移动平均值,避免了上述缺陷:
+$$
+m_t=\beta_1\cdot m_{t-1}+(1-\beta_1)\cdot\nabla f(w_t)\\\\
+V_t=\beta_2\cdot V_{t-1}+(1-\beta_2)\cdot\nabla^2f(w_t)\\\\
+w_{t+1}=w_t-\frac\eta{\sqrt{V_t}}\cdot m_t
+$$
+使用`Adam`算法的训练结果如下:
+
+```
+[0] Accuracy: 0.9657
+[1] Accuracy: 0.9724
+[2] Accuracy: 0.9759
+[3] Accuracy: 0.9769
+[4] Accuracy: 0.9788
+[5] Accuracy: 0.9778
+[6] Accuracy: 0.9775
+[7] Accuracy: 0.9759
+[8] Accuracy: 0.9786
+[9] Accuracy: 0.9779
+```
+
+
+
+可以看出相较传统的梯度下降算法,损失函数值的震荡幅度有所减小,而收敛速度与传统方法相当。
\ No newline at end of file
diff --git a/assignment-2/submission/18307130090/img/Adam.png b/assignment-2/submission/18307130090/img/Adam.png
new file mode 100644
index 0000000000000000000000000000000000000000..fe0326ebad52ad9356bdd7410834d9d61e9e5152
Binary files /dev/null and b/assignment-2/submission/18307130090/img/Adam.png differ
diff --git a/assignment-2/submission/18307130090/img/SGDM.png b/assignment-2/submission/18307130090/img/SGDM.png
new file mode 100644
index 0000000000000000000000000000000000000000..ba7ad91c5569f2605e7944afe3803863b8072b46
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGDM.png differ
diff --git a/assignment-2/submission/18307130090/img/SGD_batch_size.png b/assignment-2/submission/18307130090/img/SGD_batch_size.png
new file mode 100644
index 0000000000000000000000000000000000000000..328c4cc7bf90ef75a09f8c97ee8e9134d44a33dd
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGD_batch_size.png differ
diff --git a/assignment-2/submission/18307130090/img/SGD_learning_rate.png b/assignment-2/submission/18307130090/img/SGD_learning_rate.png
new file mode 100644
index 0000000000000000000000000000000000000000..7bca928d1aa569b08dad43d761da1b6e27e02942
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGD_learning_rate.png differ
diff --git a/assignment-2/submission/18307130090/img/SGD_normal.png b/assignment-2/submission/18307130090/img/SGD_normal.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6f3933e1bf979fa7b3b643d8f7fe823610109e9
Binary files /dev/null and b/assignment-2/submission/18307130090/img/SGD_normal.png differ
diff --git a/assignment-2/submission/18307130090/img/fnn_model.png b/assignment-2/submission/18307130090/img/fnn_model.png
new file mode 100644
index 0000000000000000000000000000000000000000..29ed50732a88ed1ca38a1cb3c6e82099a3d3e087
Binary files /dev/null and b/assignment-2/submission/18307130090/img/fnn_model.png differ
diff --git a/assignment-2/submission/18307130090/numpy_fnn.py b/assignment-2/submission/18307130090/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7010cad4609f7ae31b8bdc0b19cedc005c5b950c
--- /dev/null
+++ b/assignment-2/submission/18307130090/numpy_fnn.py
@@ -0,0 +1,239 @@
+import numpy as np
+
+
+class NumpyOp:
+
+ def __init__(self):
+ self.memory = {}
+ self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+
+ def forward(self, x, W):
+ """
+ x: shape(N, d)
+ w: shape(d, d')
+ """
+ self.memory['x'] = x
+ self.memory['W'] = W
+ h = np.matmul(x, W)
+ return h
+
+ def backward(self, grad_y):
+ """
+ grad_y: shape(N, d')
+ """
+ x, W = self.memory['x'], self.memory['W']
+ grad_x = np.matmul(grad_y, W.T)
+ grad_W = np.matmul(x.T, grad_y)
+
+ return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+
+ def forward(self, x):
+ self.memory['x'] = x
+ return np.where(x > 0, x, np.zeros_like(x))
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+ x = self.memory['x']
+ grad_x = grad_y * np.where(x > 0, np.ones_like(x), np.zeros_like(x))
+
+ return grad_x
+
+
+class Log(NumpyOp):
+
+ def forward(self, x):
+ """
+ x: shape(N, c)
+ """
+
+ out = np.log(x + self.epsilon)
+ self.memory['x'] = x
+
+ return out
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+ x = self.memory['x']
+ grad_x = grad_y * np.reciprocal(x + self.epsilon)
+
+ return grad_x
+
+
+class Softmax(NumpyOp):
+ """
+ softmax over last dimension
+ """
+
+ def forward(self, x):
+ """
+ x: shape(N, c)
+ """
+ exp_x = np.exp(x - x.max())
+ exp_sum = np.sum(exp_x, axis=1, keepdims=True)
+ out = exp_x / exp_sum
+ self.memory['x'] = x
+ self.memory['out'] = out
+
+ return out
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+ sm = self.memory['out']
+ Jacobs = np.array([np.diag(r) - np.outer(r, r) for r in sm])
+
+ grad_y = grad_y[:, np.newaxis, :]
+ grad_x = np.matmul(grad_y, Jacobs).squeeze(axis=1)
+
+ return grad_x
+
+
+class NumpyLoss:
+
+ def __init__(self):
+ self.target = None
+
+ def get_loss(self, pred, target):
+ self.target = target
+ return (-pred * target).sum(axis=1).mean()
+
+ def backward(self):
+ return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+ def __init__(self):
+ self.W1 = np.random.normal(size=(28 * 28, 256))
+ self.W2 = np.random.normal(size=(256, 64))
+ self.W3 = np.random.normal(size=(64, 10))
+
+ # 以下算子会在 forward 和 backward 中使用
+ self.matmul_1 = Matmul()
+ self.relu_1 = Relu()
+ self.matmul_2 = Matmul()
+ self.relu_2 = Relu()
+ self.matmul_3 = Matmul()
+ self.softmax = Softmax()
+ self.log = Log()
+
+ # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导)
+ self.x1_grad, self.W1_grad = None, None
+ self.relu_1_grad = None
+ self.x2_grad, self.W2_grad = None, None
+ self.relu_2_grad = None
+ self.x3_grad, self.W3_grad = None, None
+ self.softmax_grad = None
+ self.log_grad = None
+
+ self.beta_1 = 0.9
+ self.beta_2 = 0.999
+ self.epsilon = 1e-8
+ self.is_first = True
+
+ self.W1_grad_mean = None
+ self.W2_grad_mean = None
+ self.W3_grad_mean = None
+
+ self.W1_grad_square_mean = None
+ self.W2_grad_square_mean = None
+ self.W3_grad_square_mean = None
+
+ def forward(self, x):
+ x = x.reshape(-1, 28 * 28)
+
+ x = self.relu_1.forward(self.matmul_1.forward(x, self.W1))
+ x = self.relu_2.forward(self.matmul_2.forward(x, self.W2))
+ x = self.matmul_3.forward(x, self.W3)
+
+ x = self.log.forward(self.softmax.forward(x))
+
+ return x
+
+ def backward(self, y):
+ self.log_grad = self.log.backward(y)
+ self.softmax_grad = self.softmax.backward(self.log_grad)
+ self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+ self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+ self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+ self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+ self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+
+ return self.x1_grad
+
+ def optimize(self, learning_rate):
+ def SGD():
+ self.W1 -= learning_rate * self.W1_grad
+ self.W2 -= learning_rate * self.W2_grad
+ self.W3 -= learning_rate * self.W3_grad
+
+ def SGDM():
+ if self.is_first:
+ self.is_first = False
+
+ self.W1_grad_mean = self.W1_grad
+ self.W2_grad_mean = self.W2_grad
+ self.W3_grad_mean = self.W3_grad
+ else:
+ self.W1_grad_mean = self.beta_1 * self.W1_grad_mean + (1 - self.beta_1) * self.W1_grad
+ self.W2_grad_mean = self.beta_1 * self.W2_grad_mean + (1 - self.beta_1) * self.W2_grad
+ self.W3_grad_mean = self.beta_1 * self.W3_grad_mean + (1 - self.beta_1) * self.W3_grad
+
+ delta_1 = learning_rate * self.W1_grad_mean
+ delta_2 = learning_rate * self.W2_grad_mean
+ delta_3 = learning_rate * self.W3_grad_mean
+
+ self.W1 -= delta_1
+ self.W2 -= delta_2
+ self.W3 -= delta_3
+
+ def Adam(learning_rate=0.001):
+ if self.is_first:
+ self.is_first = False
+ self.W1_grad_mean = self.W1_grad
+ self.W2_grad_mean = self.W2_grad
+ self.W3_grad_mean = self.W3_grad
+
+ self.W1_grad_square_mean = np.square(self.W1_grad)
+ self.W2_grad_square_mean = np.square(self.W2_grad)
+ self.W3_grad_square_mean = np.square(self.W3_grad)
+
+ self.W1 -= learning_rate * self.W1_grad_mean
+ self.W2 -= learning_rate * self.W2_grad_mean
+ self.W3 -= learning_rate * self.W3_grad_mean
+ else:
+ self.W1_grad_mean = self.beta_1 * self.W1_grad_mean + (1 - self.beta_1) * self.W1_grad
+ self.W2_grad_mean = self.beta_1 * self.W2_grad_mean + (1 - self.beta_1) * self.W2_grad
+ self.W3_grad_mean = self.beta_1 * self.W3_grad_mean + (1 - self.beta_1) * self.W3_grad
+
+ self.W1_grad_square_mean = self.beta_2 * self.W1_grad_square_mean + (1 - self.beta_2) * np.square(
+ self.W1_grad)
+ self.W2_grad_square_mean = self.beta_2 * self.W2_grad_square_mean + (1 - self.beta_2) * np.square(
+ self.W2_grad)
+ self.W3_grad_square_mean = self.beta_2 * self.W3_grad_square_mean + (1 - self.beta_2) * np.square(
+ self.W3_grad)
+
+ delta_1 = learning_rate * self.W1_grad_mean * np.reciprocal(
+ np.sqrt(self.W1_grad_square_mean) + np.full_like(self.W1_grad_square_mean, self.epsilon))
+ delta_2 = learning_rate * self.W2_grad_mean * np.reciprocal(
+ np.sqrt(self.W2_grad_square_mean) + np.full_like(self.W2_grad_square_mean, self.epsilon))
+ delta_3 = learning_rate * self.W3_grad_mean * np.reciprocal(
+ np.sqrt(self.W3_grad_square_mean) + np.full_like(self.W3_grad_square_mean, self.epsilon))
+
+ self.W1 -= delta_1
+ self.W2 -= delta_2
+ self.W3 -= delta_3
+
+ # SGD()
+ # SGDM()
+ Adam()
diff --git a/assignment-2/submission/18307130090/numpy_mnist.py b/assignment-2/submission/18307130090/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d67f25824dabdc5791ae5cc96655affe8315e72
--- /dev/null
+++ b/assignment-2/submission/18307130090/numpy_mnist.py
@@ -0,0 +1,50 @@
+import numpy as np
+
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, get_torch_initialization, plot_curve, one_hot
+
+
+def mini_batch(dataset, batch_size=128):
+ data = np.array([np.array(each[0]) for each in dataset])
+ label = np.array([each[1] for each in dataset])
+
+ size = data.shape[0]
+ index = np.arange(size)
+ np.random.shuffle(index)
+
+ return [(data[index[i:i + batch_size]], label[index[i:i + batch_size]]) for i in range(0, size, batch_size)]
+
+
+def numpy_run():
+ train_dataset, test_dataset = download_mnist()
+
+ model = NumpyModel()
+ numpy_loss = NumpyLoss()
+ model.W1, model.W2, model.W3 = get_torch_initialization()
+
+ train_loss = []
+
+ epoch_number = 10
+ learning_rate = 0.1
+
+ for epoch in range(epoch_number):
+ for x, y in mini_batch(train_dataset):
+ y = one_hot(y)
+
+ y_pred = model.forward(x)
+ loss = numpy_loss.get_loss(y_pred, y)
+
+ model.backward(numpy_loss.backward())
+ model.optimize(learning_rate)
+
+ train_loss.append(loss.item())
+
+ x, y = batch(test_dataset)[0]
+ accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+ print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+
+ plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+ numpy_run()
diff --git a/assignment-2/submission/18307130090/tester_demo.py b/assignment-2/submission/18307130090/tester_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..504b3eef50a6df4d0aa433113136add50835e420
--- /dev/null
+++ b/assignment-2/submission/18307130090/tester_demo.py
@@ -0,0 +1,182 @@
+import numpy as np
+import torch
+from torch import matmul as torch_matmul, relu as torch_relu, softmax as torch_softmax, log as torch_log
+
+from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss
+from torch_mnist import TorchModel
+from utils import get_torch_initialization, one_hot
+
+err_epsilon = 1e-6
+err_p = 0.4
+
+
+def check_result(numpy_result, torch_result=None):
+ if isinstance(numpy_result, list) and torch_result is None:
+ flag = True
+ for (n, t) in numpy_result:
+ flag = flag and check_result(n, t)
+ return flag
+ # print((torch.from_numpy(numpy_result) - torch_result).abs().mean().item())
+ T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item()
+ direction = T / torch_result.numel() < err_p
+ return direction and ((torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon).item()
+
+
+def case_1():
+ x = np.random.normal(size=[5, 6])
+ W = np.random.normal(size=[6, 4])
+
+ numpy_matmul = Matmul()
+ numpy_out = numpy_matmul.forward(x, W)
+ numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+ torch_W = torch.from_numpy(W).clone().requires_grad_()
+
+ torch_out = torch_matmul(torch_x, torch_W)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ (numpy_W_grad, torch_W.grad)
+ ])
+
+
+def case_2():
+ x = np.random.normal(size=[5, 6])
+
+ numpy_relu = Relu()
+ numpy_out = numpy_relu.forward(x)
+ numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_relu(torch_x)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def case_3():
+ x = np.random.uniform(low=0.0, high=1.0, size=[3, 4])
+
+ numpy_log = Log()
+ numpy_out = numpy_log.forward(x)
+ numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_log(torch_x)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def case_4():
+ x = np.random.normal(size=[4, 5])
+
+ numpy_softmax = Softmax()
+ numpy_out = numpy_softmax.forward(x)
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_softmax(torch_x, 1)
+
+ return check_result(numpy_out, torch_out)
+
+
+def case_5():
+ x = np.random.normal(size=[20, 25])
+
+ numpy_softmax = Softmax()
+ numpy_out = numpy_softmax.forward(x)
+ numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_softmax(torch_x, 1)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def test_model():
+ try:
+ numpy_loss = NumpyLoss()
+ numpy_model = NumpyModel()
+ torch_model = TorchModel()
+ torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = get_torch_initialization(numpy=False)
+ numpy_model.W1 = torch_model.W1.detach().clone().numpy()
+ numpy_model.W2 = torch_model.W2.detach().clone().numpy()
+ numpy_model.W3 = torch_model.W3.detach().clone().numpy()
+
+ x = torch.randn((10000, 28, 28))
+ y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000)
+
+ y = one_hot(y, numpy=False)
+ x2 = x.numpy()
+ y_pred = torch_model.forward(x)
+ loss = (-y_pred * y).sum(dim=1).mean()
+ loss.backward()
+
+ y_pred_numpy = numpy_model.forward(x2)
+ numpy_loss.get_loss(y_pred_numpy, y.numpy())
+
+ check_flag_1 = check_result(y_pred_numpy, y_pred)
+ print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10))
+ except:
+ print("[Runtime Error in forward]")
+ print("+ {:12} {}/{}".format("forward", 0, 10))
+ return 0
+
+ try:
+
+ numpy_model.backward(numpy_loss.backward())
+
+ check_flag_2 = [
+ check_result(numpy_model.log_grad, torch_model.log_input.grad),
+ check_result(numpy_model.softmax_grad, torch_model.softmax_input.grad),
+ check_result(numpy_model.W3_grad, torch_model.W3.grad),
+ check_result(numpy_model.W2_grad, torch_model.W2.grad),
+ check_result(numpy_model.W1_grad, torch_model.W1.grad)
+ ]
+ check_flag_2 = sum(check_flag_2) >= 4
+ print("+ {:12} {}/{}".format("backward", 20 * check_flag_2, 20))
+ except:
+ print("[Runtime Error in backward]")
+ print("+ {:12} {}/{}".format("backward", 0, 20))
+ check_flag_2 = False
+
+ return 10 * check_flag_1 + 20 * check_flag_2
+
+
+if __name__ == "__main__":
+ testcases = [
+ ["matmul", case_1, 5],
+ ["relu", case_2, 5],
+ ["log", case_3, 5],
+ ["softmax_1", case_4, 5],
+ ["softmax_2", case_5, 10],
+ ]
+ score = 0
+ for case in testcases:
+ try:
+ res = case[2] if case[1]() else 0
+ except:
+ print("[Runtime Error in {}]".format(case[0]))
+ res = 0
+ score += res
+ print("+ {:12} {}/{}".format(case[0], res, case[2]))
+ score += test_model()
+ print("{:14} {}/60".format("FINAL SCORE", score))
diff --git a/assignment-2/submission/18307130090/torch_mnist.py b/assignment-2/submission/18307130090/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3e214c7606e3d43dac4b94554f942508afffb3
--- /dev/null
+++ b/assignment-2/submission/18307130090/torch_mnist.py
@@ -0,0 +1,73 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+
+ def __init__(self):
+ self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+ self.W2 = torch.randn((256, 64), requires_grad=True)
+ self.W3 = torch.randn((64, 10), requires_grad=True)
+ self.softmax_input = None
+ self.log_input = None
+
+ def forward(self, x):
+ x = x.reshape(-1, 28 * 28)
+ x = torch.relu(torch.matmul(x, self.W1))
+ x = torch.relu(torch.matmul(x, self.W2))
+ x = torch.matmul(x, self.W3)
+
+ self.softmax_input = x
+ self.softmax_input.retain_grad()
+
+ x = torch.softmax(x, 1)
+
+ self.log_input = x
+ self.log_input.retain_grad()
+
+ x = torch.log(x)
+
+ return x
+
+ def optimize(self, learning_rate):
+ with torch.no_grad():
+ self.W1 -= learning_rate * self.W1.grad
+ self.W2 -= learning_rate * self.W2.grad
+ self.W3 -= learning_rate * self.W3.grad
+
+ self.W1.grad = None
+ self.W2.grad = None
+ self.W3.grad = None
+
+
+def torch_run():
+ train_dataset, test_dataset = download_mnist()
+
+ model = TorchModel()
+ model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False)
+
+ train_loss = []
+
+ epoch_number = 3
+ learning_rate = 0.1
+
+ for epoch in range(epoch_number):
+ for x, y in mini_batch(train_dataset, numpy=False):
+ y = one_hot(y, numpy=False)
+
+ y_pred = model.forward(x)
+ loss = (-y_pred * y).sum(dim=1).mean()
+ loss.backward()
+ model.optimize(learning_rate)
+
+ train_loss.append(loss.item())
+
+ x, y = batch(test_dataset, numpy=False)[0]
+ accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+ print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+
+ plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+ torch_run()
diff --git a/assignment-2/submission/18307130090/utils.py b/assignment-2/submission/18307130090/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..709220cfa7a924d914ec1c098c505f864bcd4cfc
--- /dev/null
+++ b/assignment-2/submission/18307130090/utils.py
@@ -0,0 +1,71 @@
+import torch
+import numpy as np
+from matplotlib import pyplot as plt
+
+
+def plot_curve(data):
+ plt.plot(range(len(data)), data, color='blue')
+ plt.legend(['loss_value'], loc='upper right')
+ plt.xlabel('step')
+ plt.ylabel('value')
+ plt.show()
+
+
+def download_mnist():
+ from torchvision import datasets, transforms
+
+ transform = transforms.Compose([
+ transforms.ToTensor(),
+ transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+ ])
+
+ train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True)
+ test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True)
+
+ return train_dataset, test_dataset
+
+
+def one_hot(y, numpy=True):
+ if numpy:
+ y_ = np.zeros((y.shape[0], 10))
+ y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+ return y_
+ else:
+ y_ = torch.zeros((y.shape[0], 10))
+ y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1
+ return y_
+
+
+def batch(dataset, numpy=True):
+ data = []
+ label = []
+ for each in dataset:
+ data.append(each[0])
+ label.append(each[1])
+ data = torch.stack(data)
+ label = torch.LongTensor(label)
+ if numpy:
+ return [(data.numpy(), label.numpy())]
+ else:
+ return [(data, label)]
+
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+ return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+def get_torch_initialization(numpy=True):
+ fc1 = torch.nn.Linear(28 * 28, 256)
+ fc2 = torch.nn.Linear(256, 64)
+ fc3 = torch.nn.Linear(64, 10)
+
+ if numpy:
+ W1 = fc1.weight.T.detach().clone().numpy()
+ W2 = fc2.weight.T.detach().clone().numpy()
+ W3 = fc3.weight.T.detach().clone().numpy()
+ else:
+ W1 = fc1.weight.T.detach().clone().data
+ W2 = fc2.weight.T.detach().clone().data
+ W3 = fc3.weight.T.detach().clone().data
+
+ return W1, W2, W3
diff --git a/assignment-2/submission/18307130104/README.md b/assignment-2/submission/18307130104/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d1d38cfc70c1a72658e9d0fa1cf8569687ab9e45
--- /dev/null
+++ b/assignment-2/submission/18307130104/README.md
@@ -0,0 +1,179 @@
+18307130104
+
+# 课程报告
+
+这是 prml 的 assignment-2 课程报告,我的代码可以查看 numpy_fnn.py 中 code 1 ~ code 7 部分,以及 util.py 中 mini_batch 函数 numpy == True 的部分。
+
+在 assignment-2 中,完成了 numpy_fnn.py 中各种算子的反向传播,以及一个简单的前馈神经网络构建(包括正向传播和反向传播)。修改了 mini_batch,在 numpy == True 的情况下,不使用 torch 中的 dataloader 函数完成测试集的打乱和分批。
+
+## 模型实现
+
+为了区别矩阵乘法(np.matmul)和矩阵元素逐一做乘法(\*),下面用$\times$表示矩阵乘法,\*表示元素逐一相乘。
+
+### Matmul 算子的反向传播
+
+Matmul 算子输入一个 X 和权重 W,输出 $$[Y] = [X] \times [W]$$
+
+对于 Y 中的元素 $$Y_{ij}$$ 有$$Y_{ij}=\sum_{k}X_{ik} * W_{kj}$$
+
+在计算 grad_x 的时候,已知 grad_y,根据链式法则,可以得到 $gradx_{ij}=\sum_{k}\frac{\partial Y_{ik}}{\partial X_{ij}} * grady_{ik}$
+
+由 $Y_{ij}$的计算公式可以得到,$\frac{\partial Y_{ik}}{\partial X_{ij}}=W_{jk}$
+
+故 $gradx_{ij}=\sum_k W_{jk} *grady_{ik}$
+
+所以 $[gradx] = [grady] \times [W^T]$
+
+同理,可以得到$[gradW]=[x^T]\times [grady]$
+
+经过验证,矩阵的大小符合矩阵乘法规则。
+
+### Relu 算子的反向传播
+
+relu 函数的计算规则如下:
+
+$relu(x) = \begin{cases}0 & x < 0 \\\\ x & otherwise \end{cases}$
+
+求导可以得到
+
+$relu^{'}(x) = \begin{cases}0 & x < 0 \\\\ 1 & otherwise \end{cases}$
+
+故
+
+$[relugrad]=[grady]* [relu^{'}]$
+
+### Log 算子的反向传播
+
+$log(x) = \ln x$
+
+可以得到
+
+$log^{'}(x)=\frac 1 x$
+
+故
+
+$[loggrad]=[grady]* [log^{'}]$
+
+### softmax 算子的反向传播
+
+$softmax(x_i) = \frac {e^{x_i}}{\sum_j e^{x_j}}$
+
+在实现过程中,因为每一行代表一个测试数据点,所以以每一行为整体对每个元素进行 softmax 操作,从而达成对每个测试数据点进行分类的目的。
+
+采用 softmax 算子和交叉熵损失函数可以让损失函数的形式比较简单,但是遗憾的是实现的神经网络要求将两个算子的反向传播操作分开,因此没有办法投机取巧,只能分步进行计算。
+
+为了表达方便,不妨令 $a_i = softmax(x_i)$
+
+下面考虑$a_i$对$x_j$的反向传播。
+
+$a_i = \frac{e^{x_i}}{\sum_k e^{x_k}}$
+
+$\frac {\partial a_i}{\partial x_j}=\frac{\partial}{\partial x_j}(\frac{e^{x_i}}{\sum_k e^{x_k}})$
+
+接下来根据 i 和 j 是否相等分情况进行讨论。
+
+若 i == j,则 $\frac{\partial}{\partial x_j}(\frac{e^{x_i}}{\sum_k e^{x_k}})=\frac{e^{x_i}(\sum_j e^{x_j})-e^{z_i}e^{z_i}}{(\sum_k e^{x_k})^2}=a_i(1-a_i)$
+
+若 i != j,则$\frac{\partial}{\partial x_j}(\frac{e^{x_i}}{\sum_k e^{x_k}})=-\frac{e^{x_i}e^{x_j}}{(\sum_k e^{x_k})^2}=-a_ia_j$
+
+结合 grady,可以得到
+
+$gradx_{ij}=\sum_k \frac{\partial}{\partial x_j}(\frac{e^{x_k}}{\sum_w e^{x_w}}) grady_{ik}$
+
+由于这个梯度的计算需要进行分类讨论,我没有想到可以直接用 numpy 中函数进行计算的方法,所以首先计算出一个 list 再转换成 ndarray 进行返回。
+
+### 模型正向传播
+
+模型每一层的输出作为下一层的输入,最后得到的是经过 Log 计算的 softmax 结果,这样就能很方便的进行交叉熵损失函数的计算。同时经过“模型反向传播”中的分析可以知道,这样设计使反向传播时的输入也非常简便。
+
+### 模型反向传播
+
+模型进行反向传播的时候会输入一个每行为一个独热向量的矩阵,表示每个数据集的类别,初始代码中会将矩阵中所有元素都除以矩阵的大小,但是经过的尝试,需要将所有元素除以训练数据的组数才能保证结果正确。~~同时,虽然通过了测试,但是 softmax 层的输出也和 torch 中的结果有不同,而后面层的输出是正确的。我认定我理解的 softmax 层和 torch 实现的 softmax 层有一定区别。~~
+
+在更改了测试代码之后,输出和 torch 层比较接近,可以认定是正确的。
+
+接下来推导反向传播时 Log 层的输入。
+
+交叉熵损失函数的形式为
+
+$Loss = -\sum_k t_k*\ln a_k$
+
+其中 $t_k$表示是否属于第 k 个类别,$a_k$为 softmax 层的输出,Log 层的输出为$\ln a_k$,则$\frac{\partial Loss}{\partial \ln a_k}=-t_k$
+
+因此,将输入到反向传播的矩阵 T 取反作为 Log 层的反向传播输入,然后将结果作为前一层的输入逐一反向传播。
+
+## 模型训练
+
+随着训练轮数增长,训练的正确率如下
+
+learning_rate = 0.1 mini_batch = 128
+
+> [0] Accuracy: 0.9403
+> [1] Accuracy: 0.9641
+> [2] Accuracy: 0.9716
+> [3] Accuracy: 0.9751
+> [4] Accuracy: 0.9772
+> [5] Accuracy: 0.9782
+> [6] Accuracy: 0.9745
+> [7] Accuracy: 0.9807
+> [8] Accuracy: 0.9790
+> [9] Accuracy: 0.9811
+
+损失随训练轮数变化如下图所示
+
+
+
+可以看到,正确率随着训练稳步上升,在 6 轮之后,数字基本稳定,仅仅有略微的上下波动。
+
+learning_rate = 0.1 mini_batch = 32
+
+> [0] Accuracy: 0.9646
+> [1] Accuracy: 0.9726
+> [2] Accuracy: 0.9768
+> [3] Accuracy: 0.9788
+> [4] Accuracy: 0.9792
+> [5] Accuracy: 0.9770
+> [6] Accuracy: 0.9820
+> [7] Accuracy: 0.9808
+> [8] Accuracy: 0.9822
+> [9] Accuracy: 0.9835
+
+
+
+可以看到,由于 mini_batch 从 128 变成 32,损失随着轮数的变化会有比较大的起伏。
+
+learning_rate = 0.2 mini_batch = 128
+
+> [0] Accuracy: 0.9295
+> [1] Accuracy: 0.9688
+> [2] Accuracy: 0.9753
+> [3] Accuracy: 0.9734
+> [4] Accuracy: 0.9793
+> [5] Accuracy: 0.9777
+> [6] Accuracy: 0.9792
+> [7] Accuracy: 0.9807
+> [8] Accuracy: 0.9821
+> [9] Accuracy: 0.9815
+
+
+
+虽然调高了学习率,但是损失并没有因此产生比较大的起伏,仍然表现出非常好的效果。
+
+learning_rate = 0.05 mini_batch = 128
+
+> [0] Accuracy: 0.9310
+> [1] Accuracy: 0.9504
+> [2] Accuracy: 0.9601
+> [3] Accuracy: 0.9661
+> [4] Accuracy: 0.9691
+> [5] Accuracy: 0.9728
+> [6] Accuracy: 0.9749
+> [7] Accuracy: 0.9761
+> [8] Accuracy: 0.9768
+> [9] Accuracy: 0.9752
+
+
+
+降低了学习率之后,可以看到正确率的增长比较缓慢,但是经过几轮训练之后的结果和高学习率的时候差不多。
+
+综合来看,影响最终正确率的主要还是模型本身的学习能力,一定范围内修改学习率和 mini_batch 对结果的影响不大。采用 mini_batch 的方式训练有助于降低训练过程中损失的波动。
\ No newline at end of file
diff --git a/assignment-2/submission/18307130104/img/result-1.png b/assignment-2/submission/18307130104/img/result-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..11c6fba6be9d6f58a463830a5d8c006ad64af963
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-1.png differ
diff --git a/assignment-2/submission/18307130104/img/result-2.png b/assignment-2/submission/18307130104/img/result-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..3f9aa1a2ed643f738f7d9ff59ea1923891048166
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-2.png differ
diff --git a/assignment-2/submission/18307130104/img/result-3.png b/assignment-2/submission/18307130104/img/result-3.png
new file mode 100644
index 0000000000000000000000000000000000000000..1e7d29f9f43741b83d6ac43ecf4b6c448c8c1141
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-3.png differ
diff --git a/assignment-2/submission/18307130104/img/result-4.png b/assignment-2/submission/18307130104/img/result-4.png
new file mode 100644
index 0000000000000000000000000000000000000000..2a1f550db001bdcc1d3a3b9501dba56a13028e8e
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-4.png differ
diff --git a/assignment-2/submission/18307130104/img/result-5.png b/assignment-2/submission/18307130104/img/result-5.png
new file mode 100644
index 0000000000000000000000000000000000000000..7ee7df630e01d83559e9f316a937df107e98248d
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result-5.png differ
diff --git a/assignment-2/submission/18307130104/img/result.png b/assignment-2/submission/18307130104/img/result.png
new file mode 100644
index 0000000000000000000000000000000000000000..0039ef8029c07eeb75caa2efd42c13aeba61ce5a
Binary files /dev/null and b/assignment-2/submission/18307130104/img/result.png differ
diff --git a/assignment-2/submission/18307130104/numpy_fnn.py b/assignment-2/submission/18307130104/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba780e9edb71ec687ddf7d295973be810848ce79
--- /dev/null
+++ b/assignment-2/submission/18307130104/numpy_fnn.py
@@ -0,0 +1,214 @@
+import numpy as np
+
+
+class NumpyOp:
+
+ def __init__(self):
+ self.memory = {}
+ self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+
+ def forward(self, x, W):
+ """
+ x: shape(N, d)
+ w: shape(d, d')
+ """
+ self.memory['x'] = x
+ self.memory['W'] = W
+ h = np.matmul(x, W)
+ return h
+
+ def backward(self, grad_y):
+ """
+ grad_y: shape(N, d')
+ """
+
+ ####################
+ # code 1 #
+ ####################
+ grad_x = np.matmul(grad_y, self.memory['W'].T)
+ grad_W = np.matmul(self.memory['x'].T, grad_y)
+
+ return grad_x, grad_W
+
+
+class Relu(NumpyOp):
+
+ def forward(self, x):
+ self.memory['x'] = x
+ return np.where(x > 0, x, np.zeros_like(x))
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+
+ ####################
+ # code 2 #
+ ####################
+ grad_x = grad_y * np.where(self.memory['x'] > 0, np.ones_like(self.memory['x']), np.zeros_like(self.memory['x']))
+
+ return grad_x
+
+
+class Log(NumpyOp):
+
+ def forward(self, x):
+ """
+ x: shape(N, c)
+ """
+
+ out = np.log(x + self.epsilon)
+ self.memory['x'] = x
+
+ return out
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+
+ ####################
+ # code 3 #
+ ####################
+ grad_x = grad_y * np.reciprocal(self.memory['x'] + self.epsilon)
+
+ return grad_x
+
+
+class Softmax(NumpyOp):
+ """
+ softmax over last dimension
+ """
+
+ def forward(self, x):
+ """
+ x: shape(N, c)
+ """
+
+ ####################
+ # code 4 #
+ ####################
+ self.memory['x'] = x
+ expx = np.exp(x)
+ sumx = np.sum(expx, axis = 1, keepdims = True)
+ return (expx / sumx)
+
+ def backward(self, grad_y):
+ """
+ grad_y: same shape as x
+ """
+
+ ####################
+ # code 5 #
+ ####################
+
+ x = self.memory['x']
+ softx = self.forward(x)
+ # print(sumx.shape)
+ [n, m] = x.shape
+ out = []
+ # print(grad_y)
+ for i in range(n):
+ out.append([])
+ for j in range(m):
+ out[i].append(0)
+ for k in range(m):
+ if j == k:
+ # print(softx[i][k], grad_y[i][k])
+ out[i][j] += (1 - softx[i][k]) * softx[i][k] * grad_y[i][k]
+ else:
+ out[i][j] += -softx[i][j] * softx[i][k] * grad_y[i][k]
+ grad_x = np.array(out)
+
+ return grad_x
+
+
+class NumpyLoss:
+
+ def __init__(self):
+ self.target = None
+
+ def get_loss(self, pred, target):
+ self.target = target
+ return (-pred * target).sum(axis=1).mean()
+
+ def backward(self):
+ return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+ def __init__(self):
+ self.W1 = np.random.normal(size=(28 * 28, 256))
+ self.W2 = np.random.normal(size=(256, 64))
+ self.W3 = np.random.normal(size=(64, 10))
+
+ # 以下算子会在 forward 和 backward 中使用
+ self.matmul_1 = Matmul()
+ self.relu_1 = Relu()
+ self.matmul_2 = Matmul()
+ self.relu_2 = Relu()
+ self.matmul_3 = Matmul()
+ self.softmax = Softmax()
+ self.log = Log()
+
+ # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导)
+ self.x1_grad, self.W1_grad = None, None
+ self.relu_1_grad = None
+ self.x2_grad, self.W2_grad = None, None
+ self.relu_2_grad = None
+ self.x3_grad, self.W3_grad = None, None
+ self.softmax_grad = None
+ self.log_grad = None
+
+ def forward(self, x):
+ x = x.reshape(-1, 28 * 28)
+
+ ####################
+ # code 6 #
+ ####################
+ x = self.matmul_1.forward(x, self.W1)
+ x = self.relu_1.forward(x)
+ x = self.matmul_2.forward(x, self.W2)
+ x = self.relu_2.forward(x)
+ x = self.matmul_3.forward(x, self.W3)
+ x = self.softmax.forward(x)
+ # print(x)
+ x = self.log.forward(x)
+
+ return x
+
+ def backward(self, y):
+
+ ####################
+ # code 7 #
+ ####################
+
+ y = self.log.backward(y)
+ self.log_grad = y
+
+ y = self.softmax.backward(y)
+ self.softmax_grad = y
+
+ y, self.W3_grad = self.matmul_3.backward(y)
+ self.x3_grad = y
+
+ y = self.relu_2.backward(y)
+ self.relu_2_grad = y
+
+ y, self.W2_grad = self.matmul_2.backward(y)
+ self.x2_grad = y
+
+ y = self.relu_1.backward(y)
+ self.relu_1_grad = y
+
+ y, self.W1_grad = self.matmul_1.backward(y)
+ self.x1_grad = y
+ return y
+
+ def optimize(self, learning_rate):
+ self.W1 -= learning_rate * self.W1_grad
+ self.W2 -= learning_rate * self.W2_grad
+ self.W3 -= learning_rate * self.W3_grad
diff --git a/assignment-2/submission/18307130104/numpy_mnist.py b/assignment-2/submission/18307130104/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f7aaadd84d701b578d384df3d4976f5c76a5dfa
--- /dev/null
+++ b/assignment-2/submission/18307130104/numpy_mnist.py
@@ -0,0 +1,38 @@
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import download_mnist, batch, mini_batch, get_torch_initialization, plot_curve, one_hot
+
+
+def numpy_run():
+ train_dataset, test_dataset = download_mnist()
+
+ model = NumpyModel()
+ numpy_loss = NumpyLoss()
+ model.W1, model.W2, model.W3 = get_torch_initialization()
+
+ train_loss = []
+
+ epoch_number = 3
+ learning_rate = 0.1
+
+ for epoch in range(epoch_number):
+ for x, y in mini_batch(train_dataset, 128, True):
+ y = one_hot(y)
+
+ y_pred = model.forward(x)
+ loss = numpy_loss.get_loss(y_pred, y)
+
+ model.backward(numpy_loss.backward())
+ model.optimize(learning_rate)
+
+ train_loss.append(loss.item())
+
+ x, y = batch(test_dataset)[0]
+ accuracy = np.mean((model.forward(x).argmax(axis=1) == y))
+ print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+
+ plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+ numpy_run()
diff --git a/assignment-2/submission/18307130104/tester_demo.py b/assignment-2/submission/18307130104/tester_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..df4bb27bc0d8b9f28f5abd09faff7635d8347792
--- /dev/null
+++ b/assignment-2/submission/18307130104/tester_demo.py
@@ -0,0 +1,183 @@
+import numpy as np
+import torch
+from torch import matmul as torch_matmul, relu as torch_relu, softmax as torch_softmax, log as torch_log
+
+from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss
+from torch_mnist import TorchModel
+from utils import get_torch_initialization, one_hot
+
+err_epsilon = 1e-6
+err_p = 0.4
+
+
+def check_result(numpy_result, torch_result=None):
+ if isinstance(numpy_result, list) and torch_result is None:
+ flag = True
+ for (n, t) in numpy_result:
+ flag = flag and check_result(n, t)
+ return flag
+ # print((torch.from_numpy(numpy_result) - torch_result).abs().mean().item())
+ T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item()
+ direction = T / torch_result.numel() < err_p
+
+ return direction and ((torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon).item()
+
+
+def case_1():
+ x = np.random.normal(size=[5, 6])
+ W = np.random.normal(size=[6, 4])
+
+ numpy_matmul = Matmul()
+ numpy_out = numpy_matmul.forward(x, W)
+ numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+ torch_W = torch.from_numpy(W).clone().requires_grad_()
+
+ torch_out = torch_matmul(torch_x, torch_W)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ (numpy_W_grad, torch_W.grad)
+ ])
+
+
+def case_2():
+ x = np.random.normal(size=[5, 6])
+
+ numpy_relu = Relu()
+ numpy_out = numpy_relu.forward(x)
+ numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_relu(torch_x)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def case_3():
+ x = np.random.uniform(low=0.0, high=1.0, size=[3, 4])
+
+ numpy_log = Log()
+ numpy_out = numpy_log.forward(x)
+ numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_log(torch_x)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def case_4():
+ x = np.random.normal(size=[4, 5])
+
+ numpy_softmax = Softmax()
+ numpy_out = numpy_softmax.forward(x)
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_softmax(torch_x, 1)
+
+ return check_result(numpy_out, torch_out)
+
+
+def case_5():
+ x = np.random.normal(size=[20, 25])
+
+ numpy_softmax = Softmax()
+ numpy_out = numpy_softmax.forward(x)
+ numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_softmax(torch_x, 1)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def test_model():
+ try:
+ numpy_loss = NumpyLoss()
+ numpy_model = NumpyModel()
+ torch_model = TorchModel()
+ torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = get_torch_initialization(numpy=False)
+ numpy_model.W1 = torch_model.W1.detach().clone().numpy()
+ numpy_model.W2 = torch_model.W2.detach().clone().numpy()
+ numpy_model.W3 = torch_model.W3.detach().clone().numpy()
+
+ x = torch.randn((10000, 28, 28))
+ y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000)
+
+ y = one_hot(y, numpy=False)
+ x2 = x.numpy()
+ y_pred = torch_model.forward(x)
+ loss = (-y_pred * y).sum(dim=1).mean()
+ loss.backward()
+
+ y_pred_numpy = numpy_model.forward(x2)
+ numpy_loss.get_loss(y_pred_numpy, y.numpy())
+
+ check_flag_1 = check_result(y_pred_numpy, y_pred)
+ print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10))
+ except:
+ print("[Runtime Error in forward]")
+ print("+ {:12} {}/{}".format("forward", 0, 10))
+ return 0
+
+ try:
+
+ numpy_model.backward(numpy_loss.backward())
+
+ check_flag_2 = [
+ check_result(numpy_model.log_grad, torch_model.log_input.grad),
+ check_result(numpy_model.softmax_grad, torch_model.softmax_input.grad),
+ check_result(numpy_model.W3_grad, torch_model.W3.grad),
+ check_result(numpy_model.W2_grad, torch_model.W2.grad),
+ check_result(numpy_model.W1_grad, torch_model.W1.grad)
+ ]
+ check_flag_2 = sum(check_flag_2) >= 4
+ print("+ {:12} {}/{}".format("backward", 20 * check_flag_2, 20))
+ except:
+ print("[Runtime Error in backward]")
+ print("+ {:12} {}/{}".format("backward", 0, 20))
+ check_flag_2 = False
+
+ return 10 * check_flag_1 + 20 * check_flag_2
+
+
+if __name__ == "__main__":
+ testcases = [
+ ["matmul", case_1, 5],
+ ["relu", case_2, 5],
+ ["log", case_3, 5],
+ ["softmax_1", case_4, 5],
+ ["softmax_2", case_5, 10],
+ ]
+ score = 0
+ for case in testcases:
+ try:
+ res = case[2] if case[1]() else 0
+ except:
+ print("[Runtime Error in {}]".format(case[0]))
+ res = 0
+ score += res
+ print("+ {:12} {}/{}".format(case[0], res, case[2]))
+ score += test_model()
+ print("{:14} {}/60".format("FINAL SCORE", score))
diff --git a/assignment-2/submission/18307130104/torch_mnist.py b/assignment-2/submission/18307130104/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d3e214c7606e3d43dac4b94554f942508afffb3
--- /dev/null
+++ b/assignment-2/submission/18307130104/torch_mnist.py
@@ -0,0 +1,73 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+
+ def __init__(self):
+ self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+ self.W2 = torch.randn((256, 64), requires_grad=True)
+ self.W3 = torch.randn((64, 10), requires_grad=True)
+ self.softmax_input = None
+ self.log_input = None
+
+ def forward(self, x):
+ x = x.reshape(-1, 28 * 28)
+ x = torch.relu(torch.matmul(x, self.W1))
+ x = torch.relu(torch.matmul(x, self.W2))
+ x = torch.matmul(x, self.W3)
+
+ self.softmax_input = x
+ self.softmax_input.retain_grad()
+
+ x = torch.softmax(x, 1)
+
+ self.log_input = x
+ self.log_input.retain_grad()
+
+ x = torch.log(x)
+
+ return x
+
+ def optimize(self, learning_rate):
+ with torch.no_grad():
+ self.W1 -= learning_rate * self.W1.grad
+ self.W2 -= learning_rate * self.W2.grad
+ self.W3 -= learning_rate * self.W3.grad
+
+ self.W1.grad = None
+ self.W2.grad = None
+ self.W3.grad = None
+
+
+def torch_run():
+ train_dataset, test_dataset = download_mnist()
+
+ model = TorchModel()
+ model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False)
+
+ train_loss = []
+
+ epoch_number = 3
+ learning_rate = 0.1
+
+ for epoch in range(epoch_number):
+ for x, y in mini_batch(train_dataset, numpy=False):
+ y = one_hot(y, numpy=False)
+
+ y_pred = model.forward(x)
+ loss = (-y_pred * y).sum(dim=1).mean()
+ loss.backward()
+ model.optimize(learning_rate)
+
+ train_loss.append(loss.item())
+
+ x, y = batch(test_dataset, numpy=False)[0]
+ accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+ print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+
+ plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+ torch_run()
diff --git a/assignment-2/submission/18307130104/utils.py b/assignment-2/submission/18307130104/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..274566a51dc9718158d63b6aa59546381d939223
--- /dev/null
+++ b/assignment-2/submission/18307130104/utils.py
@@ -0,0 +1,83 @@
+import torch
+import numpy as np
+from matplotlib import pyplot as plt
+
+def plot_curve(data):
+ plt.plot(range(len(data)), data, color='blue')
+ plt.legend(['loss_value'], loc='upper right')
+ plt.xlabel('step')
+ plt.ylabel('value')
+ plt.xlim(-100,5000)
+ plt.savefig('./img/result.png')
+ plt.close()
+ plt.show()
+
+
+def download_mnist():
+ from torchvision import datasets, transforms
+
+ transform = transforms.Compose([
+ transforms.ToTensor(),
+ transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+ ])
+
+ train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True)
+ test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True)
+
+ return train_dataset, test_dataset
+
+
+def one_hot(y, numpy=True):
+ if numpy:
+ y_ = np.zeros((y.shape[0], 10))
+ y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+ return y_
+ else:
+ y_ = torch.zeros((y.shape[0], 10))
+ y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1
+ return y_
+
+
+def batch(dataset, numpy=True):
+ data = []
+ label = []
+ for each in dataset:
+ data.append(each[0])
+ label.append(each[1])
+ data = torch.stack(data)
+ label = torch.LongTensor(label)
+ if numpy:
+ return [(data.numpy(), label.numpy())]
+ else:
+ return [(data, label)]
+
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+ if numpy:
+ import random
+ datas = [(each[0].numpy(), each[1]) for each in dataset]
+ random.shuffle(datas)
+ datat = [each[0] for each in datas]
+ labelt = [each[1] for each in datas]
+ data = [np.array(datat[i: i + batch_size]) for i in range(0, len(datat), batch_size)]
+ label = [np.array(labelt[i: i + batch_size]) for i in range(0, len(datat), batch_size)]
+ return zip(data, label)
+ else:
+ return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+def get_torch_initialization(numpy=True):
+ fc1 = torch.nn.Linear(28 * 28, 256)
+ fc2 = torch.nn.Linear(256, 64)
+ fc3 = torch.nn.Linear(64, 10)
+
+ if numpy:
+ W1 = fc1.weight.T.detach().clone().numpy()
+ W2 = fc2.weight.T.detach().clone().numpy()
+ W3 = fc3.weight.T.detach().clone().numpy()
+ else:
+ W1 = fc1.weight.T.detach().clone().data
+ W2 = fc2.weight.T.detach().clone().data
+ W3 = fc3.weight.T.detach().clone().data
+
+ return W1, W2, W3
diff --git a/assignment-2/submission/18307130116/README.md b/assignment-2/submission/18307130116/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..60d6a7aaf412e4f028a1124ff7cc63b243e2c2d7
--- /dev/null
+++ b/assignment-2/submission/18307130116/README.md
@@ -0,0 +1,160 @@
+# FNN实现
+
+[toc]
+
+## 模型实现
+
+各算子实现参考[算子导数推导部分](##算子导数推导),网络结构如下图所示
+
+
+
+根据上图对应的模型,建立顺序将算子拼接在一起,并在反向传播时从loss开始逐层回传,基本没什么难点,最终模型构建了函数
+
+$log(softmax(W_3\sigma(W_2\sigma(W_1X)))$
+
+## 模型训练
+
+在运行实现给出的`numpy_minst.py`,共运行了三个epoch,对应的准确率和loss变化情况如下
+
+| epoch | Accuracy |
+| ----- | -------- |
+| 0 | 94.49% |
+| 1 | 96.47% |
+| 2 | 96.58% |
+
+
+
+### 学习率和epoch的影响
+
+观察发现,loss下降到一定范围后开始上下抖动,推测其原因为接近极值点时学习率过大,为达到更优的性能,我调小的学习率并增大了epoch数量,得到结果如下,并做了不更改学习率仅调整epoch数量的对比实验其中i为[(i-1)*5, i\*5)中位数,20为最终结果
+
+| epoch | Accuracy(learning_rate = 0.1) | Accuracy(learning_rate = 0.05) | Accuracy(learning_rate = 0.1+0.05) |
+| ----- | ----------------------------- | ------------------------------ | ---------------------------------- |
+| 0 | 97.27% | 95.85% | 96.59% |
+| 5 | 97.93% | 97.85% | 97.91% |
+| 10 | 98.03% | 98.03% | 98.18% |
+| 15 | 98.12% | 98.09% | 98.18% |
+| 20 | 98.12% | 98.19% | 98.18% |
+
+