diff --git a/assignment-2/submission/18307130252/README.md b/assignment-2/submission/18307130252/README.md new file mode 100644 index 0000000000000000000000000000000000000000..95e07a57b00f09445e88e39cfb8cc8a281d410d0 --- /dev/null +++ b/assignment-2/submission/18307130252/README.md @@ -0,0 +1,343 @@ +# Assignment-2 FNN + +- [Assignment-2 FNN](#assignment-2-fnn) + - [1. FNN 算子反向传播公式推导](#1-fnn-算子反向传播公式推导) + - [1.1 Matmul](#11-matmul) + - [1.2 Relu](#12-relu) + - [1.3 Log](#13-log) + - [1.4 Softmax](#14-softmax) + - [2. 模型的训练与测试](#2-模型的训练与测试) + - [2.1 模型的前向计算](#21-模型的前向计算) + - [2.2 模型的反向传播](#22-模型的反向传播) + - [2.3 小批量梯度下降法 Mini-Batch](#23-小批量梯度下降法-mini-batch) + - [2.4 实验](#24-实验) + - [3. 优化方法](#3-优化方法) + - [3.1 Momentum](#31-momentum) + - [3.2 Adam](#32-adam) + - [3.3 实验](#33-实验) + - [4. 参数初始化](#4-参数初始化) + - [4.1 Xavier 初始化](#41-xavier-初始化) + - [4.2 Kaiming/He 初始化](#42-kaiminghe-初始化) + - [4.3 实验](#43-实验) + - [5. 参考文献](#5-参考文献) + +## 1. FNN 算子反向传播公式推导 + +### 1.1 Matmul + +$$ +Y_{ij} = \Sigma_k X_{ik}·W_{kj} +$$ + +$$ +\begin{split} + \frac{\partial L}{\partial X_{ij}} &= \Sigma_k\frac{\partial L}{\partial Y_{ik}} \frac{\partial Y_{ik}}{\partial X_{ij}}\\\\ + &= \Sigma_k gradY_{ik} \cdot W_{jk}\\\\ + &=\Sigma_k gradY_{ik}\cdot W^T_{kj}\\\\ + \\\\ + \frac{\partial L}{\partial W_{ij}} &= \Sigma_k\frac{\partial L}{\partial Y_{kj}} \frac{\partial Y_{kj}}{\partial W_{ij}}\\\\ + &=\Sigma_k gradY_{kj}\cdot X_{ki}\\\\ + &=\Sigma_k gradY_{kj}\cdot X^T_{ik}\\\\ +\end{split} +$$ + +即 $ \frac{\partial L}{\partial X} = gradY \cdot W^T $, $ \frac{\partial L}{\partial W} = X^T \cdot gradY$ + + +### 1.2 Relu + +$$ +Y_{ij} = ReLu(X_{ij}) = + \begin{cases} + X_{ij} & X_{ij} \geq 0 \\\\ + 0 & X_{ij} \lt 0 + \end{cases} += max(0, X_{ij}) +$$ + +$$ +\begin{split} + \frac{\partial L}{\partial X_{ij}} &= \frac{\partial L}{\partial Y_{ij}} \frac{\partial Y_{ij}}{\partial X_{ij}} \\\\ + &= gradY_{ij} \frac{\partial Relu(X_{ij})}{\partial X_{ij}} \\\\ + &= + \begin{cases} + gradY_{ij} \cdot \frac{\partial X_{ij}}{\partial X_{ij}} & X_{ij} \geq 0 \\\\ + gradY_{ij} \cdot \frac{\partial \space 0}{\partial X_{ij}} & X_{ij} \lt 0 + \end{cases} \\\\ + &= + \begin{cases} + gradY_{ij} & X_{ij} \geq 0 \\\\ + 0 & X_{ij} \lt 0 + \end{cases} +\end{split} +$$ + +令 $$M_{ij} = \begin{cases} 1 & X_{ij} \geq 0\\\\ 0 & X_{ij} \lt 0\end{cases}$$,则 $ \frac{\partial L}{\partial X} = gradY \cdot M$ + + + +### 1.3 Log + +$$ +Y_{ij} = log(X_{ij} + \epsilon) +$$ + +$$ +\begin{split} +\frac{\partial L}{\partial X_{ij}} &= \frac{\partial L}{\partial Y_{ij}} \frac{\partial Y_{ij}}{\partial X_{ij}}\\ +&= gradY_{ij} \cdot \frac {\partial log(X_{ij} + \epsilon)}{\partial X_{ij}} \\ +&= gradY_{ij} \cdot \frac {1}{X_{ij} + \epsilon} +\end{split} +$$ + +令 $$M_{ij} = \frac{1}{X_{ij} + \epsilon}$$,则 $ \frac{\partial L}{\partial X} = gradY \cdot M$ + + + +### 1.4 Softmax + + +$$ +Y_{ij} = softmax(X_{ij}) = \frac {exp(X_{ij})}{\Sigma_k exp(X_{ik})} +$$ + +考虑到 $softmax$ 的特殊性质: $softmax((X+c)_i)=\frac{e^{x_i+c}}{\Sigma_k e^{x_k + c}} = \frac{e^{x_i}}{\Sigma_k e^{x_k}} = softmax(X_i)$,实际应用时,通常实现将 $x$ 减去最大值,从而防止溢出。 + +$$ +\begin{split} + \frac{\partial Y_{ij}}{\partial X_{ik}} &= \frac{\partial }{\partial X_{ik}} \frac{e^{X_{ij}}}{\Sigma_p e^{X_{ip}}}\\\\ + \\\\ + 当\space j = k \space 时, + \frac{\partial Y_{ij}}{\partial X_{ik}} &= \frac{\partial Y_{ij}}{\partial X_{ij}} \\\\ + &= \frac{\partial}{\partial X_{ij}}\frac{e^{X_{ij}}}{\Sigma_p e^{X_{ip}}} \\\\ + &= \frac{(e^{X_{ij}})'\cdot\Sigma_p e^{X_{ip}}- e^{X_{ij}}\cdot(\Sigma_p e^{X_{ip}})'}{(\Sigma_p e^{X_{ip}})^2} \\\\ + &= \frac{e^{X_{ij}}\cdot(\Sigma_p e^{X_{ip}}- e^{X_{ij}})}{(\Sigma_p e^{X_{ip}})^2} \\\\ + &=Y_{ij} - Y_{ij}^2 = Y_{ik} - Y_{ij}Y_{ik} \\\\ + \\\\ + 当\space j \neq k \space 时, + \frac{\partial Y_{ij}}{\partial X_{ik}} &= \frac{\partial Y_{ij}}{\partial X_{ik}} \\\\ + &= \frac{\partial}{\partial X_{ik}}\frac{e^{X_{ij}}}{\Sigma_p e^{X_{ip}}} \\\\ + &= \frac{(e^{X_{ij}})'\cdot\Sigma_p e^{X_{ip}}- e^{X_{ij}}\cdot(\Sigma_p e^{X_{ip}})'}{(\Sigma_p e^{X_{ip}})^2} \\\\ + &= \frac{e^{X_{ij}}\cdot e^{X_{ik}}}{(\Sigma_p e^{X_{ip}})^2} \\\\ + &=Y_{ij}Y_{ik}\\\\ +\end{split} +$$ + +$$ +\begin{split} + \frac{\partial L}{\partial X_{ij}} &= \Sigma_k \frac{\partial L}{\partial Y_{ik}}\frac{\partial Y_{ik}}{\partial X_{ij}}\\\\ + &=\Sigma_k gradY_{ik} \cdot + \begin{cases} + Y_{ik} - Y_{ij}Y_{ik} & & j = k\\\\ + Y_{ij}Y_{ik} & & j \neq k + \end{cases}\\\\ + &=gradY_{ik} \cdot Y_{ik} - \Sigma_k \space gradY_{ik} \cdot Y_{ij} \cdot Y_{ik}\\\\ + &=gradY_{ik} \cdot Y_{ik} - Y_{ji}^T \cdot \Sigma_k \space gradY_{ik} \cdot Y_{ik} +\end{split} +$$ + +记 $C = A*B$ 为 $C_{ij} = A_{ij} \cdot B_{ij}$, $S_i = \Sigma_k gradY_{ik} \cdot Y_{ik}$,则 $\frac{\partial L}{\partial X} = gradY * Y - Y^T \cdot S$ + + +## 2. 模型的训练与测试 + +### 2.1 模型的前向计算 + +![image-20210501201313443](img/neural-network-model.png) +$$ +\begin{split} +layer1 &= W1 \cdot X \\\\ +out1 &= ReLu(layer1)\\\\ +layer2 &= W2 \cdot out1\\\\ +out2 &= ReLu(layer2)\\\\ +layer3 &= W3 \cdot out2\\\\ +out3 &= softmax(layer3)\\\\ +out &= log(out3) +\end{split} +$$ + + +### 2.2 模型的反向传播 + +从后向前,根据之前推导的算子公式依次计算梯度即可: + +```python +self.log_grad = self.log.backward(y) +self.softmax_grad = self.softmax.backward(self.log_grad) +self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad) +self.relu_2_grad = self.relu_2.backward(self.x3_grad) +self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad) +self.relu_1_grad = self.relu_1.backward(self.x2_grad) +self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad) +``` + + + +### 2.3 小批量梯度下降法 Mini-Batch + +由于深度神经网络的训练数据通常较大,如果在整个训练数据集上进行迭代计算梯度,会耗费大量计算资源。同时,大规模的训练数据集往往包含很多冗余信息,因此,在训练深度神经网络时,经常会使用**Mini-Batch**这一优化方法。 + +在 Mini-Batch 算法中,每次只训练数据集的一小部分,而不训练整个数据集。从而训练模型所需的内存减小,使得原本无法训练的模型可训练。 + +在每次训练前,对训练数据进行随机 shuffle ,根据预设的 batch_size 将原本的数据集划分成若干个大小相等的 mini-batch (除最后一个 mini-batch 大小可能不一致外),对每个 mini-batch 做梯度下降。 + +此部分的实现代码,在 `numpy_mnist.py/mini_batch()` 中。 + + + +### 2.4 实验 + +此实验可通过运行 `python numpy_mnisit.py --mode=lr` 进行复现 + +通过调整学习率分别为 0.1, 0.01, 0.001,从 epoch 和迭代两个角度来讨论 mini-batch 中学习率这一超参的性质。 + + + + + +通过以上实验,发现对于 MNIST 数据集并采用 mini-batch 随机梯度下降时,学习率越大,模型收敛越快。 + +因此,取学习率 $lr = 0.1$,调整 mini-batch 的 batch-size ,观察小批量算法中,批量大小对模型训练结果的影响( `python numpy_mnist.py --mode=batchsize` ) : + + + + + +从迭代角度看, batchsize 越大,下降效果越明显,模型能越快收敛。 + +然而从 epoch 角度来看,batchsize 越小,通过同样的回合,模型的拟合准确度往往越高。 + + + +## 3. 优化方法 + +### 3.1 Momentum + +SGD 训练参数时,有时下降速度会非常慢,并且有可能会陷入到局部最优解。Momentum 算法能够加快模型的学习过程,并且对于高曲率、小旦一致的梯度,或者噪声比较大的梯度有很好的效果。 + +其思想主要是通过引入一个新的变量 $v$ 来积累之前的梯度的指数级衰减移动平均,从而加速学习过程。数学表示如下: +$$ +\begin{split} +V_{dW} &= \beta V_{dW} &+ \space (1 - \beta)dW \\\\ +V_{db} &= \beta V_{db} &+ \space (1 - \beta)db \\\\ +W &= W &- \space \alpha V_{dW} \\\\ +b &= b &- \space \alpha V_{db} +\end{split} +$$ +$\beta$ 越大,则之前积累的梯度对现在的方向影响越大。 + +$\beta$ 的常见取值为:0.5, 0.9, 0.99。在本次实验中,$\beta$ 取值0.9。 + + + +### 3.2 Adam + +Adam 算法本质可以看作是 Momentum 算法和 RMSprop 算法的结合,不但使用动量作为参数更新的方向,而且可以自适应地调整学习率。 + +其数学表示如下: +$$ +\begin{split} +M_t &= \beta_1 M_{t-1} + (1 - \beta_1) g_t \\\\ +G_t &= \beta_2 G_{t-1} + (1 - \beta_2) g_t^2 \\\\ +\end{split} +$$ +每次迭代时,$t$ 都加 1 。初始时,$M_0 = 0, G_0 = 0$ , 但由于迭代初期 $M_t$ 和 $G_t$ 的值会比真实的均值和方差小,$\beta_1$ 和 $\beta_2$ 接近 1 时,会产生很大的偏差,需要进行修正: +$$ +\begin{split} +\hat M_t &= \frac{M_t}{1-\beta_1^t} \\\\ +\hat G_t &= \frac{G_t}{1-\beta_2^t} \\\\ +\Delta\theta_t &= -\frac{\alpha}{\sqrt{\hat G_t + \epsilon}} \hat M_t +\end{split} +$$ +通常而言,学习率 $\alpha$ 设为 0.001,$\beta_1$ 设为 0.9 ,$\beta_2$ 设为 0.999。 + +### 3.3 实验 + +设置 epoch = 20 , batch_size = 128。对比原始方法、 momentum 算法、 adam 算法在常用超参下的训练情况如下图所示。 + +其中,momentum 和 adam 算法额外的超参如下: + +| **origin** | $learnint\\_rate = 0.1$ | +| ------------ | ------------------------------------------------------------ | +| **momentum** | $learnint\\_rate = 0.001, \space \beta = 0.9$ | +| **adam** | $learnint\\_rate = 0.001, \space \beta_1 = 0.9 , \beta_2 = 0.999$ | + + + + + +可以看到,在常用超参下,三种算法的收敛速度几乎没有任何差别,模型拟合的准确率也较为相近。 + +adam 相比前 momentum 和 原始算法,在准确率上虽没有那么稳定,但考虑到其学习率的不同,结合之前原始模型各学习率的收敛速度,可以发现,应用 adam 优化算法后,模型可以在更小的学习率上做到更快地收敛。并且,经过足够多的回合后,adam的准确率是最高的 + + + +## 4. 参数初始化 + +训练神经网络时,权重参数的初始化非常重要。 + +如果权重过小,那么输入信号通过每一层网络后,其方差就会逐渐减小,最终降低到一个非常低的值,影响训练效果。 + +而如果权重过大,那么输入信号通过每一层网络后,其方差就会逐渐增大,最终会造成梯度爆炸会消失。 + +因此,在初始化深度网络时,应尽可能保持每一层输入输出的方差京可能一致,自适应地调整初始化方差。 + +### 4.1 Xavier 初始化 + +设第 $l$ 层的一个神经元 $a^{(l)}$,其接受前一层的 $M_{l-1}$ 个神经元的输出 $a_i^{(l-1)}$,$1 \leq i \leq M_{l-1}$,则 $a^{(l)} = f(\Sigma_{i=1}^{M_{l-1}}w_i^{(l)}a_i^{(l-1)})$。 + +根据教材,为了使得信号在前向和反向传播中都保持不被放大或缩小,可以设置 $var(w_i^{l}) = \frac{2}{M_{l-1} + M_l}$。 + +在得到参数的理想方差后,就可以通过高斯分布和均匀分布来随机地初始化参数,从而,可以得到 Xavier 初始化公式如下: + +均匀分布 ~ $U(-a, a)$: +$$ +a = \sqrt{\frac{6}{in\\_feature + out\\_feature}} +$$ +正态分布 ~ $N(0, std)$: +$$ +std = \sqrt{\frac{2}{in\\_feature + out\\_feature}} +$$ + + +### 4.2 Kaiming/He 初始化 + +由于 Xavier 在 ReLu 激活函数中表现很差,因此,通常基于 Kaiming 初始化进行优化,其思想为:假定每一层一半的神经元被激活,另一半为0。因此,要保持方差不变,只需在 Xavier 初始化的基础上再除以 2。 + +因此, Kaiming 初始化公式如下: + +均匀分布 ~ $U(-bound, bound)$: +$$ +bound = \sqrt{\frac{6}{(1 + a^2) \cdot in\\_feature}} +$$ +正态分布 ~ $N(0, std)$: +$$ +std = \sqrt{\frac{2}{(1 + a^2) \cdot in\\_feature}} +$$ +$a$ 为后一层的激活函数中负的斜率,对于 ReLu而言,$a = 0$。 + + + +### 4.3 实验 + +将原本的 get_torch_initilization 重新实现,运行 `python numpy_mnist.py` 查看 epoch = 3,learning rate = 0.1 时的结果。因为只是用 numpy 复现了 pytorch 的版本,所以和之前使用 pytorch 时基本一致。 + + + +``` +[0] Accuracy: 0.9566 +[1] Accuracy: 0.9621 +[2] Accuracy: 0.9689 +``` + + + +## 5. 参考文献 + +[1] [Understanding Xavier Initialization In Deep Neural Networks](https://prateekvjoshi.com/2016/03/29/understanding-xavier-initialization-in-deep-neural-networks/) + +[2] [pytorch系列 -- 9 pytorch nn.init 中实现的初始化函数 uniform, normal, const, Xavier, He initialization](https://blog.csdn.net/dss_dssssd/article/details/83959474) + +[3] ["深度学习中优化方法——momentum、Nesterov Momentum、AdaGrad、Adadelta、RMSprop、Adam"](https://blog.csdn.net/u012328159/article/details/80311892) + +[4] [《神经网络与深度学习》第7章](https://nndl.github.io/nndl-book.pdf) diff --git a/assignment-2/submission/18307130252/img/exp_batchsize_epoch_acc.png b/assignment-2/submission/18307130252/img/exp_batchsize_epoch_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..43153ac2a1d544d2ae884351907ad73d9476d01f Binary files /dev/null and b/assignment-2/submission/18307130252/img/exp_batchsize_epoch_acc.png differ diff --git a/assignment-2/submission/18307130252/img/exp_batchsize_step_loss.png b/assignment-2/submission/18307130252/img/exp_batchsize_step_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..35dbe62aad8b1417c7327faba3c06786e5b0c13d Binary files /dev/null and b/assignment-2/submission/18307130252/img/exp_batchsize_step_loss.png differ diff --git a/assignment-2/submission/18307130252/img/exp_lr_epoch_acc.png b/assignment-2/submission/18307130252/img/exp_lr_epoch_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..1e9cebf9794b3132a38be097eab7fc453bc9fa65 Binary files /dev/null and b/assignment-2/submission/18307130252/img/exp_lr_epoch_acc.png differ diff --git a/assignment-2/submission/18307130252/img/exp_lr_step_loss.png b/assignment-2/submission/18307130252/img/exp_lr_step_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c40fac0b50695f86d573f38ea6d8ffa621878f3f Binary files /dev/null and b/assignment-2/submission/18307130252/img/exp_lr_step_loss.png differ diff --git a/assignment-2/submission/18307130252/img/exp_optimizer_epoch_acc.png b/assignment-2/submission/18307130252/img/exp_optimizer_epoch_acc.png new file mode 100644 index 0000000000000000000000000000000000000000..7383f72a5819dc861960312a36a1fac6c8534527 Binary files /dev/null and b/assignment-2/submission/18307130252/img/exp_optimizer_epoch_acc.png differ diff --git a/assignment-2/submission/18307130252/img/exp_optimizer_step_loss.png b/assignment-2/submission/18307130252/img/exp_optimizer_step_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..46b5abc8ec1d1e5736675f5b4c0c899f10fd4732 Binary files /dev/null and b/assignment-2/submission/18307130252/img/exp_optimizer_step_loss.png differ diff --git a/assignment-2/submission/18307130252/img/kaiming.png b/assignment-2/submission/18307130252/img/kaiming.png new file mode 100644 index 0000000000000000000000000000000000000000..bcba1c34fa020c3d758e4baee89f955ad5582c2a Binary files /dev/null and b/assignment-2/submission/18307130252/img/kaiming.png differ diff --git a/assignment-2/submission/18307130252/img/neural-network-model.png b/assignment-2/submission/18307130252/img/neural-network-model.png new file mode 100644 index 0000000000000000000000000000000000000000..42ce1d87a9c23ffe229e6873a319b7faa3566fc6 Binary files /dev/null and b/assignment-2/submission/18307130252/img/neural-network-model.png differ diff --git a/assignment-2/submission/18307130252/numpy_fnn.py b/assignment-2/submission/18307130252/numpy_fnn.py new file mode 100644 index 0000000000000000000000000000000000000000..4a57553f11c76bb0bdbf20c27a79b064a65f0128 --- /dev/null +++ b/assignment-2/submission/18307130252/numpy_fnn.py @@ -0,0 +1,238 @@ +import numpy as np + + +class NumpyOp: + + def __init__(self): + self.memory = {} + self.epsilon = 1e-12 + + +class Matmul(NumpyOp): + + def forward(self, x, W): + """ + x: shape(N, d) + w: shape(d, d') + """ + self.memory['x'] = x + self.memory['W'] = W + h = np.matmul(x, W) + return h + + def backward(self, grad_y): + """ + grad_y: shape(N, d') + """ + + #################### + # code 1 # + #################### + grad_x = np.matmul(grad_y, self.memory['W'].transpose()) + grad_W = np.matmul(self.memory['x'].transpose(), grad_y) + + return grad_x, grad_W + + +class Relu(NumpyOp): + + def forward(self, x): + self.memory['x'] = x + return np.where(x > 0, x, np.zeros_like(x)) + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 2 # + #################### + grad_x = np.where(self.memory['x'] > 0, np.ones_like(self.memory['x']), np.zeros_like(self.memory['x'])) + grad_x *= grad_y + + return grad_x + + +class Log(NumpyOp): + + def forward(self, x): + """ + x: shape(N, c) + """ + + out = np.log(x + self.epsilon) + self.memory['x'] = x + + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 3 # + #################### + grad_x = np.reciprocal(self.memory['x'] + self.epsilon) + grad_x *= grad_y + + return grad_x + + +class Softmax(NumpyOp): + """ + softmax over last dimension + """ + + def forward(self, x): + """ + x: shape(N, c) + """ + + #################### + # code 4 # + #################### + + x -= x.max(axis = 1).reshape(x.shape[0], -1) # 防止溢出 + out = np.exp(x) / np.sum(np.exp(x), axis = 1).reshape(-1, 1) + self.memory['x'] = x + + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 5 # + #################### + out = np.exp(self.memory['x']) / np.sum(np.exp(self.memory['x']), axis = 1).reshape(-1, 1) + grad_x = grad_y * out - np.sum(grad_y * out, axis = 1).reshape(-1, 1) * out + + return grad_x + + +class NumpyLoss: + + def __init__(self): + self.target = None + + def get_loss(self, pred, target): + self.target = target + return (-pred * target).sum(axis=1).mean() + + def backward(self): + return -self.target / self.target.shape[0] + + +class NumpyModel: + def __init__(self): + self.W1 = np.random.normal(size=(28 * 28, 256)) + self.W2 = np.random.normal(size=(256, 64)) + self.W3 = np.random.normal(size=(64, 10)) + + # 以下算子会在 forward 和 backward 中使用 + self.matmul_1 = Matmul() + self.relu_1 = Relu() + self.matmul_2 = Matmul() + self.relu_2 = Relu() + self.matmul_3 = Matmul() + self.softmax = Softmax() + self.log = Log() + + # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导) + self.x1_grad, self.W1_grad = None, None + self.relu_1_grad = None + self.x2_grad, self.W2_grad = None, None + self.relu_2_grad = None + self.x3_grad, self.W3_grad = None, None + self.softmax_grad = None + self.log_grad = None + + # 以下变量在 Momentum 优化算法中使用并更新 + self.momentum_V1 = 0 + self.momentum_V2 = 0 + self.momentum_V3 = 0 + self.momentum_Beta = 0.9 # 一般取值:[0.5, 0.9, 0.99],Andrew Ng推荐:0.9 + + # 以下变量在 Adam 优化算法中使用并更新 + self.adam_V1 = 0 + self.adam_S1 = 0 + self.adam_V2 = 0 + self.adam_S2 = 0 + self.adam_V3 = 0 + self.adam_S3 = 0 + self.adam_t = 0 + self.adam_epsilon = 1e-8 + self.adam_Beta1 = 0.9 + self.adam_Beta2 = 0.999 # 一般取值:[0.5, 0.9, 0.99],Andrew Ng推荐:0.9 + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + + #################### + # code 6 # + #################### + layer1 = self.matmul_1.forward(x, self.W1) + out1 = self.relu_1.forward(layer1) + layer2 = self.matmul_2.forward(out1, self.W2) + out2 = self.relu_2.forward(layer2) + layer3 = self.matmul_3.forward(out2, self.W3) + out3 = self.softmax.forward(layer3) + out = self.log.forward(out3) + + return out + + def backward(self, y): + + #################### + # code 7 # + #################### + self.log_grad = self.log.backward(y) + self.softmax_grad = self.softmax.backward(self.log_grad) + self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad) + self.relu_2_grad = self.relu_2.backward(self.x3_grad) + self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad) + self.relu_1_grad = self.relu_1.backward(self.x2_grad) + self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad) + + pass + + def optimize(self, learning_rate, method = "None"): + if method == "None": + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad + + elif method == "momentum": + self.momentum_V1 = self.momentum_Beta * self.momentum_V1 + (1 - self.momentum_Beta) * self.W1_grad + self.W1 -= learning_rate * self.momentum_V1 + self.momentum_V2 = self.momentum_Beta * self.momentum_V2 + (1 - self.momentum_Beta) * self.W2_grad + self.W2 -= learning_rate * self.momentum_V2 + self.momentum_V3 = self.momentum_Beta * self.momentum_V3 + (1 - self.momentum_Beta) * self.W3_grad + self.W3 -= learning_rate * self.momentum_V3 + + elif method == "adam": + self.adam_t += 1 + self.adam_V1 = self.adam_Beta1 * self.adam_V1 + (1 - self.adam_Beta1) * self.W1_grad + self.adam_S1 = self.adam_Beta2 * self.adam_S1 + (1 - self.adam_Beta2) * np.square(self.W1_grad) + v_corrected = self.adam_V1 / (1 - np.power(self.adam_Beta1, self.adam_t)) + s_corrected = self.adam_S1 / (1 - np.power(self.adam_Beta2, self.adam_t)) + self.W1 -= learning_rate * v_corrected / (np.sqrt(s_corrected) + self.adam_epsilon) + + self.adam_V2 = self.adam_Beta1 * self.adam_V2 + (1 - self.adam_Beta1) * self.W2_grad + self.adam_S2 = self.adam_Beta2 * self.adam_S2 + (1 - self.adam_Beta2) * np.square(self.W2_grad) + v_corrected = self.adam_V2 / (1 - np.power(self.adam_Beta1, self.adam_t)) + s_corrected = self.adam_S2 / (1 - np.power(self.adam_Beta2, self.adam_t)) + self.W2 -= learning_rate * v_corrected / (np.sqrt(s_corrected) + self.adam_epsilon) + + self.adam_V3 = self.adam_Beta1 * self.adam_V3 + (1 - self.adam_Beta1) * self.W3_grad + self.adam_S3 = self.adam_Beta2 * self.adam_S3 + (1 - self.adam_Beta2) * np.square(self.W3_grad) + v_corrected = self.adam_V3 / (1 - np.power(self.adam_Beta1, self.adam_t)) + s_corrected = self.adam_S3 / (1 - np.power(self.adam_Beta2, self.adam_t)) + self.W3 -= learning_rate * v_corrected / (np.sqrt(s_corrected) + self.adam_epsilon) + + else: + print("The optimize method has not been implemented yet.") \ No newline at end of file diff --git a/assignment-2/submission/18307130252/numpy_mnist.py b/assignment-2/submission/18307130252/numpy_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..40966a21e16baae4dbf26cf21454679f136b6212 --- /dev/null +++ b/assignment-2/submission/18307130252/numpy_mnist.py @@ -0,0 +1,238 @@ +import numpy as np +from numpy_fnn import NumpyModel, NumpyLoss +import torch +from utils import download_mnist, batch, plot_curve, one_hot +import argparse +from matplotlib import pyplot as plt + +def mini_batch(dataset, batch_size=128, numpy=False): + # modify dataset into LIST type and shuffle it + dataset = list(dataset) + np.random.shuffle(dataset) + + # divide the dataset into different batches + res = [] + dataset_size = len(dataset) + for start_index in range(0, dataset_size, batch_size): + end_index = min(start_index + batch_size, dataset_size) + # retrieve the data and label of the same batch separately + data = np.array([np.array(x[0]) for x in dataset[start_index:end_index]]) + label = np.array([x[1] for x in dataset[start_index:end_index]]) + res.append((data, label)) + + return res + +def kaiming_uniform(in_features, out_features, a = 0): + bound = 6.0 / (1 + a * a) / in_features + bound = bound ** 0.5 + W = np.random.uniform(low = -bound, high = bound, size = (in_features, out_features)) + return W + +def get_torch_initialization(numpy=True): + W1 = kaiming_uniform(28 * 28, 256) + W2 = kaiming_uniform(256, 64) + W3 = kaiming_uniform(64, 10) + + return W1, W2, W3 + + +def numpy_run(parameter_list = [(128, 0.1, "None")], epoch_number = 3): + train_dataset, test_dataset = download_mnist() + + step_loss = [] + epoch_acc = [] + + for batch_size, learning_rate, method in parameter_list: + model = NumpyModel() + numpy_loss = NumpyLoss() + model.W1, model.W2, model.W3 = get_torch_initialization() + + train_loss = [] + train_acc = [] + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset, batch_size=batch_size): + y = one_hot(y) + + y_pred = model.forward(x) + loss = numpy_loss.get_loss(y_pred, y) + + model.backward(numpy_loss.backward()) + model.optimize(learning_rate, method=method) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset)[0] + accuracy = np.mean((model.forward(x).argmax(axis=1) == y)) + train_acc.append(accuracy) + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + step_loss.append(train_loss) + epoch_acc.append(train_acc) + + return step_loss, epoch_acc + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument('--mode', type=str, default="None", help='mode') + + args = parser.parse_args() + + print("----- mode: " + args.mode + " -----") + + if args.mode == "None": + step_loss, epoch_acc = numpy_run() + plot_curve(step_loss[0]) + + elif args.mode == "lr": + # batch_size, learning_rate, optimizer + parameter_list = [ + (128, 0.1, "None"), + (128, 0.01, "None"), + (128, 0.001, "None") + ] + + step_loss, epoch_acc = numpy_run( + parameter_list = parameter_list, + epoch_number = 30 + ) + + plt.xlabel('step') + plt.ylabel('loss') + plt.plot(step_loss[0], color = 'r', label="lr=0.1") + plt.plot(step_loss[1], color = 'g', label="lr=0.01") + plt.plot(step_loss[2], color = 'b', label="lr=0.001") + plt.legend() + plt.savefig("exp_lr_step_loss.jpg") + plt.close() + + plt.xlabel('epoch') + plt.ylabel('acc') + plt.plot(epoch_acc[0], color = 'r', label="lr=0.1") + plt.plot(epoch_acc[1], color = 'g', label="lr=0.01") + plt.plot(epoch_acc[2], color = 'b', label="lr=0.001") + plt.legend() + plt.savefig("exp_lr_epoch_acc.jpg") + plt.close() + + elif args.mode == "batchsize": + # batch_size, learning_rate, optimizer + parameter_list = [ + (32, 0.1, "None"), + (64, 0.1, "None"), + (128, 0.1, "None") + ] + + step_loss, epoch_acc = numpy_run( + parameter_list = parameter_list, + epoch_number = 30 + ) + + plt.xlabel('step') + plt.ylabel('loss') + plt.plot(step_loss[0], color = 'r', label="batch_size=32") + plt.plot(step_loss[1], color = 'g', label="batch_size=64") + plt.plot(step_loss[2], color = 'b', label="batch_size=128") + plt.legend() + plt.savefig("exp_batchsize_step_loss.jpg") + plt.close() + + plt.xlabel('epoch') + plt.ylabel('acc') + plt.plot(epoch_acc[0], color = 'r', label="batch_size=32") + plt.plot(epoch_acc[1], color = 'g', label="batch_size=64") + plt.plot(epoch_acc[2], color = 'b', label="batch_size=128") + plt.legend() + plt.savefig("exp_batchsize_epoch_acc.jpg") + plt.close() + + elif args.mode == "optimizer": + # batch_size, learning_rate, optimizer + parameter_list = [ + (128, 0.001, "adam"), + (128, 0.1, "momentum"), + (128, 0.1, "None") + ] + + step_loss, epoch_acc = numpy_run( + parameter_list = parameter_list, + epoch_number = 30 + ) + + plt.xlabel('step') + plt.ylabel('loss') + plt.plot(step_loss[2], color = 'r', label="origin") + plt.plot(step_loss[1], color = 'g', label="momentum") + plt.plot(step_loss[0], color = 'b', label="adam") + plt.legend() + plt.savefig("exp_optimizer_step_loss.jpg") + plt.close() + + plt.xlabel('epoch') + plt.ylabel('acc') + plt.plot(epoch_acc[2], color = 'r', label="origin") + plt.plot(epoch_acc[1], color = 'g', label="momentum") + plt.plot(epoch_acc[0], color = 'b', label="adam") + plt.legend() + plt.savefig("exp_optimizer_epoch_acc.jpg") + plt.close() + + # batch_size, learning_rate, optimizer + # parameter_list = [ + # (128, 0.1, "adam"), + # (128, 0.01, "adam"), + # (128, 0.001, "adam") + # ] + + # step_loss, epoch_acc = numpy_run( + # parameter_list = parameter_list, + # epoch_number = 30 + # ) + + # plt.xlabel('step') + # plt.ylabel('loss') + # plt.plot(step_loss[0], color = 'r', label="lr=0.1") + # plt.plot(step_loss[1], color = 'g', label="lr=0.01") + # plt.plot(step_loss[2], color = 'b', label="lr=0.001") + # plt.legend() + # plt.savefig("exp_optimizer_adam_step_loss.jpg") + # plt.close() + + # plt.xlabel('epoch') + # plt.ylabel('acc') + # plt.plot(epoch_acc[0], color = 'r', label="lr=0.1") + # plt.plot(epoch_acc[1], color = 'g', label="lr=0.01") + # plt.plot(epoch_acc[2], color = 'b', label="lr=0.001") + # plt.legend() + # plt.savefig("exp_optimizer_adam_epoch_acc.jpg") + # plt.close() + + # parameter_list = [ + # (128, 0.1, "adam"), + # (128, 0.01, "adam"), + # (128, 0.001, "adam") + # ] + + # step_loss, epoch_acc = numpy_run( + # parameter_list = parameter_list, + # epoch_number = 30 + # ) + + # plt.xlabel('step') + # plt.ylabel('loss') + # plt.plot(step_loss[0], color = 'r', label="lr=0.1") + # plt.plot(step_loss[1], color = 'g', label="lr=0.01") + # plt.plot(step_loss[2], color = 'b', label="lr=0.001") + # plt.legend() + # plt.savefig("exp_optimizer_momentum_step_loss.jpg") + # plt.close() + + # plt.xlabel('epoch') + # plt.ylabel('acc') + # plt.plot(epoch_acc[0], color = 'r', label="lr=0.1") + # plt.plot(epoch_acc[1], color = 'g', label="lr=0.01") + # plt.plot(epoch_acc[2], color = 'b', label="lr=0.001") + # plt.legend() + # plt.savefig("exp_optimizer_momentum_epoch_acc.jpg") + # plt.close() \ No newline at end of file diff --git a/assignment-2/submission/18307130252/tester_demo.py b/assignment-2/submission/18307130252/tester_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..504b3eef50a6df4d0aa433113136add50835e420 --- /dev/null +++ b/assignment-2/submission/18307130252/tester_demo.py @@ -0,0 +1,182 @@ +import numpy as np +import torch +from torch import matmul as torch_matmul, relu as torch_relu, softmax as torch_softmax, log as torch_log + +from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss +from torch_mnist import TorchModel +from utils import get_torch_initialization, one_hot + +err_epsilon = 1e-6 +err_p = 0.4 + + +def check_result(numpy_result, torch_result=None): + if isinstance(numpy_result, list) and torch_result is None: + flag = True + for (n, t) in numpy_result: + flag = flag and check_result(n, t) + return flag + # print((torch.from_numpy(numpy_result) - torch_result).abs().mean().item()) + T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item() + direction = T / torch_result.numel() < err_p + return direction and ((torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon).item() + + +def case_1(): + x = np.random.normal(size=[5, 6]) + W = np.random.normal(size=[6, 4]) + + numpy_matmul = Matmul() + numpy_out = numpy_matmul.forward(x, W) + numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + torch_W = torch.from_numpy(W).clone().requires_grad_() + + torch_out = torch_matmul(torch_x, torch_W) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + (numpy_x_grad, torch_x.grad), + (numpy_W_grad, torch_W.grad) + ]) + + +def case_2(): + x = np.random.normal(size=[5, 6]) + + numpy_relu = Relu() + numpy_out = numpy_relu.forward(x) + numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_relu(torch_x) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + (numpy_x_grad, torch_x.grad), + ]) + + +def case_3(): + x = np.random.uniform(low=0.0, high=1.0, size=[3, 4]) + + numpy_log = Log() + numpy_out = numpy_log.forward(x) + numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_log(torch_x) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + + (numpy_x_grad, torch_x.grad), + ]) + + +def case_4(): + x = np.random.normal(size=[4, 5]) + + numpy_softmax = Softmax() + numpy_out = numpy_softmax.forward(x) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_softmax(torch_x, 1) + + return check_result(numpy_out, torch_out) + + +def case_5(): + x = np.random.normal(size=[20, 25]) + + numpy_softmax = Softmax() + numpy_out = numpy_softmax.forward(x) + numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_softmax(torch_x, 1) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + (numpy_x_grad, torch_x.grad), + ]) + + +def test_model(): + try: + numpy_loss = NumpyLoss() + numpy_model = NumpyModel() + torch_model = TorchModel() + torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = get_torch_initialization(numpy=False) + numpy_model.W1 = torch_model.W1.detach().clone().numpy() + numpy_model.W2 = torch_model.W2.detach().clone().numpy() + numpy_model.W3 = torch_model.W3.detach().clone().numpy() + + x = torch.randn((10000, 28, 28)) + y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000) + + y = one_hot(y, numpy=False) + x2 = x.numpy() + y_pred = torch_model.forward(x) + loss = (-y_pred * y).sum(dim=1).mean() + loss.backward() + + y_pred_numpy = numpy_model.forward(x2) + numpy_loss.get_loss(y_pred_numpy, y.numpy()) + + check_flag_1 = check_result(y_pred_numpy, y_pred) + print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10)) + except: + print("[Runtime Error in forward]") + print("+ {:12} {}/{}".format("forward", 0, 10)) + return 0 + + try: + + numpy_model.backward(numpy_loss.backward()) + + check_flag_2 = [ + check_result(numpy_model.log_grad, torch_model.log_input.grad), + check_result(numpy_model.softmax_grad, torch_model.softmax_input.grad), + check_result(numpy_model.W3_grad, torch_model.W3.grad), + check_result(numpy_model.W2_grad, torch_model.W2.grad), + check_result(numpy_model.W1_grad, torch_model.W1.grad) + ] + check_flag_2 = sum(check_flag_2) >= 4 + print("+ {:12} {}/{}".format("backward", 20 * check_flag_2, 20)) + except: + print("[Runtime Error in backward]") + print("+ {:12} {}/{}".format("backward", 0, 20)) + check_flag_2 = False + + return 10 * check_flag_1 + 20 * check_flag_2 + + +if __name__ == "__main__": + testcases = [ + ["matmul", case_1, 5], + ["relu", case_2, 5], + ["log", case_3, 5], + ["softmax_1", case_4, 5], + ["softmax_2", case_5, 10], + ] + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except: + print("[Runtime Error in {}]".format(case[0])) + res = 0 + score += res + print("+ {:12} {}/{}".format(case[0], res, case[2])) + score += test_model() + print("{:14} {}/60".format("FINAL SCORE", score)) diff --git a/assignment-2/submission/18307130252/torch_mnist.py b/assignment-2/submission/18307130252/torch_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..6d3e214c7606e3d43dac4b94554f942508afffb3 --- /dev/null +++ b/assignment-2/submission/18307130252/torch_mnist.py @@ -0,0 +1,73 @@ +import torch +from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve + + +class TorchModel: + + def __init__(self): + self.W1 = torch.randn((28 * 28, 256), requires_grad=True) + self.W2 = torch.randn((256, 64), requires_grad=True) + self.W3 = torch.randn((64, 10), requires_grad=True) + self.softmax_input = None + self.log_input = None + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + x = torch.relu(torch.matmul(x, self.W1)) + x = torch.relu(torch.matmul(x, self.W2)) + x = torch.matmul(x, self.W3) + + self.softmax_input = x + self.softmax_input.retain_grad() + + x = torch.softmax(x, 1) + + self.log_input = x + self.log_input.retain_grad() + + x = torch.log(x) + + return x + + def optimize(self, learning_rate): + with torch.no_grad(): + self.W1 -= learning_rate * self.W1.grad + self.W2 -= learning_rate * self.W2.grad + self.W3 -= learning_rate * self.W3.grad + + self.W1.grad = None + self.W2.grad = None + self.W3.grad = None + + +def torch_run(): + train_dataset, test_dataset = download_mnist() + + model = TorchModel() + model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False) + + train_loss = [] + + epoch_number = 3 + learning_rate = 0.1 + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset, numpy=False): + y = one_hot(y, numpy=False) + + y_pred = model.forward(x) + loss = (-y_pred * y).sum(dim=1).mean() + loss.backward() + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset, numpy=False)[0] + accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item() + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + +if __name__ == "__main__": + torch_run() diff --git a/assignment-2/submission/18307130252/utils.py b/assignment-2/submission/18307130252/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..709220cfa7a924d914ec1c098c505f864bcd4cfc --- /dev/null +++ b/assignment-2/submission/18307130252/utils.py @@ -0,0 +1,71 @@ +import torch +import numpy as np +from matplotlib import pyplot as plt + + +def plot_curve(data): + plt.plot(range(len(data)), data, color='blue') + plt.legend(['loss_value'], loc='upper right') + plt.xlabel('step') + plt.ylabel('value') + plt.show() + + +def download_mnist(): + from torchvision import datasets, transforms + + transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean=(0.1307,), std=(0.3081,)) + ]) + + train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True) + test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True) + + return train_dataset, test_dataset + + +def one_hot(y, numpy=True): + if numpy: + y_ = np.zeros((y.shape[0], 10)) + y_[np.arange(y.shape[0], dtype=np.int32), y] = 1 + return y_ + else: + y_ = torch.zeros((y.shape[0], 10)) + y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1 + return y_ + + +def batch(dataset, numpy=True): + data = [] + label = [] + for each in dataset: + data.append(each[0]) + label.append(each[1]) + data = torch.stack(data) + label = torch.LongTensor(label) + if numpy: + return [(data.numpy(), label.numpy())] + else: + return [(data, label)] + + +def mini_batch(dataset, batch_size=128, numpy=False): + return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True) + + +def get_torch_initialization(numpy=True): + fc1 = torch.nn.Linear(28 * 28, 256) + fc2 = torch.nn.Linear(256, 64) + fc3 = torch.nn.Linear(64, 10) + + if numpy: + W1 = fc1.weight.T.detach().clone().numpy() + W2 = fc2.weight.T.detach().clone().numpy() + W3 = fc3.weight.T.detach().clone().numpy() + else: + W1 = fc1.weight.T.detach().clone().data + W2 = fc2.weight.T.detach().clone().data + W3 = fc3.weight.T.detach().clone().data + + return W1, W2, W3