diff --git a/assignment-2/submission/18307130154/README.md b/assignment-2/submission/18307130154/README.md new file mode 100644 index 0000000000000000000000000000000000000000..51cea514496f5e1d5ba2fae3e49da09fe4afb6ca --- /dev/null +++ b/assignment-2/submission/18307130154/README.md @@ -0,0 +1,488 @@ +# Assignment 2——选题1 报告 + +## 概述 + +本次实验实现了简单的几个Pytorch算子,包括正向计算和反向传播,同时记录了反向传播的公式推导。然后搭建了简单的模型,在Mnist手写体数据集上进行了测试。 + +**作为扩展,我调研了Pytorch的权重初始化方法——Xavier初始化和Kaiming初始化,用Numpy实现了numpyutils工具包替代utils(现在放在numpy_mnist中了)。** + +## 算子及推导 + +### Matmul + +此算子进行两个矩阵的求积运算 + +**推导** + +设反向传播的开始节点(叶节点)为 **L**,这是一个标量,下同。 + +设正向计算中两个输入矩阵为 **P(m * k) , Q(k * n)**, 输出矩阵为 **O(m * n)**; 反向传播中输入的梯度为 **G(m * n)**。 + +则有 +$$ +G_{ij} = \frac{\partial L}{\partial O_{ij}} +$$ +此公式对后面的算子同样适用。 + +**计算Q的梯度 GQ (k * n)** + +首先有 +$$ +\begin{aligned} + GQ {ts}&= \frac{\partial L}{\partial Q_{ts}} \\\\ + &=\sum_{i \leqslant m\\ j\leqslant n} \frac{\partial L}{\partial O_{ij}} \times \frac{\partial O_{ij}}{\partial Q_{ts}}\\\\ + &=\sum_{i \leqslant m\\ j = s} G_{ij} \times \frac{\partial O_{ij}}{\partial Q_{ts}} + &(其余的 \frac{\partial O_{ij}}{\partial Q_{ts}} = 0)\\\\ + &=\sum_{i \leqslant m} G_{is} \times P_{it}\\\\ + &=\sum_{i \leqslant m} P_{ti}^T \times G_{is} +\end{aligned} +$$ +所以写成矩阵乘法为 +$$ +GQ {ts} = P^T \times G +$$ +同理,**P的梯度为** +$$ +GP {ts} = G \times Q^T +$$ + +### Relu + +设输入为**X**,输出为**Y**,Relu层的输出矩阵中每个元素**只与输入矩阵中对应位置的元素有关**,设对应位置元素为**x** , **y**。 + +则有 +$$ +y = relu(x)= + \begin{cases} + 0& x \leq 0\\\\ + x& x \geq 0 + \end{cases} +$$ +和 +$$ +\frac{\partial Y_{ts}}{\partial X_{ij}} = + \begin{cases} + 1& t = i& and & s = j & and & X_{ij} > 0 \\\\ + 0& else + \end{cases} +$$ + +于是 +$$ +\frac{\partial L}{\partial X_{ij}} = \frac{\partial L}{\partial Y_{ij}} \times \frac{\partial Y_{ij}}{\partial X_{ij}} +$$ +设M为X的掩码矩阵,其中M中元素m定义为:当X对应位置为正数时,m为1;否则m为0。那么上面的式子写成矩阵的形式: +$$ +GX = GY * M +$$ +其中 $*$ 表示矩阵的点乘,即对应位相乘。 + +### Log + +设输入为**X**,输出为**Y**,Log层的输出矩阵中每个元素**只与输入矩阵中对应位置的元素有关**,设对应位置元素为**x** , **y**。 + +则有 +$$ +\frac{dy}{dx} = \frac{1}{x} +$$ +与Relu同理,设矩阵M定义为:M和X形状相同,且 +$$ +M_{ij} = \frac{1}{X_{ij}} +$$ +则有 +$$ +GX = GY * M +$$ + +### Softmax + +设输入为**X**,输出为**Y**,其中**X**的第一维可以看成batch维,所以Softmax层的输出矩阵中每个元素只与输入矩阵中对应位置元素**所在行的元素**有关。方便起见,我们先考虑batch size为1的输入,即X (1 * n),并且用 +$$ +X_i +$$ +来简写 X 中第一行第 i 列的元素。 + +输出Y 也是 1 * n 的矩阵,我们使用和 X 相同的表示规则。那么,正向计算公式为 +$$ +Y_i = \frac{e^{X_i}}{\sum_{k=1}^n e^{X_k}} +$$ +梯度公式为: +$$ +\frac{\partial Y_i}{\partial X_j} = + \begin{cases} + Y_i \times (1 - Y_i) & i = j\\\\ + -Y_i \times Y_j & i \neq j + \end{cases} +$$ + +根据上面的公式可以计算出向量Y对向量X求导的雅各比矩阵**J (n * n)**, 定义如下 +$$ +J_{ij} = \frac{\partial Y_i}{\partial X_j} +$$ +那么 +$$ +\begin{aligned} +GX_{i} &= \frac{\partial L}{\partial X_i} \\\\ + &=\sum_{k=1}^{n}\frac{\partial L}{\partial Y_k} \times \frac{\partial Y_k}{\partial X_i}\\\\ + &=\sum_{k=1}^{n} GY_k \times J_{ki}\\\\ + &=\sum_{k=1}^{n} GY_{1k} \times J_{ki}\\\\ + &=GY \times J\\\\ + &(其中GY为Y的梯度,是这一层反向传播的输入) +\end{aligned} +$$ +我们已经推出了在输入X的第一维为1的情况下的反向传播公式,事实上,当X的第一维(batch size)大于1时,只需要添加一个最高维,扩展 X, Y, GY, J, 并利用numpy的函数: + +```python +numpy.matmul() +``` + +将自动执行张量计算,得到 GX。 + +## 模型训练与测试 + +### 模型搭建 + +首先按照 torch_mnist 搭建模型。 + +**正向传播** + +```python +x = self.matmul_1.forward(x, self.W1) +x = self.relu_1.forward(x) +x = self.matmul_2.forward(x, self.W2) +x = self.relu_2.forward(x) +x = self.matmul_3.forward(x, self.W3) +x = self.softmax.forward(x) +x = self.log.forward(x) +``` + +~~**反向传播**~~(这里由于后面测试例做了改动,这里的模型也随之变化,最终的模型在下面) + +这里有一点要注意,torch的反向传播以**标量(叶子结点)**为开始,但是我们定义的模型没有最后的激活为标量的层,所以最高层的梯度要手动计算。看到测试例中torch使用的标量(Loss)为: + +```python +loss = (-y_pred * y).sum(dim=1).mean() +``` + +因为有一个对列求均值的操作,所以激活层的权重矩阵(也即最高层的梯度矩阵),为**- y / y.shape[0]**,但是在模型反向传播的函数中已经有这样一段代码: + +```python +for size in y.shape: + y /= size +``` + +y的符号相反,并且多除了一个y.shape[1], 所以我在反向传播一开始,把这个弥补进顶层梯度里面了,最终的code 7: + +```python +#################### +# code 7 # +#################### + +#mulgrade = mulgrade3 +#x3_grade = mulgrade2 +#x2_grade = mulgrade1 +#x1_grade = input_grad + +y *= (-y.shape[1]) +self.log_grad = y +self.softmax_grad = self.log.backward(self.log_grad) + +mulgrade = self.softmax.backward(self.softmax_grad) +self.relu_2_grad,self.W3_grad = self.matmul_3.backward(mulgrade) + +self.x3_grad = self.relu_2.backward(self.relu_2_grad) +self.relu_1_grad,self.W2_grad = self.matmul_2.backward(self.x3_grad) + +self.x2_grad = self.relu_1.backward(self.relu_1_grad) +self.x1_grad,self.W1_grad = self.matmul_1.backward(self.x2_grad) +``` + +**反向传播版本2** + +现在_grad 表示对应层的 input 的梯度,直接贴代码 + +```python +self.log_grad = self.log.backward(y) +self.softmax_grad = self.softmax.backward(self.log_grad) + +mulgrade3,self.W3_grad = self.matmul_3.backward(self.softmax_grad) +self.relu_2_grad = self.relu_2.backward(mulgrade3) + +mulgrade2,self.W2_grad = self.matmul_2.backward(self.relu_2_grad) +self.relu_1_grad = self.relu_1.backward(mulgrade2) + +self.x1_grad,self.W1_grad = self.matmul_1.backward(self.relu_1_grad) +``` + +### 用numpy实现mini_batch + +将数据集打乱,并根据batch_size分割 + +```python +def mini_batch(dataset, batch_size=128, numpy=False): + data = [] + label = [] + for x in dataset: + data.append(np.array(x[0])) + label.append(x[1]) + data = np.array(data) + label = np.array(label) + + #索引随机打乱 + siz = data.shape[0] + ind = np.arange(siz) + np.random.shuffle(ind) + + #划分batch + res = [] + con = 0 + while con + batch_size <= siz: + data_batch = data[ind[con:con + batch_size]] + label_batch = label[ind[con:con + batch_size]] + res.append((data_batch,label_batch)) + con += batch_size + + return res +``` + +### 训练与测试 + +这部分代码助教已经给出,使用的是mnist手写体数据集。下载数据集后,对每个epoch,按照batch_size将数据读入,并使用模型进行一次正向计算、反向传播、优化。主要部分: + +```python +for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset): + + y = one_hot(y) + + # y_pred = model.forward(x.numpy()) + y_pred = model.forward(x) + loss = (-y_pred * y).sum(axis=1).mean() + model.backward(y) + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset)[0] + accuracy = np.mean((model.forward(x).argmax(axis=1) == y)) + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) +``` + +### 测试结果 + +**损失函数** + +![image-20210425011755375](img/image-20210425011755375.png) + +**每一轮epoch后正确率(共3轮)** + +``` +[0] Accuracy: 0.9459 +[1] Accuracy: 0.9635 +[2] Accuracy: 0.9713 +``` + + + +## 扩展——Pytorch权重初始化方法 + +### 结论 + +结论写在前。Pytorch线性层采取的默认初始化方式是**Kaiming**初始化,这是由我国计算机视觉领域专家**何恺明**提出的。我的探究主要包括: + +* 为什么采取Kaiming初始化? +* 考察Kaiming初始化的基础——Xavier初始化的公式 +* 考察Kaiming初始化的公式 +* 用Numpy实现一个简易的Kaiming初始化 + +### 为什么采取Kaiming初始化? + +**采取固定的分布?** + +当考虑怎么初始化权重矩阵这个问题时,可以想到应该使得初始权重具有随机性。提到随机,自然的想法是使用**均匀分布或正态分布**,那么我们如果采用**与模型无关的固定分布**(例如标准正态分布(均值为0,方差为1))怎么样?下面我们分析如果对模型本身不加考虑,采取固定的分布,会有什么问题: + +* 如果权重的绝对值太小,在多层的神经网络的每一层,输入信号的方差会不断减小;当到达最终的输出层时,可以理解为输入信号的影响已经降低到微乎其微。一方面训练效果差,另一方面可能会有梯度消失等问题。(此处从略,参考https://zhuanlan.zhihu.com/p/25631496) +* 如果权重的绝对值太大,同样道理,随着深度的加深,可能会使输入信号的方差过大,这会造成梯度爆炸或消失的问题。 + +这里举一个例子,假如一个网络使用了多个sigmoid作为中间层(这个函数具有两边导数趋于0的特点): + +* 如果权重初始绝对值太小,随着深度的加深,输入信号的方差过小。当输入很小时,sigmoid函数接近线性,深层模型也失去了非线性性的优点。(**模型效果**) +* 如果权重初始绝对值太大,随着深度的加深,输入信号的方差过大。绝对值过大的sigmoid输入意味着激活变得饱和,梯度将开始接近零。(**梯度消失**) + +### Xavier初始化 + +前面的问题提示我们要根据模型的特点(维度,规模)决定使用的随机化方法(分布的均值、方差),**xavier初始化**应运而生,它可以使得输入值经过网络层后**方差不变**。pytorch中这一点是通过增益值gain来实现的,下面的函数用来获得特定层的gain: + +```python +torch.nn.init.calculate_gain(nonlinearity, param=None) +``` + +增益值表(图片摘自https://blog.csdn.net/winycg/article/details/86649832) + +![在这里插入图片描述](img/20190125144412278.png) + +Xavier初始化可以采用均匀分布 **U(-a, a)**,其中a的计算公式为: +$$ +a = gain \times \sqrt[]{\frac{6}{fan\_in+fan\_out}} +$$ +Xavier初始化可以采用正态分布 **N(0, std)**,其中std的计算公式为: +$$ +std = gain \times \sqrt[]{\frac{2}{fan\_in+fan\_out}} +$$ +其中fan_in和fan_out分别是输入神经元和输出神经元的数量,在全连接层中,就等于输入输出的feature数。 + +### Kaiming初始化 + +Xavier初始化在Relu层表现不好,主要原因是relu层会将负数映射到0,影响整体方差。所以**何恺明**在对此做了改进提出Kaiming初始化,一开始主要应用于计算机视觉、卷积网络。 + +Kaiming均匀分布的初始化采用**U(-bound, bound)**,其中bound的计算公式为:(a 的概念下面再说) +$$ +bound = \sqrt[]{\frac{6}{(1 + a ^2) \times fan\_in}} +$$ +这里补充一点,pytorch中这个公式也通过gain作为中间变量实现,也就是: +$$ +bound = gain \times \sqrt[]{\frac{3}{ fan\_in}} +$$ +其中: +$$ +gain = \sqrt{\frac{2}{1 + a^2}} +$$ +Kaiming正态分布的初始化采用**N(0,std)**,其中std的计算公式为: +$$ +std = \sqrt[]{\frac{2}{(1 + a ^2) \times fan\_in}} +$$ +这里稍微解释一下a的含义,源码中的解释为 + +``` +the negative slope of the rectifier used after this layer +``` + +简单说,是用来衡量这一层中负数比例的,负数越多,Relu层会将越多的输入“抹平”为0,a用来平衡这种“抹平”对于方差的影响。 + +### 我们使用的初始化 + +看一下我们现在使用的get_torch_initialization函数,可以看到是通过调用pytorch的线性层进行的默认初始化: + +```python +fc1 = torch.nn.Linear(28 * 28, 256) +``` + +在Linear类中通过 + +```python +self.reset_parameters() +``` + +这个函数来完成随机初始化的过程,后者使用的是 + +```python +init.kaiming_uniform_(self.weight, a=math.sqrt(5)) +``` + +可见是我们前面提到的Kaiming均匀分布的初始化方式,这个函数的内容和前面的公式相符(使用gain作为中间变量): + +```python +fan = _calculate_correct_fan(tensor, mode) +gain = calculate_gain(nonlinearity, a) +std = gain / math.sqrt(fan) +bound = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation +with torch.no_grad(): + return tensor.uniform_(-bound, bound) +``` + +~~同时将参数a 的值设置为5。~~ + +同时将参数a 的值设置为根号5。 + +### ~~使用numpy完成get_torch_initialization~~ 修正 + +简单起见,我没有按照pytorch的封装方法分层实现初始化过程,后者主要为了提供多种不同的初始化方式。我直接按照线性层默认的初始方式——Kaiming均匀分布的公式用numpy实现了get_torch_initialization,其中a值取5, 代码如下: + +```python +def get_torch_initialization(numpy = True): + + a = 5 + + def Kaiming_uniform(fan_in,fan_out,a): + bound = 6.0 / (1 + a * a) / fan_in + bound = bound ** 0.5 + W = np.random.uniform(low=-bound, high=bound, size=(fan_in,fan_out)) + return W + + W1 = Kaiming_uniform(28 * 28, 256, a) + W2 = Kaiming_uniform(256, 64, a) + W3 = Kaiming_uniform(64, 10, a) + return W1,W2,W3 +``` + +顺便,我将utils其它函数(包括之前的mini_batch)转化为numpy版本,~~写在了numpyutils中~~现在全放在了numpy_mnist中。这样,使用这个工具包可以不使用torch包进行numpy_mnist。特别指出的是,download_mnist依然需要使用 torchvision这个包下载数据集。 + +### ~~测试~~ 修正 + +在numpy_mnist替换了工具包之后重新运行,正确率和之前基本一致。 + +```python +[0] Accuracy: 0.9340 +[1] Accuracy: 0.9584 +[2] Accuracy: 0.9684 +``` + +## 4月27日 对初始化方式的修正 + +之前提交的版本中采取和Linear层默认初始化方式相同的方式进行初始化,今天发现存在以下两方面的问题(特别感谢**彭润宇**同学的提醒): + +* Pytorch线性层采取默认初始化中,假定非线性层为**Leaky Relu**,并设置a值默认为**根号5**,而非5。前面我公式中采用了5,会造成很不好的效果。 +* 如**何恺明**论文中所述,a值代表leaky relu层负斜率,我们采用relu层,理论上a值应该取0才符合Kaiming初始化设计初衷。 + +本次修正针对上面两处问题进行修改,并补充探讨a值的选取。 + +### 修改 + +修改后的get_torch_initialization将a作为入参,并设置默认值为0,作为Relu层的Kaiming初始化方法。 + +```python +def get_torch_initialization(numpy = True,a = 0): + def Kaiming_uniform(fan_in,fan_out,a): + bound = 6.0 / (1 + a * a) / fan_in + bound = bound ** 0.5 + W = np.random.uniform(low=-bound, high=bound, size=(fan_in,fan_out)) + return W + + W1 = Kaiming_uniform(28 * 28, 256, a) + W2 = Kaiming_uniform(256, 64, a) + W3 = Kaiming_uniform(64, 10, a) + return W1,W2,W3 +``` + +### 对a值选取进行测试 + +Pytorch的Linear层默认非线性激活层为Leaky Relu,并将a设置为根号5的做法发人深思。为了比较a值选择对效果的影响,我选取不同的a值在原数据集上进行了测试(a从0到6,间隔为0.3,同时统计第1、2、3次迭代后的正确率)。但结果不甚理想,事实上结果中权重初始化方式对3轮迭代后的正确率影响很不明显,即使仅在第一轮迭代后。可以想见的原因包括: + +* 我们的模型及数据不会产生**梯度消失**或**神经元死亡**的问题 +* batch的随机性,测试次数少 + +我在img中保留了测试结果。但是对于我们的模型,还是按照何恺明在论文中指出的规则,对于Relu层使用a = 0。 + +### 一点问题 + +Pytorch对线性层的默认初始化中a值的选取令人困惑,按照何恺明指出,a值应该选择Leaky Relu层的**负斜率**,这个值应该是小于1 的正数(pytorch下层源码中是这样使用的,如下图) + +![image-20210427212809776](img/image-20210427212809776.png) + +但在linear层中将其默认值设置为根号5: + +```python +init.kaiming_uniform_(self.weight, a=math.sqrt(5)) +``` + +这两者存在矛盾,使得默认的线性层初始化中会将a=$\sqrt{5}$代入公式: +$$ +bound = \sqrt[]{\frac{6}{(1 + a ^2) \times fan\_in}} +$$ +得到一个较小的bound。 + +曾有多名国内外网友提及这个问题,目前我没有看到这个问题合理的解释,其中一个讨论的地址: + +https://github.com/pytorch/pytorch/issues/15314 + +我认为这有可能是Pytorch(version 3)的一处歧义甚至错误。 \ No newline at end of file diff --git a/assignment-2/submission/18307130154/img/20190125144412278.png b/assignment-2/submission/18307130154/img/20190125144412278.png new file mode 100644 index 0000000000000000000000000000000000000000..fcbc3a2982c4162900790d4e3d479717765b743f Binary files /dev/null and b/assignment-2/submission/18307130154/img/20190125144412278.png differ diff --git a/assignment-2/submission/18307130154/img/image-20210425011755375.png b/assignment-2/submission/18307130154/img/image-20210425011755375.png new file mode 100644 index 0000000000000000000000000000000000000000..62a58dedaff524c0d49407a1103b4ac0d7e8d022 Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210425011755375.png differ diff --git a/assignment-2/submission/18307130154/img/image-20210425230057530.png b/assignment-2/submission/18307130154/img/image-20210425230057530.png new file mode 100644 index 0000000000000000000000000000000000000000..7779533c9222baca603aab11e54f32d58054bb90 Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210425230057530.png differ diff --git a/assignment-2/submission/18307130154/img/image-20210425230119977.png b/assignment-2/submission/18307130154/img/image-20210425230119977.png new file mode 100644 index 0000000000000000000000000000000000000000..70f10047ed945ea6ac69f36d9a80195e11de4967 Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210425230119977.png differ diff --git a/assignment-2/submission/18307130154/img/image-20210427200512951.png b/assignment-2/submission/18307130154/img/image-20210427200512951.png new file mode 100644 index 0000000000000000000000000000000000000000..43189faca346fc18e2938d53d39691aea37c954e Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427200512951.png differ diff --git a/assignment-2/submission/18307130154/img/image-20210427203245993.png b/assignment-2/submission/18307130154/img/image-20210427203245993.png new file mode 100644 index 0000000000000000000000000000000000000000..52cfc7d3907638f1502a6a89866f44a6af6b73bd Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427203245993.png differ diff --git a/assignment-2/submission/18307130154/img/image-20210427203300617.png b/assignment-2/submission/18307130154/img/image-20210427203300617.png new file mode 100644 index 0000000000000000000000000000000000000000..24b35eed4c9f022a11991135806034b706dec21c Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427203300617.png differ diff --git a/assignment-2/submission/18307130154/img/image-20210427203337433.png b/assignment-2/submission/18307130154/img/image-20210427203337433.png new file mode 100644 index 0000000000000000000000000000000000000000..912b1ca130c033a9ba33e0f0b30254843241c5bc Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427203337433.png differ diff --git a/assignment-2/submission/18307130154/img/image-20210427205224362.png b/assignment-2/submission/18307130154/img/image-20210427205224362.png new file mode 100644 index 0000000000000000000000000000000000000000..1bb5da48837686d89da73925b935accbe5454c17 Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427205224362.png differ diff --git a/assignment-2/submission/18307130154/img/image-20210427205245840.png b/assignment-2/submission/18307130154/img/image-20210427205245840.png new file mode 100644 index 0000000000000000000000000000000000000000..4ec5e96e75e7987a6d12d4977a49205c03ca923a Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427205245840.png differ diff --git a/assignment-2/submission/18307130154/img/image-20210427205308848.png b/assignment-2/submission/18307130154/img/image-20210427205308848.png new file mode 100644 index 0000000000000000000000000000000000000000..060021006b29d7907d064146375f28b30079459e Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427205308848.png differ diff --git a/assignment-2/submission/18307130154/img/image-20210427212809776.png b/assignment-2/submission/18307130154/img/image-20210427212809776.png new file mode 100644 index 0000000000000000000000000000000000000000..d0e834c5023e6ce211c264c0c386a97af8e21172 Binary files /dev/null and b/assignment-2/submission/18307130154/img/image-20210427212809776.png differ diff --git a/assignment-2/submission/18307130154/numpy_fnn.py b/assignment-2/submission/18307130154/numpy_fnn.py new file mode 100644 index 0000000000000000000000000000000000000000..9eb8a954dc83f8c8125a655602af0cac7933c4de --- /dev/null +++ b/assignment-2/submission/18307130154/numpy_fnn.py @@ -0,0 +1,215 @@ +import numpy as np + + +class NumpyOp: + + def __init__(self): + self.memory = {} + self.epsilon = 1e-12 + + +class Matmul(NumpyOp): + + def forward(self, x, W): + """ + x: shape(N, d) + w: shape(d, d') + """ + self.memory['x'] = x + self.memory['W'] = W + h = np.matmul(x, W) + return h + + def backward(self, grad_y): + """ + grad_y: shape(N, d') + """ + + #################### + # code 1 # + #################### + x = self.memory['x'] + W = self.memory['W'] + + grad_W = np.matmul(x.T,grad_y) + grad_x = np.matmul(grad_y,W.T) + return grad_x, grad_W + + +class Relu(NumpyOp): + + def forward(self, x): + self.memory['x'] = x + return np.where(x > 0, x, np.zeros_like(x)) + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 2 # + #################### + x = self.memory['x'] + x1 = np.where(x > 0, 1, 0) + grad_x = x1 * grad_y + return grad_x + + +class Log(NumpyOp): + + def forward(self, x): + """ + x: shape(N, c) + """ + + out = np.log(x + self.epsilon) + self.memory['x'] = x + + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 3 # + #################### + x = self.memory['x'] + grad_x = 1/(x + self.epsilon) + grad_x = grad_x * grad_y + return grad_x + + +class Softmax(NumpyOp): + """ + softmax over last dimension + """ + + def forward(self, x): + """ + x: shape(N, c) + """ + + #################### + # code 4 # + #################### + self.memory['x'] = x + ex = np.exp(x) + rowsum = np.sum(ex,axis=1) + rowsum = rowsum[:,np.newaxis] + softmax = ex / rowsum + self.memory['softmax'] = softmax + return softmax + + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 5 # + #################### + sm = self.memory['softmax'] + Jacobs = [] + for i in range(sm.shape[0]): + r = sm[i] + #对每一行求雅各比矩阵(因为导数只与本行有关) + J = np.diag(r) - np.outer(r, r) + Jacobs.append(J) + Jacobs = np.array(Jacobs) + + grad_y = grad_y[:,np.newaxis,:] + grad_x = np.matmul(grad_y,Jacobs) + grad_x = np.squeeze(grad_x,axis=1) + + return grad_x + +class NumpyLoss: + + def __init__(self): + self.target = None + + def get_loss(self, pred, target): + self.target = target + return (-pred * target).sum(axis=1).mean() + + def backward(self): + return -self.target / self.target.shape[0] + + +class NumpyModel: + def __init__(self): + self.W1 = np.random.normal(size=(28 * 28, 256)) + self.W2 = np.random.normal(size=(256, 64)) + self.W3 = np.random.normal(size=(64, 10)) + + + # 以下算子会在 forward 和 backward 中使用 + self.matmul_1 = Matmul() + self.relu_1 = Relu() + self.matmul_2 = Matmul() + self.relu_2 = Relu() + self.matmul_3 = Matmul() + self.softmax = Softmax() + self.log = Log() + + # 以下变量需要在 backward 中更新 + self.x1_grad, self.W1_grad = None, None + self.relu_1_grad = None + self.x2_grad, self.W2_grad = None, None + self.relu_2_grad = None + self.x3_grad, self.W3_grad = None, None + self.softmax_grad = None + self.log_grad = None + + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + + #################### + # code 6 # + #################### + x = self.matmul_1.forward(x, self.W1) + x = self.relu_1.forward(x) + x = self.matmul_2.forward(x, self.W2) + x = self.relu_2.forward(x) + x = self.matmul_3.forward(x, self.W3) + x = self.softmax.forward(x) + x = self.log.forward(x) + + return x + + def backward(self, y): + + + + #################### + # code 7 # + #################### + + self.log_grad = self.log.backward(y) + self.softmax_grad = self.softmax.backward(self.log_grad) + + mulgrade3,self.W3_grad = self.matmul_3.backward(self.softmax_grad) + self.relu_2_grad = self.relu_2.backward(mulgrade3) + + mulgrade2,self.W2_grad = self.matmul_2.backward(self.relu_2_grad) + self.relu_1_grad = self.relu_1.backward(mulgrade2) + + self.x1_grad,self.W1_grad = self.matmul_1.backward(self.relu_1_grad) + + + + pass + + def optimize(self, learning_rate): + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad + + + + \ No newline at end of file diff --git a/assignment-2/submission/18307130154/numpy_mnist.py b/assignment-2/submission/18307130154/numpy_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..1abc1e73eef32967faa94c5f1d93f20f8ae96d2d --- /dev/null +++ b/assignment-2/submission/18307130154/numpy_mnist.py @@ -0,0 +1,112 @@ +from numpy_fnn import NumpyModel, NumpyLoss + +import numpy as np +from matplotlib import pyplot as plt + +def get_torch_initialization(numpy = True,a=0): + + + def Kaiming_uniform(fan_in,fan_out,a): + bound = 6.0 / (1 + a * a) / fan_in + bound = bound ** 0.5 + W = np.random.uniform(low=-bound, high=bound, size=(fan_in,fan_out)) + return W + + W1 = Kaiming_uniform(28 * 28, 256, a) + W2 = Kaiming_uniform(256, 64, a) + W3 = Kaiming_uniform(64, 10, a) + return W1,W2,W3 + +def plot_curve(data): + plt.plot(range(len(data)), data, color='blue') + plt.legend(['loss_value'], loc='upper right') + plt.xlabel('step') + plt.ylabel('value') + plt.show() + +def mini_batch(dataset, batch_size=128, numpy=False): + data = [] + label = [] + for x in dataset: + data.append(np.array(x[0])) + label.append(x[1]) + data = np.array(data) + label = np.array(label) + + #索引随机打乱 + siz = data.shape[0] + ind = np.arange(siz) + np.random.shuffle(ind) + + #划分batch + res = [] + con = 0 + while con + batch_size <= siz: + data_batch = data[ind[con:con + batch_size]] + label_batch = label[ind[con:con + batch_size]] + res.append((data_batch,label_batch)) + con += batch_size + + return res + +def batch(dataset, numpy=True): + data = [] + label = [] + for x in dataset: + data.append(np.array(x[0])) + label.append(x[1]) + data = np.array(data) + label = np.array(label) + return [(data, label)] + +def one_hot(y, numpy=True): + y_ = np.zeros((y.shape[0], 10)) + y_[np.arange(y.shape[0], dtype=np.int32), y] = 1 + return y_ + +def download_mnist(): + from torchvision import datasets, transforms + + transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean=(0.1307,), std=(0.3081,)) + ]) + + train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True) + test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True) + + return train_dataset, test_dataset + +def numpy_run(): + train_dataset, test_dataset = download_mnist() + + model = NumpyModel() + numpy_loss = NumpyLoss() + model.W1, model.W2, model.W3 = get_torch_initialization() + + train_loss = [] + + epoch_number = 3 + learning_rate = 0.1 + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset): + y = one_hot(y) + + y_pred = model.forward(x) + loss = numpy_loss.get_loss(y_pred, y) + + model.backward(numpy_loss.backward()) + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset)[0] + accuracy = np.mean((model.forward(x).argmax(axis=1) == y)) + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + +if __name__ == "__main__": + numpy_run()