diff --git a/assignment-2/submission/19307130062/README.md b/assignment-2/submission/19307130062/README.md new file mode 100644 index 0000000000000000000000000000000000000000..02a93eb112820dff16d39f05398e6a648fea011b --- /dev/null +++ b/assignment-2/submission/19307130062/README.md @@ -0,0 +1,523 @@ +# Assignment 2. 前馈神经网络 + +- **姓名:高庆麾** +- **学号:19307130062** + + + +## 第一部分:梯度计算公式推导 + +### Matmul + +考虑 $Y = XW$,其中 $Y \in \R^{n\times d_2},\ X \in \R^{n \times d_1},\ W \in \R^{d_1 \times d_2}$ + +设损失函数为 $\mathcal L(\boldsymbol y,\ \boldsymbol {\hat y})$ ,且 $\Delta_Y$ (即 $Y$ 相对于损失函数的梯度)已知,希望得到 $\Delta_X,\ \Delta_W$ + +推导如下: + +#### $\Delta_X$ 的推导 + +我们考虑 $Y$ 的每一位对 $X$ 贡献的偏导,即 $\frac{\partial Y_{ij}}{\partial X}$ + +由于 $Y_{ij} = \sum_{k = 1}^{d_1}X_{ik}W_{kj}$ ,$X$ 各位独立,且 +$$ +\frac{\partial Y_{ij}}{\partial X} = +\begin{bmatrix} +\frac{\partial Y_{ij}}{\partial X_{11}} & \frac{\partial Y_{ij}}{\partial X_{12}} & \cdots & \frac{\partial Y_{ij}}{\partial X_{1d_1}} \\ +\frac{\partial Y_{ij}}{\partial X_{21}} & \frac{\partial Y_{ij}}{\partial X_{22}} & \cdots & \frac{\partial Y_{ij}}{\partial X_{2d_1}} \\ +\vdots & \vdots & \ddots & \vdots \\ +\frac{\partial Y_{ij}}{\partial X_{n1}} & \frac{\partial Y_{ij}}{\partial X_{n2}} & \cdots & \frac{\partial Y_{ij}}{\partial X_{nd_1}} \\ +\end{bmatrix} +$$ +故 $\left[\frac{\partial Y_{ij}}{\partial X}\right]_{ik} = W_{kj},\ k \in [1,\ d_1] \cap\Z$ ,其余项为 $0$ + +由于 $\Delta_Y$ 已知,即 $\frac{\partial{\mathcal L}}{\partial Y_{ij}}$ 已知,则有 +$$ +\frac{\partial{\mathcal L}}{\partial X_{ij}} = \sum_{s = 1}^n\sum_{t = 1}^{d_2} \frac{\partial{\mathcal L}}{\partial Y_{st}}\frac{\partial{Y_{st}}}{\partial X_{ij}} = \sum_{s = 1}^n\sum_{t = 1}^{d_2} \frac{\partial{\mathcal L}}{\partial Y_{st}}\left[\frac{\partial{Y_{st}}}{\partial X}\right]_{ij} = \sum_{t = 1}^{d_2} \frac{\partial{\mathcal L}}{\partial Y_{it}}\left[\frac{\partial{Y_{it}}}{\partial X}\right]_{ij} = \sum_{t = 1}^{d_2} \frac{\partial{\mathcal L}}{\partial Y_{it}}W_{jt} = \sum_{t = 1}^{d_2} \frac{\partial{\mathcal L}}{\partial Y_{it}}W^T_{tj} +$$ +即 +$$ +\frac{\partial{\mathcal L}}{\partial X} = \frac{\partial{\mathcal L}}{\partial Y}W^T +$$ +可知 +$$ +\Delta_X = \Delta_YW^T +$$ + +#### $\Delta_W$ 的推导 + +其次,对于 $\Delta_W$ ,我们用类似的方法进行计算,有 $\left[\frac{\partial Y_{ij}}{\partial W}\right]_{kj} = X_{ik},\ k \in [1,\ d_1] \cap\Z$ ,其余项为 $0$ ,则有 +$$ +\frac{\partial{\mathcal L}}{\partial W_{ij}} = \sum_{s = 1}^{n}\sum_{t = 1}^{d_2} \frac{\partial{\mathcal L}}{\partial Y_{st}}\left[\frac{\partial{Y_{st}}}{\partial W}\right]_{ij} = \sum_{s = 1}^{n} \frac{\partial{\mathcal L}}{\partial Y_{sj}}\left[\frac{\partial{Y_{sj}}}{\partial W}\right]_{ij} = \sum_{s = 1}^{n} \frac{\partial{\mathcal L}}{\partial Y_{sj}}X_{si} = \sum_{s = 1}^{n} X_{is}^T\frac{\partial{\mathcal L}}{\partial Y_{sj}} +$$ +即 +$$ +\frac{\partial{\mathcal L}}{\partial W} = X^T\frac{\partial{\mathcal L}}{\partial Y} +$$ +可知 +$$ +\Delta_W = X^T\Delta_Y +$$ +$\square$ + + + +### ReLU + +考虑 $Y = \mathrm{ReLU}(X)$ ,其中 $Y,\ X \in \R^{n\times m}$ + +设损失函数为 $\mathcal L(\boldsymbol y,\ \boldsymbol {\hat y})$ ,且 $\Delta_Y$ (即 $Y$ 相对于损失函数的梯度)已知,希望得到 $\Delta_X$ + +推导如下: + +#### $\Delta_X$ 的推导 + +方法类似于上文,只要注意这里的 $\mathrm{ReLU}$ 是一个逐元素函数 + +考虑 $Y$ 的每一位对 $X$ 贡献的导数,即 $\frac{\mathrm{d}Y_{ij}}{\mathrm{d} X}$ + +由于 $Y_{ij} = \mathrm{ReLU}(X_{ij})$ ,故 $\left[\frac{\mathrm{d}Y_{ij}}{\mathrm{d}X}\right]_{ij} = \mathrm{ReLU}'(X_{ij})$ ,其余项为 $0$ + +显然 +$$ +\mathrm{ReLU}'(x) = \begin{cases} +0, & n < 0 \\ +1, & n > 0 \\ +\mathrm{Undefined}, & n = 0 +\end{cases} +$$ +当然,在 $x = 0$ 处,由于截断误差的存在,可以认为导数为 $0$ 或 $1$ + +则有 +$$ +\frac{\partial{\mathcal L}}{\partial X_{ij}} = \sum_{s = 1}^n\sum_{t = 1}^{m} \frac{\partial{\mathcal L}}{\partial Y_{st}}\frac{\mathrm{d}{Y_{st}}}{\mathrm{d} X_{ij}} = \frac{\partial{\mathcal L}}{\partial Y_{ij}}\left[\frac{\partial{Y_{ij}}}{\partial X}\right]_{ij} =\frac{\partial{\mathcal L}}{\partial Y_{ij}}\mathrm{ReLU}'(X_{ij}) +$$ +即(此处 $\odot$ 表示矩阵的哈达玛积,即对应位乘积) +$$ +\frac{\partial{\mathcal L}}{\partial X} = \frac{\partial{\mathcal L}}{\partial Y}\odot\mathrm{ReLU}'(X) +$$ +可知 +$$ +\Delta_X = \Delta_Y\odot\mathrm{ReLU}'(X) +$$ +$\square$ + + + +### Log + +考虑 $Y = \mathrm{Log}(X)$ ,其中 $Y,\ X \in \R^{n\times m}$ + +设损失函数为 $\mathcal L(\boldsymbol y,\ \boldsymbol {\hat y})$ ,且 $\Delta_Y$ (即 $Y$ 相对于损失函数的梯度)已知,希望得到 $\Delta_X$ + +推导如下: + +#### $\Delta_X$ 的推导 + +方法类似于上文,只要注意这里的 $\mathrm{Log}$ 是一个逐元素函数 + +考虑 $Y$ 的每一位对 $X$ 贡献的导数,即 $\frac{\mathrm{d}Y_{ij}}{\mathrm{d} X}$ + +由于 $Y_{ij} = \mathrm{Log}(X_{ij})$ ,故 $\left[\frac{\mathrm{d}Y_{ij}}{\mathrm{d}X}\right]_{ij} = \mathrm{Log}'(X_{ij}) = \frac{1}{X_{ij}}$ ,其余项为 $0$ + +则有 +$$ +\frac{\partial{\mathcal L}}{\partial X_{ij}} = \sum_{s = 1}^n\sum_{t = 1}^{m} \frac{\partial{\mathcal L}}{\partial Y_{st}}\frac{\mathrm{d}{Y_{st}}}{\mathrm{d} X_{ij}} = \frac{\partial{\mathcal L}}{\partial Y_{ij}}\left[\frac{\partial{Y_{ij}}}{\partial X}\right]_{ij} =\frac{\partial{\mathcal L}}{\partial Y_{ij}}\frac{1}{X_{ij}} +$$ +即(其中 $\frac{1}{X}$ 表示 $X$ 的每一位取倒数后的结果) +$$ +\frac{\partial{\mathcal L}}{\partial X} = \frac{\partial{\mathcal L}}{\partial Y}\odot\frac{1}{X} +$$ +可知 +$$ +\Delta_X = \Delta_Y\odot\frac{1}{X} +$$ +$\square$ + + + +### Softmax + +考虑 $\boldsymbol y = \mathrm{Softmax}(\boldsymbol x)$ ,其中 $\boldsymbol y,\ \boldsymbol x \in \R^{1 \times c}$ + +设损失函数为 $\mathcal L(\boldsymbol y,\ \boldsymbol {\hat y})$ ,且 $\Delta_{\boldsymbol y}$ (即 $Y$ 相对于损失函数的梯度)已知,希望得到 $\boldsymbol y$ 的表达(前向计算)及 $\Delta_{\boldsymbol x}$ + +推导如下: + +#### $\boldsymbol y$ 的推导(前向计算) + +根据 $\mathrm{Softmax}$ 的定义,可以得到 +$$ +\boldsymbol y_i = \frac{e^{\boldsymbol x_i}}{\sum_{j = 1}^ce^{\boldsymbol x_j}} +$$ + +#### $\Delta_{\boldsymbol x}$ 的推导 + +由于 +$$ +\boldsymbol y_i = \frac{e^{\boldsymbol x_i}}{\sum_{j = 1}^ce^{\boldsymbol x_j}} +$$ +且 +$$ +\frac{\partial \boldsymbol y}{\partial\boldsymbol x} = +\begin{bmatrix} +\frac{\partial \boldsymbol y_1}{\partial \boldsymbol x_1} & \frac{\partial \boldsymbol y_1}{\partial \boldsymbol x_2} & \cdots & \frac{\partial \boldsymbol y_1}{\partial \boldsymbol x_c} \\ +\frac{\partial \boldsymbol y_2}{\partial \boldsymbol x_1} & \frac{\partial \boldsymbol y_2}{\partial \boldsymbol x_2} & \cdots & \frac{\partial \boldsymbol y_2}{\partial \boldsymbol x_c} \\ +\vdots & \vdots & \ddots & \vdots \\ +\frac{\partial \boldsymbol y_c}{\partial \boldsymbol x_1} & \frac{\partial \boldsymbol y_c}{\partial \boldsymbol x_2} & \cdots & \frac{\partial \boldsymbol y_c}{\partial \boldsymbol x_c} \\ +\end{bmatrix} +$$ +故当 $i = j$ 时,有 +$$ +\left[\frac{\partial \boldsymbol y}{\partial\boldsymbol x}\right]_{ii} = \frac{\partial \boldsymbol y_i}{\partial \boldsymbol x_i} = \frac{\partial\left( \frac{e^{\boldsymbol x_i}}{\sum_{j = 1}^ce^{\boldsymbol x_j}}\right)}{\partial \boldsymbol x_i} = \frac{e^{\boldsymbol x_i}(\sum_{j = 1}^ce^{\boldsymbol x_j}) - e^{\boldsymbol x_i}e^{\boldsymbol x_i}}{\left(\sum_{j = 1}^ce^{\boldsymbol x_j}\right)^2} = \frac{e^{\boldsymbol x_i}}{\sum_{j = 1}^ce^{\boldsymbol x_j}}\frac{\left(\sum_{j = 1}^ce^{\boldsymbol x_j}\right) - e^{\boldsymbol x_i}}{\sum_{j = 1}^ce^{\boldsymbol x_j}} = \boldsymbol y_i(1 - \boldsymbol y_i) +$$ +当 $i \neq j$ 时,有 +$$ +\left[\frac{\partial \boldsymbol y}{\partial\boldsymbol x}\right]_{ij} = \frac{\partial \boldsymbol y_i}{\partial \boldsymbol x_j} = \frac{\partial\left( \frac{e^{\boldsymbol x_i}}{\sum_{j = 1}^ce^{\boldsymbol x_j}}\right)}{\partial \boldsymbol x_j} = \frac{-e^{\boldsymbol x_i}e^{\boldsymbol x_j}}{\left(\sum_{j = 1}^ce^{\boldsymbol x_j}\right)^2} = -\boldsymbol y_i\boldsymbol y_j +$$ +则有 +$$ +\frac{\partial{\mathcal L}}{\partial\boldsymbol x_{j}} = \sum_{i = 1}^c\frac{\partial{\mathcal L}}{\partial\boldsymbol y_{i}}\frac{\partial\boldsymbol y_i}{\partial \boldsymbol x_j} = \sum_{i = 1}^c\frac{\partial{\mathcal L}}{\partial\boldsymbol y_{i}}\left[\frac{\partial\boldsymbol y}{\partial \boldsymbol x}\right]_{ij} +$$ +即 +$$ +\frac{\partial{\mathcal L}}{\partial\boldsymbol x} = \frac{\partial{\mathcal L}}{\partial \boldsymbol y}\left[\frac{\partial\boldsymbol y}{\partial \boldsymbol x}\right] +$$ +可知(其中 $\left[\frac{\partial\boldsymbol y}{\partial \boldsymbol x}\right]$ 已经在上文中求出) +$$ +\Delta_{\boldsymbol x} = \Delta_{\boldsymbol y}\left[\frac{\partial\boldsymbol y}{\partial \boldsymbol x}\right] +$$ +由于原代码中给出的是一组行向量构成的矩阵,因此,我们可以对每一行分别进行如上操作,且行与行之间互不干扰,由此完成 $\mathrm{Softmax}$ 的反向传播 + +$\square$ + +*注意此处对应代码的写法,对于 numpy 里的 array A,A[i] 只会被认为具有一个维度(被看做是一列数而非一个行向量),因此如果希望使用其对应的行向量,需要将 A[i] 转换为 matrix 类型,此时就会带上表示行的一维 + + + +### FNN + +由于整个 FNN 由上面推导过的许多基本层次构成,因此在 FNN 中进行前向计算,只需要类似函数嵌套的方法将模型结构的各个层次结合在一起即可 + +设 FNN 对应的函数为 $F$ ,则 + +#### $F$ 的推导(前向计算) + +$F(X) = \mathrm{Log}(\mathrm{Softmax}(\mathrm{ReLU}(\mathrm{ReLU}(X\cdot W_1)\cdot W_2)\cdot W_3))$ + +其中 $X \in \R^{n \times 784},\ W_1 \in \R^{784\times 256},\ W_2 \in \R^{256\times 64},\ W_3 \in \R^{64\times 10},\ F(X) \in \R^{n \times 10}$ ,$n$ 为数据条数 + +#### FNN 后向传播的推导 + +根据代码,我们可以得到模型定义的损失函数为 $\mathcal L(\boldsymbol y,\ \boldsymbol {\hat y}) = -\boldsymbol {\hat y}\boldsymbol y^T$ ($\boldsymbol {\hat y}$ 表示预测向量),而在整个数据集上的定义为 + +$\mathcal L(Y,\ \hat Y) = -\frac{1}{n}\sum_{i = 1}^n{\hat Y_{i:}}Y_{i:}^T = -\frac{1}{n}\sum_{i = 1}^n\sum_{j = 1}^d{\hat Y_{ij}}Y_{ij}$ + +由此我们需要计算 $\Delta_{\hat Y}$ (即 $\hat Y$ 相对于该损失函数的梯度,$\hat Y = F(X)$ ) + +推导如下: + +#### $\Delta_{\hat Y}$ 的推导 + +根据上文对 $\mathcal L$ 的定义,我们很容易得到 +$$ +\frac{\partial\mathcal L}{\partial\hat Y_{ij}} = -\frac{1}{n}Y_{ij} +$$ +由此即可得到最初进入反向传播过程的梯度矩阵 $\Delta_{\hat Y}$ ,其它层上的梯度可以通过逐层反向传播得到 + + + +### 总结 + +**这里希望就笔者对梯度和导数的理解,以及根据自己的理解和方法简明地推导反向传播公式的感悟,做一点总结。** + +对于后向传播的梯度计算,有许许多多的方法,比如迹运算技术[^1]和向量化技术[^2]即为比较通用的高维求导方法。但简单尝试后可以发现,这些方法运用起来实在过于复杂。实际上,为了正确而又简单地计算梯度,我们只需要明确所谓梯度(或者说导数)究竟有怎样的实际含义。 + +*下面对梯度和导数的概念可能有一些混用,在本文中这两者表达的含义基本相同。 + +对于一个非常一般的关系 $A = f(B)$ ,此处 $A,\ B$ 可视为任意维度和大小的张量。如果对线性代数比较熟悉,可以知道,其实这只是对许多多元函数关系的一种简化表达。如果我们拆开来看,若 $a$ 是 $A$ 中某一项,那么 $a$ 实际上就是一个**关于 $B$ 中所有项的多元函数**。 + +其次,为什么要在神经网络中计算梯度?无非就是希望通过梯度下降来迭代地优化神经网络的效果(而梯度下降的方向也是损失函数值下降最快的方向,且一般而言这个损失函数的结果都是一个标量,否则难以比较大小)。既然如此,我们就需要计算出网络中每个参数的梯度,并通过将每个参数都向梯度下降的方向做一些移动,使得整个网络的损失函数在特定数据集上的表现有所优化。 + +所以,我们可以认为,每个参数的移动都对损失函数的最终结果有一定的贡献。而链式法则告诉我们,这种贡献的正确计算方法,是考虑该参数到达损失函数的**每一条作用路径**[^3]。反向传播算法 (Back Propagation, BP) 即是利用了这种思想。如果对动态规划 (Dynamic Programming, DP) 算法比较熟悉(比如对运筹学比较熟悉或有程序设计竞赛方面的背景),可以知道 BP 实际是一个**非常显然而简单**的有向无环图 (Directed Acyclic Graph, DAG) 上 做 DP 的算法(当然这也很大程度上归功于链式法则的证明),因为神经网络中的计算图本身一定是一个 DAG,即从各个结点出发,沿着计算边行进,最终必定会到达损失函数。 + +这样,我们在计算 $B$ 到 $\mathcal L$ 的梯度时,不再需要枚举所有作用路径,计算并累加所有的贡献,而只是需要计算从 $B$ 到以它为自变量(的一部分)的关系的象 $A$ 中所有可能的作用路径,再和先前计算好的从 $A$ 到 $\mathcal L$ 的梯度做一个“路径拼接”,实际也就是矩阵(或张量)的乘法,即可得到 $B$ 所有的贡献。关于 $A,\ B$ 的顺序,由于计算图是一个 DAG ,因此可以通过拓扑排序 (Topological Sort) 确定二者(乃至全部参数)之间合法的先后关系。 + +回到上文的一般关系 $A = f(B)$ 中,从 $B$ 到 $A$ 的所有合法路径是怎样的?这就回到了一开始从 $A$ 中抽出的一项 $a$ (它也代表一个关于 $B$ 中所有项的多元函数) ,所谓所有的合法路径,实际也就是要考虑所有这样的 $A$ 中的项(或多元函数)对于 $B$ 中每一项的梯度,然后 $B$ 中每一项的贡献,也就是以 $\mathcal L$ 到 $A$ 中每一项的梯度作为权,乘这一 $A$ 中的项对 $B$ 中该项刚刚计算出的梯度(或称偏导)的和,这也就是先前提到的**“路径拼接”**(也是一种加权和,如此理解的意义也是非常显然的)。 + +这样,对于日常遇到的简单情形,我们其实并不需要套用一些非常通用而繁琐的方法,只需要如上文一般,对 $A$ 中每一项求出其对 $B$ 中每一项的偏导,由此将问题转化为一个多元函数(结果是标量)对其中每个自变量的偏导问题,这显然是非常简单的(甚至不需要大学的知识)。而关于“简单情形”的定义,其实也就是指这种多元函数的形式比较一致,这样我们只需要对其中几个多元函数求出偏导,就能得到对全部情况的理解。最后模仿“路径拼接”或链式法则,将表达简化为矩阵乘法的形式,就可以非常容易地得到梯度。本文对相关常见的神经网络层反向传播的推导,全部基于这种认识和由此而来的方法,在笔者个人看来,是足够简明和容易理解的。 + + + + + +[^1]: 来源于该式 $\mathrm{d}f = \mathrm{tr}\left(\frac{\partial f}{\partial X}^T \mathrm{d}X\right)$,用于标量对矩阵的求导。从需要求导的标量出发,套上迹运算,再结合一些迹内运算的恒等式推导得到类似形式,则迹运算内 $\mathrm{d}X$ 左侧部分的转置即为所求导数 +[^2]:来源于该式 $\mathrm{vec}(\mathrm{d}F) = \frac{\partial F}{\partial X}^T \mathrm{vec}(\mathrm{d}X)$,用于矩阵对矩阵的求导。类似地,从需要求导的矩阵出发,套上向量化运算,再结合一些向量化内运算的恒等式推导得到类似形式,则 $\mathrm{vec}(\mathrm{d}X)$ 左侧部分的转置即为所求导数 +[^3]: 对于这一点,可以举例考虑函数 $f = (x,\ y,\ z),\ x = g(u),\ y = h(u),\ z = t(u)$ 。如果可导相关的条件上没有任何障碍,那么想要求出 $\frac{\partial f}{\partial u}$ ,我们就必须计算 $\frac{\partial f}{\partial x}\frac{\partial x}{\partial u} + \frac{\partial f}{\partial y}\frac{\partial y}{\partial u} + \frac{\partial f}{\partial z}\frac{\partial z}{\partial u}$ ,也就是考虑 $u$ 到 $f$ 的每一条作用路径。 + + + + + +## 第二部分:模型的训练和测试(先前版本) + +在从损失函数进行梯度回传时,发现并没有为损失函数提供 `backward` 接口,因此就去看了一下损失函数的具体形式,然后进行了一下推导,发现只需要在原来取平均的基础上加一个负号就可以,然后就通过了所有的测试。但不知道为什么初始准确率不是很高,只有 $80\%$ 多。多训练几个 epoch 后才提升得比较好。 + +### 调整学习率 + +#### 保守尝试 + +根据上文的推导结果,在 `numpy_fnn.py` 中填写好有关代码,然后运行 `numpy_mnist.py` 进行训练,并尝试不同的学习率,得到结果如下: + +(实际顺序是先测试了原配置 $\alpha = 0.1$,后来发现损失函数后期波动较大,以为是学习率过大导致的不稳定,就尝试了更小的学习率 $\alpha = 0.05$ ,结果发现效果明显不如从前,就开始逐渐增大学习率进行试验,观察效果。最后决定将数据和损失函数图象放在一起,这样对比更加明显,且节约篇幅) + +| Epoch | Accuracy ($\alpha = 0.05$) | Accuracy ($\alpha = 0.1$) | Accuracy $(\alpha = 0.15$) | Accuracy $(\alpha = 0.3$) | +| :---: | :------------------------: | :-----------------------: | :------------------------: | :-----------------------: | +| $0$ | $80.18\%$ | $87.50\%$ | $89.20\%$ | $91.06\%$ | +| $1$ | $87.25\%$ | $90.46\%$ | $91.22\%$ | $93.54\%$ | +| $2$ | $89.23\%$ | $91.65\%$ | $92.46\%$ | $94.64\%$ | +| $3$ | $90.40\%$ | $92.21\%$ | $93.19\%$ | $95.42\%$ | +| $4$ | $91.03\%$ | $92.98\%$ | $93.67\%$ | $96.02\%$ | +| $5$ | $91.61\%$ | $93.23\%$ | $94.45\%$ | $96.38\%$ | + +2834 + +(上四图从左到右分别是 $\alpha = 0.05,\ 0.1,\ 0.15,\ 0.3$ 的情况,可以看到损失函数下降的速度明显不同) + +设置这几个 $\alpha$ 也是为了比较在 $\mathrm{epoch}\times \alpha$ 相同时的模型训练效果,结果如下: + +| condition | accuracy | +| :----------------------------------: | :-------: | +| $\alpha = 0.05,\ \mathrm{epoch} = 6$ | $91.61\%$ | +| $\alpha = 0.1,\ \mathrm{epoch} = 3$ | $91.65\%$ | +| $\alpha = 0.15,\ \mathrm{epoch} = 2$ | $91.22\%$ | +| $\alpha = 0.3,\ \mathrm{epoch} = 1$ | $91.06\%$ | + +可以发现,在保持模型训练效果大体不变的情况下,可以**适当增加**学习率,减少训练的 $\mathrm{epoch}$ 次数,从而提高训练效率 + +那么,既然提到了适当,那何时是不太适当的呢?这将在下一节中进行探索。 + +#### 过大的 $\alpha$ + +显然一切都不能太极端,我们也不能一昧地提升学习率而不计后果。过大的学习率容易造成模型训练的不稳定,以及数值计算上的严重问题,于是希望在这一节探索怎样的 $\alpha$ 是过大的 + +| Epoch | Accuracy ($\alpha = 1.0$) | Accuracy ($\alpha = 5.0$) | Accuracy ($\alpha = 8.0$) | +| :---: | :-----------------------: | :-----------------------: | :-----------------------: | +| $0$ | $94.64\%$ | $96.36\%$ | $9.80\%$ | +| $1$ | $96.28\%$ | $96.99\%$ | $9.80\%$ | +| $2$ | $97.08\%$ | $97.14\%$ | $9.80\%$ | +| $3$ | $97.39\%$ | $97.73\%$ | $9.80\%$ | +| $4$ | $97.57\%$ | $97.52\%$ | $9.80\%$ | +| $5$ | $97.82\%$ | $97.84\%$ | $9.80\%$ | + +555 + +(上三图分别对应 $\alpha = 1.0,\ 5.0,\ 8.0$ 情况) + +可以看到 $\alpha = 1.0$ 的效果还非常好,$\alpha = 5.0$ 时损失函数已经出现相对明显大幅度的波动,而 $\alpha = 8.0$ 时在训练过程中出现了数值问题,模型优化失效 + +#### 总结 + +- 可以发现,在 $\alpha$ 一定的范围内(合理范围内),epoch $\times\ \alpha$ 相同时,训练模型的精度也大体相近。总体看来在这个模型中,$0.1$ 的学习率确实还不太合适,迭代速度太慢 +- 增大 $\alpha$ 时,损失函数下降的速度也有非常明显的提升,可以看到损失函数近于直线下降的部分,随着 $\alpha$ 增大其倾斜程度也有非常明显的增大 +- 合理增大 $\alpha$ 对模型精度的提升非常明显,如 $\alpha = 1.0$ 在第 $6$ 个 epoch 时的精度甚至达到了近 $98\%$(但和 torch 下的模型训练效率相比,差距仍然很明显) + + + + + +## 第二部分:模型的训练和测试(修改后版本) + +然后听说对 `numpy_fnn.py` 作了不少修改,然后就 pull 了一下,发现给损失函数加上了 `backward`,并在模型 `backward` 时传入了,这样就不需要再手动处理什么,直接从 `ReLU` 开始反向传播就好。然而,这次在 MNIST 上的测试准确率变得很好,初始就是 $90\%$ 以上,训练几个 epoch 以后也显著地变得更好。所以这一部分就是把之前的结果重新做一遍,然后更新一下实验数据。 + +### 调整学习率 + +#### 保守尝试 + +根据上文的推导结果,在 `numpy_fnn.py` 中填写好有关代码,然后运行 `numpy_mnist.py` 进行训练,并尝试不同的学习率,得到结果如下: + +| Epoch | Accuracy ($\alpha = 0.05$) | Accuracy ($\alpha = 0.1$) | Accuracy $(\alpha = 0.15$) | Accuracy $(\alpha = 0.3$) | +| :---: | :------------------------: | :-----------------------: | :------------------------: | :-----------------------: | +| $0$ | $92.51\%$ | $94.56\%$ | $95.47\%$ | $96.09\%$ | +| $1$ | $94.96\%$ | $96.12\%$ | $95.89\%$ | $95.97\%$ | +| $2$ | $96.15\%$ | $96.96\%$ | $97.27\%$ | $96.94\%$ | +| $3$ | $96.28\%$ | $97.28\%$ | $97.52\%$ | $97.78\%$ | +| $4$ | $96.93\%$ | $97.23\%$ | $97.54\%$ | $98.05\%$ | +| $5$ | $97.14\%$ | $97.55\%$ | $98.08\%$ | $97.91\%$ | + +2834 + +(上四图从左到右分别是 $\alpha = 0.05,\ 0.1,\ 0.15,\ 0.3$ 的情况,可以看到损失函数下降的速度明显不同) + +设置这几个 $\alpha$ 也是为了比较在 $\mathrm{epoch}\times \alpha$ 相同时的模型训练效果,结果如下: + +| condition | accuracy | +| :----------------------------------: | :-------: | +| $\alpha = 0.05,\ \mathrm{epoch} = 6$ | $97.1\%$ | +| $\alpha = 0.1,\ \mathrm{epoch} = 3$ | $96.96\%$ | +| $\alpha = 0.15,\ \mathrm{epoch} = 2$ | $95.89\%$ | +| $\alpha = 0.3,\ \mathrm{epoch} = 1$ | $96.09\%$ | + +#### 过大的 $\alpha$ + +上一节中,我们就看到,在采用较大学习率时,模型学习效果会出现明显的波动,且最终学习效果会有所下降。那么当采用更大学习率时,会出现什么结果呢? + +| Epoch | Accuracy ($\alpha = 1.0$) | +| :---: | :-----------------------: | +| $0$ | $9.80\%$ | +| $1$ | $9.80\%$ | +| $2$ | $9.80\%$ | +| $3$ | $9.80\%$ | +| $4$ | $9.80\%$ | +| $5$ | $9.80\%$ | + +5 + +(上三图分别对应 $\alpha = 1.0,\ 5.0,\ 8.0$ 情况) + +和先前不同, $\alpha = 1.0$ 时模型就出现了数值问题,导致优化失效 + + + +## 第三部分:自定义 mini_batch + +用 numpy 手写 mini_batch 函数,其实原理很简单,算法大概分为如下步骤(写在代码中): + +```python +def mini_batch(dataset, batch_size = 128, numpy = False): + if batch_size <= 0 or not isinstance(batch_size, int): + return None + # 1. 判断传入的 batch_size 是否合法,需要为正整数,不合法返回空 + + data, label = batch(dataset)[0] + # 2. 用 batch 方法将 torchvision 下的 MNIST 数据集转换为 numpy 的 array + + datanum = len(data) + idx = np.arange(datanum) + np.random.shuffle(idx) + data, label = data[idx], label[idx] + # 3. 对 data 和 label 进行 random shuffle,具体来说,可以先对一个指示下标的数组做 random shuffle,然后用这个下标数组配合 slice 机制对 data 和 label 进行对应的 random shuffle,从而防止 data 和 label 错误匹配 + + batchnum = (datanum - 1) // batch_size + 1 # datanum 对 batch_size 下取整 + batches = [] + # 4. 计算 batch 数量,初始化 batches 列表 + + for i in range(batchnum): + batches.append((data[i * batch_size: min(datanum, (i + 1) * batch_size)], label[i * batch_size: min(datanum, (i + 1) * batch_size)])) + # 5. 通过 slice 机制选出第 i 个 batch 对应的 data 和 label 子集,放入 batches 列表中 + + return batches +``` + + + +## 第四部分:额外探究 + +### 其他基于梯度的优化方法对比实验 + +在代码中 ( numpy_fnn.py 和 numpy_mnist.py ) 实现了十种优化方法,有 + +- Momentum, 动量法 +- Nesterov Accelerated Gradient, NAG, Nesterov 加速梯度 或称 Nesterov 动量法 +- Adaptive Moment Estimation Algorithm, Adam 算法 +- Inverse Time Decay, 逆时衰减 +- Exponential Decay, 指数衰减 +- Natural Exponential Decay, 自然指数衰减 +- Cosine Decay, 余弦衰减 +- Adaptive Gradient Algorithm, AdaGrad 算法 +- RMSprop 算法 +- AdaDelta 算法 + +包括无优化的版本,共有十一种不同的对比实验 + +#### $\mathrm{epoch} = 3,\ \alpha = 0.1$ + +这里设定 $\mathrm{epoch} = 3,\ \alpha = 0.1$ ,当然,对于调整学习率的算法,$\alpha$ 是不固定的,提供的只是一个初始值 + +| Epoch | None | Momentum | Nesterov | Adam | +| :---: | :-------: | :-------: | :-------: | :-------: | +| $0$ | $92.15\%$ | $96.72\%$ | $96.42\%$ | $95.43\%$ | +| $1$ | $96.21\%$ | $97.00\%$ | $96.88\%$ | $96.87\%$ | +| $2$ | $96.81\%$ | $97.54\%$ | $97.09\%$ | $97.55\%$ | + +| Epoch | Inverse Time Decay | Exponential Decay | Natural Exponential Decay | Cosine Decay | +| :---: | :----------------: | :---------------: | :-----------------------: | :----------: | +| $0$ | $86.71\%$ | $81.10\%$ | $81.05\%$ | $94.17\%$ | +| $1$ | $87.93\%$ | $81.10\%$ | $81.05\%$ | $95.91\%$ | +| $2$ | $88.49\%$ | $81.10\%$ | $81.05\%$ | $96.08\%$ | + +| Epoch | AdaGrad | RMSprop | AdaDelta | +| :---: | :-------: | :-------: | :-------: | +| $0$ | $95.04\%$ | $95.47\%$ | $76.42\%$ | +| $1$ | $96.28\%$ | $96.45\%$ | $86.86\%$ | +| $2$ | $97.24\%$ | $97.43\%$ | $89.22\%$ | + +![5.1](img/5.2.png) + +*其实这时候忘了把 RMSprop 加进去了,但是对比也是很丰富的... + + + +#### $\mathrm{epoch} = 3,\ \alpha = 0.05$ + +为了更好地突出各个算法的不同,将学习率 $\alpha$ 降低为 $0.05$ + +| Epoch | None | Momentum | Nesterov | Adam | +| :---: | :-------: | :-------: | :-------: | :-------: | +| $0$ | $92.47\%$ | $96.20\%$ | $96.65\%$ | $93.67\%$ | +| $1$ | $94.43\%$ | $97.38\%$ | $97.13\%$ | $96.11\%$ | +| $2$ | $95.67\%$ | $97.70\%$ | $97.70\%$ | $96.69\%$ | + +| Epoch | Inverse Time Decay | Exponential Decay | Natural Exponential Decay | Cosine Decay | +| :---: | :----------------: | :---------------: | :-----------------------: | :----------: | +| $0$ | $77.18\%$ | $67.31\%$ | $64.09\%$ | $92.47\%$ | +| $1$ | $80.42\%$ | $67.31\%$ | $64.09\%$ | $94.07\%$ | +| $2$ | $81.90\%$ | $67.31\%$ | $64.09\%$ | $94.35\%$ | + +| Epoch | AdaGrad | RMSprop | AdaDelta | +| :---: | :-------: | :-------: | :-------: | +| $0$ | $93.29\%$ | $93.74\%$ | $75.09\%$ | +| $1$ | $95.47\%$ | $95.17\%$ | $87.09\%$ | +| $2$ | $96.49\%$ | $96.71\%$ | $89.76\%$ | + +![5.3](img/5.3.png) + +*这时候仍然忘了把 RMSprop 加进去了... + + + +#### $\mathrm{epoch} = 3,\ \alpha = 0.01$ + +为了更好地突出各个算法的不同,将学习率 $\alpha$ 进一步降低为 $0.01$,同时加上 RMSprop 算法 + +| Epoch | None | Momentum | Nesterov | Adam | +| :---: | :-------: | :-------: | :-------: | :-------: | +| $0$ | $87.89\%$ | $94.37\%$ | $94.51\%$ | $89.81\%$ | +| $1$ | $90.47\%$ | $96.32\%$ | $95.82\%$ | $91.76\%$ | +| $2$ | $91.98\%$ | $97.12\%$ | $96.85\%$ | $92.96\%$ | + +| Epoch | Inverse Time Decay | Exponential Decay | Natural Exponential Decay | Cosine Decay | +| :---: | :----------------: | :---------------: | :-----------------------: | :----------: | +| $0$ | $36.83\%$ | $21.94\%$ | $29.31\%$ | $87.22\%$ | +| $1$ | $45.39\%$ | $21.94\%$ | $29.31\%$ | $89.67\%$ | +| $2$ | $50.02\%$ | $21.94\%$ | $29.31\%$ | $89.85\%$ | + +| Epoch | AdaGrad | RMSprop | AdaDelta | +| :---: | :-------: | :-----: | :-------: | +| $0$ | $89.44\%$ | $89.56\%$ | $77.58\%$ | +| $1$ | $91.83\%$ | $91.85\%$ | $86.84\%$ | +| $2$ | $92.76\%$ | $92.90\%$ | $89.26\%$ | + +![5.4](img/5.4.png) + +#### 总结 + +- 可以发现 Momentum 和 Nesterov 效果一直不错,且表现接近,这可能是因为本作业的模型比较简单,而 Momentum 和 Nesterov 总体来看做的事情比较相似,所以没有体现出 Nesterov 对梯度更新的修正。Adam 在 $\alpha$ 较大时表现也很好 +- 对于以某种方式衰减 $\alpha$ 的优化,初始 $\alpha$ 必须足够大才能体现出较好的效果,毕竟 $\alpha$ 太小的话,衰减到最后已经不足以对参数产生实质性的更新从而导致停滞(如 Exponential Decay 和 Natural Exponential Decay) +- AdaGrad 和 RMSprop 表现比较接近,这可能也是因为本作业的模型比较简单,所以没有体现出 RMSprop 中指数衰减移动平均的优势,且这两者也会受到学习率减小的影响 + + + +### 权重初始化 + +留坑待补... \ No newline at end of file diff --git a/assignment-2/submission/19307130062/img/1.png b/assignment-2/submission/19307130062/img/1.png new file mode 100644 index 0000000000000000000000000000000000000000..76e24ca85c7ce4d93a478588b5bb4c56712f55f8 Binary files /dev/null and b/assignment-2/submission/19307130062/img/1.png differ diff --git a/assignment-2/submission/19307130062/img/2.2.png b/assignment-2/submission/19307130062/img/2.2.png new file mode 100644 index 0000000000000000000000000000000000000000..dcd73246ee7c55075b77254e662b29c73b388479 Binary files /dev/null and b/assignment-2/submission/19307130062/img/2.2.png differ diff --git a/assignment-2/submission/19307130062/img/2.3.png b/assignment-2/submission/19307130062/img/2.3.png new file mode 100644 index 0000000000000000000000000000000000000000..52dc358d56961c0b88b78c4be37f09d6cf0ff73f Binary files /dev/null and b/assignment-2/submission/19307130062/img/2.3.png differ diff --git a/assignment-2/submission/19307130062/img/2.4.png b/assignment-2/submission/19307130062/img/2.4.png new file mode 100644 index 0000000000000000000000000000000000000000..b1887ce4f8daf51fb7f0b83c31e8fc7af2edb333 Binary files /dev/null and b/assignment-2/submission/19307130062/img/2.4.png differ diff --git a/assignment-2/submission/19307130062/img/2.5.png b/assignment-2/submission/19307130062/img/2.5.png new file mode 100644 index 0000000000000000000000000000000000000000..0bfb996933564302164fef6ef8f81ccff226b261 Binary files /dev/null and b/assignment-2/submission/19307130062/img/2.5.png differ diff --git a/assignment-2/submission/19307130062/img/2.8.png b/assignment-2/submission/19307130062/img/2.8.png new file mode 100644 index 0000000000000000000000000000000000000000..25fbb117e3043af4ee3c0971e0df2f86e235002f Binary files /dev/null and b/assignment-2/submission/19307130062/img/2.8.png differ diff --git a/assignment-2/submission/19307130062/img/2.png b/assignment-2/submission/19307130062/img/2.png new file mode 100644 index 0000000000000000000000000000000000000000..50493ce463016089ef27bd53da94fb037f1a11da Binary files /dev/null and b/assignment-2/submission/19307130062/img/2.png differ diff --git a/assignment-2/submission/19307130062/img/3.png b/assignment-2/submission/19307130062/img/3.png new file mode 100644 index 0000000000000000000000000000000000000000..ff604f17eb61b62e9f093adf749a1f9e300f183b Binary files /dev/null and b/assignment-2/submission/19307130062/img/3.png differ diff --git a/assignment-2/submission/19307130062/img/4.1.png b/assignment-2/submission/19307130062/img/4.1.png new file mode 100644 index 0000000000000000000000000000000000000000..06c1a99c134d02ef2198b12330e1fae207a0c88a Binary files /dev/null and b/assignment-2/submission/19307130062/img/4.1.png differ diff --git a/assignment-2/submission/19307130062/img/4.png b/assignment-2/submission/19307130062/img/4.png new file mode 100644 index 0000000000000000000000000000000000000000..517a871080f592d5bdf8b0d16ce2fb1851c0b0a8 Binary files /dev/null and b/assignment-2/submission/19307130062/img/4.png differ diff --git a/assignment-2/submission/19307130062/img/5.1.png b/assignment-2/submission/19307130062/img/5.1.png new file mode 100644 index 0000000000000000000000000000000000000000..3649baecd02c1fa828f201b1220fcf81a2d11131 Binary files /dev/null and b/assignment-2/submission/19307130062/img/5.1.png differ diff --git a/assignment-2/submission/19307130062/img/5.2.png b/assignment-2/submission/19307130062/img/5.2.png new file mode 100644 index 0000000000000000000000000000000000000000..7706ab7fefd7672d262ef8e8781bc97d3d749401 Binary files /dev/null and b/assignment-2/submission/19307130062/img/5.2.png differ diff --git a/assignment-2/submission/19307130062/img/5.3.png b/assignment-2/submission/19307130062/img/5.3.png new file mode 100644 index 0000000000000000000000000000000000000000..3f8c5876a1a72ebdbccf057b776ec5b9be91f296 Binary files /dev/null and b/assignment-2/submission/19307130062/img/5.3.png differ diff --git a/assignment-2/submission/19307130062/img/5.4.png b/assignment-2/submission/19307130062/img/5.4.png new file mode 100644 index 0000000000000000000000000000000000000000..e8ca3f58c0149ffd66c0510e30da28c4f2751ff4 Binary files /dev/null and b/assignment-2/submission/19307130062/img/5.4.png differ diff --git a/assignment-2/submission/19307130062/img/5.png b/assignment-2/submission/19307130062/img/5.png new file mode 100644 index 0000000000000000000000000000000000000000..d1698795bf9cd724b32984a43585b0ffcece7dad Binary files /dev/null and b/assignment-2/submission/19307130062/img/5.png differ diff --git a/assignment-2/submission/19307130062/img/6.png b/assignment-2/submission/19307130062/img/6.png new file mode 100644 index 0000000000000000000000000000000000000000..f3246e39870e14e383f18c231d30c5f22e2c637d Binary files /dev/null and b/assignment-2/submission/19307130062/img/6.png differ diff --git a/assignment-2/submission/19307130062/img/7.png b/assignment-2/submission/19307130062/img/7.png new file mode 100644 index 0000000000000000000000000000000000000000..b4fbff12bdc4a177d7fd555580ac7312fabf1d06 Binary files /dev/null and b/assignment-2/submission/19307130062/img/7.png differ diff --git a/assignment-2/submission/19307130062/img/8.png b/assignment-2/submission/19307130062/img/8.png new file mode 100644 index 0000000000000000000000000000000000000000..c8568213cf5f72bfe57b28c7bdb9e2a8cc2bcfe4 Binary files /dev/null and b/assignment-2/submission/19307130062/img/8.png differ diff --git a/assignment-2/submission/19307130062/numpy_fnn.py b/assignment-2/submission/19307130062/numpy_fnn.py new file mode 100644 index 0000000000000000000000000000000000000000..c5c95457e32cddf3ea2cb9effb780331e632d2bd --- /dev/null +++ b/assignment-2/submission/19307130062/numpy_fnn.py @@ -0,0 +1,340 @@ +import numpy as np + +class NumpyOp: + + def __init__(self): + self.memory = {} + self.epsilon = 1e-12 + + +class Matmul(NumpyOp): + + def forward(self, x, W): + """ + x: shape(N, d) + w: shape(d, d') + """ + self.memory['x'] = x + self.memory['W'] = W + h = np.matmul(x, W) + return h + + def backward(self, grad_y): + """ + grad_y: shape(N, d') + """ + + # code1 + grad_x = np.matmul(grad_y, self.memory['W'].T) + grad_W = np.matmul(self.memory['x'].T, grad_y) + + return grad_x, grad_W + + +class Relu(NumpyOp): + + def forward(self, x): + self.memory['x'] = x + return np.where(x > 0, x, np.zeros_like(x)) + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + # code2 + grad_x = grad_y * (self.memory['x'] >= 0) + + return grad_x + + +class Log(NumpyOp): + + def forward(self, x): + """ + x: shape(N, c) + """ + + out = np.log(x + self.epsilon) + self.memory['x'] = x + + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + # code3 + grad_x = grad_y * (1.0 / self.memory['x']) + + return grad_x + + +class Softmax(NumpyOp): + """ + softmax over last dimension + """ + + def forward(self, x): + """ + x: shape(N, c) + """ + + # code4 + mx = x.max(axis = 1).reshape(x.shape[0], -1) # 防止上溢和下溢 + ex = np.exp(x - mx) + out = (ex.T / (ex.sum(axis = 1))).T + self.memory['x'] = x + self.memory['y'] = out + + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + # code5 + x = self.memory['x'] + y = self.memory['y'] + grad_x = np.zeros(x.shape) + for i in range(x.shape[0]): + grad_x[i] = np.matmul(grad_y[i], -np.matmul(np.matrix(y[i]).T, np.matrix(y[i])) + np.diag(np.array(y[i]))) + return grad_x + +class NumpyLoss: + + def __init__(self): + self.target = None + + def get_loss(self, pred, target): + self.target = target + return (-pred * target).sum(axis=1).mean() + + def backward(self): + return -self.target / self.target.shape[0] + +class NumpyModel: + def __init__(self, learning_rate = 0.1, update_type = None, iter_times = 1407): + self.W1 = np.random.normal(size=(28 * 28, 256)) + self.W2 = np.random.normal(size=(256, 64)) + self.W3 = np.random.normal(size=(64, 10)) + + + # 以下算子会在 forward 和 backward 中使用 + self.matmul_1 = Matmul() + self.relu_1 = Relu() + self.matmul_2 = Matmul() + self.relu_2 = Relu() + self.matmul_3 = Matmul() + self.softmax = Softmax() + self.log = Log() + + # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导) + self.x1_grad, self.W1_grad = None, None + self.relu_1_grad = None + self.x2_grad, self.W2_grad = None, None + self.relu_2_grad = None + self.x3_grad, self.W3_grad = None, None + self.softmax_grad = None + self.log_grad = None + + # 以下变量指定了梯度回传所用的优化方法,并完成了有关的初始化 + self.update_type = update_type + self.learning_rate = learning_rate + self.iter_times = iter_times + if update_type == 'Momentum': + self.rho = 0.9 + self.W1_delta = np.zeros(self.W1.shape) + self.W2_delta = np.zeros(self.W2.shape) + self.W3_delta = np.zeros(self.W3.shape) + + elif update_type == 'Nesterov': + self.rho = 0.9 + self.W1_delta = np.zeros(self.W1.shape) + self.W2_delta = np.zeros(self.W2.shape) + self.W3_delta = np.zeros(self.W3.shape) + + elif update_type == 'Adam': + self.epsilon = 1e-7 + self.beta1, self.beta2 = 0.9, 0.99 + self.M1 = np.zeros(self.W1.shape) + self.M2 = np.zeros(self.W2.shape) + self.M3 = np.zeros(self.W3.shape) + self.G1, self.G2, self.G3 = .0, .0, .0 + + elif update_type == 'Inverse Time Decay': + self.beta = 0.1 + self.t = 0 + + elif update_type == 'Exponential Decay': + self.beta = 0.96 + self.t = 0 + + elif update_type == 'Natural Exponential Decay': + self.beta = 0.04 + self.t = 0 + + elif update_type == 'Cosine Decay': + self.t = 0 + + elif update_type == 'AdaGrad': + self.epsilon = 1e-7 + + elif update_type == 'RMSprop': + self.beta = 0.9 + self.epsilon = 1e-7 + self.G1, self.G2, self.G3 = .0, .0, .0 + + elif update_type == 'AdaDelta': + self.beta = 0.9 + self.epsilon = 1e-7 + self.W1_delta = np.zeros(self.W1.shape) + self.W2_delta = np.zeros(self.W2.shape) + self.W3_delta = np.zeros(self.W3.shape) + self.X1, self.X2, self.X3 = .0, .0, .0 + self.G1, self.G2, self.G3 = .0, .0, .0 + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + + # code6 + if self.update_type == 'Nesterov': + # 在前向传播之前进行 Nesterov 算法的第一阶段 + self.W1_delta = self.rho * self.W1_delta + self.W2_delta = self.rho * self.W2_delta + self.W3_delta = self.rho * self.W3_delta + self.W1 += self.W1_delta + self.W2 += self.W2_delta + self.W3 += self.W3_delta + + x = self.matmul_1.forward(x, self.W1) + x = self.relu_1.forward(x) + x = self.matmul_2.forward(x, self.W2) + x = self.relu_2.forward(x) + x = self.matmul_3.forward(x, self.W3) + x = self.softmax.forward(x) + x = self.log.forward(x) + + return x + + def backward(self, y): + # for size in y.shape: + # y /= size + + + # code7 + #self.log_grad = self.log.backward(-y) + self.log_grad = self.log.backward(y) + self.softmax_grad = self.softmax.backward(self.log_grad) + self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad) + self.relu_2_grad = self.relu_2.backward(self.x3_grad) + self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad) + self.relu_1_grad = self.relu_1.backward(self.x2_grad) + self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad) + + + + def optimize(self, learning_rate): + if not self.update_type: + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad + + elif self.update_type == 'Momentum': + self.W1_delta = self.rho * self.W1_delta - learning_rate * self.W1_grad + self.W2_delta = self.rho * self.W2_delta - learning_rate * self.W2_grad + self.W3_delta = self.rho * self.W3_delta - learning_rate * self.W3_grad + self.W1 += self.W1_delta + self.W2 += self.W2_delta + self.W3 += self.W3_delta + + elif self.update_type == 'Nesterov': + # 在参数更新时进行 Nesterov 第二阶段 + self.W1_delta -= learning_rate * self.W1_grad + self.W2_delta -= learning_rate * self.W2_grad + self.W3_delta -= learning_rate * self.W3_grad + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad + + elif self.update_type == 'Adam': + self.M1 = self.beta1 * self.M1 + (1 - self.beta1) * self.W1_grad + self.G1 = self.beta2 * self.G1 + (1 - self.beta2) * (self.W1_grad * self.W1_grad).sum() + _M1 = self.M1 / (1 - self.beta1) + _G1 = self.G1 / (1 - self.beta2) + self.W1 -= learning_rate / np.sqrt(_G1 + self.epsilon) * _M1 + + self.M2 = self.beta1 * self.M2 + (1 - self.beta1) * self.W2_grad + self.G2 = self.beta2 * self.G2 + (1 - self.beta2) * (self.W2_grad * self.W2_grad).sum() + _M2 = self.M2 / (1 - self.beta1) + _G2 = self.G2 / (1 - self.beta2) + self.W2 -= learning_rate / np.sqrt(_G2 + self.epsilon) * _M2 + + self.M3 = self.beta1 * self.M3 + (1 - self.beta1) * self.W3_grad + self.G3 = self.beta2 * self.G3 + (1 - self.beta2) * (self.W3_grad * self.W3_grad).sum() + _M3 = self.M3 / (1 - self.beta1) + _G3 = self.G3 / (1 - self.beta2) + self.W3 -= learning_rate / np.sqrt(_G3 + self.epsilon) * _M3 + + elif self.update_type == 'Inverse Time Decay': + learning_rate = self.learning_rate / (1.0 + self.beta * self.t) + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad + self.t += 1 + + elif self.update_type == 'Exponential Decay': + learning_rate = self.learning_rate * pow(self.beta, self.t) + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad + self.t += 1 + + elif self.update_type == 'Natural Exponential Decay': + learning_rate = self.learning_rate * np.exp(-self.beta * self.t) + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad + self.t += 1 + + elif self.update_type == 'Cosine Decay': + learning_rate = self.learning_rate / 2.0 * (1.0 + np.cos(self.t * np.pi / self.iter_times)) + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad + self.t += 1 + + elif self.update_type == 'AdaGrad': + G = (self.W1_grad * self.W1_grad).sum() + self.W1 -= learning_rate / np.sqrt(G + self.epsilon) * self.W1_grad + G = (self.W2_grad * self.W2_grad).sum() + self.W2 -= learning_rate / np.sqrt(G + self.epsilon) * self.W2_grad + G = (self.W3_grad * self.W3_grad).sum() + self.W3 -= learning_rate / np.sqrt(G + self.epsilon) * self.W3_grad + + elif self.update_type == 'RMSprop': + self.G1 = self.beta * self.G1 + (1 - self.beta) * (self.W1_grad * self.W1_grad).sum() + self.W1 -= learning_rate / np.sqrt(self.G1 + self.epsilon) * self.W1_grad + self.G2 = self.beta * self.G2 + (1 - self.beta) * (self.W2_grad * self.W2_grad).sum() + self.W2 -= learning_rate / np.sqrt(self.G2 + self.epsilon) * self.W2_grad + self.G3 = self.beta * self.G3 + (1 - self.beta) * (self.W3_grad * self.W3_grad).sum() + self.W3 -= learning_rate / np.sqrt(self.G3 + self.epsilon) * self.W3_grad + + elif self.update_type == 'AdaDelta': + self.X1 = self.beta * self.X1 + (1 - self.beta) * (self.W1_delta * self.W1_delta).sum() + self.G1 = self.beta * self.G1 + (1 - self.beta) * (self.W1_grad * self.W1_grad).sum() + self.W1_delta = -np.sqrt(self.X1 + self.epsilon) / np.sqrt(self.G1 + self.epsilon) * self.W1_grad + self.W1 += self.W1_delta + + self.X2 = self.beta * self.X2 + (1 - self.beta) * (self.W2_delta * self.W2_delta).sum() + self.G2 = self.beta * self.G2 + (1 - self.beta) * (self.W2_grad * self.W2_grad).sum() + self.W2_delta = -np.sqrt(self.X2 + self.epsilon) / np.sqrt(self.G2 + self.epsilon) * self.W2_grad + self.W2 += self.W2_delta + + self.X3 = self.beta * self.X3 + (1 - self.beta) * (self.W3_delta * self.W3_delta).sum() + self.G3 = self.beta * self.G3 + (1 - self.beta) * (self.W3_grad * self.W3_grad).sum() + self.W3_delta = -np.sqrt(self.X3 + self.epsilon) / np.sqrt(self.G3 + self.epsilon) * self.W3_grad + self.W3 += self.W3_delta + diff --git a/assignment-2/submission/19307130062/numpy_mnist.py b/assignment-2/submission/19307130062/numpy_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..4e2bf7a7f624444ede1e6b317d46b0052f1ac675 --- /dev/null +++ b/assignment-2/submission/19307130062/numpy_mnist.py @@ -0,0 +1,143 @@ +import numpy as np +from numpy_fnn import NumpyModel, NumpyLoss +from utils import download_mnist, batch, mini_batch, get_torch_initialization, plot_curve, one_hot + +def mini_batch(dataset, batch_size = 128, numpy = False): + if batch_size <= 0 or not isinstance(batch_size, int): + return None + # 1. 判断传入的 batch_size 是否合法,需要为正整数,不合法返回空 + + data, label = batch(dataset)[0] + # 2. 用 batch 方法将 torchvision 下的 MNIST 数据集转换为 numpy 的 array + + datanum = len(data) + idx = np.arange(datanum) + np.random.shuffle(idx) + data, label = data[idx], label[idx] + # 3. 对 data 和 label 进行 random shuffle,具体来说,可以先对一个指示下标的数组做 random shuffle,然后用这个下标数组配合 slice 机制对 data 和 label 进行对应的 random shuffle,从而防止 data 和 label 错误匹配 + + batchnum = (datanum - 1) // batch_size + 1 # datanum 对 batch_size 下取整 + batches = [] + # 4. 计算 batch 数量,初始化 batches 列表 + + for i in range(batchnum): + batches.append((data[i * batch_size: min(datanum, (i + 1) * batch_size)], label[i * batch_size: min(datanum, (i + 1) * batch_size)])) + # 5. 通过 slice 机制选出第 i 个 batch 对应的 data 和 label 子集,放入 batches 列表中 + return batches + +def numpy_run(): + train_dataset, test_dataset = download_mnist() + + model = NumpyModel() + numpy_loss = NumpyLoss() + model.W1, model.W2, model.W3 = get_torch_initialization() + + train_loss = [] + + epoch_number = 3 + learning_rate = 0.1 + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset): + y = one_hot(y) + + # y_pred = model.forward(x.numpy()) # minibatch from pytorch + + y_pred = model.forward(x) # now x is a numpy array, so x.numpy() is not needed + + loss = numpy_loss.get_loss(y_pred, y) + + model.backward(numpy_loss.backward()) + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset)[0] + accuracy = np.mean((model.forward(x).argmax(axis=1) == y)) + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + + +def my_numpy_run(learning_rate = 0.01, epoch_number = 3, update_type = None): + train_dataset, test_dataset = download_mnist() + + model = NumpyModel(learning_rate, update_type, iter_times = 1407) + numpy_loss = NumpyLoss() + model.W1, model.W2, model.W3 = get_torch_initialization() + + train_loss = [] + + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset): + y = one_hot(y) + + # y_pred = model.forward(x.numpy()) # minibatch from pytorch + + y_pred = model.forward(x) # now x is a numpy array, so x.numpy() is not needed + + loss = numpy_loss.get_loss(y_pred, y) + + model.backward(numpy_loss.backward()) + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset)[0] + accuracy = np.mean((model.forward(x).argmax(axis=1) == y)) + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + return train_loss + # plot_curve(train_loss) + +def multi_test(): + from matplotlib import pyplot as plt + cases = [ None, + 'Momentum', + 'Nesterov', + 'Adam', + 'Inverse Time Decay', + 'Exponential Decay', + 'Natural Exponential Decay', + 'Cosine Decay', + 'AdaGrad', + 'RMSprop', + 'AdaDelta', + ] + + colors = ['#1f77b4', + '#ff7f0e', + '#2ca02c', + '#d62728', + '#9467bd', + '#8c564b', + '#e377c2', + '#7f7f7f', + '#bcbd22', + '#17becf', + '#1a55FF'] + + # Configure rcParams axes.prop_cycle to simultaneously cycle cases and colors. + # mpl.rcParams['axes.prop_cycle'] = cycler(markevery=cases, color=colors) + # Set the plot curve with markers and a title + plt.rcParams['figure.figsize'] = (10.0, 4.0) # 设置figure_size尺寸 + fig = plt.figure() + ax = fig.add_axes([0.1, 0.1, 0.6, 0.75]) + plt.xlabel('step') + plt.ylabel('loss value') + for i in range(len(cases)): + print('Test ' + str(cases[i]) + ' :') + data = my_numpy_run(update_type = cases[i]) + print('-------------\n') + ax.plot(range(len(data)), data, linewidth = 0.5, label = str(cases[i])) + ax.legend(bbox_to_anchor = (1.05, 1), loc = 'upper left', borderaxespad = 0.) + + plt.savefig("5.4.png", format = 'png', dpi = 1000) + + +if __name__ == "__main__": + numpy_run() + # my_numpy_run(learning_rate = 0.05, update_type = 'RMSprop') + # multi_test()