diff --git a/assignment-1/submission/18307130003/source.py b/assignment-1/submission/18307130003/source.py index 3676e4eed7bb10282f277697d818b8ff2103879e..e6bc70f8a0c3faff84ec5b52c083e95676140fab 100644 --- a/assignment-1/submission/18307130003/source.py +++ b/assignment-1/submission/18307130003/source.py @@ -149,6 +149,8 @@ class KNN: print(f'best k = {self.k}\n') + print(f'best k = {self.k}\n') + def predict(self, test_data: np.ndarray) -> np.ndarray: ''' Predict the label of a point using our model. diff --git a/assignment-2/submission/18307130003/README.md b/assignment-2/submission/18307130003/README.md new file mode 100644 index 0000000000000000000000000000000000000000..13d20470331535b0abb6b7355a45db22cf11fb3f --- /dev/null +++ b/assignment-2/submission/18307130003/README.md @@ -0,0 +1,622 @@ + + +# 实验报告 + +本次作业完成了选题 1 的实验内容,利用 NumPy 实现了一个 FNN 模型,并在 MNIST 数据集上进行了训练。 + +## 目录 + +- [实验报告](#实验报告) + - [目录](#目录) + - [FNN 算子的反向传播](#fnn-算子的反向传播) + - [1. Matmul](#1-matmul) + - [1.1 公式推导](#11-公式推导) + - [1.2 代码实现](#12-代码实现) + - [2. Relu](#2-relu) + - [2.1 公式推导](#21-公式推导) + - [2.2 代码实现](#22-代码实现) + - [3. Log](#3-log) + - [3.1 公式推导](#31-公式推导) + - [3.2 代码实现](#32-代码实现) + - [4. Softmax](#4-softmax) + - [4.1 公式推导](#41-公式推导) + - [4.2 代码实现](#42-代码实现) + - [函数 `mini_batch` 实现](#函数-mini_batch-实现) + - [实验过程与结果](#实验过程与结果) + - [1. 实验 1](#1-实验-1) + - [1.1 参数](#11-参数) + - [1.2 预测准确率](#12-预测准确率) + - [1.3 损失函数值](#13-损失函数值) + - [2. 实验 2](#2-实验-2) + - [2.1 参数](#21-参数) + - [2.2 预测准确率](#22-预测准确率) + - [2.3 损失函数值](#23-损失函数值) + - [3. 实验 3](#3-实验-3) + - [3.1 参数](#31-参数) + - [3.2 预测准确率](#32-预测准确率) + - [3.3 损失函数值](#33-损失函数值) + - [4. 实验 4](#4-实验-4) + - [4.1 参数](#41-参数) + - [4.2 预测准确率](#42-预测准确率) + - [4.3 损失函数值](#43-损失函数值) + +## FNN 算子的反向传播 + +### 1. Matmul + +#### 1.1 公式推导 + +输入一个 $n\times d$ 的矩阵 $X$ 和一个 $d\times d'$ 的矩阵 $W$,算子 `Matmul` 的正向传播公式为: + + + +Equation 1.1 + +输出一个 $n\times d'$ 的矩阵 $Y$。 + +对于梯度的反向传播,有 + + + +Equation 1.2 + +我们利用向量化(vectorization)进行对矩阵求导的求解: + + + +Equation 1.3 + +这里 $\mathit{vec}(X_{m\times n})$ 表示向量 + + + +Equation 1.3.1 + +$\otimes$ 表示 Kronecker 积,下标表示矩阵或向量的维度。 + +因此有 + + + +Equation 1.4 + +类似 $(1.3)$ 的推导,同理可得 + + + +Equation 1.5 + +#### 1.2 代码实现 + +```python {.line-numbers} +# numpy_fnn.py + +class Matmul(NumpyOp): + ''' + Matrix multiplication unit. + ''' + + def forward(self, x: np.ndarray, w: np.ndarray) -> np.ndarray: + ''' + Args: + x: shape(N, d) + w: shape(d, d') + + Returns: + shape(N, d') + ''' + + self.memory['x'] = x + self.memory['w'] = w + return np.matmul(x, w) + + def backward(self, grad_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + ''' + Args: + grad_y: shape(N, d') + + Returns: + grad_x: shape(N, d) + grad_w: shape(d, d') + ''' + + x: np.ndarray = self.memory['x'] + w: np.ndarray = self.memory['w'] + grad_x: np.ndarray = np.matmul(grad_y, w.T) + grad_w: np.ndarray = np.matmul(x.T, grad_y) + return grad_x, grad_w +``` + +### 2. Relu + +#### 2.1 公式推导 + +输入一个 $n\times d$ 的矩阵 $X$,对于 $X$ 中的每个元素 $X_{ij}$,算子 `Relu` 的正向传播公式为: + + + +Equation 2.1 + +输出一个 $n\times d$ 的矩阵 $Y$。 + +对于梯度的反向传播,有 + + + +Equation 2.2 + +这里 $\odot$ 表示 Hadamard 积,即逐元素(element-wise)乘积。 + +其中,对于 $\frac{\partial Y}{\partial X}\_{n\times d}$ 中的每个元素 ${Y'}\_{ij}$,由 $(2.1)$ 有 + + + +Equation 2.3 + +因此,对于 $\frac{\partial z}{\partial X}\_{n\times d}$ 中的每个元素 ${Z'}\_{ij}$,令 ${Z\_Y}' = \frac{\partial z}{\partial Y}\_{n\times d}$,由 $(2.2)$ 有 + + + +Equation 2.4 + +#### 2.2 代码实现 + +```python {.line-numbers} +# numpy_fnn.py + +class Relu(NumpyOp): + ''' + Rectified Linear Unit. + ''' + + def forward(self, x: np.ndarray) -> np.ndarray: + ''' + Args: + x: shape(N, d) + + Returns: + shape(N, d) + ''' + + self.memory['x'] = x + return np.where(x > 0, x, 0) + + def backward(self, grad_y: np.ndarray) -> np.ndarray: + ''' + Args: + grad_y: shape(N, d) + + Returns: + shape(N, d) + ''' + + x: np.ndarray = self.memory['x'] + return np.where(x > 0, grad_y, 0) +``` + +### 3. Log + +#### 3.1 公式推导 + +输入一个 $n\times d$ 的矩阵 $X$,对于 $X$ 中的每个元素 $X_{ij}$,算子 `Log` 的正向传播公式为: + + + +Equation 3.1 + +输出一个 $n\times d$ 的矩阵 $Y$。 + +对于梯度的反向传播,有 + + + +Equation 3.2 + +其中,对于 $\frac{\partial Y}{\partial X}\_{n\times d}$ 中的每个元素 ${Y'}\_{ij}$,由 $(3.1)$ 有 + + + +Equation 3.3 + +因此,对于 $\frac{\partial z}{\partial X}\_{n\times d}$ 中的每个元素 ${Z'}\_{ij}$,令 ${Z\_Y}' = \frac{\partial z}{\partial Y}\_{n\times d}$,由 $(3.2)$ 有 + + + +Equation 3.4 + +#### 3.2 代码实现 + +为了防止 $X_{ij} = 0$ 时出现 $\log{X_{ij}} = -\infty$ 导致溢出,这里我们给 $X_{ij}$ 附加了一个 $\epsilon = 10^{-12}$ 的修正。 + +```python {.line-numbers} +# numpy_fnn.py + +class Log(NumpyOp): + ''' + Natural logarithm unit. + ''' + + def forward(self, x: np.ndarray) -> np.ndarray: + ''' + Args: + x: shape(N, d) + + Returns: + shape(N, d) + ''' + + self.memory['x'] = x + return np.log(x + self.epsilon) + + def backward(self, grad_y: np.ndarray) -> np.ndarray: + ''' + Args: + grad_y: shape(N, d) + + Returns: + shape(N, d) + ''' + + x: np.ndarray = self.memory['x'] + return grad_y / x +``` + +### 4. Softmax + +#### 4.1 公式推导 + +输入一个 $n\times d$ 的矩阵 $X$,对于 $X$ 中的每个元素 $X_{ij}$,算子 `Softmax` 的正向传播公式为: + + + +Equation 4.1 + +输出一个 $n\times d$ 的矩阵 $Y$。 + +对于梯度的反向传播,有 + + + +Equation 4.2 + +其中,对于 $\frac{\partial z}{\partial X}\_{n\times d}$ 中的每个元素 ${Z'}\_{ij}$ 有 + + + +Equation 4.3 + +#### 4.2 代码实现 + +```python {.line-numbers} +# numpy_fnn.py + +class Softmax(NumpyOp): + ''' + Softmax over last dimension. + ''' + + def forward(self, x: np.ndarray) -> np.ndarray: + ''' + Args: + x: shape(N, d) + + Returns: + shape(N, d) + ''' + + y: np.ndarray = np.exp(x) / np.exp(x).sum(axis=1)[:, None] + self.memory['y'] = y + return y + + def backward(self, grad_y: np.ndarray) -> np.ndarray: + ''' + Args: + grad_y: shape(N, d) + + Returns: + shape(N, d) + ''' + + y: np.ndarray = self.memory['y'] + return y * (grad_y - (grad_y * y).sum(axis=1)[:, None]) +``` + +## 函数 `mini_batch` 实现 + +我们使用 NumPy 重写了函数 `mini_batch`,用于之后的训练。 + +```python {.line-numbers} +# numpy_mnist.py + +def mini_batch(dataset: List[Tuple[Any, int]], batch_size=128) -> np.ndarray: + ''' + Split the data and labels from the given dataset into batches. + + Args: + dataset: the given dataset + batch_size: the size of retrieved data + + Returns: + Batches of [data, labels] pair. + ''' + + data: np.ndarray = np.array([np.array(pair[0]) for pair in dataset]) + labels: np.ndarray = np.array([pair[1] for pair in dataset]) + + # Shuffle the dataset + size: int = len(dataset) + indices: np.ndarray = np.arange(size) + np.random.shuffle(indices) + + batches: List[Tuple[np.ndarray, np.ndarray]] = [] + for i in range(0, size, batch_size): + chunk: np.ndarray = indices[i:i+batch_size] + batches.append((data[chunk], labels[chunk])) + return batches +``` + +## 实验过程与结果 + +执行以下命令开始训练。 + +```bash +python ./numpy_mnist.py +``` + +### 1. 实验 1 + +#### 1.1 参数 + +```python {.line-numbers} +epoch_number = 3 +batch_size = 128 +learning_rate = 0.1 +``` + +#### 1.2 预测准确率 + +```text +[0] Accuracy: 0.9485 +[1] Accuracy: 0.9647 +[2] Accuracy: 0.9715 +``` + +#### 1.3 损失函数值 + +![Loss value 1](./img/loss_value_1.png) + +### 2. 实验 2 + +这次,我们调大训练轮数(`epoch_number`),观察预测准确率的变化。 + +#### 2.1 参数 + +```python {.line-numbers} +epoch_number = 10 +batch_size = 128 +learning_rate = 0.1 +``` + +#### 2.2 预测准确率 + +```text +[0] Accuracy: 0.9496 +[1] Accuracy: 0.9600 +[2] Accuracy: 0.9675 +[3] Accuracy: 0.9761 +[4] Accuracy: 0.9755 +[5] Accuracy: 0.9775 +[6] Accuracy: 0.9791 +[7] Accuracy: 0.9795 +[8] Accuracy: 0.9781 +[9] Accuracy: 0.9810 +``` + +可见,训练轮数越多,预测准确率越高,但到达一定准确率后开始波动,不再明显上升。 + +#### 2.3 损失函数值 + +![Loss value 2](./img/loss_value_2.png) + +可见,训练轮数越多,损失函数值越低,波动越小。 + +### 3. 实验 3 + +这次,我们调大批处理时每批数据大小(`batch_size`),观察预测准确率的变化。 + +#### 3.1 参数 + +```python {.line-numbers} +epoch_number = 10 +batch_size = 256 +learning_rate = 0.1 +``` + +#### 3.2 预测准确率 + +```text +[0] Accuracy: 0.9253 +[1] Accuracy: 0.9477 +[2] Accuracy: 0.9522 +[3] Accuracy: 0.9651 +[4] Accuracy: 0.9680 +[5] Accuracy: 0.9731 +[6] Accuracy: 0.9755 +[7] Accuracy: 0.9766 +[8] Accuracy: 0.9766 +[9] Accuracy: 0.9790 +``` + +可见,每批数据的大小越大,训练速度越慢,但最终达到的准确率没有明显变化。 + +#### 3.3 损失函数值 + +![Loss value 3](./img/loss_value_3.png) + +可见,每批数据的大小越大,损失函数值整体的波动越小。 + +### 4. 实验 4 + +这次,我们提高学习率(`learning_rate`),观察预测准确率的变化。 + +#### 4.1 参数 + +```python {.line-numbers} +epoch_number = 10 +batch_size = 256 +learning_rate = 0.5 +``` + +#### 4.2 预测准确率 + +```text +[0] Accuracy: 0.9505 +[1] Accuracy: 0.9675 +[2] Accuracy: 0.9706 +[3] Accuracy: 0.9471 +[4] Accuracy: 0.9737 +[5] Accuracy: 0.8980 +[6] Accuracy: 0.9757 +[7] Accuracy: 0.9573 +[8] Accuracy: 0.9770 +[9] Accuracy: 0.9792 +``` + +可见,学习率越高,训练速度越快,但训练时的波动也可能较大。 + +#### 4.3 损失函数值 + +![Loss value 4](./img/loss_value_4.png) + +可见,学习率越高,损失函数值下降越快,但整体的波动也可能较大。 diff --git a/assignment-2/submission/18307130003/img/equation_1.1.svg b/assignment-2/submission/18307130003/img/equation_1.1.svg new file mode 100644 index 0000000000000000000000000000000000000000..048e56f154f2f40bd0cdaf0702615807c2145ca0 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_1.1.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_1.2.svg b/assignment-2/submission/18307130003/img/equation_1.2.svg new file mode 100644 index 0000000000000000000000000000000000000000..33892d4bcf22bcded97604ddc5cf354c18abb576 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_1.2.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_1.3.1.svg b/assignment-2/submission/18307130003/img/equation_1.3.1.svg new file mode 100644 index 0000000000000000000000000000000000000000..005032ef40fc10a2c05b5a740e7dcde4a78a1c48 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_1.3.1.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_1.3.svg b/assignment-2/submission/18307130003/img/equation_1.3.svg new file mode 100644 index 0000000000000000000000000000000000000000..68d04c5e681902537b85d4a254b8bf85524089e3 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_1.3.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_1.4.svg b/assignment-2/submission/18307130003/img/equation_1.4.svg new file mode 100644 index 0000000000000000000000000000000000000000..d655eaae87b6dff86ca1117861409f13019470f2 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_1.4.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_1.5.svg b/assignment-2/submission/18307130003/img/equation_1.5.svg new file mode 100644 index 0000000000000000000000000000000000000000..54e7cb1aa6e7776fbe1ae1fd3106fb99f2bc1352 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_1.5.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_2.1.svg b/assignment-2/submission/18307130003/img/equation_2.1.svg new file mode 100644 index 0000000000000000000000000000000000000000..eb21ebb6af0ec4fa6dbdaf6e9a01f907cc863aab --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_2.1.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_2.2.svg b/assignment-2/submission/18307130003/img/equation_2.2.svg new file mode 100644 index 0000000000000000000000000000000000000000..7edf73de46bde7973651f6e6fb85c7c5ff6eb233 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_2.2.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_2.3.svg b/assignment-2/submission/18307130003/img/equation_2.3.svg new file mode 100644 index 0000000000000000000000000000000000000000..bf68e43b7fc39081e5efec8cb6a7ac267af76b12 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_2.3.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_2.4.svg b/assignment-2/submission/18307130003/img/equation_2.4.svg new file mode 100644 index 0000000000000000000000000000000000000000..d6433c573aa219e7e1aae3d34c4e72243cd974ac --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_2.4.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_3.1.svg b/assignment-2/submission/18307130003/img/equation_3.1.svg new file mode 100644 index 0000000000000000000000000000000000000000..391e1937a4e4b7acab50955142dd3625b07599b5 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_3.1.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_3.2.svg b/assignment-2/submission/18307130003/img/equation_3.2.svg new file mode 100644 index 0000000000000000000000000000000000000000..3d623c23089d0e03c44240fcf4432ff36157c86a --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_3.2.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_3.3.svg b/assignment-2/submission/18307130003/img/equation_3.3.svg new file mode 100644 index 0000000000000000000000000000000000000000..6f5a88351d0077541fca59541f6002cfe029f90d --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_3.3.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_3.4.svg b/assignment-2/submission/18307130003/img/equation_3.4.svg new file mode 100644 index 0000000000000000000000000000000000000000..27682eda729e7c7027f1e7f8a1e946b31916ff64 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_3.4.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_4.1.svg b/assignment-2/submission/18307130003/img/equation_4.1.svg new file mode 100644 index 0000000000000000000000000000000000000000..a4a904174c49236781bed187d439cb9193e0f4d3 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_4.1.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_4.2.svg b/assignment-2/submission/18307130003/img/equation_4.2.svg new file mode 100644 index 0000000000000000000000000000000000000000..2256e00b04832055cf7d6c293df80d552f652425 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_4.2.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/equation_4.3.svg b/assignment-2/submission/18307130003/img/equation_4.3.svg new file mode 100644 index 0000000000000000000000000000000000000000..2136ae53dcd7c04ffefcbd1277a7d90fd6d21297 --- /dev/null +++ b/assignment-2/submission/18307130003/img/equation_4.3.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/assignment-2/submission/18307130003/img/loss_value_1.png b/assignment-2/submission/18307130003/img/loss_value_1.png new file mode 100644 index 0000000000000000000000000000000000000000..ac76f374937aadf8ea57663647c3fbf8c8efc8fc Binary files /dev/null and b/assignment-2/submission/18307130003/img/loss_value_1.png differ diff --git a/assignment-2/submission/18307130003/img/loss_value_2.png b/assignment-2/submission/18307130003/img/loss_value_2.png new file mode 100644 index 0000000000000000000000000000000000000000..9b09bfa0b05cc98cc41408c4703ca8b4c87ddf56 Binary files /dev/null and b/assignment-2/submission/18307130003/img/loss_value_2.png differ diff --git a/assignment-2/submission/18307130003/img/loss_value_3.png b/assignment-2/submission/18307130003/img/loss_value_3.png new file mode 100644 index 0000000000000000000000000000000000000000..482bd3f6c977c7acb1a0c6dfbf6acc687d92c47d Binary files /dev/null and b/assignment-2/submission/18307130003/img/loss_value_3.png differ diff --git a/assignment-2/submission/18307130003/img/loss_value_4.png b/assignment-2/submission/18307130003/img/loss_value_4.png new file mode 100644 index 0000000000000000000000000000000000000000..268f540308806eb531e46cddbd64a4ebfaedcdc6 Binary files /dev/null and b/assignment-2/submission/18307130003/img/loss_value_4.png differ diff --git a/assignment-2/submission/18307130003/numpy_fnn.py b/assignment-2/submission/18307130003/numpy_fnn.py new file mode 100644 index 0000000000000000000000000000000000000000..4331007a5835d00d955e95447e4f20f03481c9ae --- /dev/null +++ b/assignment-2/submission/18307130003/numpy_fnn.py @@ -0,0 +1,213 @@ +from typing import Tuple +import numpy as np + + +class NumpyOp: + ''' + The base class for Numpy operations. + ''' + + def __init__(self): + self.memory = {} + self.epsilon = 1e-12 + + +class Matmul(NumpyOp): + ''' + Matrix multiplication unit. + ''' + + def forward(self, x: np.ndarray, w: np.ndarray) -> np.ndarray: + ''' + Args: + x: shape(N, d) + w: shape(d, d') + + Returns: + shape(N, d') + ''' + + self.memory['x'] = x + self.memory['w'] = w + return np.matmul(x, w) + + def backward(self, grad_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + ''' + Args: + grad_y: shape(N, d') + + Returns: + grad_x: shape(N, d) + grad_w: shape(d, d') + ''' + + x: np.ndarray = self.memory['x'] + w: np.ndarray = self.memory['w'] + grad_x: np.ndarray = np.matmul(grad_y, w.T) + grad_w: np.ndarray = np.matmul(x.T, grad_y) + return grad_x, grad_w + + +class Relu(NumpyOp): + ''' + Rectified Linear Unit. + ''' + + def forward(self, x: np.ndarray) -> np.ndarray: + ''' + Args: + x: shape(N, d) + + Returns: + shape(N, d) + ''' + + self.memory['x'] = x + return np.where(x > 0, x, 0) + + def backward(self, grad_y: np.ndarray) -> np.ndarray: + ''' + Args: + grad_y: shape(N, d) + + Returns: + shape(N, d) + ''' + + x: np.ndarray = self.memory['x'] + return np.where(x > 0, grad_y, 0) + + +class Log(NumpyOp): + ''' + Natural logarithm unit. + ''' + + def forward(self, x: np.ndarray) -> np.ndarray: + ''' + Args: + x: shape(N, d) + + Returns: + shape(N, d) + ''' + + self.memory['x'] = x + return np.log(x + self.epsilon) + + def backward(self, grad_y: np.ndarray) -> np.ndarray: + ''' + Args: + grad_y: shape(N, d) + + Returns: + shape(N, d) + ''' + + x: np.ndarray = self.memory['x'] + return grad_y / x + + +class Softmax(NumpyOp): + ''' + Softmax over last dimension. + ''' + + def forward(self, x: np.ndarray) -> np.ndarray: + ''' + Args: + x: shape(N, d) + + Returns: + shape(N, d) + ''' + + y: np.ndarray = np.exp(x) / np.exp(x).sum(axis=1)[:, None] + self.memory['y'] = y + return y + + def backward(self, grad_y: np.ndarray) -> np.ndarray: + ''' + Args: + grad_y: shape(N, d) + + Returns: + shape(N, d) + ''' + + y: np.ndarray = self.memory['y'] + return y * (grad_y - (grad_y * y).sum(axis=1)[:, None]) + + +class NumpyLoss: + ''' + Loss function. + ''' + + def __init__(self): + self.target: np.ndarray = None + + def get_loss(self, pred: np.ndarray, target: np.ndarray) -> float: + self.target = target + return (-pred * target).sum(axis=1).mean() + + def backward(self) -> np.ndarray: + return -self.target / self.target.shape[0] + + +class NumpyModel: + ''' + An FNN model implemented in NumPy. + ''' + + def __init__(self): + self.W1: np.ndarray = np.random.normal(size=(28 * 28, 256)) + self.W2: np.ndarray = np.random.normal(size=(256, 64)) + self.W3: np.ndarray = np.random.normal(size=(64, 10)) + + self.matmul_1 = Matmul() + self.relu_1 = Relu() + + self.matmul_2 = Matmul() + self.relu_2 = Relu() + + self.matmul_3 = Matmul() + self.softmax_3 = Softmax() + self.log_3 = Log() + + def forward(self, x: np.ndarray) -> np.ndarray: + ''' + Args: + x: shape(N, d) + + Returns: + shape(N, d) + ''' + + x = x.reshape(-1, 28 * 28) + x = self.relu_1.forward(self.matmul_1.forward(x, self.W1)) + x = self.relu_2.forward(self.matmul_2.forward(x, self.W2)) + x = self.softmax_3.forward(self.matmul_3.forward(x, self.W3)) + x = self.log_3.forward(x) + return x + + def backward(self, y: np.ndarray) -> None: + ''' + Args: + y: shape(N, d) + ''' + + self.log_grad = self.log_3.backward(y) + self.softmax_grad = self.softmax_3.backward(self.log_grad) + self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad) + + self.relu_2_grad = self.relu_2.backward(self.x3_grad) + self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad) + + self.relu_1_grad = self.relu_1.backward(self.x2_grad) + self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad) + + def optimize(self, learning_rate: float): + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad diff --git a/assignment-2/submission/18307130003/numpy_mnist.py b/assignment-2/submission/18307130003/numpy_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..f992979ba869396a0885b228240cbae6d83404fc --- /dev/null +++ b/assignment-2/submission/18307130003/numpy_mnist.py @@ -0,0 +1,79 @@ +from typing import Any, List, Tuple +import numpy as np +from numpy_fnn import NumpyModel, NumpyLoss +from utils import ( + download_mnist, + batch, + # mini_batch, + get_torch_initialization, + plot_curve, + one_hot, +) + + +def mini_batch(dataset: List[Tuple[Any, int]], batch_size=128) -> np.ndarray: + ''' + Split the data and labels from the given dataset into batches. + + Args: + dataset: the given dataset + batch_size: the size of retrieved data + + Returns: + Batches of [data, labels] pair. + ''' + + data: np.ndarray = np.array([np.array(pair[0]) for pair in dataset]) + labels: np.ndarray = np.array([pair[1] for pair in dataset]) + + # Shuffle the dataset + size: int = len(dataset) + indices: np.ndarray = np.arange(size) + np.random.shuffle(indices) + + batches: List[Tuple[np.ndarray, np.ndarray]] = [] + for i in range(0, size, batch_size): + chunk: np.ndarray = indices[i:i+batch_size] + batches.append((data[chunk], labels[chunk])) + return batches + + +def numpy_run(): + ''' + Train the FNN network on MNIST. + ''' + + train_dataset, test_dataset = download_mnist() + + model = NumpyModel() + numpy_loss = NumpyLoss() + model.W1, model.W2, model.W3 = get_torch_initialization() + + train_loss: List[float] = [] + + epoch_number = 10 + batch_size = 256 + learning_rate = 0.5 + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset, batch_size=batch_size): + x: np.ndarray + y: np.ndarray = one_hot(y) + + y_pred = model.forward(x) + loss = numpy_loss.get_loss(y_pred, y) + + model.backward(numpy_loss.backward()) + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset)[0] + accuracy: float = np.mean((model.forward(x).argmax(axis=1) == y)) + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + +if __name__ == "__main__": + numpy_run() diff --git a/assignment-2/submission/18307130003/tester_demo.py b/assignment-2/submission/18307130003/tester_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..68874dece3ca3717b19a722e8489764d089a7aa0 --- /dev/null +++ b/assignment-2/submission/18307130003/tester_demo.py @@ -0,0 +1,191 @@ +import numpy as np +import torch +from torch import ( + matmul as torch_matmul, + relu as torch_relu, + softmax as torch_softmax, + log as torch_log, +) + +from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss +from torch_mnist import TorchModel +from utils import get_torch_initialization, one_hot + +err_epsilon = 1e-6 +err_p = 0.4 + + +def check_result(numpy_result, torch_result=None): + if isinstance(numpy_result, list) and torch_result is None: + flag = True + for (n, t) in numpy_result: + flag = flag and check_result(n, t) + return flag + T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item() + direction = T / torch_result.numel() < err_p + result = direction and ( + (torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon + ).item() + print(result) + return result + + +def case_1(): + x = np.random.normal(size=[5, 6]) + W = np.random.normal(size=[6, 4]) + + numpy_matmul = Matmul() + numpy_out = numpy_matmul.forward(x, W) + numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + torch_W = torch.from_numpy(W).clone().requires_grad_() + + torch_out = torch_matmul(torch_x, torch_W) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + (numpy_x_grad, torch_x.grad), + (numpy_W_grad, torch_W.grad), + ]) + + +def case_2(): + x = np.random.normal(size=[5, 6]) + + numpy_relu = Relu() + numpy_out = numpy_relu.forward(x) + numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_relu(torch_x) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + (numpy_x_grad, torch_x.grad), + ]) + + +def case_3(): + x = np.random.uniform(low=0.0, high=1.0, size=[3, 4]) + + numpy_log = Log() + numpy_out = numpy_log.forward(x) + numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_log(torch_x) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + (numpy_x_grad, torch_x.grad), + ]) + + +def case_4(): + x = np.random.normal(size=[4, 5]) + + numpy_softmax = Softmax() + numpy_out = numpy_softmax.forward(x) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_softmax(torch_x, 1) + + return check_result(numpy_out, torch_out) + + +def case_5(): + x = np.random.normal(size=[20, 25]) + + numpy_softmax = Softmax() + numpy_out = numpy_softmax.forward(x) + numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_softmax(torch_x, 1) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + (numpy_x_grad, torch_x.grad), + ]) + + +def test_model(): + try: + numpy_loss = NumpyLoss() + numpy_model = NumpyModel() + torch_model = TorchModel() + torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = ( + get_torch_initialization(numpy=False) + ) + numpy_model.W1 = torch_model.W1.detach().clone().numpy() + numpy_model.W2 = torch_model.W2.detach().clone().numpy() + numpy_model.W3 = torch_model.W3.detach().clone().numpy() + + x = torch.randn((10000, 28, 28)) + y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000) + + y = one_hot(y, numpy=False) + x2 = x.numpy() + y_pred = torch_model.forward(x) + loss = (-y_pred * y).sum(dim=1).mean() + loss.backward() + + y_pred_numpy = numpy_model.forward(x2) + numpy_loss.get_loss(y_pred_numpy, y.numpy()) + + check_flag_1 = check_result(y_pred_numpy, y_pred) + print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10)) + except Exception as err: + print("[Runtime Error in forward, error: {}]".format(err)) + print("+ {:12} {}/{}".format("forward", 0, 10)) + return 0 + + try: + numpy_model.backward(numpy_loss.backward()) + + check_flag_2 = [ + check_result(numpy_model.log_grad, torch_model.log_input.grad), + check_result(numpy_model.softmax_grad, + torch_model.softmax_input.grad), + check_result(numpy_model.W3_grad, torch_model.W3.grad), + check_result(numpy_model.W2_grad, torch_model.W2.grad), + check_result(numpy_model.W1_grad, torch_model.W1.grad) + ] + check_flag_2 = sum(check_flag_2) / 5 + print("+ {:12} {}/{}".format("backward", int(20 * check_flag_2), 20)) + except Exception as err: + print("[Runtime Error in backward, error: {}]".format(err)) + print("+ {:12} {}/{}".format("backward", 0, 20)) + check_flag_2 = False + + return int(10 * check_flag_1 + 20 * check_flag_2) + + +if __name__ == "__main__": + testcases = [ + ["matmul", case_1, 5], + ["relu", case_2, 5], + ["log", case_3, 5], + ["softmax_1", case_4, 5], + ["softmax_2", case_5, 10], + ] + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except Exception as err: + print("[Runtime Error in {}, error: {}]".format(case[0], err)) + res = 0 + score += res + print("+ {:12} {}/{}".format(case[0], res, case[2])) + score += test_model() + print("{:14} {}/60".format("FINAL SCORE", score)) diff --git a/assignment-2/submission/18307130003/torch_mnist.py b/assignment-2/submission/18307130003/torch_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..dec6c89db4ba96accc0696527a1bf5e0d6da66c0 --- /dev/null +++ b/assignment-2/submission/18307130003/torch_mnist.py @@ -0,0 +1,74 @@ +import torch +from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve + + +class TorchModel: + + def __init__(self): + self.W1 = torch.randn((28 * 28, 256), requires_grad=True) + self.W2 = torch.randn((256, 64), requires_grad=True) + self.W3 = torch.randn((64, 10), requires_grad=True) + self.softmax_input = None + self.log_input = None + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + x = torch.relu(torch.matmul(x, self.W1)) + x = torch.relu(torch.matmul(x, self.W2)) + x = torch.matmul(x, self.W3) + + self.softmax_input = x + self.softmax_input.retain_grad() + + x = torch.softmax(x, 1) + + self.log_input = x + self.log_input.retain_grad() + + x = torch.log(x) + + return x + + def optimize(self, learning_rate): + with torch.no_grad(): + self.W1 -= learning_rate * self.W1.grad + self.W2 -= learning_rate * self.W2.grad + self.W3 -= learning_rate * self.W3.grad + + self.W1.grad = None + self.W2.grad = None + self.W3.grad = None + + +def torch_run(): + train_dataset, test_dataset = download_mnist() + + model = TorchModel() + model.W1.data, model.W2.data, model.W3.data = get_torch_initialization( + numpy=False) + + train_loss = [] + + epoch_number = 3 + learning_rate = 0.1 + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset, numpy=False): + y = one_hot(y, numpy=False) + + y_pred = model.forward(x) + loss = (-y_pred * y).sum(dim=1).mean() + loss.backward() + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset, numpy=False)[0] + accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item() + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + +if __name__ == "__main__": + torch_run() diff --git a/assignment-2/submission/18307130003/utils.py b/assignment-2/submission/18307130003/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f471049774fe2c72cf7e490cc94078ad8e7b593f --- /dev/null +++ b/assignment-2/submission/18307130003/utils.py @@ -0,0 +1,75 @@ +import torch +import numpy as np +from matplotlib import pyplot as plt + + +def plot_curve(data): + plt.plot(range(len(data)), data, color='blue') + plt.legend(['loss_value'], loc='upper right') + plt.xlabel('step') + plt.ylabel('value') + plt.show() + + +def download_mnist(): + from torchvision import datasets, transforms + + transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean=(0.1307,), std=(0.3081,)) + ]) + + train_dataset = datasets.MNIST( + root="./data/", transform=transform, train=True, download=True + ) + test_dataset = datasets.MNIST( + root="./data/", transform=transform, train=False, download=True + ) + + return train_dataset, test_dataset + + +def one_hot(y, numpy=True): + if numpy: + y_ = np.zeros((y.shape[0], 10)) + y_[np.arange(y.shape[0], dtype=np.int32), y] = 1 + return y_ + else: + y_ = torch.zeros((y.shape[0], 10)) + y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1 + return y_ + + +def batch(dataset, numpy=True): + data = [] + label = [] + for each in dataset: + data.append(each[0]) + label.append(each[1]) + data = torch.stack(data) + label = torch.LongTensor(label) + if numpy: + return [(data.numpy(), label.numpy())] + else: + return [(data, label)] + + +def mini_batch(dataset, batch_size=128, numpy=False): + return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True) + + +def get_torch_initialization(numpy=True): + fc1 = torch.nn.Linear(28 * 28, 256) + fc2 = torch.nn.Linear(256, 64) + fc3 = torch.nn.Linear(64, 10) + + if numpy: + W1 = fc1.weight.T.detach().clone().numpy() + W2 = fc2.weight.T.detach().clone().numpy() + W3 = fc3.weight.T.detach().clone().numpy() + else: + W1 = fc1.weight.T.detach().clone().data + W2 = fc2.weight.T.detach().clone().data + W3 = fc3.weight.T.detach().clone().data + + return W1, W2, W3