diff --git a/assignment-1/submission/18307130003/source.py b/assignment-1/submission/18307130003/source.py
index 3676e4eed7bb10282f277697d818b8ff2103879e..e6bc70f8a0c3faff84ec5b52c083e95676140fab 100644
--- a/assignment-1/submission/18307130003/source.py
+++ b/assignment-1/submission/18307130003/source.py
@@ -149,6 +149,8 @@ class KNN:
print(f'best k = {self.k}\n')
+ print(f'best k = {self.k}\n')
+
def predict(self, test_data: np.ndarray) -> np.ndarray:
'''
Predict the label of a point using our model.
diff --git a/assignment-2/submission/18307130003/README.md b/assignment-2/submission/18307130003/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..13d20470331535b0abb6b7355a45db22cf11fb3f
--- /dev/null
+++ b/assignment-2/submission/18307130003/README.md
@@ -0,0 +1,622 @@
+
+
+# 实验报告
+
+本次作业完成了选题 1 的实验内容,利用 NumPy 实现了一个 FNN 模型,并在 MNIST 数据集上进行了训练。
+
+## 目录
+
+- [实验报告](#实验报告)
+ - [目录](#目录)
+ - [FNN 算子的反向传播](#fnn-算子的反向传播)
+ - [1. Matmul](#1-matmul)
+ - [1.1 公式推导](#11-公式推导)
+ - [1.2 代码实现](#12-代码实现)
+ - [2. Relu](#2-relu)
+ - [2.1 公式推导](#21-公式推导)
+ - [2.2 代码实现](#22-代码实现)
+ - [3. Log](#3-log)
+ - [3.1 公式推导](#31-公式推导)
+ - [3.2 代码实现](#32-代码实现)
+ - [4. Softmax](#4-softmax)
+ - [4.1 公式推导](#41-公式推导)
+ - [4.2 代码实现](#42-代码实现)
+ - [函数 `mini_batch` 实现](#函数-mini_batch-实现)
+ - [实验过程与结果](#实验过程与结果)
+ - [1. 实验 1](#1-实验-1)
+ - [1.1 参数](#11-参数)
+ - [1.2 预测准确率](#12-预测准确率)
+ - [1.3 损失函数值](#13-损失函数值)
+ - [2. 实验 2](#2-实验-2)
+ - [2.1 参数](#21-参数)
+ - [2.2 预测准确率](#22-预测准确率)
+ - [2.3 损失函数值](#23-损失函数值)
+ - [3. 实验 3](#3-实验-3)
+ - [3.1 参数](#31-参数)
+ - [3.2 预测准确率](#32-预测准确率)
+ - [3.3 损失函数值](#33-损失函数值)
+ - [4. 实验 4](#4-实验-4)
+ - [4.1 参数](#41-参数)
+ - [4.2 预测准确率](#42-预测准确率)
+ - [4.3 损失函数值](#43-损失函数值)
+
+## FNN 算子的反向传播
+
+### 1. Matmul
+
+#### 1.1 公式推导
+
+输入一个 $n\times d$ 的矩阵 $X$ 和一个 $d\times d'$ 的矩阵 $W$,算子 `Matmul` 的正向传播公式为:
+
+
+
+
+
+输出一个 $n\times d'$ 的矩阵 $Y$。
+
+对于梯度的反向传播,有
+
+
+
+
+
+我们利用向量化(vectorization)进行对矩阵求导的求解:
+
+
+
+
+
+这里 $\mathit{vec}(X_{m\times n})$ 表示向量
+
+
+
+
+
+$\otimes$ 表示 Kronecker 积,下标表示矩阵或向量的维度。
+
+因此有
+
+
+
+
+
+类似 $(1.3)$ 的推导,同理可得
+
+
+
+
+
+#### 1.2 代码实现
+
+```python {.line-numbers}
+# numpy_fnn.py
+
+class Matmul(NumpyOp):
+ '''
+ Matrix multiplication unit.
+ '''
+
+ def forward(self, x: np.ndarray, w: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ x: shape(N, d)
+ w: shape(d, d')
+
+ Returns:
+ shape(N, d')
+ '''
+
+ self.memory['x'] = x
+ self.memory['w'] = w
+ return np.matmul(x, w)
+
+ def backward(self, grad_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ '''
+ Args:
+ grad_y: shape(N, d')
+
+ Returns:
+ grad_x: shape(N, d)
+ grad_w: shape(d, d')
+ '''
+
+ x: np.ndarray = self.memory['x']
+ w: np.ndarray = self.memory['w']
+ grad_x: np.ndarray = np.matmul(grad_y, w.T)
+ grad_w: np.ndarray = np.matmul(x.T, grad_y)
+ return grad_x, grad_w
+```
+
+### 2. Relu
+
+#### 2.1 公式推导
+
+输入一个 $n\times d$ 的矩阵 $X$,对于 $X$ 中的每个元素 $X_{ij}$,算子 `Relu` 的正向传播公式为:
+
+
+
+
+
+输出一个 $n\times d$ 的矩阵 $Y$。
+
+对于梯度的反向传播,有
+
+
+
+
+
+这里 $\odot$ 表示 Hadamard 积,即逐元素(element-wise)乘积。
+
+其中,对于 $\frac{\partial Y}{\partial X}\_{n\times d}$ 中的每个元素 ${Y'}\_{ij}$,由 $(2.1)$ 有
+
+
+
+
+
+因此,对于 $\frac{\partial z}{\partial X}\_{n\times d}$ 中的每个元素 ${Z'}\_{ij}$,令 ${Z\_Y}' = \frac{\partial z}{\partial Y}\_{n\times d}$,由 $(2.2)$ 有
+
+
+
+
+
+#### 2.2 代码实现
+
+```python {.line-numbers}
+# numpy_fnn.py
+
+class Relu(NumpyOp):
+ '''
+ Rectified Linear Unit.
+ '''
+
+ def forward(self, x: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ x: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ self.memory['x'] = x
+ return np.where(x > 0, x, 0)
+
+ def backward(self, grad_y: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ grad_y: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ x: np.ndarray = self.memory['x']
+ return np.where(x > 0, grad_y, 0)
+```
+
+### 3. Log
+
+#### 3.1 公式推导
+
+输入一个 $n\times d$ 的矩阵 $X$,对于 $X$ 中的每个元素 $X_{ij}$,算子 `Log` 的正向传播公式为:
+
+
+
+
+
+输出一个 $n\times d$ 的矩阵 $Y$。
+
+对于梯度的反向传播,有
+
+
+
+
+
+其中,对于 $\frac{\partial Y}{\partial X}\_{n\times d}$ 中的每个元素 ${Y'}\_{ij}$,由 $(3.1)$ 有
+
+
+
+
+
+因此,对于 $\frac{\partial z}{\partial X}\_{n\times d}$ 中的每个元素 ${Z'}\_{ij}$,令 ${Z\_Y}' = \frac{\partial z}{\partial Y}\_{n\times d}$,由 $(3.2)$ 有
+
+
+
+
+
+#### 3.2 代码实现
+
+为了防止 $X_{ij} = 0$ 时出现 $\log{X_{ij}} = -\infty$ 导致溢出,这里我们给 $X_{ij}$ 附加了一个 $\epsilon = 10^{-12}$ 的修正。
+
+```python {.line-numbers}
+# numpy_fnn.py
+
+class Log(NumpyOp):
+ '''
+ Natural logarithm unit.
+ '''
+
+ def forward(self, x: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ x: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ self.memory['x'] = x
+ return np.log(x + self.epsilon)
+
+ def backward(self, grad_y: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ grad_y: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ x: np.ndarray = self.memory['x']
+ return grad_y / x
+```
+
+### 4. Softmax
+
+#### 4.1 公式推导
+
+输入一个 $n\times d$ 的矩阵 $X$,对于 $X$ 中的每个元素 $X_{ij}$,算子 `Softmax` 的正向传播公式为:
+
+
+
+
+
+输出一个 $n\times d$ 的矩阵 $Y$。
+
+对于梯度的反向传播,有
+
+
+
+
+
+其中,对于 $\frac{\partial z}{\partial X}\_{n\times d}$ 中的每个元素 ${Z'}\_{ij}$ 有
+
+
+
+
+
+#### 4.2 代码实现
+
+```python {.line-numbers}
+# numpy_fnn.py
+
+class Softmax(NumpyOp):
+ '''
+ Softmax over last dimension.
+ '''
+
+ def forward(self, x: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ x: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ y: np.ndarray = np.exp(x) / np.exp(x).sum(axis=1)[:, None]
+ self.memory['y'] = y
+ return y
+
+ def backward(self, grad_y: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ grad_y: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ y: np.ndarray = self.memory['y']
+ return y * (grad_y - (grad_y * y).sum(axis=1)[:, None])
+```
+
+## 函数 `mini_batch` 实现
+
+我们使用 NumPy 重写了函数 `mini_batch`,用于之后的训练。
+
+```python {.line-numbers}
+# numpy_mnist.py
+
+def mini_batch(dataset: List[Tuple[Any, int]], batch_size=128) -> np.ndarray:
+ '''
+ Split the data and labels from the given dataset into batches.
+
+ Args:
+ dataset: the given dataset
+ batch_size: the size of retrieved data
+
+ Returns:
+ Batches of [data, labels] pair.
+ '''
+
+ data: np.ndarray = np.array([np.array(pair[0]) for pair in dataset])
+ labels: np.ndarray = np.array([pair[1] for pair in dataset])
+
+ # Shuffle the dataset
+ size: int = len(dataset)
+ indices: np.ndarray = np.arange(size)
+ np.random.shuffle(indices)
+
+ batches: List[Tuple[np.ndarray, np.ndarray]] = []
+ for i in range(0, size, batch_size):
+ chunk: np.ndarray = indices[i:i+batch_size]
+ batches.append((data[chunk], labels[chunk]))
+ return batches
+```
+
+## 实验过程与结果
+
+执行以下命令开始训练。
+
+```bash
+python ./numpy_mnist.py
+```
+
+### 1. 实验 1
+
+#### 1.1 参数
+
+```python {.line-numbers}
+epoch_number = 3
+batch_size = 128
+learning_rate = 0.1
+```
+
+#### 1.2 预测准确率
+
+```text
+[0] Accuracy: 0.9485
+[1] Accuracy: 0.9647
+[2] Accuracy: 0.9715
+```
+
+#### 1.3 损失函数值
+
+
+
+### 2. 实验 2
+
+这次,我们调大训练轮数(`epoch_number`),观察预测准确率的变化。
+
+#### 2.1 参数
+
+```python {.line-numbers}
+epoch_number = 10
+batch_size = 128
+learning_rate = 0.1
+```
+
+#### 2.2 预测准确率
+
+```text
+[0] Accuracy: 0.9496
+[1] Accuracy: 0.9600
+[2] Accuracy: 0.9675
+[3] Accuracy: 0.9761
+[4] Accuracy: 0.9755
+[5] Accuracy: 0.9775
+[6] Accuracy: 0.9791
+[7] Accuracy: 0.9795
+[8] Accuracy: 0.9781
+[9] Accuracy: 0.9810
+```
+
+可见,训练轮数越多,预测准确率越高,但到达一定准确率后开始波动,不再明显上升。
+
+#### 2.3 损失函数值
+
+
+
+可见,训练轮数越多,损失函数值越低,波动越小。
+
+### 3. 实验 3
+
+这次,我们调大批处理时每批数据大小(`batch_size`),观察预测准确率的变化。
+
+#### 3.1 参数
+
+```python {.line-numbers}
+epoch_number = 10
+batch_size = 256
+learning_rate = 0.1
+```
+
+#### 3.2 预测准确率
+
+```text
+[0] Accuracy: 0.9253
+[1] Accuracy: 0.9477
+[2] Accuracy: 0.9522
+[3] Accuracy: 0.9651
+[4] Accuracy: 0.9680
+[5] Accuracy: 0.9731
+[6] Accuracy: 0.9755
+[7] Accuracy: 0.9766
+[8] Accuracy: 0.9766
+[9] Accuracy: 0.9790
+```
+
+可见,每批数据的大小越大,训练速度越慢,但最终达到的准确率没有明显变化。
+
+#### 3.3 损失函数值
+
+
+
+可见,每批数据的大小越大,损失函数值整体的波动越小。
+
+### 4. 实验 4
+
+这次,我们提高学习率(`learning_rate`),观察预测准确率的变化。
+
+#### 4.1 参数
+
+```python {.line-numbers}
+epoch_number = 10
+batch_size = 256
+learning_rate = 0.5
+```
+
+#### 4.2 预测准确率
+
+```text
+[0] Accuracy: 0.9505
+[1] Accuracy: 0.9675
+[2] Accuracy: 0.9706
+[3] Accuracy: 0.9471
+[4] Accuracy: 0.9737
+[5] Accuracy: 0.8980
+[6] Accuracy: 0.9757
+[7] Accuracy: 0.9573
+[8] Accuracy: 0.9770
+[9] Accuracy: 0.9792
+```
+
+可见,学习率越高,训练速度越快,但训练时的波动也可能较大。
+
+#### 4.3 损失函数值
+
+
+
+可见,学习率越高,损失函数值下降越快,但整体的波动也可能较大。
diff --git a/assignment-2/submission/18307130003/img/equation_1.1.svg b/assignment-2/submission/18307130003/img/equation_1.1.svg
new file mode 100644
index 0000000000000000000000000000000000000000..048e56f154f2f40bd0cdaf0702615807c2145ca0
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_1.1.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_1.2.svg b/assignment-2/submission/18307130003/img/equation_1.2.svg
new file mode 100644
index 0000000000000000000000000000000000000000..33892d4bcf22bcded97604ddc5cf354c18abb576
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_1.2.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_1.3.1.svg b/assignment-2/submission/18307130003/img/equation_1.3.1.svg
new file mode 100644
index 0000000000000000000000000000000000000000..005032ef40fc10a2c05b5a740e7dcde4a78a1c48
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_1.3.1.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_1.3.svg b/assignment-2/submission/18307130003/img/equation_1.3.svg
new file mode 100644
index 0000000000000000000000000000000000000000..68d04c5e681902537b85d4a254b8bf85524089e3
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_1.3.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_1.4.svg b/assignment-2/submission/18307130003/img/equation_1.4.svg
new file mode 100644
index 0000000000000000000000000000000000000000..d655eaae87b6dff86ca1117861409f13019470f2
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_1.4.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_1.5.svg b/assignment-2/submission/18307130003/img/equation_1.5.svg
new file mode 100644
index 0000000000000000000000000000000000000000..54e7cb1aa6e7776fbe1ae1fd3106fb99f2bc1352
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_1.5.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_2.1.svg b/assignment-2/submission/18307130003/img/equation_2.1.svg
new file mode 100644
index 0000000000000000000000000000000000000000..eb21ebb6af0ec4fa6dbdaf6e9a01f907cc863aab
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_2.1.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_2.2.svg b/assignment-2/submission/18307130003/img/equation_2.2.svg
new file mode 100644
index 0000000000000000000000000000000000000000..7edf73de46bde7973651f6e6fb85c7c5ff6eb233
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_2.2.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_2.3.svg b/assignment-2/submission/18307130003/img/equation_2.3.svg
new file mode 100644
index 0000000000000000000000000000000000000000..bf68e43b7fc39081e5efec8cb6a7ac267af76b12
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_2.3.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_2.4.svg b/assignment-2/submission/18307130003/img/equation_2.4.svg
new file mode 100644
index 0000000000000000000000000000000000000000..d6433c573aa219e7e1aae3d34c4e72243cd974ac
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_2.4.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_3.1.svg b/assignment-2/submission/18307130003/img/equation_3.1.svg
new file mode 100644
index 0000000000000000000000000000000000000000..391e1937a4e4b7acab50955142dd3625b07599b5
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_3.1.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_3.2.svg b/assignment-2/submission/18307130003/img/equation_3.2.svg
new file mode 100644
index 0000000000000000000000000000000000000000..3d623c23089d0e03c44240fcf4432ff36157c86a
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_3.2.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_3.3.svg b/assignment-2/submission/18307130003/img/equation_3.3.svg
new file mode 100644
index 0000000000000000000000000000000000000000..6f5a88351d0077541fca59541f6002cfe029f90d
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_3.3.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_3.4.svg b/assignment-2/submission/18307130003/img/equation_3.4.svg
new file mode 100644
index 0000000000000000000000000000000000000000..27682eda729e7c7027f1e7f8a1e946b31916ff64
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_3.4.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_4.1.svg b/assignment-2/submission/18307130003/img/equation_4.1.svg
new file mode 100644
index 0000000000000000000000000000000000000000..a4a904174c49236781bed187d439cb9193e0f4d3
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_4.1.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_4.2.svg b/assignment-2/submission/18307130003/img/equation_4.2.svg
new file mode 100644
index 0000000000000000000000000000000000000000..2256e00b04832055cf7d6c293df80d552f652425
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_4.2.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/equation_4.3.svg b/assignment-2/submission/18307130003/img/equation_4.3.svg
new file mode 100644
index 0000000000000000000000000000000000000000..2136ae53dcd7c04ffefcbd1277a7d90fd6d21297
--- /dev/null
+++ b/assignment-2/submission/18307130003/img/equation_4.3.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/assignment-2/submission/18307130003/img/loss_value_1.png b/assignment-2/submission/18307130003/img/loss_value_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..ac76f374937aadf8ea57663647c3fbf8c8efc8fc
Binary files /dev/null and b/assignment-2/submission/18307130003/img/loss_value_1.png differ
diff --git a/assignment-2/submission/18307130003/img/loss_value_2.png b/assignment-2/submission/18307130003/img/loss_value_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..9b09bfa0b05cc98cc41408c4703ca8b4c87ddf56
Binary files /dev/null and b/assignment-2/submission/18307130003/img/loss_value_2.png differ
diff --git a/assignment-2/submission/18307130003/img/loss_value_3.png b/assignment-2/submission/18307130003/img/loss_value_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..482bd3f6c977c7acb1a0c6dfbf6acc687d92c47d
Binary files /dev/null and b/assignment-2/submission/18307130003/img/loss_value_3.png differ
diff --git a/assignment-2/submission/18307130003/img/loss_value_4.png b/assignment-2/submission/18307130003/img/loss_value_4.png
new file mode 100644
index 0000000000000000000000000000000000000000..268f540308806eb531e46cddbd64a4ebfaedcdc6
Binary files /dev/null and b/assignment-2/submission/18307130003/img/loss_value_4.png differ
diff --git a/assignment-2/submission/18307130003/numpy_fnn.py b/assignment-2/submission/18307130003/numpy_fnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4331007a5835d00d955e95447e4f20f03481c9ae
--- /dev/null
+++ b/assignment-2/submission/18307130003/numpy_fnn.py
@@ -0,0 +1,213 @@
+from typing import Tuple
+import numpy as np
+
+
+class NumpyOp:
+ '''
+ The base class for Numpy operations.
+ '''
+
+ def __init__(self):
+ self.memory = {}
+ self.epsilon = 1e-12
+
+
+class Matmul(NumpyOp):
+ '''
+ Matrix multiplication unit.
+ '''
+
+ def forward(self, x: np.ndarray, w: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ x: shape(N, d)
+ w: shape(d, d')
+
+ Returns:
+ shape(N, d')
+ '''
+
+ self.memory['x'] = x
+ self.memory['w'] = w
+ return np.matmul(x, w)
+
+ def backward(self, grad_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ '''
+ Args:
+ grad_y: shape(N, d')
+
+ Returns:
+ grad_x: shape(N, d)
+ grad_w: shape(d, d')
+ '''
+
+ x: np.ndarray = self.memory['x']
+ w: np.ndarray = self.memory['w']
+ grad_x: np.ndarray = np.matmul(grad_y, w.T)
+ grad_w: np.ndarray = np.matmul(x.T, grad_y)
+ return grad_x, grad_w
+
+
+class Relu(NumpyOp):
+ '''
+ Rectified Linear Unit.
+ '''
+
+ def forward(self, x: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ x: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ self.memory['x'] = x
+ return np.where(x > 0, x, 0)
+
+ def backward(self, grad_y: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ grad_y: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ x: np.ndarray = self.memory['x']
+ return np.where(x > 0, grad_y, 0)
+
+
+class Log(NumpyOp):
+ '''
+ Natural logarithm unit.
+ '''
+
+ def forward(self, x: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ x: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ self.memory['x'] = x
+ return np.log(x + self.epsilon)
+
+ def backward(self, grad_y: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ grad_y: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ x: np.ndarray = self.memory['x']
+ return grad_y / x
+
+
+class Softmax(NumpyOp):
+ '''
+ Softmax over last dimension.
+ '''
+
+ def forward(self, x: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ x: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ y: np.ndarray = np.exp(x) / np.exp(x).sum(axis=1)[:, None]
+ self.memory['y'] = y
+ return y
+
+ def backward(self, grad_y: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ grad_y: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ y: np.ndarray = self.memory['y']
+ return y * (grad_y - (grad_y * y).sum(axis=1)[:, None])
+
+
+class NumpyLoss:
+ '''
+ Loss function.
+ '''
+
+ def __init__(self):
+ self.target: np.ndarray = None
+
+ def get_loss(self, pred: np.ndarray, target: np.ndarray) -> float:
+ self.target = target
+ return (-pred * target).sum(axis=1).mean()
+
+ def backward(self) -> np.ndarray:
+ return -self.target / self.target.shape[0]
+
+
+class NumpyModel:
+ '''
+ An FNN model implemented in NumPy.
+ '''
+
+ def __init__(self):
+ self.W1: np.ndarray = np.random.normal(size=(28 * 28, 256))
+ self.W2: np.ndarray = np.random.normal(size=(256, 64))
+ self.W3: np.ndarray = np.random.normal(size=(64, 10))
+
+ self.matmul_1 = Matmul()
+ self.relu_1 = Relu()
+
+ self.matmul_2 = Matmul()
+ self.relu_2 = Relu()
+
+ self.matmul_3 = Matmul()
+ self.softmax_3 = Softmax()
+ self.log_3 = Log()
+
+ def forward(self, x: np.ndarray) -> np.ndarray:
+ '''
+ Args:
+ x: shape(N, d)
+
+ Returns:
+ shape(N, d)
+ '''
+
+ x = x.reshape(-1, 28 * 28)
+ x = self.relu_1.forward(self.matmul_1.forward(x, self.W1))
+ x = self.relu_2.forward(self.matmul_2.forward(x, self.W2))
+ x = self.softmax_3.forward(self.matmul_3.forward(x, self.W3))
+ x = self.log_3.forward(x)
+ return x
+
+ def backward(self, y: np.ndarray) -> None:
+ '''
+ Args:
+ y: shape(N, d)
+ '''
+
+ self.log_grad = self.log_3.backward(y)
+ self.softmax_grad = self.softmax_3.backward(self.log_grad)
+ self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad)
+
+ self.relu_2_grad = self.relu_2.backward(self.x3_grad)
+ self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad)
+
+ self.relu_1_grad = self.relu_1.backward(self.x2_grad)
+ self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad)
+
+ def optimize(self, learning_rate: float):
+ self.W1 -= learning_rate * self.W1_grad
+ self.W2 -= learning_rate * self.W2_grad
+ self.W3 -= learning_rate * self.W3_grad
diff --git a/assignment-2/submission/18307130003/numpy_mnist.py b/assignment-2/submission/18307130003/numpy_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..f992979ba869396a0885b228240cbae6d83404fc
--- /dev/null
+++ b/assignment-2/submission/18307130003/numpy_mnist.py
@@ -0,0 +1,79 @@
+from typing import Any, List, Tuple
+import numpy as np
+from numpy_fnn import NumpyModel, NumpyLoss
+from utils import (
+ download_mnist,
+ batch,
+ # mini_batch,
+ get_torch_initialization,
+ plot_curve,
+ one_hot,
+)
+
+
+def mini_batch(dataset: List[Tuple[Any, int]], batch_size=128) -> np.ndarray:
+ '''
+ Split the data and labels from the given dataset into batches.
+
+ Args:
+ dataset: the given dataset
+ batch_size: the size of retrieved data
+
+ Returns:
+ Batches of [data, labels] pair.
+ '''
+
+ data: np.ndarray = np.array([np.array(pair[0]) for pair in dataset])
+ labels: np.ndarray = np.array([pair[1] for pair in dataset])
+
+ # Shuffle the dataset
+ size: int = len(dataset)
+ indices: np.ndarray = np.arange(size)
+ np.random.shuffle(indices)
+
+ batches: List[Tuple[np.ndarray, np.ndarray]] = []
+ for i in range(0, size, batch_size):
+ chunk: np.ndarray = indices[i:i+batch_size]
+ batches.append((data[chunk], labels[chunk]))
+ return batches
+
+
+def numpy_run():
+ '''
+ Train the FNN network on MNIST.
+ '''
+
+ train_dataset, test_dataset = download_mnist()
+
+ model = NumpyModel()
+ numpy_loss = NumpyLoss()
+ model.W1, model.W2, model.W3 = get_torch_initialization()
+
+ train_loss: List[float] = []
+
+ epoch_number = 10
+ batch_size = 256
+ learning_rate = 0.5
+
+ for epoch in range(epoch_number):
+ for x, y in mini_batch(train_dataset, batch_size=batch_size):
+ x: np.ndarray
+ y: np.ndarray = one_hot(y)
+
+ y_pred = model.forward(x)
+ loss = numpy_loss.get_loss(y_pred, y)
+
+ model.backward(numpy_loss.backward())
+ model.optimize(learning_rate)
+
+ train_loss.append(loss.item())
+
+ x, y = batch(test_dataset)[0]
+ accuracy: float = np.mean((model.forward(x).argmax(axis=1) == y))
+ print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+
+ plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+ numpy_run()
diff --git a/assignment-2/submission/18307130003/tester_demo.py b/assignment-2/submission/18307130003/tester_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..68874dece3ca3717b19a722e8489764d089a7aa0
--- /dev/null
+++ b/assignment-2/submission/18307130003/tester_demo.py
@@ -0,0 +1,191 @@
+import numpy as np
+import torch
+from torch import (
+ matmul as torch_matmul,
+ relu as torch_relu,
+ softmax as torch_softmax,
+ log as torch_log,
+)
+
+from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss
+from torch_mnist import TorchModel
+from utils import get_torch_initialization, one_hot
+
+err_epsilon = 1e-6
+err_p = 0.4
+
+
+def check_result(numpy_result, torch_result=None):
+ if isinstance(numpy_result, list) and torch_result is None:
+ flag = True
+ for (n, t) in numpy_result:
+ flag = flag and check_result(n, t)
+ return flag
+ T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item()
+ direction = T / torch_result.numel() < err_p
+ result = direction and (
+ (torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon
+ ).item()
+ print(result)
+ return result
+
+
+def case_1():
+ x = np.random.normal(size=[5, 6])
+ W = np.random.normal(size=[6, 4])
+
+ numpy_matmul = Matmul()
+ numpy_out = numpy_matmul.forward(x, W)
+ numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+ torch_W = torch.from_numpy(W).clone().requires_grad_()
+
+ torch_out = torch_matmul(torch_x, torch_W)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ (numpy_W_grad, torch_W.grad),
+ ])
+
+
+def case_2():
+ x = np.random.normal(size=[5, 6])
+
+ numpy_relu = Relu()
+ numpy_out = numpy_relu.forward(x)
+ numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_relu(torch_x)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def case_3():
+ x = np.random.uniform(low=0.0, high=1.0, size=[3, 4])
+
+ numpy_log = Log()
+ numpy_out = numpy_log.forward(x)
+ numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_log(torch_x)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def case_4():
+ x = np.random.normal(size=[4, 5])
+
+ numpy_softmax = Softmax()
+ numpy_out = numpy_softmax.forward(x)
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_softmax(torch_x, 1)
+
+ return check_result(numpy_out, torch_out)
+
+
+def case_5():
+ x = np.random.normal(size=[20, 25])
+
+ numpy_softmax = Softmax()
+ numpy_out = numpy_softmax.forward(x)
+ numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out))
+
+ torch_x = torch.from_numpy(x).clone().requires_grad_()
+
+ torch_out = torch_softmax(torch_x, 1)
+ torch_out.sum().backward()
+
+ return check_result([
+ (numpy_out, torch_out),
+ (numpy_x_grad, torch_x.grad),
+ ])
+
+
+def test_model():
+ try:
+ numpy_loss = NumpyLoss()
+ numpy_model = NumpyModel()
+ torch_model = TorchModel()
+ torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = (
+ get_torch_initialization(numpy=False)
+ )
+ numpy_model.W1 = torch_model.W1.detach().clone().numpy()
+ numpy_model.W2 = torch_model.W2.detach().clone().numpy()
+ numpy_model.W3 = torch_model.W3.detach().clone().numpy()
+
+ x = torch.randn((10000, 28, 28))
+ y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000)
+
+ y = one_hot(y, numpy=False)
+ x2 = x.numpy()
+ y_pred = torch_model.forward(x)
+ loss = (-y_pred * y).sum(dim=1).mean()
+ loss.backward()
+
+ y_pred_numpy = numpy_model.forward(x2)
+ numpy_loss.get_loss(y_pred_numpy, y.numpy())
+
+ check_flag_1 = check_result(y_pred_numpy, y_pred)
+ print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10))
+ except Exception as err:
+ print("[Runtime Error in forward, error: {}]".format(err))
+ print("+ {:12} {}/{}".format("forward", 0, 10))
+ return 0
+
+ try:
+ numpy_model.backward(numpy_loss.backward())
+
+ check_flag_2 = [
+ check_result(numpy_model.log_grad, torch_model.log_input.grad),
+ check_result(numpy_model.softmax_grad,
+ torch_model.softmax_input.grad),
+ check_result(numpy_model.W3_grad, torch_model.W3.grad),
+ check_result(numpy_model.W2_grad, torch_model.W2.grad),
+ check_result(numpy_model.W1_grad, torch_model.W1.grad)
+ ]
+ check_flag_2 = sum(check_flag_2) / 5
+ print("+ {:12} {}/{}".format("backward", int(20 * check_flag_2), 20))
+ except Exception as err:
+ print("[Runtime Error in backward, error: {}]".format(err))
+ print("+ {:12} {}/{}".format("backward", 0, 20))
+ check_flag_2 = False
+
+ return int(10 * check_flag_1 + 20 * check_flag_2)
+
+
+if __name__ == "__main__":
+ testcases = [
+ ["matmul", case_1, 5],
+ ["relu", case_2, 5],
+ ["log", case_3, 5],
+ ["softmax_1", case_4, 5],
+ ["softmax_2", case_5, 10],
+ ]
+ score = 0
+ for case in testcases:
+ try:
+ res = case[2] if case[1]() else 0
+ except Exception as err:
+ print("[Runtime Error in {}, error: {}]".format(case[0], err))
+ res = 0
+ score += res
+ print("+ {:12} {}/{}".format(case[0], res, case[2]))
+ score += test_model()
+ print("{:14} {}/60".format("FINAL SCORE", score))
diff --git a/assignment-2/submission/18307130003/torch_mnist.py b/assignment-2/submission/18307130003/torch_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..dec6c89db4ba96accc0696527a1bf5e0d6da66c0
--- /dev/null
+++ b/assignment-2/submission/18307130003/torch_mnist.py
@@ -0,0 +1,74 @@
+import torch
+from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve
+
+
+class TorchModel:
+
+ def __init__(self):
+ self.W1 = torch.randn((28 * 28, 256), requires_grad=True)
+ self.W2 = torch.randn((256, 64), requires_grad=True)
+ self.W3 = torch.randn((64, 10), requires_grad=True)
+ self.softmax_input = None
+ self.log_input = None
+
+ def forward(self, x):
+ x = x.reshape(-1, 28 * 28)
+ x = torch.relu(torch.matmul(x, self.W1))
+ x = torch.relu(torch.matmul(x, self.W2))
+ x = torch.matmul(x, self.W3)
+
+ self.softmax_input = x
+ self.softmax_input.retain_grad()
+
+ x = torch.softmax(x, 1)
+
+ self.log_input = x
+ self.log_input.retain_grad()
+
+ x = torch.log(x)
+
+ return x
+
+ def optimize(self, learning_rate):
+ with torch.no_grad():
+ self.W1 -= learning_rate * self.W1.grad
+ self.W2 -= learning_rate * self.W2.grad
+ self.W3 -= learning_rate * self.W3.grad
+
+ self.W1.grad = None
+ self.W2.grad = None
+ self.W3.grad = None
+
+
+def torch_run():
+ train_dataset, test_dataset = download_mnist()
+
+ model = TorchModel()
+ model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(
+ numpy=False)
+
+ train_loss = []
+
+ epoch_number = 3
+ learning_rate = 0.1
+
+ for epoch in range(epoch_number):
+ for x, y in mini_batch(train_dataset, numpy=False):
+ y = one_hot(y, numpy=False)
+
+ y_pred = model.forward(x)
+ loss = (-y_pred * y).sum(dim=1).mean()
+ loss.backward()
+ model.optimize(learning_rate)
+
+ train_loss.append(loss.item())
+
+ x, y = batch(test_dataset, numpy=False)[0]
+ accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item()
+ print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy))
+
+ plot_curve(train_loss)
+
+
+if __name__ == "__main__":
+ torch_run()
diff --git a/assignment-2/submission/18307130003/utils.py b/assignment-2/submission/18307130003/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f471049774fe2c72cf7e490cc94078ad8e7b593f
--- /dev/null
+++ b/assignment-2/submission/18307130003/utils.py
@@ -0,0 +1,75 @@
+import torch
+import numpy as np
+from matplotlib import pyplot as plt
+
+
+def plot_curve(data):
+ plt.plot(range(len(data)), data, color='blue')
+ plt.legend(['loss_value'], loc='upper right')
+ plt.xlabel('step')
+ plt.ylabel('value')
+ plt.show()
+
+
+def download_mnist():
+ from torchvision import datasets, transforms
+
+ transform = transforms.Compose([
+ transforms.ToTensor(),
+ transforms.Normalize(mean=(0.1307,), std=(0.3081,))
+ ])
+
+ train_dataset = datasets.MNIST(
+ root="./data/", transform=transform, train=True, download=True
+ )
+ test_dataset = datasets.MNIST(
+ root="./data/", transform=transform, train=False, download=True
+ )
+
+ return train_dataset, test_dataset
+
+
+def one_hot(y, numpy=True):
+ if numpy:
+ y_ = np.zeros((y.shape[0], 10))
+ y_[np.arange(y.shape[0], dtype=np.int32), y] = 1
+ return y_
+ else:
+ y_ = torch.zeros((y.shape[0], 10))
+ y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1
+ return y_
+
+
+def batch(dataset, numpy=True):
+ data = []
+ label = []
+ for each in dataset:
+ data.append(each[0])
+ label.append(each[1])
+ data = torch.stack(data)
+ label = torch.LongTensor(label)
+ if numpy:
+ return [(data.numpy(), label.numpy())]
+ else:
+ return [(data, label)]
+
+
+def mini_batch(dataset, batch_size=128, numpy=False):
+ return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+def get_torch_initialization(numpy=True):
+ fc1 = torch.nn.Linear(28 * 28, 256)
+ fc2 = torch.nn.Linear(256, 64)
+ fc3 = torch.nn.Linear(64, 10)
+
+ if numpy:
+ W1 = fc1.weight.T.detach().clone().numpy()
+ W2 = fc2.weight.T.detach().clone().numpy()
+ W3 = fc3.weight.T.detach().clone().numpy()
+ else:
+ W1 = fc1.weight.T.detach().clone().data
+ W2 = fc2.weight.T.detach().clone().data
+ W3 = fc3.weight.T.detach().clone().data
+
+ return W1, W2, W3