diff --git a/assignment-2/submission/18307130074/img/basic_test.png b/assignment-2/submission/18307130074/img/basic_test.png new file mode 100644 index 0000000000000000000000000000000000000000..09a77bfd30f6ba7bf950b8413637d9fe163ebe53 Binary files /dev/null and b/assignment-2/submission/18307130074/img/basic_test.png differ diff --git a/assignment-2/submission/18307130074/img/get_torch_initialization.png b/assignment-2/submission/18307130074/img/get_torch_initialization.png new file mode 100644 index 0000000000000000000000000000000000000000..5346f4e9b753d2375e9db0d9674101836a32989c Binary files /dev/null and b/assignment-2/submission/18307130074/img/get_torch_initialization.png differ diff --git a/assignment-2/submission/18307130074/img/mini_batch.png b/assignment-2/submission/18307130074/img/mini_batch.png new file mode 100644 index 0000000000000000000000000000000000000000..45a8913a0f8fc0dd86aab705aa6fb87d7d796fdc Binary files /dev/null and b/assignment-2/submission/18307130074/img/mini_batch.png differ diff --git a/assignment-2/submission/18307130074/img/research.png b/assignment-2/submission/18307130074/img/research.png new file mode 100644 index 0000000000000000000000000000000000000000..77f8f897fd02871f3a93a3636bc374c0232b6341 Binary files /dev/null and b/assignment-2/submission/18307130074/img/research.png differ diff --git a/assignment-2/submission/18307130074/numpy_fnn.py b/assignment-2/submission/18307130074/numpy_fnn.py new file mode 100644 index 0000000000000000000000000000000000000000..11820682b35a685487f8246c31a6026ed2190c00 --- /dev/null +++ b/assignment-2/submission/18307130074/numpy_fnn.py @@ -0,0 +1,171 @@ +import numpy as np + + +class NumpyOp: + + def __init__(self): + self.memory = {} + self.epsilon = 1e-12 + + +class Matmul(NumpyOp): + + def forward(self, x, W): + """ + x: shape(N, d) + w: shape(d, d') + """ + self.memory['x'] = x + self.memory['W'] = W + h = np.matmul(x, W) + return h + + def backward(self, grad_y): + """ + grad_y: shape(N, d') + """ + + grad_x = np.matmul(grad_y, self.memory['W'].T) + grad_W = np.matmul(self.memory['x'].T, grad_y) + + return grad_x, grad_W + + +class Relu(NumpyOp): + + def forward(self, x): + self.memory['x'] = x + return np.where(x > 0, x, np.zeros_like(x)) + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + grad_x = np.where(self.memory['x'] > 0, 1, 0) * grad_y + + return grad_x + + +class Log(NumpyOp): + + def forward(self, x): + """ + x: shape(N, c) + """ + + out = np.log(x + self.epsilon) + self.memory['x'] = x + + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + grad_x = np.reciprocal(self.memory['x'] + self.epsilon) * grad_y + + return grad_x + + +class Softmax(NumpyOp): + """ + softmax over last dimension + """ + + def forward(self, x): + """ + x: shape(N, c) + """ + + r = np.exp(x) + s = np.sum(r, axis=1).reshape(-1, 1) + out = (r / s).astype('float64') + self.memory['out'] = out + + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + out = self.memory['out'] + Matrix = [] + for i in range(out.shape[0]): + row = out[i] + Jacob = np.diag(row) - np.outer(row, row) + Matrix.append(Jacob) + Matrix = np.array(Matrix) + grad_x = np.squeeze(np.matmul(grad_y[:,np.newaxis,:], Matrix), axis=1) + + return grad_x + + +class NumpyLoss: + + def __init__(self): + self.target = None + + def get_loss(self, pred, target): + self.target = target + return (-pred * target).sum(axis=1).mean() + + def backward(self): + return -self.target / self.target.shape[0] + + +class NumpyModel: + def __init__(self): + self.W1 = np.random.normal(size=(28 * 28, 256)) + self.W2 = np.random.normal(size=(256, 64)) + self.W3 = np.random.normal(size=(64, 10)) + + # 以下算子会在 forward 和 backward 中使用 + self.matmul_1 = Matmul() + self.relu_1 = Relu() + self.matmul_2 = Matmul() + self.relu_2 = Relu() + self.matmul_3 = Matmul() + self.softmax = Softmax() + self.log = Log() + + # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导) + self.x1_grad, self.W1_grad = None, None + self.relu_1_grad = None + self.x2_grad, self.W2_grad = None, None + self.relu_2_grad = None + self.x3_grad, self.W3_grad = None, None + self.softmax_grad = None + self.log_grad = None + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + + z1 = self.matmul_1.forward(x, self.W1) + x2 = self.relu_1.forward(z1) + z2 = self.matmul_2.forward(x2, self.W2) + x3 = self.relu_2.forward(z2) + z3 = self.matmul_3.forward(x3, self.W3) + out = self.softmax.forward(z3) + x = self.log.forward(out) + + return x + + def backward(self, y): + + self.log_grad = self.log.backward(y) + self.softmax_grad = self.softmax.backward(self.log_grad) + self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad) + self.relu_2_grad = self.relu_2.backward(self.x3_grad) + self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad) + self.relu_1_grad = self.relu_1.backward(self.x2_grad) + self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad) + + pass + + def optimize(self, learning_rate): + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad diff --git a/assignment-2/submission/18307130074/numpy_mnist.py b/assignment-2/submission/18307130074/numpy_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..689df447bfdcf31afa5bfc4d9328e77177412dba --- /dev/null +++ b/assignment-2/submission/18307130074/numpy_mnist.py @@ -0,0 +1,83 @@ +import numpy as np +from numpy_fnn import NumpyModel, NumpyLoss + +import os +os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" + +# from utils import download_mnist, batch, mini_batch, get_torch_initialization, plot_curve, one_hot +from utils import download_mnist, batch, plot_curve, one_hot + +def mini_batch(dataset, batch_size=128): + + data = [] + label = [] + + for each in dataset: + data.append(np.array(each[0])) + label.append(each[1]) + + label = np.array(label) + data = np.array(data) + + num = data.shape[0] + i = np.arange(num) + np.random.shuffle(i) + + label_ = label[i] + data_ = data[i] + + res = [] + for id in range(num // batch_size): + batch_data = data_[id * batch_size: (id + 1) * batch_size] + batch_label = label_[id * batch_size: (id + 1) * batch_size] + res.append((batch_data, batch_label)) + + return res + + +def get_torch_initialization(): + + def parameters(in_features, out_features, param = 5**0.5): + bound = (6 / (1 + param * param) / in_features ) ** 0.5 + return np.random.uniform(-bound, bound, (in_features, out_features)) + + W1 = parameters(28 * 28, 256) + W2 = parameters(256, 64) + W3 = parameters(64, 10) + return W1, W2, W3 + + +def numpy_run(): + train_dataset, test_dataset = download_mnist() + + model = NumpyModel() + numpy_loss = NumpyLoss() + model.W1, model.W2, model.W3 = get_torch_initialization() + + train_loss = [] + + epoch_number = 3 + learning_rate = 0.1 + + for epoch in range(epoch_number): + + for x, y in mini_batch(train_dataset): + y = one_hot(y) + + y_pred = model.forward(x) + loss = numpy_loss.get_loss(y_pred, y) + + model.backward(numpy_loss.backward()) + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset)[0] + accuracy = np.mean((model.forward(x).argmax(axis=1) == y)) + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + +if __name__ == "__main__": + numpy_run() diff --git a/assignment-2/submission/18307130074/readme.md b/assignment-2/submission/18307130074/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..71c3f6f1d990e3eff9e30a6ae8a33d2fbfbfe19b --- /dev/null +++ b/assignment-2/submission/18307130074/readme.md @@ -0,0 +1,371 @@ +# Assignment2:FNN + +18307130074 姜博天(选题1) + +## 1. numpy_fnn.py算子反向传播推导 + +### 1. Matmul + +首先正向计算时有矩阵 x(N * d') * W(d' * d) = h(N * d) + +假设反向传播中输入的梯度为y(N * d) + +则有 +$$ +y_{ij} = \frac{\partial Loss}{\partial h_{ij}} +$$ +计算W的梯度 +$$ +\begin{aligned} + \frac{\partial Loss}{\partial W_{pq}} + &=\sum_{i \leqslant N\\ j\leqslant d} \frac{\partial Loss}{\partial h_{ij}} \times \frac{\partial h_{ij}}{\partial W_{pq}}\\\\ + &=\sum_{i \leqslant N} \ y_{iq} \times \frac{\partial h_{iq}}{\partial W_{pq}}\\\\ + &=\sum_{i \leqslant N} \ y_{iq} \times \ x_{ip}\\\\ + &=\sum_{i \leqslant N} \ x_{pi}^T \times y_{iq} +\end{aligned} +$$ +所以有 +$$ +\frac{\partial Loss}{\partial W} = x^T \times y +$$ +同理可得P的梯度 +$$ +\frac{\partial Loss}{\partial x} = y \times W^T +$$ + +### 2. RELU + +正向计算有RELU(x(N * d)) = out(N * d) +$$ +out = \begin{cases} + 0& where\ x \leq 0 \\\\ + x& where\ x > 0 + \end{cases} +$$ +计算x的梯度 +$$ +\begin{aligned} + \frac{\partial Loss}{\partial x_{ij}} + &=\frac{\partial Loss}{\partial out_{ij}} \times \frac{d_{out_{ij}}}{d_{x_{ij}}}\\\\ + &=y \times \frac{d_{out_{ij}}}{d_{x_{ij}}} +\end{aligned} +$$ +其中 +$$ +\frac{d_{out_{ij}}}{d_{x_{ij}}} = \begin{cases} + 0& x_{ij} \leq 0 \\\\ + 1& x_{ij} > 0 + \end{cases} +$$ +所以 +$$ +\frac{\partial Loss}{\partial x} = y \times x'\\\ +其中x' = \begin{cases} + 0& where\ x \leq 0 \\\\ + 1& where\ x > 0 + \end{cases} +$$ + + +### 3. Log + +正向计算有Log(x(N * d)) = out(N * d) +$$ +out_{ij} = log\ x_{ij} +$$ +计算x的梯度 +$$ +\begin{aligned} + \frac{\partial Loss}{\partial x_{ij}} + &=\frac{\partial Loss}{\partial out_{ij}} \times \frac{d_{out_{ij}}}{d_{x_{ij}}}\\\\ + &=y \times \frac{d_{out_{ij}}}{d_{x_{ij}}} +\end{aligned} +$$ +其中 +$$ +\frac{d_{out_{ij}}}{d_{x_{ij}}} = \frac {1}{x_{ij}} +$$ +所以 +$$ +\frac{\partial Loss}{\partial x} = y \times x'\\\ +其中x' = \frac{1}{x} +$$ + + +### 4. Softmax + +正向计算有Softmax(x(N * d)) = out(N * d) +$$ +out_{ij} = \frac{e^{ij}}{\sum_{k=1}^d e^{x_{ik}}} +$$ +可见每一行的out只与所在行有关,所以我们不妨先只考虑一行,然后推广到整个矩阵。 +$$ +\begin{aligned} + \frac{\partial Loss}{\partial x_i} + &=\sum_{j=1}^d \frac{\partial Loss}{\partial out_j} \times \frac{\partial out_j}{\partial x_i}\\\\ + &=y \times \frac{\partial out_i}{\partial x_i} +\end{aligned} +$$ +并且有 +$$ +\frac{\partial out_i}{\partial x_j} = + \begin{cases} + out_i \times (1 - out_i) & i = j\\\\ + -out_i \times out_j & i \neq j + \end{cases} +$$ + + + +$$ +softmax([x_1, x_2, ...,x_d]) = [out_1, out_2, ..., out_d]\\\\ +$$ + + + +$$ +\frac{\partial Loss}{\partial x} = [\frac{\partial Loss}{\partial out_1}, \frac{\partial Loss}{\partial out_2},...,\frac{\partial Loss}{\partial out_d}] \times +\begin{matrix} +\frac{\partial out_1}{\partial x_1} & \frac{\partial out_1}{\partial x_2} & ... & \frac{\partial out_1}{\partial x_d}\\\\ +\frac{\partial out_2}{\partial x_1} & \frac{\partial out_2}{\partial x_2} & ... & \frac{\partial out_2}{\partial x_d}\\\\ +... & ... & ... & ...\\\\ +\frac{\partial out_d}{\partial x_1} & \frac{\partial out_d}{\partial x_2} & ... & \frac{\partial out_d}{\partial x_d}\\\\ +\end{matrix} +$$ +可以看到[1 * d]维的向量需要乘一个[d * d]的矩阵才可得到梯度 + +那么对于[N * d]维的矩阵则可以通过添加维看成一个[N * 1 * d]的多维矩阵,需要乘一个[N * d * d]的矩阵才能得到梯度,然而得到的矩阵也为[N * 1 * d],需要通过numpy压缩成[N * d]维的矩阵作为结果返回 + +代码如下: + +```python +out = self.memory['out'] +Matrix = [] +for i in range(out.shape[0]): + row = out[i] + Jacob = np.diag(row) - np.outer(row, row) + Matrix.append(Jacob) +Matrix = np.array(Matrix) +grad_x = np.squeeze(np.matmul(grad_y[:,np.newaxis,:], Matrix), axis=1) + +return grad_x +``` + + + +## 2. 模型训练与测试 + +### 1. 利用utils.py中已经实现的函数来做测试 + +基础测试 + +| epoch | accuracy | +| ----- | -------- | +| 0 | 0.9409 | +| 1 | 0.9653 | +| 2 | 0.9694 | + +![basic_test](img/basic_test.png) + +**考虑更改epoch数量和learning_rate做对比实验** + +| epoch | accuracy with learning_rate = 0.05 | accuracy with learning_rate = 0.1 | accuracy with learning_rate = 0.2 | +| ----- | ---------------------------------- | --------------------------------- | --------------------------------- | +| 0 | 0.9266 | 0.9468 | 0.9602 | +| 1 | 0.9482 | 0.9638 | 0.9708 | +| 2 | 0.9571 | 0.9701 | 0.9752 | +| 4 | 0.9692 | 0.9755 | 0.9785 | +| 9 | 0.9787 | 0.9792 | 0.9811 | +| 14 | 0.9793 | 0.9811 | 0.9835 | +| 19 | 0.9811 | 0.9811 | 0.9835 | +| 29 | 0.9820 | 0.9806 | 0.9838 | +| 39 | 0.9823 | 0.9809 | 0.9832 | +| 49 | 0.9822 | 0.9809 | 0.9835 | +| 69 | 0.9826 | 0.9810 | 0.9836 | +| 99 | 0.9826 | 0.9811 | 0.9836 | + +由于数据量较大,这里只展示部分数据。实际上不同的learning rate以及其对应的最高accuracy和accuracy所对应的epoch如下图所示。 + +| learning_rate | epoch | accuracy | +| ------------- | ----- | -------- | +| 0.05 | 44 | 0.9827 | +| 0.1 | 23 | 0.9815 | +| 0.2 | 16 | 0.9839 | + +可以看到随着learning_rate的提升,第一次得到最大accuracy的epoch数逐渐降低,前几次的accuracy也较高。此外,可以发现无论learning_rate为何值,accuracy都会存在持续的抖动现象(甚至epoch数还未超过10就已经开始抖动)。 + +**下面对epoch、learning_rate的选取进行探究** + +如果epoch选取过大会导致浪费gpu时间且会导致过拟合,如果选取过小又有可能使得结果并非最优。所以在实验开始之前,给epoch和learning_rate选取合适的值是一件非常有意义的事情。查阅资料后发现,learning_rate最优的选取方式并非是一个定值,而是一个随着epoch变化的函数,所以初始learning_rate可以设置较大,随后每隔几个epoch减半是一种比较好的实现方式。而epoch的选取则没有较为固定的方法,一般是观察loss的变化,选取loss最小时的epoch值,一般选取10左右。下面是一组epoch=10,每隔2个epoch则learning_rate减半,learning_rate初始设为0.2的一组实验数据。 + +| epoch | accuracy | +| ----- | -------- | +| 0 | 0.9573 | +| 1 | 0.9694 | +| 2 | 0.9759 | +| 3 | 0.9791 | +| 4 | 0.9789 | +| 5 | 0.9789 | +| 6 | 0.9809 | +| 7 | 0.9812 | +| 8 | 0.9807 | +| 9 | 0.9812 | + +![research](img/research.png) + +与上文所做的固定learning_rate的结果相比之下,效果有稍微的提升,不过不是很明显。 + +### 2. mini_batch的复现 + +可以看到mini_batch函数中只有简单的一行 + +```python +def mini_batch(dataset, batch_size=128, numpy=False): + return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True) +``` + +那么去查阅torch.utils.data的源码中DataLoader类可以看到该函数的作用是给定batch_size和dataset,将数据集打乱并分割成batch_size大小的一个个小数据集。知道了原理之后就可以很简单的利用numpy.shuffle来进行打乱,并进行很简单的数据处理实现该函数。 + +```python +def mini_batch(dataset, batch_size=128): + + data = [] + label = [] + + for each in dataset: + data.append(np.array(each[0])) + label.append(each[1]) + + label = np.array(label) + data = np.array(data) + + num = data.shape[0] + i = np.arange(num) + np.random.shuffle(i) + + label_ = label[i] + data_ = data[i] + + res = [] + for id in range(num // batch_size): + batch_data = data_[id * batch_size: (id + 1) * batch_size] + batch_label = label_[id * batch_size: (id + 1) * batch_size] + res.append((batch_data, batch_label)) + + return res +``` + +### 3.利用新版本的mini_batch进行基础测试 + +| epoch | accuracy | +| ----- | -------- | +| 0 | 0.9476 | +| 1 | 0.9640 | +| 2 | 0.9715 | + +可以看到和utils.py中的mini_batch效果相近 + +![mini_batch](img/mini_batch.png) + +## 3. Pytorch权重初始化 + +首先去查阅torch.nn.linear的源码,如下所示 + +```python +def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None: + super(Linear, self).__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = Parameter(torch.Tensor(out_features, in_features)) + if bias: + self.bias = Parameter(torch.Tensor(out_features)) + else: + self.register_parameter('bias', None) + self.reset_parameters() + +def reset_parameters(self) -> None: + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + if self.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) + bound = 1 / math.sqrt(fan_in) + init.uniform_(self.bias, -bound, bound) +``` + +可以看到在定义self.bias和self.weight之后进行了reset_parameters的操作,而在这个函数中关键的一步为init.kaiming_uniform_ + +去查看该函数的源码,如下所示 + +```python +def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'): + fan = _calculate_correct_fan(tensor, mode) + gain = calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + bound = math.sqrt(3.0) * std + with torch.no_grad(): + return tensor.uniform_(-bound, bound) +``` + +calculate_gain的源码,如下所示 + +```python +def calculate_gain(nonlinearity, param=None): + linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d'] + if nonlinearity in linear_fns or nonlinearity == 'sigmoid': + return 1 + elif nonlinearity == 'tanh': + return 5.0 / 3 + elif nonlinearity == 'relu': + return math.sqrt(2.0) + elif nonlinearity == 'leaky_relu': + if param is None: + negative_slope = 0.01 + elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float): + negative_slope = param + else: + raise ValueError("negative_slope {} not a valid number".format(param)) + return math.sqrt(2.0 / (1 + negative_slope ** 2)) + elif nonlinearity == 'selu': + return 3.0 / 4 + else: + raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) +``` + +| 函数 | gain效果 | +| ---------- | -------------------------- | +| ReLU | sqrt(2) | +| Leaky_ReLU | sqrt(2 / (1 + param ** 2)) | + +由于默认为Leaky_ReLU,所以 +$$ +bound = \sqrt[]{\frac{2}{1 + param^2}} \times \sqrt[]{\frac{3}{fan\_in}} +$$ +而且可以看到默认的param = math.sqrt(5),所以很容易就可以写出代码,如下所示 + +```python +def get_torch_initialization(): + + def parameters(in_features, out_features, param = 5**0.5): + bound = (6 / (1 + param * param) / in_features ) ** 0.5 + return np.random.uniform(-bound, bound, (in_features, out_features)) + + W1 = parameters(28 * 28, 256) + W2 = parameters(256, 64) + W3 = parameters(64, 10) + return W1, W2, W3 +``` + +如上为kaiming分布中的均匀分布,也是pytorch初始化参数的做法 + +其实kaiming分布中还有一种正态分布,和均匀分布稍有不同,将值的求法稍稍改动,如下所示 +$$ +std = \sqrt[]{\frac{2}{fan\_in \times (1 + param^2)}} +$$ +采用复现的get_torch_initialization进行基础测试,结果如下 + +| epoch | accuracy | +| ----- | -------- | +| 0 | 0.9421 | +| 1 | 0.9658 | +| 2 | 0.9742 | + +![get_torch_initialization](.\img\get_torch_initialization.png) \ No newline at end of file diff --git a/assignment-2/submission/18307130074/tester_demo.py b/assignment-2/submission/18307130074/tester_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..e63107a79414018a4600e9795bba41ff82826c9a --- /dev/null +++ b/assignment-2/submission/18307130074/tester_demo.py @@ -0,0 +1,183 @@ +import numpy as np +import torch +from torch import matmul as torch_matmul, relu as torch_relu, softmax as torch_softmax, log as torch_log + +from numpy_fnn import Matmul, Relu, Softmax, Log, NumpyModel, NumpyLoss +from torch_mnist import TorchModel +from utils import get_torch_initialization, one_hot + +err_epsilon = 1e-6 +err_p = 0.4 + +import os +os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" +def check_result(numpy_result, torch_result=None): + if isinstance(numpy_result, list) and torch_result is None: + flag = True + for (n, t) in numpy_result: + flag = flag and check_result(n, t) + return flag + # print((torch.from_numpy(numpy_result) - torch_result).abs().mean().item()) + T = (torch_result * torch.from_numpy(numpy_result) < 0).sum().item() + direction = T / torch_result.numel() < err_p + return direction and ((torch.from_numpy(numpy_result) - torch_result).abs().mean() < err_epsilon).item() + + +def case_1(): + x = np.random.normal(size=[5, 6]) + W = np.random.normal(size=[6, 4]) + + numpy_matmul = Matmul() + numpy_out = numpy_matmul.forward(x, W) + numpy_x_grad, numpy_W_grad = numpy_matmul.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + torch_W = torch.from_numpy(W).clone().requires_grad_() + + torch_out = torch_matmul(torch_x, torch_W) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + (numpy_x_grad, torch_x.grad), + (numpy_W_grad, torch_W.grad) + ]) + + +def case_2(): + x = np.random.normal(size=[5, 6]) + + numpy_relu = Relu() + numpy_out = numpy_relu.forward(x) + numpy_x_grad = numpy_relu.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_relu(torch_x) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + (numpy_x_grad, torch_x.grad), + ]) + + +def case_3(): + x = np.random.uniform(low=0.0, high=1.0, size=[3, 4]) + + numpy_log = Log() + numpy_out = numpy_log.forward(x) + numpy_x_grad = numpy_log.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_log(torch_x) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + + (numpy_x_grad, torch_x.grad), + ]) + + +def case_4(): + x = np.random.normal(size=[4, 5]) + + numpy_softmax = Softmax() + numpy_out = numpy_softmax.forward(x) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_softmax(torch_x, 1) + + return check_result(numpy_out, torch_out) + + +def case_5(): + x = np.random.normal(size=[20, 25]) + + numpy_softmax = Softmax() + numpy_out = numpy_softmax.forward(x) + numpy_x_grad = numpy_softmax.backward(np.ones_like(numpy_out)) + + torch_x = torch.from_numpy(x).clone().requires_grad_() + + torch_out = torch_softmax(torch_x, 1) + torch_out.sum().backward() + + return check_result([ + (numpy_out, torch_out), + (numpy_x_grad, torch_x.grad), + ]) + + +def test_model(): + try: + numpy_loss = NumpyLoss() + numpy_model = NumpyModel() + torch_model = TorchModel() + torch_model.W1.data, torch_model.W2.data, torch_model.W3.data = get_torch_initialization(numpy=False) + numpy_model.W1 = torch_model.W1.detach().clone().numpy() + numpy_model.W2 = torch_model.W2.detach().clone().numpy() + numpy_model.W3 = torch_model.W3.detach().clone().numpy() + + x = torch.randn((10000, 28, 28)) + y = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0] * 1000) + + y = one_hot(y, numpy=False) + x2 = x.numpy() + y_pred = torch_model.forward(x) + loss = (-y_pred * y).sum(dim=1).mean() + loss.backward() + + y_pred_numpy = numpy_model.forward(x2) + numpy_loss.get_loss(y_pred_numpy, y.numpy()) + + check_flag_1 = check_result(y_pred_numpy, y_pred) + print("+ {:12} {}/{}".format("forward", 10 * check_flag_1, 10)) + except: + print("[Runtime Error in forward]") + print("+ {:12} {}/{}".format("forward", 0, 10)) + return 0 + + try: + + numpy_model.backward(numpy_loss.backward()) + + check_flag_2 = [ + check_result(numpy_model.log_grad, torch_model.log_input.grad), + check_result(numpy_model.softmax_grad, torch_model.softmax_input.grad), + check_result(numpy_model.W3_grad, torch_model.W3.grad), + check_result(numpy_model.W2_grad, torch_model.W2.grad), + check_result(numpy_model.W1_grad, torch_model.W1.grad) + ] + check_flag_2 = sum(check_flag_2) >= 4 + print("+ {:12} {}/{}".format("backward", 20 * check_flag_2, 20)) + except: + print("[Runtime Error in backward]") + print("+ {:12} {}/{}".format("backward", 0, 20)) + check_flag_2 = False + + return 10 * check_flag_1 + 20 * check_flag_2 + + +if __name__ == "__main__": + testcases = [ + ["matmul", case_1, 5], + ["relu", case_2, 5], + ["log", case_3, 5], + ["softmax_1", case_4, 5], + ["softmax_2", case_5, 10], + ] + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except: + print("[Runtime Error in {}]".format(case[0])) + res = 0 + score += res + print("+ {:12} {}/{}".format(case[0], res, case[2])) + score += test_model() + print("{:14} {}/60".format("FINAL SCORE", score)) diff --git a/assignment-2/submission/18307130074/torch_mnist.py b/assignment-2/submission/18307130074/torch_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..6d3e214c7606e3d43dac4b94554f942508afffb3 --- /dev/null +++ b/assignment-2/submission/18307130074/torch_mnist.py @@ -0,0 +1,73 @@ +import torch +from utils import mini_batch, batch, download_mnist, get_torch_initialization, one_hot, plot_curve + + +class TorchModel: + + def __init__(self): + self.W1 = torch.randn((28 * 28, 256), requires_grad=True) + self.W2 = torch.randn((256, 64), requires_grad=True) + self.W3 = torch.randn((64, 10), requires_grad=True) + self.softmax_input = None + self.log_input = None + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + x = torch.relu(torch.matmul(x, self.W1)) + x = torch.relu(torch.matmul(x, self.W2)) + x = torch.matmul(x, self.W3) + + self.softmax_input = x + self.softmax_input.retain_grad() + + x = torch.softmax(x, 1) + + self.log_input = x + self.log_input.retain_grad() + + x = torch.log(x) + + return x + + def optimize(self, learning_rate): + with torch.no_grad(): + self.W1 -= learning_rate * self.W1.grad + self.W2 -= learning_rate * self.W2.grad + self.W3 -= learning_rate * self.W3.grad + + self.W1.grad = None + self.W2.grad = None + self.W3.grad = None + + +def torch_run(): + train_dataset, test_dataset = download_mnist() + + model = TorchModel() + model.W1.data, model.W2.data, model.W3.data = get_torch_initialization(numpy=False) + + train_loss = [] + + epoch_number = 3 + learning_rate = 0.1 + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset, numpy=False): + y = one_hot(y, numpy=False) + + y_pred = model.forward(x) + loss = (-y_pred * y).sum(dim=1).mean() + loss.backward() + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset, numpy=False)[0] + accuracy = model.forward(x).argmax(dim=1).eq(y).float().mean().item() + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + +if __name__ == "__main__": + torch_run() diff --git a/assignment-2/submission/18307130074/utils.py b/assignment-2/submission/18307130074/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..709220cfa7a924d914ec1c098c505f864bcd4cfc --- /dev/null +++ b/assignment-2/submission/18307130074/utils.py @@ -0,0 +1,71 @@ +import torch +import numpy as np +from matplotlib import pyplot as plt + + +def plot_curve(data): + plt.plot(range(len(data)), data, color='blue') + plt.legend(['loss_value'], loc='upper right') + plt.xlabel('step') + plt.ylabel('value') + plt.show() + + +def download_mnist(): + from torchvision import datasets, transforms + + transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean=(0.1307,), std=(0.3081,)) + ]) + + train_dataset = datasets.MNIST(root="./data/", transform=transform, train=True, download=True) + test_dataset = datasets.MNIST(root="./data/", transform=transform, train=False, download=True) + + return train_dataset, test_dataset + + +def one_hot(y, numpy=True): + if numpy: + y_ = np.zeros((y.shape[0], 10)) + y_[np.arange(y.shape[0], dtype=np.int32), y] = 1 + return y_ + else: + y_ = torch.zeros((y.shape[0], 10)) + y_[torch.arange(y.shape[0], dtype=torch.long), y] = 1 + return y_ + + +def batch(dataset, numpy=True): + data = [] + label = [] + for each in dataset: + data.append(each[0]) + label.append(each[1]) + data = torch.stack(data) + label = torch.LongTensor(label) + if numpy: + return [(data.numpy(), label.numpy())] + else: + return [(data, label)] + + +def mini_batch(dataset, batch_size=128, numpy=False): + return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True) + + +def get_torch_initialization(numpy=True): + fc1 = torch.nn.Linear(28 * 28, 256) + fc2 = torch.nn.Linear(256, 64) + fc3 = torch.nn.Linear(64, 10) + + if numpy: + W1 = fc1.weight.T.detach().clone().numpy() + W2 = fc2.weight.T.detach().clone().numpy() + W3 = fc3.weight.T.detach().clone().numpy() + else: + W1 = fc1.weight.T.detach().clone().data + W2 = fc2.weight.T.detach().clone().data + W3 = fc3.weight.T.detach().clone().data + + return W1, W2, W3