diff --git a/assignment-2/submission/18307130130/README.md b/assignment-2/submission/18307130130/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b7fd7f8ae075cfb38e5157e595ddff516b3eb44e --- /dev/null +++ b/assignment-2/submission/18307130130/README.md @@ -0,0 +1,227 @@ +# Assignment-2 Report + +
------李睿琛 18307130130
+## 算子实现 + +`Matmul` + +实现两个矩阵相乘。对于: +$$ +Y = X \times W +$$ +存在函数`Loss = f(Y)`,根据链式法则,求导得: + + +$$ +\frac{\partial L}{\partial W_{ij}}={\sum_{k,l}}\frac{\partial L}{\partial Y_{kl}}\times\frac{\partial Y_{kl}}{\partial W_{ij}} \\\\ +$$ + +$$ +\frac{\partial Y_{kl}}{\partial X_{ij}}=\frac{\partial {\sum_{s}(X_{ks}\times W_{sl})}}{W_{ij}}=\frac{\partial X_{ki}W_{il}}{W_{ij}}=A_{ki}{\delta_{lj}} +$$ + +$$ +\frac{\partial L}{\partial W}=X^T\frac{\partial L}{\partial Y} +$$ + +于是有: +$$ +grad\_w = X^T \times grad\_y\\\\ +同理有: grad\_x = grad\_y \times W^T +$$ +`Relu` + +作为激活函数,具有单侧抑制、宽兴奋边界等生物学合理性。 +$$ +Y_{ij}=\begin{cases} +X_{ij}&X_{ij}\ge0\\\\ +0&\text{otherwise} +\end{cases} +$$ + +求导得: +$$ +\frac{\partial Y_{ij}}{\partial X_{mn}}=\begin{cases}1&X_{ij}>0,i=m,j=n\\\\0&\text{otherwise}\end{cases} +$$ + +`Log` + +计算公式为: +$$ +Y_{ij}=\ln(X_{ij}+\epsilon),\epsilon=10^{-12} \\\\ +$$ + +求导得: +$$ +\frac{\partial Y_{ij}}{\partial X_{ij}}=\frac1{X_{ij}+\epsilon} +$$ + + +`Softmax` + +称为多项的Logistic回归,表达式为: +$$ +Y_{ij}=\frac{\exp\{X_{ij} \}}{\sum_{k=1}^c\exp\{X_{ik} \}}\\\\ +$$ +向量`Y_i`对向量`X_j`求导得: +$$ +\frac{\partial Y_{i}}{\partial X_{j}}=\begin{cases} +Y_{i}(1-Y_{j})&i=j\\\\ +-Y_{i}Y_{j}&\text otherwise +\end{cases} +$$ + + +## 模型搭建 + +根据`torch_mnist.py`中`TorchModel`方法搭建模型:、 + + + +根据模型计算图,即可得到如下**前馈过程:** + +```python +x = self.matmul_1.forward(x, self.W1) +x = self.relu_1.forward(x) +x = self.matmul_2.forward(x, self.W2) +x = self.relu_2.forward(x) +x = self.matmul_3.forward(x, self.W3) +x = self.softmax.forward(x) +x = self.log.forward(x) +``` + +**反向传播:** + +首先是标量Loss对向量pred的求导: +$$ +\frac{\partial L}{\partial X^T} =[\frac{\partial L}{\partial x_1},...,\frac{\partial L}{\partial x_n}]^T +$$ +于是有: + +```python +class NumpyLoss: +def backward(self): + return -self.target / self.target.shape[0] +``` + +根据计算图,继续反向传播: + +```python +self.log_grad = self.log.backward(y) +self.softmax_grad = self.softmax.backward(self.log_grad) +self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad) +self.relu_2_grad = self.relu_2.backward(self.x3_grad) +self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad) +self.relu_1_grad = self.relu_1.backward(self.x2_grad) +self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad) +``` + +## mini_batch实现 + +由于数据集巨大,深度学习往往训练速度很慢,导致难以发挥最大效果。相比batch梯度下降法,mini_batch进一步分割数据集,能在一次遍历训练集的过程中做多次梯度下降,收敛到一个合适的精度。获得mini_batch过程为: + +```python +# 将数据集打乱; +sz = data.shape[0] +index = np.arange(sz) +np.random.shuffle(index) + +# 按照batch_size分割数据集; +for start in range(0, sz, batch_size): + ret.append([data[index[start: start+ batch_size]], label[index[start: start+ batch_size]]]) +``` + +## 模型参数影响 + +探究了学习率、batch_size大小对收敛速度的影响。 + +* learning_rate = 0.1,batch_size=128 + +``` +[0] Accuracy: 0.9436 +[1] Accuracy: 0.9610 +[2] Accuracy: 0.9710 +``` + + + +* learning_rate = 0.01,batch_size=128 + +``` +[0] Accuracy: 0.8730 +[1] Accuracy: 0.9047 +[2] Accuracy: 0.9142 +``` + + + +* learning_rate = 0.1,batch_size=512 + +``` +[0] Accuracy: 0.8233 +[1] Accuracy: 0.9244 +[2] Accuracy: 0.9296 +``` + + + +在一定范围内: + +随着学习率的减小,参数收敛速度减小,在相同迭代次数下准确率更低。 + +随着批处理容量的增大,迭代次数减少,震荡幅度减小,随着容量继续增大,可能达到时间上的最优。 + +## 梯度下降算法优化 + +* SGD算法。learning_rate = 0.1,batch_size=512 + +即优化前的算法。 + +* Momentum算法。learning_rate = 0.1,batch_size=512 + +SGN的损失在一个方向上快速变化而在另一个方向慢慢变化。Momentum算法将一段时间内的梯度向量进行了加权平均 ,有利于帮助SGN加速,冲破沟壑,加速收敛。引入动量能够使得在遇到局部最优的时候在动量的基础上**冲出局部最优**;另外也可使**震荡减弱**,更快运动到最优解。 + +**算法实现:** + + + +```python +# 算法实现 +self.beta1 = self.momentum * self.beta1 + (1 - self.momentum) * self.W1_grad +self.W1 -= learning_rate * self.beta1 + +``` 结果 +[0] Accuracy: 0.9058 +[1] Accuracy: 0.9293 +[2] Accuracy: 0.9391 +``` +``` + + + +* Adam算法。learning_rate = 0.1,batch_size=512 + +相比随机梯度下降中**不变**的学习率,Adam算法通过计算梯度的一阶矩估计和二阶矩估计而为不同的参数设计独立的**自适应性学习率**。 + +**算法实现:** + + + +```python +# 算法实现 +self.m1 = self.theta1 * self.m1 + (1 - self.theta1) * self.W1_grad +self.v1 = self.theta2 * self.v1 + (1 - self.theta1) * np.square(self.W1_grad) +m_ = self.m1 / (1 - np.power(self.theta1, self.n)) +v_ = self.v1 / (1 - np.power(self.theta2, self.n)) +self.W1 -= learning_rate * m_ / (np.sqrt(v_) + self.eps) + +``` 结果 +[0] Accuracy: 0.9639 +[1] Accuracy: 0.9661 +[2] Accuracy: 0.9690 +``` +``` + + + +比较三次迭代后的准确率,收敛速度:Adam > Momentum > SGD 。 \ No newline at end of file diff --git a/assignment-2/submission/18307130130/img/Adam.png b/assignment-2/submission/18307130130/img/Adam.png new file mode 100644 index 0000000000000000000000000000000000000000..cfa321ab862467b331f20692c04e66d4fd692ca0 Binary files /dev/null and b/assignment-2/submission/18307130130/img/Adam.png differ diff --git a/assignment-2/submission/18307130130/img/Adam_train.png b/assignment-2/submission/18307130130/img/Adam_train.png new file mode 100644 index 0000000000000000000000000000000000000000..5fc0b816e78b331003a15c00129951be2cfae4f8 Binary files /dev/null and b/assignment-2/submission/18307130130/img/Adam_train.png differ diff --git a/assignment-2/submission/18307130130/img/Adam_train_512_01.png b/assignment-2/submission/18307130130/img/Adam_train_512_01.png new file mode 100644 index 0000000000000000000000000000000000000000..2c3bc63fb7fff6797bb16d17a72359f690c133fa Binary files /dev/null and b/assignment-2/submission/18307130130/img/Adam_train_512_01.png differ diff --git a/assignment-2/submission/18307130130/img/Momentum.png b/assignment-2/submission/18307130130/img/Momentum.png new file mode 100644 index 0000000000000000000000000000000000000000..6d548f77cd3d0ed85d5fbdf48439aea0a2d73955 Binary files /dev/null and b/assignment-2/submission/18307130130/img/Momentum.png differ diff --git a/assignment-2/submission/18307130130/img/Momentum_train.png b/assignment-2/submission/18307130130/img/Momentum_train.png new file mode 100644 index 0000000000000000000000000000000000000000..7a5d77ef53b9a9b832080deb8feae3b1fafacd2a Binary files /dev/null and b/assignment-2/submission/18307130130/img/Momentum_train.png differ diff --git a/assignment-2/submission/18307130130/img/Momentum_train_512_01.png b/assignment-2/submission/18307130130/img/Momentum_train_512_01.png new file mode 100644 index 0000000000000000000000000000000000000000..b33390f9ed07281e4dccba4e26c77cfea23a7238 Binary files /dev/null and b/assignment-2/submission/18307130130/img/Momentum_train_512_01.png differ diff --git a/assignment-2/submission/18307130130/img/SGD_train_128_001.png b/assignment-2/submission/18307130130/img/SGD_train_128_001.png new file mode 100644 index 0000000000000000000000000000000000000000..c581cfa591392a68d65dbda13c82f162f4d81cb4 Binary files /dev/null and b/assignment-2/submission/18307130130/img/SGD_train_128_001.png differ diff --git a/assignment-2/submission/18307130130/img/SGD_train_128_01.png b/assignment-2/submission/18307130130/img/SGD_train_128_01.png new file mode 100644 index 0000000000000000000000000000000000000000..385a0c3d97846385dad3cbe0a95e3ac9b5b6a4f9 Binary files /dev/null and b/assignment-2/submission/18307130130/img/SGD_train_128_01.png differ diff --git a/assignment-2/submission/18307130130/img/SGD_train_512_01.png b/assignment-2/submission/18307130130/img/SGD_train_512_01.png new file mode 100644 index 0000000000000000000000000000000000000000..e065f65d6fee671ccb2fad718e3a8ce62c7227c5 Binary files /dev/null and b/assignment-2/submission/18307130130/img/SGD_train_512_01.png differ diff --git a/assignment-2/submission/18307130130/img/model.png b/assignment-2/submission/18307130130/img/model.png new file mode 100644 index 0000000000000000000000000000000000000000..e673f6710acb8d4c443cf1622f363b3e76aecbb2 Binary files /dev/null and b/assignment-2/submission/18307130130/img/model.png differ diff --git a/assignment-2/submission/18307130130/numpy_fnn.py b/assignment-2/submission/18307130130/numpy_fnn.py new file mode 100644 index 0000000000000000000000000000000000000000..5469a9f2ba16e60bf21e5cbb009e2f2bac50263e --- /dev/null +++ b/assignment-2/submission/18307130130/numpy_fnn.py @@ -0,0 +1,262 @@ +import os +os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" +import numpy as np + + +class NumpyOp: + + def __init__(self): + self.memory = {} + self.epsilon = 1e-12 + + +class Matmul(NumpyOp): + + def forward(self, x, W): + """ + x: shape(N, d) + w: shape(d, d') + """ + self.memory['x'] = x + self.memory['W'] = W + h = np.matmul(x, W) + return h + + def backward(self, grad_y): + """ + grad_y: shape(N, d') + """ + + #################### + # code 1 # + #################### + + grad_x = np.matmul(grad_y, self.memory['W'].T) + grad_W = np.matmul(self.memory['x'].T, grad_y) + + return grad_x, grad_W + + +class Relu(NumpyOp): + + def forward(self, x): + self.memory['x'] = x + return np.where(x > 0, x, np.zeros_like(x)) + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 2 # + #################### + + x = self.memory['x'] + grad_x = grad_y * np.where( x > 0, np.ones_like(x), np.zeros_like(x) ) + + return grad_x + + +class Log(NumpyOp): + + def forward(self, x): + """ + x: shape(N, c) + """ + + out = np.log(x + self.epsilon) + self.memory['x'] = x + + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 3 # + #################### + + x = self.memory['x'] + grad_x = grad_y / (x + self.epsilon) + + return grad_x + + +class Softmax(NumpyOp): + """ + softmax over last dimension + """ + + def forward(self, x): + """ + x: shape(N, c) + """ + + #################### + # code 4 # + #################### + + # 得到矩阵每行的最大值,避免溢出 + row_max = np.max(x, axis=1).reshape(-1, 1) + x -= row_max + x_exp = np.exp(x) + # 每行求和 + sum_exp = np.sum(x_exp, axis=1, keepdims=True) + # out: N * c + out = x_exp / sum_exp + self.memory['out'] = out + + return out + + def backward(self, grad_y): + """ + grad_y: same shape as x + """ + + #################### + # code 5 # + #################### + + out = self.memory['out'] + # Jacobs: N * c * c + Jacobs = np.array([np.diag(x) - np.outer(x, x) for x in out]) + + # (B, n, m) * (B, m, d) = (B, n, d) + grad_y = grad_y[:, np.newaxis, :] + grad_x = np.matmul(grad_y, Jacobs).squeeze(axis=1) + + return grad_x + + +class NumpyLoss: + + def __init__(self): + self.target = None + + def get_loss(self, pred, target): + self.target = target + return (-pred * target).sum(axis=1).mean() + + def backward(self): + return -self.target / self.target.shape[0] + + +class NumpyModel: + def __init__(self): + self.W1 = np.random.normal(size=(28 * 28, 256)) + self.W2 = np.random.normal(size=(256, 64)) + self.W3 = np.random.normal(size=(64, 10)) + + # 以下算子会在 forward 和 backward 中使用 + self.matmul_1 = Matmul() + self.relu_1 = Relu() + self.matmul_2 = Matmul() + self.relu_2 = Relu() + self.matmul_3 = Matmul() + self.softmax = Softmax() + self.log = Log() + + # 以下变量需要在 backward 中更新。 softmax_grad, log_grad 等为算子反向传播的梯度( loss 关于算子输入的偏导) + self.x1_grad, self.W1_grad = None, None + self.relu_1_grad = None + self.x2_grad, self.W2_grad = None, None + self.relu_2_grad = None + self.x3_grad, self.W3_grad = None, None + self.softmax_grad = None + self.log_grad = None + + # Momentum parameters + self.beta1 = np.zeros_like(self.W1) + self.beta2 = np.zeros_like(self.W2) + self.beta3 = np.zeros_like(self.W3) + self.momentum = 0.9 + + # Adam parameters + self.theta1 = 0.9 + self.theta2 = 0.999 + self.eps = 1e-8 + self.m1 = np.zeros_like(self.W1) + self.v1 = np.zeros_like(self.W1) + self.m2 = np.zeros_like(self.W2) + self.v2 = np.zeros_like(self.W2) + self.m3 = np.zeros_like(self.W3) + self.v3 = np.zeros_like(self.W3) + self.n = 0 + + def forward(self, x): + x = x.reshape(-1, 28 * 28) + + #################### + # code 6 # + #################### + + x = self.matmul_1.forward(x, self.W1) + x = self.relu_1.forward(x) + x = self.matmul_2.forward(x, self.W2) + x = self.relu_2.forward(x) + x = self.matmul_3.forward(x, self.W3) + x = self.softmax.forward(x) + x = self.log.forward(x) + + return x + + def backward(self, y): + + #################### + # code 7 # + #################### + + self.log_grad = self.log.backward(y) + self.softmax_grad = self.softmax.backward(self.log_grad) + self.x3_grad, self.W3_grad = self.matmul_3.backward(self.softmax_grad) + self.relu_2_grad = self.relu_2.backward(self.x3_grad) + self.x2_grad, self.W2_grad = self.matmul_2.backward(self.relu_2_grad) + self.relu_1_grad = self.relu_1.backward(self.x2_grad) + self.x1_grad, self.W1_grad = self.matmul_1.backward(self.relu_1_grad) + + return self.x1_grad + + def optimize(self, learning_rate): + def SGD(): + self.W1 -= learning_rate * self.W1_grad + self.W2 -= learning_rate * self.W2_grad + self.W3 -= learning_rate * self.W3_grad + + def Momentum(): + + self.beta1 = self.momentum * self.beta1 + (1 - self.momentum) * self.W1_grad + self.beta2 = self.momentum * self.beta2 + (1 - self.momentum) * self.W2_grad + self.beta3 = self.momentum * self.beta3 + (1 - self.momentum) * self.W3_grad + self.W1 -= learning_rate * self.beta1 + self.W2 -= learning_rate * self.beta2 + self.W3 -= learning_rate * self.beta3 + + def Adam(): + + self.n += 1 + + self.m1 = self.theta1 * self.m1 + (1 - self.theta1) * self.W1_grad + self.v1 = self.theta2 * self.v1 + (1 - self.theta1) * np.square(self.W1_grad) + m_ = self.m1 / (1 - np.power(self.theta1, self.n)) + v_ = self.v1 / (1 - np.power(self.theta2, self.n)) + self.W1 -= learning_rate * m_ / (np.sqrt(v_) + self.eps) + + self.m2 = self.theta1 * self.m2 + (1 - self.theta1) * self.W2_grad + self.v2 = self.theta2 * self.v2 + (1 - self.theta1) * np.square(self.W2_grad) + m_ = self.m2 / (1 - np.power(self.theta1, self.n)) + v_ = self.v2 / (1 - np.power(self.theta2, self.n)) + self.W2 -= learning_rate * m_ / (np.sqrt(v_) + self.eps) + + self.m3 = self.theta1 * self.m3 + (1 - self.theta1) * self.W3_grad + self.v3 = self.theta2 * self.v3 + (1 - self.theta1) * np.square(self.W3_grad) + m_ = self.m3 / (1 - np.power(self.theta1, self.n)) + v_ = self.v3 / (1 - np.power(self.theta2, self.n)) + self.W3 -= learning_rate * m_ / (np.sqrt(v_) + self.eps) + + SGD() + #Momentum() + #Adam() + diff --git a/assignment-2/submission/18307130130/numpy_mnist.py b/assignment-2/submission/18307130130/numpy_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..ec9d23563037904e97a1952d4a23a6b6518031bf --- /dev/null +++ b/assignment-2/submission/18307130130/numpy_mnist.py @@ -0,0 +1,51 @@ +import numpy as np +from numpy_fnn import NumpyModel, NumpyLoss +from utils import download_mnist, batch, get_torch_initialization, plot_curve, one_hot + +def mini_batch(train_dataset, batch_size=128): + data = np.array([np.array(x[0]) for x in train_dataset]) + label = np.array([np.array(x[1]) for x in train_dataset]) + + sz = data.shape[0] + index = np.arange(sz) + np.random.shuffle(index) + + ret = [] + for start in range(0, sz, batch_size): + ret.append([data[index[start: start+ batch_size]], label[index[start: start+ batch_size]]]) + return ret + + +def numpy_run(): + train_dataset, test_dataset = download_mnist() + + model = NumpyModel() + numpy_loss = NumpyLoss() + model.W1, model.W2, model.W3 = get_torch_initialization() + + train_loss = [] + + epoch_number = 3 + learning_rate = 0.1 + + for epoch in range(epoch_number): + for x, y in mini_batch(train_dataset, batch_size=512): + y = one_hot(y) + y_pred = model.forward(x) + loss = numpy_loss.get_loss(y_pred, y) + + # numpy_loss.backward().shape: batch_size * 10 + model.backward(numpy_loss.backward()) + model.optimize(learning_rate) + + train_loss.append(loss.item()) + + x, y = batch(test_dataset)[0] + accuracy = np.mean((model.forward(x).argmax(axis=1) == y)) + print('[{}] Accuracy: {:.4f}'.format(epoch, accuracy)) + + plot_curve(train_loss) + + +if __name__ == "__main__": + numpy_run()