diff --git a/assignment-2/submission/19307130062/README.md b/assignment-2/submission/19307130062/README.md index 6615e86187728bd84f94ff46c3ef55d6595e1b00..79d593179f6bcd306650008b80a89cf352bc2ba0 100644 --- a/assignment-2/submission/19307130062/README.md +++ b/assignment-2/submission/19307130062/README.md @@ -520,4 +520,37 @@ def mini_batch(dataset, batch_size = 128, numpy = False): ### 权重初始化 -留坑待补... \ No newline at end of file +#### torch.nn.Linear 的初始化方法 + +首先是原代码中 `torch.nn.Linear` 的初始化方法调查,这个可以去查一下文档,然后就可以发现 + +5 + +可以看到,这里 $k$ 表示上一层神经元个数(in_features) 的倒数,而 old_features 即为当前层神经元个数,然后用 $\mathcal U(-\sqrt{k},\ \sqrt{k})$ 的均匀分布进行初始化,这和 He 初始化有一些相似,但是差了常数 + + + +#### 其他初始化方法的探索 + +测试了几种方法,有 + +- 均匀分布采样的 Xavier 初始化 +- 均匀分布采样的 Hekaiming 初始化 +- 高斯分布采样的 Xavier 初始化 +- 高斯分布采样的 Hekaiming 初始化 + +#### $\mathrm{epoch} = 3,\ \alpha = 0.1$ + +5 + +| Epoch | Torch | Xavier_Uniform | HeKaiming_Uniform | Xavier_Normal | HeKaiming_Normal | +| :---: | :-------: | :------------: | :---------------: | :-----------: | :--------------: | +| $0$ | $94.38\%$ | $95.59\%$ | $95.66\%$ | $95.55\%$ | $95.58\%$ | +| $1$ | $95.47\%$ | $96.19\%$ | $96.72\%$ | $96.68\%$ | $96.83\%$ | +| $2$ | $96.91\%$ | $97.20\%$ | $96.81\%$ | $97.12\%$ | $96.97\%$ | + + + +#### 总结 + +- 可以看到 Xavier 和 He 初始化相对于 Pytorch 默认的方法来说还是有一定优势的,但实际测试中 He 初始化有时会不够稳定,且参数设定上存在一些困难 \ No newline at end of file diff --git a/assignment-2/submission/19307130062/img/6.1.png b/assignment-2/submission/19307130062/img/6.1.png new file mode 100644 index 0000000000000000000000000000000000000000..6ce20298bc07a084ce60460219d0ec97c6da753f Binary files /dev/null and b/assignment-2/submission/19307130062/img/6.1.png differ diff --git a/assignment-2/submission/19307130062/img/6.2.png b/assignment-2/submission/19307130062/img/6.2.png new file mode 100644 index 0000000000000000000000000000000000000000..37fe7567104f9c5d6aff3c637cf351ab372f4721 Binary files /dev/null and b/assignment-2/submission/19307130062/img/6.2.png differ diff --git a/assignment-2/submission/19307130062/img/6.3.png b/assignment-2/submission/19307130062/img/6.3.png new file mode 100644 index 0000000000000000000000000000000000000000..ca5379b9a083780667237d99a41eee14bc4479cb Binary files /dev/null and b/assignment-2/submission/19307130062/img/6.3.png differ diff --git a/assignment-2/submission/19307130062/img/Pytorch.png b/assignment-2/submission/19307130062/img/Pytorch.png new file mode 100644 index 0000000000000000000000000000000000000000..bac6ce64671b021b2420d290d7cfa67bf6519298 Binary files /dev/null and b/assignment-2/submission/19307130062/img/Pytorch.png differ diff --git a/assignment-2/submission/19307130062/numpy_fnn.py b/assignment-2/submission/19307130062/numpy_fnn.py index c5c95457e32cddf3ea2cb9effb780331e632d2bd..0f05d097653cfde4eb20a9749462e20d5a190aed 100644 --- a/assignment-2/submission/19307130062/numpy_fnn.py +++ b/assignment-2/submission/19307130062/numpy_fnn.py @@ -66,7 +66,7 @@ class Log(NumpyOp): """ # code3 - grad_x = grad_y * (1.0 / self.memory['x']) + grad_x = grad_y * (1.0 / (self.memory['x'] + self.epsilon)) return grad_x diff --git a/assignment-2/submission/19307130062/numpy_mnist.py b/assignment-2/submission/19307130062/numpy_mnist.py index 4e2bf7a7f624444ede1e6b317d46b0052f1ac675..d6cf155da24ea5279c4c71dfc25bfec6b4c39fcc 100644 --- a/assignment-2/submission/19307130062/numpy_mnist.py +++ b/assignment-2/submission/19307130062/numpy_mnist.py @@ -58,14 +58,67 @@ def numpy_run(): plot_curve(train_loss) +def get_torch_initialization(numpy = True, init_type = 'Torch'): + import torch + if init_type == 'Torch': + fc1 = torch.nn.Linear(28 * 28, 256) + fc2 = torch.nn.Linear(256, 64) + fc3 = torch.nn.Linear(64, 10) + + if numpy: + W1 = fc1.weight.T.detach().clone().numpy() + W2 = fc2.weight.T.detach().clone().numpy() + W3 = fc3.weight.T.detach().clone().numpy() + else: + W1 = fc1.weight.T.detach().clone().data + W2 = fc2.weight.T.detach().clone().data + W3 = fc3.weight.T.detach().clone().data + + elif init_type == 'Xavier_Uniform': + print('Xavier_Uniform') + r = np.sqrt(6.0 / (28 * 28 + 256)) + W1 = np.random.uniform(-r, r, (28 * 28, 256)) + r = np.sqrt(6.0 / (256 + 64)) + W2 = np.random.uniform(-r, r, (256, 64)) + r = np.sqrt(6.0 / (64 + 10)) + W3 = np.random.uniform(-r, r, (64, 10)) + + elif init_type == 'HeKaiming_Uniform': + print('HeKaiming_Uniform') + r = np.sqrt(6.0 / (28 * 28)) + W1 = np.random.uniform(-r, r, (28 * 28, 256)) + r = np.sqrt(6.0 / 256) + W2 = np.random.uniform(-r, r, (256, 64)) + r = 4 * np.sqrt(6.0 / 64) + W3 = np.random.uniform(-r, r, (64, 10)) + + elif init_type == 'Xavier_Normal': + print('Xavier_Normal') + sigma = 2.0 / (28 * 28 + 256) + W1 = np.random.normal(0., np.sqrt(sigma), (28 * 28, 256)) + sigma = 2.0 / (256 + 64) + W2 = np.random.normal(0., np.sqrt(sigma), (256, 64)) + sigma = 2.0 / (64 + 10) + W3 = np.random.normal(0., np.sqrt(sigma), (64, 10)) + + elif init_type == 'HeKaiming_Normal': + print('HeKaiming_Normal') + sigma = 2.0 / (28 * 28) + W1 = np.random.normal(0., np.sqrt(sigma), (28 * 28, 256)) + sigma = 2.0 / 256 + W2 = np.random.normal(0., np.sqrt(sigma), (256, 64)) + sigma = 16 * 2.0 / 64 + W3 = np.random.normal(0., np.sqrt(sigma), (64, 10)) + + return W1, W2, W3 - -def my_numpy_run(learning_rate = 0.01, epoch_number = 3, update_type = None): +def my_numpy_run(learning_rate = 0.1, epoch_number = 3, update_type = None, init_type = 'Torch'): + print('learning rate = ' + str(learning_rate)) train_dataset, test_dataset = download_mnist() model = NumpyModel(learning_rate, update_type, iter_times = 1407) numpy_loss = NumpyLoss() - model.W1, model.W2, model.W3 = get_torch_initialization() + model.W1, model.W2, model.W3 = get_torch_initialization(init_type = init_type) train_loss = [] @@ -136,8 +189,47 @@ def multi_test(): plt.savefig("5.4.png", format = 'png', dpi = 1000) +def multi_test_2(): + from matplotlib import pyplot as plt + cases = [ 'HeKaiming_Normal', + 'HeKaiming_Uniform', + 'Torch', + 'Xavier_Uniform', + 'Xavier_Normal', + ] + + colors = ['#1f77b4', + '#ff7f0e', + '#2ca02c', + '#d62728', + '#9467bd', + '#8c564b', + '#e377c2', + '#7f7f7f', + '#bcbd22', + '#17becf', + '#1a55FF'] + + # Configure rcParams axes.prop_cycle to simultaneously cycle cases and colors. + # mpl.rcParams['axes.prop_cycle'] = cycler(markevery=cases, color=colors) + # Set the plot curve with markers and a title + plt.rcParams['figure.figsize'] = (10.0, 4.0) # 设置figure_size尺寸 + fig = plt.figure() + ax = fig.add_axes([0.1, 0.1, 0.6, 0.75]) + plt.xlabel('step') + plt.ylabel('loss value') + for i in range(len(cases)): + print('Test ' + str(cases[i]) + ' :') + data = my_numpy_run(init_type = cases[i]) + print('-------------\n') + ax.plot(range(len(data)), data, linewidth = 0.5, label = str(cases[i])) + ax.legend(bbox_to_anchor = (1.05, 1), loc = 'upper left', borderaxespad = 0.) + + plt.savefig("5.4.png", format = 'png', dpi = 1000) + if __name__ == "__main__": numpy_run() # my_numpy_run(learning_rate = 0.05, update_type = 'RMSprop') # multi_test() + # multi_test_2()