diff --git a/assignment-2/submission/19307130062/README.md b/assignment-2/submission/19307130062/README.md
index 6615e86187728bd84f94ff46c3ef55d6595e1b00..79d593179f6bcd306650008b80a89cf352bc2ba0 100644
--- a/assignment-2/submission/19307130062/README.md
+++ b/assignment-2/submission/19307130062/README.md
@@ -520,4 +520,37 @@ def mini_batch(dataset, batch_size = 128, numpy = False):
### 权重初始化
-留坑待补...
\ No newline at end of file
+#### torch.nn.Linear 的初始化方法
+
+首先是原代码中 `torch.nn.Linear` 的初始化方法调查,这个可以去查一下文档,然后就可以发现
+
+
+
+可以看到,这里 $k$ 表示上一层神经元个数(in_features) 的倒数,而 old_features 即为当前层神经元个数,然后用 $\mathcal U(-\sqrt{k},\ \sqrt{k})$ 的均匀分布进行初始化,这和 He 初始化有一些相似,但是差了常数
+
+
+
+#### 其他初始化方法的探索
+
+测试了几种方法,有
+
+- 均匀分布采样的 Xavier 初始化
+- 均匀分布采样的 Hekaiming 初始化
+- 高斯分布采样的 Xavier 初始化
+- 高斯分布采样的 Hekaiming 初始化
+
+#### $\mathrm{epoch} = 3,\ \alpha = 0.1$
+
+
+
+| Epoch | Torch | Xavier_Uniform | HeKaiming_Uniform | Xavier_Normal | HeKaiming_Normal |
+| :---: | :-------: | :------------: | :---------------: | :-----------: | :--------------: |
+| $0$ | $94.38\%$ | $95.59\%$ | $95.66\%$ | $95.55\%$ | $95.58\%$ |
+| $1$ | $95.47\%$ | $96.19\%$ | $96.72\%$ | $96.68\%$ | $96.83\%$ |
+| $2$ | $96.91\%$ | $97.20\%$ | $96.81\%$ | $97.12\%$ | $96.97\%$ |
+
+
+
+#### 总结
+
+- 可以看到 Xavier 和 He 初始化相对于 Pytorch 默认的方法来说还是有一定优势的,但实际测试中 He 初始化有时会不够稳定,且参数设定上存在一些困难
\ No newline at end of file
diff --git a/assignment-2/submission/19307130062/img/6.1.png b/assignment-2/submission/19307130062/img/6.1.png
new file mode 100644
index 0000000000000000000000000000000000000000..6ce20298bc07a084ce60460219d0ec97c6da753f
Binary files /dev/null and b/assignment-2/submission/19307130062/img/6.1.png differ
diff --git a/assignment-2/submission/19307130062/img/6.2.png b/assignment-2/submission/19307130062/img/6.2.png
new file mode 100644
index 0000000000000000000000000000000000000000..37fe7567104f9c5d6aff3c637cf351ab372f4721
Binary files /dev/null and b/assignment-2/submission/19307130062/img/6.2.png differ
diff --git a/assignment-2/submission/19307130062/img/6.3.png b/assignment-2/submission/19307130062/img/6.3.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca5379b9a083780667237d99a41eee14bc4479cb
Binary files /dev/null and b/assignment-2/submission/19307130062/img/6.3.png differ
diff --git a/assignment-2/submission/19307130062/img/Pytorch.png b/assignment-2/submission/19307130062/img/Pytorch.png
new file mode 100644
index 0000000000000000000000000000000000000000..bac6ce64671b021b2420d290d7cfa67bf6519298
Binary files /dev/null and b/assignment-2/submission/19307130062/img/Pytorch.png differ
diff --git a/assignment-2/submission/19307130062/numpy_fnn.py b/assignment-2/submission/19307130062/numpy_fnn.py
index c5c95457e32cddf3ea2cb9effb780331e632d2bd..0f05d097653cfde4eb20a9749462e20d5a190aed 100644
--- a/assignment-2/submission/19307130062/numpy_fnn.py
+++ b/assignment-2/submission/19307130062/numpy_fnn.py
@@ -66,7 +66,7 @@ class Log(NumpyOp):
"""
# code3
- grad_x = grad_y * (1.0 / self.memory['x'])
+ grad_x = grad_y * (1.0 / (self.memory['x'] + self.epsilon))
return grad_x
diff --git a/assignment-2/submission/19307130062/numpy_mnist.py b/assignment-2/submission/19307130062/numpy_mnist.py
index 4e2bf7a7f624444ede1e6b317d46b0052f1ac675..d6cf155da24ea5279c4c71dfc25bfec6b4c39fcc 100644
--- a/assignment-2/submission/19307130062/numpy_mnist.py
+++ b/assignment-2/submission/19307130062/numpy_mnist.py
@@ -58,14 +58,67 @@ def numpy_run():
plot_curve(train_loss)
+def get_torch_initialization(numpy = True, init_type = 'Torch'):
+ import torch
+ if init_type == 'Torch':
+ fc1 = torch.nn.Linear(28 * 28, 256)
+ fc2 = torch.nn.Linear(256, 64)
+ fc3 = torch.nn.Linear(64, 10)
+
+ if numpy:
+ W1 = fc1.weight.T.detach().clone().numpy()
+ W2 = fc2.weight.T.detach().clone().numpy()
+ W3 = fc3.weight.T.detach().clone().numpy()
+ else:
+ W1 = fc1.weight.T.detach().clone().data
+ W2 = fc2.weight.T.detach().clone().data
+ W3 = fc3.weight.T.detach().clone().data
+
+ elif init_type == 'Xavier_Uniform':
+ print('Xavier_Uniform')
+ r = np.sqrt(6.0 / (28 * 28 + 256))
+ W1 = np.random.uniform(-r, r, (28 * 28, 256))
+ r = np.sqrt(6.0 / (256 + 64))
+ W2 = np.random.uniform(-r, r, (256, 64))
+ r = np.sqrt(6.0 / (64 + 10))
+ W3 = np.random.uniform(-r, r, (64, 10))
+
+ elif init_type == 'HeKaiming_Uniform':
+ print('HeKaiming_Uniform')
+ r = np.sqrt(6.0 / (28 * 28))
+ W1 = np.random.uniform(-r, r, (28 * 28, 256))
+ r = np.sqrt(6.0 / 256)
+ W2 = np.random.uniform(-r, r, (256, 64))
+ r = 4 * np.sqrt(6.0 / 64)
+ W3 = np.random.uniform(-r, r, (64, 10))
+
+ elif init_type == 'Xavier_Normal':
+ print('Xavier_Normal')
+ sigma = 2.0 / (28 * 28 + 256)
+ W1 = np.random.normal(0., np.sqrt(sigma), (28 * 28, 256))
+ sigma = 2.0 / (256 + 64)
+ W2 = np.random.normal(0., np.sqrt(sigma), (256, 64))
+ sigma = 2.0 / (64 + 10)
+ W3 = np.random.normal(0., np.sqrt(sigma), (64, 10))
+
+ elif init_type == 'HeKaiming_Normal':
+ print('HeKaiming_Normal')
+ sigma = 2.0 / (28 * 28)
+ W1 = np.random.normal(0., np.sqrt(sigma), (28 * 28, 256))
+ sigma = 2.0 / 256
+ W2 = np.random.normal(0., np.sqrt(sigma), (256, 64))
+ sigma = 16 * 2.0 / 64
+ W3 = np.random.normal(0., np.sqrt(sigma), (64, 10))
+
+ return W1, W2, W3
-
-def my_numpy_run(learning_rate = 0.01, epoch_number = 3, update_type = None):
+def my_numpy_run(learning_rate = 0.1, epoch_number = 3, update_type = None, init_type = 'Torch'):
+ print('learning rate = ' + str(learning_rate))
train_dataset, test_dataset = download_mnist()
model = NumpyModel(learning_rate, update_type, iter_times = 1407)
numpy_loss = NumpyLoss()
- model.W1, model.W2, model.W3 = get_torch_initialization()
+ model.W1, model.W2, model.W3 = get_torch_initialization(init_type = init_type)
train_loss = []
@@ -136,8 +189,47 @@ def multi_test():
plt.savefig("5.4.png", format = 'png', dpi = 1000)
+def multi_test_2():
+ from matplotlib import pyplot as plt
+ cases = [ 'HeKaiming_Normal',
+ 'HeKaiming_Uniform',
+ 'Torch',
+ 'Xavier_Uniform',
+ 'Xavier_Normal',
+ ]
+
+ colors = ['#1f77b4',
+ '#ff7f0e',
+ '#2ca02c',
+ '#d62728',
+ '#9467bd',
+ '#8c564b',
+ '#e377c2',
+ '#7f7f7f',
+ '#bcbd22',
+ '#17becf',
+ '#1a55FF']
+
+ # Configure rcParams axes.prop_cycle to simultaneously cycle cases and colors.
+ # mpl.rcParams['axes.prop_cycle'] = cycler(markevery=cases, color=colors)
+ # Set the plot curve with markers and a title
+ plt.rcParams['figure.figsize'] = (10.0, 4.0) # 设置figure_size尺寸
+ fig = plt.figure()
+ ax = fig.add_axes([0.1, 0.1, 0.6, 0.75])
+ plt.xlabel('step')
+ plt.ylabel('loss value')
+ for i in range(len(cases)):
+ print('Test ' + str(cases[i]) + ' :')
+ data = my_numpy_run(init_type = cases[i])
+ print('-------------\n')
+ ax.plot(range(len(data)), data, linewidth = 0.5, label = str(cases[i]))
+ ax.legend(bbox_to_anchor = (1.05, 1), loc = 'upper left', borderaxespad = 0.)
+
+ plt.savefig("5.4.png", format = 'png', dpi = 1000)
+
if __name__ == "__main__":
numpy_run()
# my_numpy_run(learning_rate = 0.05, update_type = 'RMSprop')
# multi_test()
+ # multi_test_2()