diff --git a/assignment-1/submission/17307100038/README.md b/assignment-1/submission/17307100038/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e209eba20b4f93de8019d642a8fdc6979914d38
--- /dev/null
+++ b/assignment-1/submission/17307100038/README.md
@@ -0,0 +1,362 @@
+# 课程报告
+
+## KNN类实现
+
+### fit()函数
+
+fit(X, y,cate = 'euclidean',metric='accuracy',preprocess =None)
+
+X: 训练集
+
+y:训练集标签
+
+cate:距离计算方式,如euclidean、manhattan距离
+
+metric:模型评估方式,如accuracy
+
+preprocess:预处理方式,包含min_max归一化、z_score标准化、不处理
+
+
+
+fit函数包含以下功能:
+
+ 1、预处理;
+
+ 2、随机打乱数据集顺序
+
+ 3、以8:2的比例划分train_data,dev_data,训练选出评估结果最优的k值
+
+### predict()函数
+
+predict用于预测测试集样本
+
+### 辅助函数
+
+distance( d1, d2,cate ='eulidean')
+
+d1,d2表示计算距离的点,cate默认为euclidean距离,可以选择manhattan距离
+
+
+
+## 实验1
+
+### Group1:各个类别相差较大,成较为明显的线性位置
+
+$$
+\Sigma =
+ \left[
+ \begin{matrix}
+ 52 & 0 \\
+ 0 & 22
+ \end{matrix}
+ \right]
+ \Sigma =
+ \left[
+ \begin{matrix}
+ 21.1 & 0 \\
+ 0 & 32.1
+ \end{matrix}
+ \right]
+ \Sigma =
+ \left[
+ \begin{matrix}
+ 10 & 0 \\
+ 0 & 10
+ \end{matrix}
+ \right]
+$$
+
+$$
+\mu =
+ \left[
+ \begin{matrix}
+ 2 &5
+ \end{matrix}
+ \right]
+ \mu =
+ \left[
+ \begin{matrix}
+ 20 & -5
+ \end{matrix}
+ \right]
+ \mu =
+ \left[
+ \begin{matrix}
+ -5 & 22
+ \end{matrix}
+ \right]
+$$
+
+train_data
+
+
+
+测试集
+
+
+
+测试在两种距离下的准确率如下:
+
+| k | distance | acc |
+| ---- | --------- | ------- |
+| 8 | euclidean | 96.250% |
+| 9 | euclidean | 95.625% |
+| 3 | euclidean | 95.833% |
+| 13 | euclidean | 96.458% |
+| 3 | manhattan | 95.417% |
+| 13 | manhattan | 96.250% |
+| 5 | manhattan | 95.625% |
+| 5 | manhattan | 95.625% |
+
+### Group2:各个类别之间相差较大,成较为明显的分散位置
+
+$$
+\Sigma =
+ \left[
+ \begin{matrix}
+ 52 & 0 \\
+ 0 & 22
+ \end{matrix}
+ \right]
+ \Sigma =
+ \left[
+ \begin{matrix}
+ 21.1 & 0 \\
+ 0 & 32.1
+ \end{matrix}
+ \right]
+ \Sigma =
+ \left[
+ \begin{matrix}
+ 10 & 0 \\
+ 0 & 10
+ \end{matrix}
+ \right]
+$$
+
+$$
+\mu =
+ \left[
+ \begin{matrix}
+ 2 &5
+ \end{matrix}
+ \right]
+ \mu =
+ \left[
+ \begin{matrix}
+ 20 & 16
+ \end{matrix}
+ \right]
+ \mu =
+ \left[
+ \begin{matrix}
+ -5 & 22
+ \end{matrix}
+ \right]
+$$
+
+train_data:
+
+
+
+test_data:
+
+
+
+测试在两种距离下的准确率如下:
+
+| k | distance | acc |
+| ---- | --------- | ------- |
+| 7 | euclidean | 96.875% |
+| 7 | euclidean | 96.875% |
+| 9 | euclidean | 97.083% |
+| 8 | euclidean | 97.083% |
+| 12 | manhattan | 97.708% |
+| 14 | manhattan | 97.500% |
+| 5 | manhattan | 97.083% |
+| 12 | manhattan | 97.708% |
+
+*可见不同群之间的几何分布类型对knn的效果影响不明显*
+
+## 实验2
+
+控制均值不变,倍数扩大协方差的各个数值至2倍
+$$
+\Sigma =
+ \left[
+ \begin{matrix}
+ 52 & 0 \\
+ 0 & 22
+ \end{matrix}
+ \right]
+ \Sigma =
+ \left[
+ \begin{matrix}
+ 21.1 & 0 \\
+ 0 & 32.1
+ \end{matrix}
+ \right]
+ \Sigma =
+ \left[
+ \begin{matrix}
+ 10 & 0 \\
+ 0 & 10
+ \end{matrix}
+ \right]
+$$
+
+$$
+\left[
+ \begin{matrix}
+ 2 &5
+ \end{matrix}
+ \right]
+ \mu =
+ \left[
+ \begin{matrix}
+ 20 & 16
+ \end{matrix}
+ \right]
+ \mu =
+ \left[
+ \begin{matrix}
+ -5 & 22
+ \end{matrix}
+ \right]
+$$
+
+得到准确率改变如下图:
+
+
+
+*方差对于KNN的准确率影响显著,随着方差增大,模型准确率下降*
+
+## 实验3
+
+对比采用归一化、标准化前后
+$$
+\Sigma =
+ \left[
+ \begin{matrix}
+ 20 & 0 \\
+ 0 & 1250
+ \end{matrix}
+ \right]
+ \Sigma =
+ \left[
+ \begin{matrix}
+ 25 & 0 \\
+ 0 & 2500
+ \end{matrix}
+ \right]
+ \Sigma =
+ \left[
+ \begin{matrix}
+ 10 & 0 \\
+ 0 & 950
+ \end{matrix}
+ \right]
+$$
+
+$$
+\mu=
+\left[
+ \begin{matrix}
+ 2 &5
+ \end{matrix}
+ \right]
+ \mu =
+ \left[
+ \begin{matrix}
+ 10 & -60
+ \end{matrix}
+ \right]
+ \mu =
+ \left[
+ \begin{matrix}
+ -5 & 72
+ \end{matrix}
+ \right]
+$$
+
+无预处理:
+
+
+
+min_max 归一化:
+
+
+
+Z_score标准化:
+
+
+
+得到对应的准确率如下:
+
+| preprocessing | accuracy |
+| ------------- | -------- |
+| None | 82.917% |
+| min_max | 83.542% |
+| z_score | 84.17% |
+
+通过变小均值和方差的差距,重新实验得到如下结果:
+$$
+\Sigma =
+ \left[
+ \begin{matrix}
+ 20 & 0 \\
+ 0 & 750
+ \end{matrix}
+ \right]
+ \Sigma =
+ \left[
+ \begin{matrix}
+ 25 & 0 \\
+ 0 & 1200
+ \end{matrix}
+ \right]
+ \Sigma =
+ \left[
+ \begin{matrix}
+ 10 & 0 \\
+ 0 & 650
+ \end{matrix}
+ \right]
+$$
+
+$$
+\mu=
+\left[
+ \begin{matrix}
+ 2 &5
+ \end{matrix}
+ \right]
+ \mu =
+ \left[
+ \begin{matrix}
+ 10 & -50
+ \end{matrix}
+ \right]
+ \mu =
+ \left[
+ \begin{matrix}
+ -5 & 55
+ \end{matrix}
+ \right]
+$$
+
+| preprocessing | accuracy |
+| ------------- | -------- |
+| None | 90.417% |
+| min_max | 90.625# |
+| z_score | 90.833% |
+
+*标准化、归一化对于KNN模型的准确率有一定提升,数据集各个feature的数量级差别越大,效果越明显*
+
+## 总结
+
+1、KNN模型中不同类别点的几何分布类型对模型预测准确率影响不明显
+
+2、方差对于KNN的准确率影响显著,随着方差增大,模型准确率下降
+
+3、标准化、归一化对于KNN模型的准确率有一定提升,数据集各个feature的数量级差别越大,效果越明显;在数量级相差不大的情况下,性能提升不明显
\ No newline at end of file
diff --git a/assignment-1/submission/17307100038/img/change_cov.png b/assignment-1/submission/17307100038/img/change_cov.png
new file mode 100644
index 0000000000000000000000000000000000000000..90c6e3d31b490ac4e6f2e9a05f21f24bc71627ea
Binary files /dev/null and b/assignment-1/submission/17307100038/img/change_cov.png differ
diff --git a/assignment-1/submission/17307100038/img/data_minmax.png b/assignment-1/submission/17307100038/img/data_minmax.png
new file mode 100644
index 0000000000000000000000000000000000000000..2bf4c70c5448506cd1bb4c074e8a1a9e569c7716
Binary files /dev/null and b/assignment-1/submission/17307100038/img/data_minmax.png differ
diff --git a/assignment-1/submission/17307100038/img/data_original.png b/assignment-1/submission/17307100038/img/data_original.png
new file mode 100644
index 0000000000000000000000000000000000000000..76b9b4aa00c3807e7eb0c973d717e15b8f6ebdc4
Binary files /dev/null and b/assignment-1/submission/17307100038/img/data_original.png differ
diff --git a/assignment-1/submission/17307100038/img/data_zscore.png b/assignment-1/submission/17307100038/img/data_zscore.png
new file mode 100644
index 0000000000000000000000000000000000000000..c79fe49fa23ed2cf8aec87519e4770fd9b3930aa
Binary files /dev/null and b/assignment-1/submission/17307100038/img/data_zscore.png differ
diff --git a/assignment-1/submission/17307100038/img/test_g1.png b/assignment-1/submission/17307100038/img/test_g1.png
new file mode 100644
index 0000000000000000000000000000000000000000..6ba84cf0de903969371c4bb50b7dd8da40b2f1e4
Binary files /dev/null and b/assignment-1/submission/17307100038/img/test_g1.png differ
diff --git a/assignment-1/submission/17307100038/img/test_g2.png b/assignment-1/submission/17307100038/img/test_g2.png
new file mode 100644
index 0000000000000000000000000000000000000000..2155370c1ac0fa5544e7e9e4c9baee3b53fb834e
Binary files /dev/null and b/assignment-1/submission/17307100038/img/test_g2.png differ
diff --git a/assignment-1/submission/17307100038/img/train_g1.png b/assignment-1/submission/17307100038/img/train_g1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b1c264c47eadb1f85822cf8ab1364ced2405f8d
Binary files /dev/null and b/assignment-1/submission/17307100038/img/train_g1.png differ
diff --git a/assignment-1/submission/17307100038/img/train_g2.png b/assignment-1/submission/17307100038/img/train_g2.png
new file mode 100644
index 0000000000000000000000000000000000000000..5530bce8dde2a7a3787fa58ee3e9a37b45726b02
Binary files /dev/null and b/assignment-1/submission/17307100038/img/train_g2.png differ
diff --git a/assignment-1/submission/17307100038/source.py b/assignment-1/submission/17307100038/source.py
new file mode 100644
index 0000000000000000000000000000000000000000..be07e0492b7b13cde2148ce694ddd252ad0426dc
--- /dev/null
+++ b/assignment-1/submission/17307100038/source.py
@@ -0,0 +1,227 @@
+import sys
+import numpy as np
+import matplotlib.pyplot as plt
+
+class KNN:
+ def __init__(self):
+ self.X = None
+ self.y = None
+ self.k = None
+ self.cate = None # 距离计算公式
+ self.metric = None # 评分方式,如accuracy
+ self.preprocess = None
+ self.min = None
+ self.max =None
+ self.mean = None
+ self.std = None
+
+ def distance(self, d1, d2):
+ '''计算距离,如欧式距离、曼哈顿距离等'''
+ if self.cate == 'euclidean':
+ dist = np.sum(np.square(d1 - d2))
+ elif self.cate == 'manhattan':
+ dist = np.sum(np.abs(d1-d2))
+ return dist
+
+ def score(self, y_pred, test_label):
+ '''分数评估如accuracy、macro_f1、micro_f1等'''
+ if self.metric == 'accuracy':
+ cnt = 0
+ for i in range(len(y_pred)):
+ if y_pred[i] == test_label[i]:
+ cnt += 1
+ score = cnt / len(y_pred)
+ return score
+
+ def fit(self, X, y,cate = 'euclidean',metric='accuracy',preprocess =None):
+ '''包含K值的选择、建立模型'''
+ self.cate = cate
+ self.metric = metric
+ self.preprocess = preprocess
+
+ # 1、preprocessing
+ if preprocess == 'Min_Max': #标准化
+ self.min = X.min(axis = 0)
+ self.max = X.max(axis = 0)
+ X = (X -self.min)/(self.max - self.min)
+ elif preprocess == 'Z_score': # 归一化
+ self.mean = X.mean(axis=0)
+ self.std = X.std(axis=0)
+ X = (X - self.mean) / self.std
+ else:
+ X = X
+
+ # 2、打乱顺序
+ random_index = np.random.permutation(len(X))
+ X = X[random_index]
+ y= y[random_index]
+
+ # 3、分为train_data,dev_data
+ N = X.shape[0]
+ cut = int(N * 0.8) # 防止非整数情况
+ train_data, dev_data = X[:cut, ], X[cut:, ]
+ train_label, dev_label = y[:cut, ], y[cut:, ]
+
+ # 4、训练K值
+ max_score = 0
+ max_score_K = 0
+ for k in range(2, 15):
+ '''计算每个k下的accuracy:
+ 1、对每个dev_data,计算其与train_data的距离
+ 2、排序得到距离最近的k个index
+ 3、获取该dev_data的y_pred
+ 4、计算accuracy
+ '''
+ y_pred = []
+ for i in range(len(dev_data)):
+ dist_arr = [self.distance(dev_data[i], train_data[j]) for j in range(len(train_data))] # 每个测试点距离训练集各个点的距离列表
+ sorted_index = np.argsort(dist_arr) # arg 排序各个距离的大小,得到index
+ first_k_index = sorted_index[:k] # 最小的k个index
+ first_k_label = train_label[first_k_index]
+ y_pred.append(np.argmax(np.bincount(first_k_label))) # 取众数为预测值
+ y_pred = np.array(y_pred)
+ score = self.score(y_pred, dev_label)
+
+ if score > max_score:
+ max_score, max_score_K = score, k
+
+ # 5、确立参数
+ self.X = X
+ self.y = y
+ self.k = max_score_K
+ # print('k:%d' % self.k)
+
+ def predict(self, test_data):
+ # preprocessing
+ if self.preprocess == 'Min_Max': #标准化
+ test_data = (test_data -self.min)/(self.max - self.min)
+ elif self.preprocess == 'Z_score': # 归一化
+ test_data = (test_data - self.mean) / self.std
+ else:
+ test_data = test_data
+
+ y_pred = []
+ for i in range(len(test_data)):
+ dist_arr = [self.distance(test_data[i], self.X[j]) for j in range(len(self.X))]
+ first_k_index = np.argsort(dist_arr)[:self.k]
+ first_k_label = self.y[first_k_index]
+ y_pred.append(np.argmax(np.bincount(first_k_label)))
+ return np.array(y_pred)
+
+
+def generate():
+ mean = (2, 5)
+ cov = np.array([[20, 0], [0, 750]])
+ x = np.random.multivariate_normal(mean, cov, (800,))
+
+ mean = (10, -60)
+ cov = np.array([[25, 0], [0, 2500]])
+ y = np.random.multivariate_normal(mean, cov, (600,))
+
+ mean = (-5, 72)
+ cov = np.array([[10, 0], [0, 650]])
+ z = np.random.multivariate_normal(mean, cov, (1000,))
+
+ idx = np.arange(2400)
+ np.random.shuffle(idx)
+ data = np.concatenate([x, y, z])
+ label = np.concatenate([
+ np.zeros((800,), dtype=int),
+ np.ones((600,), dtype=int),
+ np.ones((1000,), dtype=int) * 2
+ ])
+ data = data[idx]
+ label = label[idx]
+
+ train_data, test_data = data[:1920, ], data[1920:, ]
+ train_label, test_label = label[:1920, ], label[1920:, ]
+ np.save("data.npy", (
+ (train_data, train_label), (test_data, test_label)
+ ))
+
+
+def read():
+ (train_data, train_label), (test_data, test_label) = np.load("data.npy", allow_pickle=True)
+ return (train_data, train_label), (test_data, test_label)
+
+
+def display(data, label, name):
+ datas = [[], [], []]
+ for i in range(len(data)):
+ datas[label[i]].append(data[i])
+
+ for each in datas:
+ each = np.array(each)
+ plt.scatter(each[:, 0], each[:, 1])
+ plt.savefig(f'img/{name}')
+ plt.show()
+
+
+'''测试改变方差对结果的影响'''
+def generate_ball(r=1):
+ mean = (2, 5)
+ cov = np.array([[40, 0], [0, 30]])
+ x = np.random.multivariate_normal(mean, cov*r, (800,))
+
+ mean = (20, 16)
+ cov = np.array([[25, 0], [0, 35.1]])
+ y = np.random.multivariate_normal(mean, cov*r, (600,))
+
+ mean = (-5, 22)
+ cov = np.array([[30, 0], [0, 25]])
+ z = np.random.multivariate_normal(mean, cov*r, (1000,))
+
+ idx = np.arange(2400)
+ np.random.shuffle(idx)
+ data = np.concatenate([x, y, z])
+ label = np.concatenate([
+ np.zeros((800,), dtype=int),
+ np.ones((600,), dtype=int),
+ np.ones((1000,), dtype=int) * 2
+ ])
+ data = data[idx]
+ label = label[idx]
+
+ train_data, test_data = data[:1920, ], data[1920:, ]
+ train_label, test_label = label[:1920, ], label[1920:, ]
+ return train_data, train_label, test_data, test_label
+
+def change_cov():
+ acc_1 = []
+ acc_2 = []
+ for each in np.arange(1, 2.1, 0.1):
+ train_data, train_label, test_data, test_label = generate_ball(r=each)
+ # euclidean
+ model = KNN()
+ model.fit(train_data, train_label, cate='euclidean', metric='accuracy')
+ res = model.predict(test_data)
+ acc1 = np.mean(np.equal(res, test_label))
+ acc_1.append(acc1)
+ # manhattan
+ model = KNN()
+ model.fit(train_data, train_label, cate='manhattan', metric='accuracy')
+ res = model.predict(test_data)
+ acc2 = np.mean(np.equal(res, test_label))
+ acc_2.append(acc2)
+ plt.plot(np.arange(1,2.1,0.1), acc_1,color = 'r')
+ plt.plot(np.arange(1,2.1,0.1), acc_2,color = 'b')
+ plt.title('accuracy at different cov')
+ plt.legend(['euclidean','manhattan'])
+ plt.savefig('change_cov.png')
+
+
+if __name__ == "__main__":
+ if len(sys.argv) > 1 and sys.argv[1] == "g":
+ generate()
+ if len(sys.argv) > 1 and sys.argv[1] == "d":
+ (train_data, train_label), (test_data, test_label) = read()
+ display(train_data, train_label, 'train')
+ display(test_data, test_label, 'test')
+ else:
+ (train_data, train_label), (test_data, test_label) = read()
+
+ model = KNN()
+ # 选择距离计算公式、评估公式
+ model.fit(train_data, train_label, cate='manhattan',metric='accuracy')
+ res = model.predict(test_data)
+ print("acc =", np.mean(np.equal(res, test_label)))
\ No newline at end of file