diff --git a/assignment-1/submission/17307100038/README.md b/assignment-1/submission/17307100038/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7e209eba20b4f93de8019d642a8fdc6979914d38 --- /dev/null +++ b/assignment-1/submission/17307100038/README.md @@ -0,0 +1,362 @@ +# 课程报告 + +## KNN类实现 + +### fit()函数 + +fit(X, y,cate = 'euclidean',metric='accuracy',preprocess =None) + +X: 训练集 + +y:训练集标签 + +cate:距离计算方式,如euclidean、manhattan距离 + +metric:模型评估方式,如accuracy + +preprocess:预处理方式,包含min_max归一化、z_score标准化、不处理 + + + +fit函数包含以下功能: + +​ 1、预处理; + +​ 2、随机打乱数据集顺序 + +​ 3、以8:2的比例划分train_data,dev_data,训练选出评估结果最优的k值 + +### predict()函数 + +predict用于预测测试集样本 + +### 辅助函数 + +distance( d1, d2,cate ='eulidean') + +d1,d2表示计算距离的点,cate默认为euclidean距离,可以选择manhattan距离 + + + +## 实验1 + +### Group1:各个类别相差较大,成较为明显的线性位置 + +$$ +\Sigma = + \left[ + \begin{matrix} + 52 & 0 \\ + 0 & 22 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 21.1 & 0 \\ + 0 & 32.1 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 10 & 0 \\ + 0 & 10 + \end{matrix} + \right] +$$ + +$$ +\mu = + \left[ + \begin{matrix} + 2 &5 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + 20 & -5 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + -5 & 22 + \end{matrix} + \right] +$$ + +train_data + +test_g1 + +测试集 + + + +测试在两种距离下的准确率如下: + +| k | distance | acc | +| ---- | --------- | ------- | +| 8 | euclidean | 96.250% | +| 9 | euclidean | 95.625% | +| 3 | euclidean | 95.833% | +| 13 | euclidean | 96.458% | +| 3 | manhattan | 95.417% | +| 13 | manhattan | 96.250% | +| 5 | manhattan | 95.625% | +| 5 | manhattan | 95.625% | + +### Group2:各个类别之间相差较大,成较为明显的分散位置 + +$$ +\Sigma = + \left[ + \begin{matrix} + 52 & 0 \\ + 0 & 22 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 21.1 & 0 \\ + 0 & 32.1 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 10 & 0 \\ + 0 & 10 + \end{matrix} + \right] +$$ + +$$ +\mu = + \left[ + \begin{matrix} + 2 &5 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + 20 & 16 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + -5 & 22 + \end{matrix} + \right] +$$ + +train_data: + +train_g2 + +test_data: + + + +测试在两种距离下的准确率如下: + +| k | distance | acc | +| ---- | --------- | ------- | +| 7 | euclidean | 96.875% | +| 7 | euclidean | 96.875% | +| 9 | euclidean | 97.083% | +| 8 | euclidean | 97.083% | +| 12 | manhattan | 97.708% | +| 14 | manhattan | 97.500% | +| 5 | manhattan | 97.083% | +| 12 | manhattan | 97.708% | + +*可见不同群之间的几何分布类型对knn的效果影响不明显* + +## 实验2 + +控制均值不变,倍数扩大协方差的各个数值至2倍 +$$ +\Sigma = + \left[ + \begin{matrix} + 52 & 0 \\ + 0 & 22 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 21.1 & 0 \\ + 0 & 32.1 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 10 & 0 \\ + 0 & 10 + \end{matrix} + \right] +$$ + +$$ +\left[ + \begin{matrix} + 2 &5 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + 20 & 16 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + -5 & 22 + \end{matrix} + \right] +$$ + +得到准确率改变如下图: + +change_cov + +*方差对于KNN的准确率影响显著,随着方差增大,模型准确率下降* + +## 实验3 + +对比采用归一化、标准化前后 +$$ +\Sigma = + \left[ + \begin{matrix} + 20 & 0 \\ + 0 & 1250 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 25 & 0 \\ + 0 & 2500 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 10 & 0 \\ + 0 & 950 + \end{matrix} + \right] +$$ + +$$ +\mu= +\left[ + \begin{matrix} + 2 &5 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + 10 & -60 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + -5 & 72 + \end{matrix} + \right] +$$ + +无预处理: + +data_original + +min_max 归一化: + +data_minmax + +Z_score标准化: + +data_zscore + +得到对应的准确率如下: + +| preprocessing | accuracy | +| ------------- | -------- | +| None | 82.917% | +| min_max | 83.542% | +| z_score | 84.17% | + +通过变小均值和方差的差距,重新实验得到如下结果: +$$ +\Sigma = + \left[ + \begin{matrix} + 20 & 0 \\ + 0 & 750 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 25 & 0 \\ + 0 & 1200 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 10 & 0 \\ + 0 & 650 + \end{matrix} + \right] +$$ + +$$ +\mu= +\left[ + \begin{matrix} + 2 &5 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + 10 & -50 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + -5 & 55 + \end{matrix} + \right] +$$ + +| preprocessing | accuracy | +| ------------- | -------- | +| None | 90.417% | +| min_max | 90.625# | +| z_score | 90.833% | + +*标准化、归一化对于KNN模型的准确率有一定提升,数据集各个feature的数量级差别越大,效果越明显* + +## 总结 + +1、KNN模型中不同类别点的几何分布类型对模型预测准确率影响不明显 + +2、方差对于KNN的准确率影响显著,随着方差增大,模型准确率下降 + +3、标准化、归一化对于KNN模型的准确率有一定提升,数据集各个feature的数量级差别越大,效果越明显;在数量级相差不大的情况下,性能提升不明显 \ No newline at end of file diff --git a/assignment-1/submission/17307100038/img/change_cov.png b/assignment-1/submission/17307100038/img/change_cov.png new file mode 100644 index 0000000000000000000000000000000000000000..90c6e3d31b490ac4e6f2e9a05f21f24bc71627ea Binary files /dev/null and b/assignment-1/submission/17307100038/img/change_cov.png differ diff --git a/assignment-1/submission/17307100038/img/data_minmax.png b/assignment-1/submission/17307100038/img/data_minmax.png new file mode 100644 index 0000000000000000000000000000000000000000..2bf4c70c5448506cd1bb4c074e8a1a9e569c7716 Binary files /dev/null and b/assignment-1/submission/17307100038/img/data_minmax.png differ diff --git a/assignment-1/submission/17307100038/img/data_original.png b/assignment-1/submission/17307100038/img/data_original.png new file mode 100644 index 0000000000000000000000000000000000000000..76b9b4aa00c3807e7eb0c973d717e15b8f6ebdc4 Binary files /dev/null and b/assignment-1/submission/17307100038/img/data_original.png differ diff --git a/assignment-1/submission/17307100038/img/data_zscore.png b/assignment-1/submission/17307100038/img/data_zscore.png new file mode 100644 index 0000000000000000000000000000000000000000..c79fe49fa23ed2cf8aec87519e4770fd9b3930aa Binary files /dev/null and b/assignment-1/submission/17307100038/img/data_zscore.png differ diff --git a/assignment-1/submission/17307100038/img/test_g1.png b/assignment-1/submission/17307100038/img/test_g1.png new file mode 100644 index 0000000000000000000000000000000000000000..6ba84cf0de903969371c4bb50b7dd8da40b2f1e4 Binary files /dev/null and b/assignment-1/submission/17307100038/img/test_g1.png differ diff --git a/assignment-1/submission/17307100038/img/test_g2.png b/assignment-1/submission/17307100038/img/test_g2.png new file mode 100644 index 0000000000000000000000000000000000000000..2155370c1ac0fa5544e7e9e4c9baee3b53fb834e Binary files /dev/null and b/assignment-1/submission/17307100038/img/test_g2.png differ diff --git a/assignment-1/submission/17307100038/img/train_g1.png b/assignment-1/submission/17307100038/img/train_g1.png new file mode 100644 index 0000000000000000000000000000000000000000..1b1c264c47eadb1f85822cf8ab1364ced2405f8d Binary files /dev/null and b/assignment-1/submission/17307100038/img/train_g1.png differ diff --git a/assignment-1/submission/17307100038/img/train_g2.png b/assignment-1/submission/17307100038/img/train_g2.png new file mode 100644 index 0000000000000000000000000000000000000000..5530bce8dde2a7a3787fa58ee3e9a37b45726b02 Binary files /dev/null and b/assignment-1/submission/17307100038/img/train_g2.png differ diff --git a/assignment-1/submission/17307100038/source.py b/assignment-1/submission/17307100038/source.py new file mode 100644 index 0000000000000000000000000000000000000000..be07e0492b7b13cde2148ce694ddd252ad0426dc --- /dev/null +++ b/assignment-1/submission/17307100038/source.py @@ -0,0 +1,227 @@ +import sys +import numpy as np +import matplotlib.pyplot as plt + +class KNN: + def __init__(self): + self.X = None + self.y = None + self.k = None + self.cate = None # 距离计算公式 + self.metric = None # 评分方式,如accuracy + self.preprocess = None + self.min = None + self.max =None + self.mean = None + self.std = None + + def distance(self, d1, d2): + '''计算距离,如欧式距离、曼哈顿距离等''' + if self.cate == 'euclidean': + dist = np.sum(np.square(d1 - d2)) + elif self.cate == 'manhattan': + dist = np.sum(np.abs(d1-d2)) + return dist + + def score(self, y_pred, test_label): + '''分数评估如accuracy、macro_f1、micro_f1等''' + if self.metric == 'accuracy': + cnt = 0 + for i in range(len(y_pred)): + if y_pred[i] == test_label[i]: + cnt += 1 + score = cnt / len(y_pred) + return score + + def fit(self, X, y,cate = 'euclidean',metric='accuracy',preprocess =None): + '''包含K值的选择、建立模型''' + self.cate = cate + self.metric = metric + self.preprocess = preprocess + + # 1、preprocessing + if preprocess == 'Min_Max': #标准化 + self.min = X.min(axis = 0) + self.max = X.max(axis = 0) + X = (X -self.min)/(self.max - self.min) + elif preprocess == 'Z_score': # 归一化 + self.mean = X.mean(axis=0) + self.std = X.std(axis=0) + X = (X - self.mean) / self.std + else: + X = X + + # 2、打乱顺序 + random_index = np.random.permutation(len(X)) + X = X[random_index] + y= y[random_index] + + # 3、分为train_data,dev_data + N = X.shape[0] + cut = int(N * 0.8) # 防止非整数情况 + train_data, dev_data = X[:cut, ], X[cut:, ] + train_label, dev_label = y[:cut, ], y[cut:, ] + + # 4、训练K值 + max_score = 0 + max_score_K = 0 + for k in range(2, 15): + '''计算每个k下的accuracy: + 1、对每个dev_data,计算其与train_data的距离 + 2、排序得到距离最近的k个index + 3、获取该dev_data的y_pred + 4、计算accuracy + ''' + y_pred = [] + for i in range(len(dev_data)): + dist_arr = [self.distance(dev_data[i], train_data[j]) for j in range(len(train_data))] # 每个测试点距离训练集各个点的距离列表 + sorted_index = np.argsort(dist_arr) # arg 排序各个距离的大小,得到index + first_k_index = sorted_index[:k] # 最小的k个index + first_k_label = train_label[first_k_index] + y_pred.append(np.argmax(np.bincount(first_k_label))) # 取众数为预测值 + y_pred = np.array(y_pred) + score = self.score(y_pred, dev_label) + + if score > max_score: + max_score, max_score_K = score, k + + # 5、确立参数 + self.X = X + self.y = y + self.k = max_score_K + # print('k:%d' % self.k) + + def predict(self, test_data): + # preprocessing + if self.preprocess == 'Min_Max': #标准化 + test_data = (test_data -self.min)/(self.max - self.min) + elif self.preprocess == 'Z_score': # 归一化 + test_data = (test_data - self.mean) / self.std + else: + test_data = test_data + + y_pred = [] + for i in range(len(test_data)): + dist_arr = [self.distance(test_data[i], self.X[j]) for j in range(len(self.X))] + first_k_index = np.argsort(dist_arr)[:self.k] + first_k_label = self.y[first_k_index] + y_pred.append(np.argmax(np.bincount(first_k_label))) + return np.array(y_pred) + + +def generate(): + mean = (2, 5) + cov = np.array([[20, 0], [0, 750]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (10, -60) + cov = np.array([[25, 0], [0, 2500]]) + y = np.random.multivariate_normal(mean, cov, (600,)) + + mean = (-5, 72) + cov = np.array([[10, 0], [0, 650]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + idx = np.arange(2400) + np.random.shuffle(idx) + data = np.concatenate([x, y, z]) + label = np.concatenate([ + np.zeros((800,), dtype=int), + np.ones((600,), dtype=int), + np.ones((1000,), dtype=int) * 2 + ]) + data = data[idx] + label = label[idx] + + train_data, test_data = data[:1920, ], data[1920:, ] + train_label, test_label = label[:1920, ], label[1920:, ] + np.save("data.npy", ( + (train_data, train_label), (test_data, test_label) + )) + + +def read(): + (train_data, train_label), (test_data, test_label) = np.load("data.npy", allow_pickle=True) + return (train_data, train_label), (test_data, test_label) + + +def display(data, label, name): + datas = [[], [], []] + for i in range(len(data)): + datas[label[i]].append(data[i]) + + for each in datas: + each = np.array(each) + plt.scatter(each[:, 0], each[:, 1]) + plt.savefig(f'img/{name}') + plt.show() + + +'''测试改变方差对结果的影响''' +def generate_ball(r=1): + mean = (2, 5) + cov = np.array([[40, 0], [0, 30]]) + x = np.random.multivariate_normal(mean, cov*r, (800,)) + + mean = (20, 16) + cov = np.array([[25, 0], [0, 35.1]]) + y = np.random.multivariate_normal(mean, cov*r, (600,)) + + mean = (-5, 22) + cov = np.array([[30, 0], [0, 25]]) + z = np.random.multivariate_normal(mean, cov*r, (1000,)) + + idx = np.arange(2400) + np.random.shuffle(idx) + data = np.concatenate([x, y, z]) + label = np.concatenate([ + np.zeros((800,), dtype=int), + np.ones((600,), dtype=int), + np.ones((1000,), dtype=int) * 2 + ]) + data = data[idx] + label = label[idx] + + train_data, test_data = data[:1920, ], data[1920:, ] + train_label, test_label = label[:1920, ], label[1920:, ] + return train_data, train_label, test_data, test_label + +def change_cov(): + acc_1 = [] + acc_2 = [] + for each in np.arange(1, 2.1, 0.1): + train_data, train_label, test_data, test_label = generate_ball(r=each) + # euclidean + model = KNN() + model.fit(train_data, train_label, cate='euclidean', metric='accuracy') + res = model.predict(test_data) + acc1 = np.mean(np.equal(res, test_label)) + acc_1.append(acc1) + # manhattan + model = KNN() + model.fit(train_data, train_label, cate='manhattan', metric='accuracy') + res = model.predict(test_data) + acc2 = np.mean(np.equal(res, test_label)) + acc_2.append(acc2) + plt.plot(np.arange(1,2.1,0.1), acc_1,color = 'r') + plt.plot(np.arange(1,2.1,0.1), acc_2,color = 'b') + plt.title('accuracy at different cov') + plt.legend(['euclidean','manhattan']) + plt.savefig('change_cov.png') + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "g": + generate() + if len(sys.argv) > 1 and sys.argv[1] == "d": + (train_data, train_label), (test_data, test_label) = read() + display(train_data, train_label, 'train') + display(test_data, test_label, 'test') + else: + (train_data, train_label), (test_data, test_label) = read() + + model = KNN() + # 选择距离计算公式、评估公式 + model.fit(train_data, train_label, cate='manhattan',metric='accuracy') + res = model.predict(test_data) + print("acc =", np.mean(np.equal(res, test_label))) \ No newline at end of file