diff --git a/assignment-1/submission/19307130211/README.md b/assignment-1/submission/19307130211/README.md new file mode 100755 index 0000000000000000000000000000000000000000..9d46bef52b53b558ad6ec716ee26dce570e77a87 --- /dev/null +++ b/assignment-1/submission/19307130211/README.md @@ -0,0 +1,213 @@ + + +[TOC] + +我只使用了numpy和matplotlib两个库实现了KNN的,所以代码应该可以完成在额外的任务。 + +#### 一、模型的实现 + +对于单个测试数据的knn分类网络的实现: + +```python +inX=test_data[i] +#计算欧氏距离 +diffMat=np.tile(inX,(dataSetSize,1))-tra_data +sqDifMat=diffMat**2 +sqDistances=sqDifMat.sum(axis=1) +distances=sqDistances**0.5 +#对计算得到的距离对应的序号进行排序 +sortedDistIndicies=distances.argsort() +#统计最近的k个点中各个类别的数量 +classCount={} +for j in range(k): + Label = tra_label[sortedDistIndicies[j]] + classCount[Label] = classCount.get(Label,0) + 1 +#得到对于类别的预测 +max_count=0 +for key,value in classCount.items(): +if value >max_count : + max_count = value + pred_label=key +``` + +对与KNN fit() 部分的实现: + +将数据集按照8:2的比例进行划分后,对K进行拟合,多次重复在训练集上进行单次的knn分类预测,选出正确率最高的K值。 + +~~~python +def fit(self, train_data, train_label): + self.train_data=train_data + self.train_label=train_label + N=train_data.shape[0] + cut=int(N*0.8) + + tra_data, test_data = train_data[:cut,], train_data[cut:,] + tra_label, test_label = train_label[:cut,], train_label[cut:,] + + dataSetSize=tra_data.shape[0] + test_number=test_data.shape[0] + + best_k=0 + max_score=0 + if N<6 : + k_range=N + else : + k_range=20 + + for k in range(2,k_range): + total_correct=0 + for i in range(0,test_number): + inX=test_data[i] + #计算欧氏距离 + diffMat=np.tile(inX,(dataSetSize,1))-tra_data + sqDifMat=diffMat**2 + sqDistances=sqDifMat.sum(axis=1) + distances=sqDistances**0.5 + #对计算得到的距离对应的序号进行排序 + sortedDistIndicies=distances.argsort() + #统计最近的k个点中各个类别的数量 + classCount={} + for j in range(k): + Label = tra_label[sortedDistIndicies[j]] + classCount[Label] = classCount.get(Label,0) + 1 + #得到对于类别的预测 + max_count=0 + for key,value in classCount.items(): + if value >max_count : + max_count = value + pred_label=key + + if pred_label ==test_label[i]: + total_correct=total_correct+1 + #选择正确率最高的K值 + score=total_correct*1.0/test_number + if score>max_score: + max_score=score + best_k=k + print("Best K: %d"%(best_k)) + self.k=best_k +~~~ + +#### 二、实验数据的生成 + +通过 np.random.multivariate_normal() 生成数据集 + +~~~python + mean = (1, 1) + cov = np.array([[10, 0], [0, 10]]) + x1 = np.random.multivariate_normal(mean, cov, (400,)) + + mean = (1, 12) + x2 = np.random.multivariate_normal(mean, cov, (400,)) + + mean = (12, 1) + x3 = np.random.multivariate_normal(mean, cov, (400,)) + + mean = (12, 12) + x4 = np.random.multivariate_normal(mean, cov, (400,)) +~~~ + +生成的训练集:![Figure_1](img/Figure_1.png) + +生成的测试集:![Figure_2](img/Figure_2.png) + +训练结果: + +| K | 准确率 | +| ---- | :------- | +| 10 | 0.884375 | + +#### 三、探究实验 + +##### 1. 修改mean参数 + +首先修改成如下形式,使得四类点分布的更开 + +~~~python +mean = (1, 1) mean = (1, 15) mean = (15, 1) mean = (15, 15) +~~~ + +得到的训练集:![Figure_3](img/Figure_3.png) + +得到的测试集:![Figure_4](img/Figure_4.png) + +训练结果: + +| K | 准确率 | +| ---- | -------- | +| 11 | 0.978125 | + +可以看到不同的数据分布更开后,准确率大大提升。 + + + +修改成以下的形式后,不同的数据更加集中: + +~~~python +ean = (1, 1) mean = (1, 8) mean = (8, 1) mean = (8, 8) +~~~ + +生成的训练集:![Figure_5](img/Figure_5.png) + +生成的测试集:![Figure_6](img/Figure_6.png) + +训练结果: + +| K | 准确率 | +| ---- | ------ | +| 18 | 0.7 | + +可以看到准确率大大降低。 + +结论:KNN分类算法的准确性依赖于数据的分布,当不同种类的数据越集中(重合的部分越多时),效果越差; + +反之,重合部分越少,分类结果越好。 + +##### 2.对cov参数的修改 + +修改cov,使得方差更大,同类数据更离散: + +~~~python +cov = np.array([[30, 0], [0, 30]]) +~~~ + +生成的训练集:![Figure_7](img/Figure_7.png) + +生成的测试集:![Figure_8](img/Figure_8.png) + +测试结果: + +| K | 准确率 | +| ---- | -------- | +| 18 | 0.690625 | + +可以看到数据更离散后,准确率下降 + + + +再次修改cov,减少方差,使得同类数据更集中 + +```python +cov = np.array([[5, 0], [0, 5]]) +``` + +生成的训练集:![Figure_9](img/Figure_9.png) + +生成的测试集:![Figure_10](img/Figure_10.png) + +训练结果: + +| K | 准确率 | +| ---- | ------ | +| 5 | 0.9875 | + +可以看到同类数据更集中,准确率越高。 + +##### 探讨与结论 + +可以和实验一得到相同的结论,不同类数据间分布的重合区域越小,KNN分类方法的准确率越高;重合区域越大,准确率越低。 + + + +同时在上述实验中观察到的另一有趣现象是准确率越低时,K的最佳取指偏大;准确率越高时,K的取值偏小。但是在修改参数cov来控制正确率,希望得到准确率和K的关系时,却发现最佳K的选取还是具有一定的随机性,没有找到一般规律。 + diff --git a/assignment-1/submission/19307130211/img/Figure_1.png b/assignment-1/submission/19307130211/img/Figure_1.png new file mode 100755 index 0000000000000000000000000000000000000000..e4a5d05eb4f6ccb904e8fd207fded8cf0b68b43c Binary files /dev/null and b/assignment-1/submission/19307130211/img/Figure_1.png differ diff --git a/assignment-1/submission/19307130211/img/Figure_10.png b/assignment-1/submission/19307130211/img/Figure_10.png new file mode 100755 index 0000000000000000000000000000000000000000..c9002a2fdce20748784f39dfaac3a0e1ca956f06 Binary files /dev/null and b/assignment-1/submission/19307130211/img/Figure_10.png differ diff --git a/assignment-1/submission/19307130211/img/Figure_2.png b/assignment-1/submission/19307130211/img/Figure_2.png new file mode 100755 index 0000000000000000000000000000000000000000..c4cb6a263cd63039ad370eca82607f50666592b7 Binary files /dev/null and b/assignment-1/submission/19307130211/img/Figure_2.png differ diff --git a/assignment-1/submission/19307130211/img/Figure_3.png b/assignment-1/submission/19307130211/img/Figure_3.png new file mode 100755 index 0000000000000000000000000000000000000000..24fc664d1ea502c02c96a4deca581dc42fc465cd Binary files /dev/null and b/assignment-1/submission/19307130211/img/Figure_3.png differ diff --git a/assignment-1/submission/19307130211/img/Figure_4.png b/assignment-1/submission/19307130211/img/Figure_4.png new file mode 100755 index 0000000000000000000000000000000000000000..1a3fa01d1aa274fb14dca65e961793de8f2174ca Binary files /dev/null and b/assignment-1/submission/19307130211/img/Figure_4.png differ diff --git a/assignment-1/submission/19307130211/img/Figure_5.png b/assignment-1/submission/19307130211/img/Figure_5.png new file mode 100755 index 0000000000000000000000000000000000000000..4afbaeb755e3460b5f5e547d8e43e5221cd44c17 Binary files /dev/null and b/assignment-1/submission/19307130211/img/Figure_5.png differ diff --git a/assignment-1/submission/19307130211/img/Figure_6.png b/assignment-1/submission/19307130211/img/Figure_6.png new file mode 100755 index 0000000000000000000000000000000000000000..4dd7b5c0d0d15d0bfb3f47c5db12a5a378e17144 Binary files /dev/null and b/assignment-1/submission/19307130211/img/Figure_6.png differ diff --git a/assignment-1/submission/19307130211/img/Figure_7.png b/assignment-1/submission/19307130211/img/Figure_7.png new file mode 100755 index 0000000000000000000000000000000000000000..89613e8f29af003f9ad06af59112d5199da8d5a9 Binary files /dev/null and b/assignment-1/submission/19307130211/img/Figure_7.png differ diff --git a/assignment-1/submission/19307130211/img/Figure_8.png b/assignment-1/submission/19307130211/img/Figure_8.png new file mode 100755 index 0000000000000000000000000000000000000000..87a4ebbe6926fe0a26cc6e91c6c31e96b769736b Binary files /dev/null and b/assignment-1/submission/19307130211/img/Figure_8.png differ diff --git a/assignment-1/submission/19307130211/img/Figure_9.png b/assignment-1/submission/19307130211/img/Figure_9.png new file mode 100755 index 0000000000000000000000000000000000000000..dff482725b27547590be277cdba748ba1bc1305f Binary files /dev/null and b/assignment-1/submission/19307130211/img/Figure_9.png differ diff --git a/assignment-1/submission/19307130211/source.py b/assignment-1/submission/19307130211/source.py new file mode 100755 index 0000000000000000000000000000000000000000..43e2a2ec214289e40b5a5c7e99b49087695fc828 --- /dev/null +++ b/assignment-1/submission/19307130211/source.py @@ -0,0 +1,154 @@ +import numpy as np +import matplotlib.pyplot as plt + +class KNN: + + def __init__(self): + pass + + #def settled_knn(self,test_data,train_data,train_label) : + + def fit(self, train_data, train_label): + self.train_data=train_data + self.train_label=train_label + N=train_data.shape[0] + cut=int(N*0.8) + + tra_data, test_data = train_data[:cut,], train_data[cut:,] + tra_label, test_label = train_label[:cut,], train_label[cut:,] + + dataSetSize=tra_data.shape[0] + test_number=test_data.shape[0] + + best_k=0 + max_score=0 + if N<6 : + k_range=N + else : + k_range=20 + + for k in range(2,k_range): + total_correct=0 + for i in range(0,test_number): + inX=test_data[i] + + diffMat=np.tile(inX,(dataSetSize,1))-tra_data + + sqDifMat=diffMat**2 + sqDistances=sqDifMat.sum(axis=1) + distances=sqDistances**0.5 + + sortedDistIndicies=distances.argsort() + + classCount={} + for j in range(k): + Label = tra_label[sortedDistIndicies[j]] + classCount[Label] = classCount.get(Label,0) + 1 + max_count=0 + for key,value in classCount.items(): + if value >max_count : + max_count = value + pred_label=key + + if pred_label ==test_label[i]: + total_correct=total_correct+1 + + score=total_correct*1.0/test_number + if score>max_score: + max_score=score + best_k=k + print("Best K: %d"%(best_k)) + self.k=best_k + + def predict(self, test_data): + dataSetSize=self.train_data.shape[0] + test_number=test_data.shape[0] + ans_label=np.array([]) + for i in range(0,test_number): + inX=test_data[i] + + diffMat=np.tile(inX,(dataSetSize,1))-self.train_data + + sqDifMat=diffMat**2 + sqDistances=sqDifMat.sum(axis=1) + distances=sqDistances**0.5 + + sortedDistIndicies=distances.argsort() + classCount={} + for j in range(self.k): + voteIlabel = self.train_label[sortedDistIndicies[j]] + classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 + + max_count=0 + pred_label=self.train_label[0] + for key,value in classCount.items(): + if value >max_count : + max_count = value + pred_label=key + + ans_label=np.append(ans_label,pred_label) + return ans_label + +def generate(): + mean = (1, 1) + cov = np.array([[10, 0], [0, 10]]) + x1 = np.random.multivariate_normal(mean, cov, (400,)) + + mean = (1, 12) + cov = np.array([[10, 0], [0, 10]]) + x2 = np.random.multivariate_normal(mean, cov, (400,)) + + mean = (12, 1) + cov = np.array([[10, 0], [0, 10]]) + x3 = np.random.multivariate_normal(mean, cov, (400,)) + + mean = (12, 12) + cov = np.array([[10, 0], [0, 10]]) + x4 = np.random.multivariate_normal(mean, cov, (400,)) + + X = np.concatenate([x1, x2, x3, x4]) + Y = np.concatenate([ + np.zeros((400,),dtype=int), + np.ones((400,),dtype=int), + np.ones((400,),dtype=int)*2, + np.ones((400,),dtype=int)*3 + ]) + shuffled_indices = np.random.permutation(1600) + data=X[shuffled_indices] + label=Y[shuffled_indices] + total=1600 + cut=int(total*0.8) + train_data, test_data = data[:cut,], data[cut:,] + train_label, test_label = label[:cut,], label[cut:,] + + np.save("data.npy",( + (train_data, train_label), (test_data, test_label) + )) + +def read(): + (train_data, train_label), (test_data, test_label) = np.load("data.npy",allow_pickle=True) + return (train_data, train_label), (test_data, test_label) + +def display(data,label,name): + datas=[[],[],[],[]] + for i in range(len(data)): + + datas[label[i]].append(data[i]) + + for each in datas: + each=np.array(each) + plt.scatter(each[:,0],each[:,1]) + #plt.savefig(f'imag/{name}') + plt.show() + + +if __name__ == '__main__': + generate() + (train_data, train_label), (test_data, test_label) = read() + display(train_data, train_label, 'train') + display(test_data, test_label, 'test') + + model = KNN() + model.fit(train_data, train_label) + res = model.predict(test_data) + print("acc =",np.mean(np.equal(res, test_label))) \ No newline at end of file