diff --git a/assignment-1/submission/18340246016/README.md b/assignment-1/submission/18340246016/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c998960bf2dcbd812771d18b990d086ef231b740 --- /dev/null +++ b/assignment-1/submission/18340246016/README.md @@ -0,0 +1,485 @@ + **1. KNN实现过程** + +*1.1 辅助方法的实现* + +``` +# 这里我们用train_test_split实现训练集与验证集以给定的比例划分与打乱 +def train_test_split(self,x,y,rate): + shuffled_indexes = np.random.permutation(len(x)) + test_size = int(len(x) * rate) + train_index = shuffled_indexes[test_size:] + test_index = shuffled_indexes[:test_size] + return x[train_index], x[test_index], y[train_index], y[test_index] +``` + + +``` +# 用distance方法计算两组向量之间的欧式距离 +def distance(self,v1,v2): + + weight_array = (v1-v2)**2 + weight_array_sum = np.sum(weight_array) + return weight_array_sum**(0.5) +``` + + + +*1.2 fit方法的实现* + + +``` +# 我们用fit方法实现 1. 对训练数据的归一化 2. 训练数据内部subdivide为训练集与测试集,取最优K值 3. 将训练数据赋值self.data +def fit(self, train_data, train_label): + + # 归一化 + mu = np.mean(train_data, axis=0) + sigma = np.std(train_data, axis=0) + train_data = (train_data - mu) / sigma + + # 划分训练集/验证集 with rate =0.3 + X_train, X_test, Y_train, Y_test = self.train_test_split(train_data,train_label,0.3) + + # 对于不同的K[1-0.5*len(train)],计算验证集到训练集的欧氏距离 + best_k=0 + k_candi=0; + for k in range(1,int(0.5*len(X_train))+1): + + true_couter=0 + for test_counter in range(0,len(X_test)): + pos_vec_list=[] + + for train_counter in range(0,len(X_train)): + pos_vec = np.array([self.distance(X_test[test_counter],X_train[train_counter]),Y_train[train_counter]]) + pos_vec_list.append(pos_vec) + + #对距离list根据距离排序 + pos_vec_list = np.array(pos_vec_list) + pos_vec_list_sorted = pos_vec_list[np.lexsort(pos_vec_list[:,::-1].T)] + #k-近邻结果列表 + result_list = pos_vec_list_sorted[:k][:,1] + + + + #test预测结果 + label = int(result_list[np.argmax(result_list)]) + + + #检验本次test在给定k下是否正确 + if (label == Y_test[test_counter] ): + true_couter=true_couter+1 + + + #最优K取值 + if (true_couter >= best_k): + best_k = true_couter + k_candi = k + + self.k = k_candi + self.train_data = train_data + self.train_labels = train_label + return self.k +``` + + +*1.3 predict方法的实现* + +``` + # 根据fit方法带入的数据data以及训练结果K,预测test_label + def predict(self, test_data): + test_label=[] + result_list=[] + + # 归一化原始训练数据 + mu = np.mean(test_data, axis=0) + sigma = np.std(test_data, axis=0) + test_data = (test_data - mu) / sigma + + for i in range (0,len(test_data)): + pos_vec_list=[] + for m in range(0,len(self.train_data)): + pos_vec = np.array([self.distance(self.train_data[m],test_data[i]),self.train_labels[m]]) + pos_vec_list.append(pos_vec) + + + # KNN结果列表 + pos_vec_list = np.array(pos_vec_list) + pos_vec_list_sorted = pos_vec_list[np.lexsort(pos_vec_list[:,::-1].T)] + + result_list = pos_vec_list_sorted[:(self.k)][:,1] + test_label.append(result_list[np.argmax(result_list)]) + + return test_label +``` + + + + **2. 高斯分布探究试验** + +*2.0 生成以及展示函数* + + +``` +def generate (amount_1,amount_2,amount_3): + + mean = (2, 2) + cov = np.array([[1,0], [0, 1]]) + x = np.random.multivariate_normal(mean, cov, (amount_1,)) + + mean = (4, 6) + cov = np.array([[2, 0], [0, 2]]) + y = np.random.multivariate_normal(mean, cov, (amount_2,)) + + mean = (10, 10) + cov = np.array([[2,1],[1,3]]) + z = np.random.multivariate_normal(mean, cov, (amount_3,)) + + + data = np.concatenate([x,y,z]) + + label = np.concatenate([ + np.zeros((amount_1,),dtype=int), + np.ones((amount_2,),dtype=int), + np.ones((amount_3,),dtype=int)*2 + ]) + + return model.train_test_split(data,label,0.2) + +``` + + +``` +# 展示函数 +def display(x,y): + type1_x = []; type1_y = [] + type2_x = []; type2_y = [] + type3_x = []; type3_y = [] + + plt.figure(figsize=(8,6)) + + for i in range(0,len(x)): + if(y[i]==0): + type1_x.append(x[i][0]) + type1_y.append(x[i][1]) + if(y[i]==1): + type2_x.append(x[i][0]) + type2_y.append(x[i][1]) + if(y[i]==2): + type3_x.append(x[i][0]) + type3_y.append(x[i][1]) + + fig = plt.figure(figsize = (10, 6)) + ax = fig.add_subplot(111) + + type1 = ax.scatter(type1_x, type1_y, s = 30, c = 'brown') + type2 = ax.scatter(type2_x, type2_y, s = 30, c = 'lime') + type3 = ax.scatter(type3_x, type3_y, s = 30, c = "darkviolet") + + + + ax.legend((type1, type2, type3), ("A", "B", "C"), loc = 0) + + plt.show() + +``` + + + +***2.1 均值集中 + xy分布分散 = 对应的k*** + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +4 & 0 \\\\ +0 & 2 +\end{array}\right] +\mu=\left[\begin{array}{ll} +10 & 10 +\end{array}\right] +\end{array} +$$ + + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +2 & 0 \\\\ +0 & 4 +\end{array}\right] +\mu=\left[\begin{array}{ll} +8 & 8 +\end{array}\right] +\end{array} +$$ + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +8 & 1 \\\\ +1 & 6 +\end{array}\right] +\mu=\left[\begin{array}{ll} +12 & 12 +\end{array}\right] +\end{array} +$$ + + +*训练集分布* + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0401/111859_687c068b_8823823.png "屏幕截图.png") + +*测试集分布* + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0401/111906_d0e2e134_8823823.png "屏幕截图.png") + +*K与准确率结果* +| k | 准确率 | +| ---------------- | ------ | +| 4 | 62.5% | + +***2.2 mean集中 + xy集中 = 对应的k*** + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +1 & 0 \\\\ +0 & 1 +\end{array}\right] +\mu=\left[\begin{array}{ll} +10 & 10 +\end{array}\right] +\end{array} +$$ + + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +2 & 0 \\\\ +0 & 1 +\end{array}\right] +\mu=\left[\begin{array}{ll} +8 & 8 +\end{array}\right] +\end{array} +$$ + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +2 & 1 \\\\ +1 & 1 +\end{array}\right] +\mu=\left[\begin{array}{ll} +12 & 12 +\end{array}\right] +\end{array} +$$ + +*训练集分布* + +![训练集分布](https://images.gitee.com/uploads/images/2021/0401/105433_e7ec4619_8823823.png "屏幕截图.png") + +*测试集分布* + +![测试集分布](https://images.gitee.com/uploads/images/2021/0401/105459_56af4a90_8823823.png "屏幕截图.png") + +*K与准确率结果* +| k | 准确率 | +| ---------------- | ------ | +| 1 | 78.75% | + + +***2.3 mean分散 + xy分散 = 对应的k*** + + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +4 & 0 \\\\ +0 & 2 +\end{array}\right] +\mu=\left[\begin{array}{ll} +2 & 2 +\end{array}\right] +\end{array} +$$ + + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +2 & 0 \\\\ +0 & 4 +\end{array}\right] +\mu=\left[\begin{array}{ll} +4 & 6 +\end{array}\right] +\end{array} +$$ + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +8 & 1 \\\\ +1 & 6 +\end{array}\right] +\mu=\left[\begin{array}{ll} +10 & 10 +\end{array}\right] +\end{array} +$$ + + +*训练集分布* + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0401/112426_09535d36_8823823.png "屏幕截图.png") + +*测试集分布* + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0401/112437_53a32eec_8823823.png "屏幕截图.png") + +*K与准确率结果* + + +| k | 准确率 | +| ---------------- | ------ | +| 2 | 86.25% | + + +***2.4 mean分散 + xy集中 = 对应的k*** + + + + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +1 & 0 \\\\ +0 & 1 +\end{array}\right] +\mu=\left[\begin{array}{ll} +2 & 2 +\end{array}\right] +\end{array} +$$ + + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +2 & 0 \\\\ +0 & 2 +\end{array}\right] +\mu=\left[\begin{array}{ll} +4 & 6 +\end{array}\right] +\end{array} +$$ + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +2 & 1 \\\\ +1 & 3 +\end{array}\right] +\mu=\left[\begin{array}{ll} +10 & 10 +\end{array}\right] +\end{array} +$$ + + +*训练集分布* + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0401/112759_754208e4_8823823.png "屏幕截图.png") + +*测试集分布* + +![输入图片说明](https://images.gitee.com/uploads/images/2021/0401/112810_ca9d4230_8823823.png "屏幕截图.png") + +*K与准确率结果* + + +| k | 准确率 | +| ---------------- | ------ | +| 1 | 95.625% | + + +***2.5 有关高斯分布的结论*** +1. 与直观体会到的类似,模型准确率与高斯分布均值离散程度正相关,与xy方差负相关 +2. 模型在均值分散,方差集中的情况下表现最好,在均值集中,方差分散的情况下表现最差 +3. 最佳K的取值与准确率无直接联系,准确性更多取决于分布情况 + + + +**3. K值 ** + +***3.1 不同量下的best_k*** + +(这里以2.4高斯分布为例) + +``` +# 准备数据集维度 +amount_list = [[10,10,10],[50,50,50],[100,100,100],[150,50,200],[200,200,200],[250,300,400]] +k_list=[] +aug_amount = [30,150,300,400,600,950] + +# 在不同数据集量下训练最佳k值 +for i in range (0,len(amount_list)): + model = KNN() + X_train, X_test, Y_train, Y_test = generate(amount_list[i][0],amount_list[i][1],amount_list[i][2]) + k_list.append(model.fit(X_train, Y_train)) + +# 画图 +l1=plt.plot(aug_amount,k_list,'r--',label='Best-K-Value') +plt.title('The relationship between # elements and |K|') +plt.xlabel('Total # of elements') +plt.ylabel('K-value') +plt.legend() + +``` +![输入图片说明](https://images.gitee.com/uploads/images/2021/0401/120033_088f8aa7_8823823.png "屏幕截图.png") + + +1. K取值与元素数量无关 +2. 最佳K会趋向于一个较小的值,所以在fit时考虑遍历[1,10],无需遍历更大的K(导致额外无用运算) + + +**3.2 不同K下的acc** + + +``` +# 因为我实现的KNN函数没有K参数(学习最优K),这里采用SKlearn数据库中的KNeighborsClassifier来做本次探究 +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier + +# 导入sklearn iris数据库并分类 +iris = load_iris() +X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.25, random_state = 33) + +# 在不同k值下计算accuracy +acc_list=[] + +for i in range (1,100): + + + knc = KNeighborsClassifier(n_neighbors=i) + knc.fit(X_train, y_train) + y_predict = knc.predict(X_test) + + acc_list.append(knc.score(X_test, y_test)) + +# 画图 +l1=plt.plot(range(1,100),acc_list,'green',label='Accuracy') +plt.title('The relationship between K-choice and Accuracy') +plt.xlabel('K-Value') +plt.ylabel('Accuracy') +plt.legend() + +``` +![输入图片说明](https://images.gitee.com/uploads/images/2021/0401/122104_30809a7a_8823823.png "屏幕截图.png") + +1. 同3.1结论[2],最佳K的取值会聚焦于一个较小的值 +2. iris数据集大小为150,可以看到超过0.5*len 之后准确率锁定在50%(几乎约等于瞎猜) diff --git a/assignment-1/submission/18340246016/img/.keep b/assignment-1/submission/18340246016/img/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assignment-1/submission/18340246016/img/105433_e7ec4619_8823823.png b/assignment-1/submission/18340246016/img/105433_e7ec4619_8823823.png new file mode 100644 index 0000000000000000000000000000000000000000..2b3ebe23a4c696d374084525296b704bc1e94461 Binary files /dev/null and b/assignment-1/submission/18340246016/img/105433_e7ec4619_8823823.png differ diff --git a/assignment-1/submission/18340246016/img/105459_56af4a90_8823823.png b/assignment-1/submission/18340246016/img/105459_56af4a90_8823823.png new file mode 100644 index 0000000000000000000000000000000000000000..8ecab29162d04ce0e20d91129786ace0133752dd Binary files /dev/null and b/assignment-1/submission/18340246016/img/105459_56af4a90_8823823.png differ diff --git a/assignment-1/submission/18340246016/img/111859_687c068b_8823823.png b/assignment-1/submission/18340246016/img/111859_687c068b_8823823.png new file mode 100644 index 0000000000000000000000000000000000000000..38182b7678184b3e73071c3a0b77496698405b6c Binary files /dev/null and b/assignment-1/submission/18340246016/img/111859_687c068b_8823823.png differ diff --git a/assignment-1/submission/18340246016/img/111906_d0e2e134_8823823.png b/assignment-1/submission/18340246016/img/111906_d0e2e134_8823823.png new file mode 100644 index 0000000000000000000000000000000000000000..b5fd60ce1e7218c3359d9739ae304ead9ee6e16b Binary files /dev/null and b/assignment-1/submission/18340246016/img/111906_d0e2e134_8823823.png differ diff --git a/assignment-1/submission/18340246016/img/112426_09535d36_8823823.png b/assignment-1/submission/18340246016/img/112426_09535d36_8823823.png new file mode 100644 index 0000000000000000000000000000000000000000..a1db91fe434924aafbc734b235647d73fb4229dd Binary files /dev/null and b/assignment-1/submission/18340246016/img/112426_09535d36_8823823.png differ diff --git a/assignment-1/submission/18340246016/img/112437_53a32eec_8823823.png b/assignment-1/submission/18340246016/img/112437_53a32eec_8823823.png new file mode 100644 index 0000000000000000000000000000000000000000..7731028adc12f28c461efc7240b6e5fa95c12702 Binary files /dev/null and b/assignment-1/submission/18340246016/img/112437_53a32eec_8823823.png differ diff --git a/assignment-1/submission/18340246016/img/112759_754208e4_8823823.png b/assignment-1/submission/18340246016/img/112759_754208e4_8823823.png new file mode 100644 index 0000000000000000000000000000000000000000..913e3801971de18d4f609181dad97957b03a142b Binary files /dev/null and b/assignment-1/submission/18340246016/img/112759_754208e4_8823823.png differ diff --git a/assignment-1/submission/18340246016/img/112810_ca9d4230_8823823.png b/assignment-1/submission/18340246016/img/112810_ca9d4230_8823823.png new file mode 100644 index 0000000000000000000000000000000000000000..34ea7339a55877d026ae45bc7d32ee349f4b6533 Binary files /dev/null and b/assignment-1/submission/18340246016/img/112810_ca9d4230_8823823.png differ diff --git a/assignment-1/submission/18340246016/img/120033_088f8aa7_8823823.png b/assignment-1/submission/18340246016/img/120033_088f8aa7_8823823.png new file mode 100644 index 0000000000000000000000000000000000000000..48293c8c963882364c7642e833597be03e206027 Binary files /dev/null and b/assignment-1/submission/18340246016/img/120033_088f8aa7_8823823.png differ diff --git a/assignment-1/submission/18340246016/img/122104_30809a7a_8823823.png b/assignment-1/submission/18340246016/img/122104_30809a7a_8823823.png new file mode 100644 index 0000000000000000000000000000000000000000..0343da404b4c87d03aebfde3016b60e6b55cb836 Binary files /dev/null and b/assignment-1/submission/18340246016/img/122104_30809a7a_8823823.png differ diff --git a/assignment-1/submission/18340246016/source.py b/assignment-1/submission/18340246016/source.py new file mode 100644 index 0000000000000000000000000000000000000000..c43b0cc36b1573f5a16c096738d5547454904ca6 --- /dev/null +++ b/assignment-1/submission/18340246016/source.py @@ -0,0 +1,154 @@ +class KNN: + + def __init__(self): + self.train_data = None + self.train_labels = None + self.k = None + + def train_test_split(self,x,y,rate): + shuffled_indexes = np.random.permutation(len(x)) + test_size = int(len(x) * rate) + train_index = shuffled_indexes[test_size:] + test_index = shuffled_indexes[:test_size] + return x[train_index], x[test_index], y[train_index], y[test_index] + + def distance(self,v1,v2): + + weight_array = (v1-v2)**2 + weight_array_sum = np.sum(weight_array) + return weight_array_sum**(0.5) + + def fit(self, train_data, train_label): + + # 归一化 + mu = np.mean(train_data, axis=0) + sigma = np.std(train_data, axis=0) + train_data = (train_data - mu) / sigma + + # 划分训练集/验证集 with rate =0.3 + X_train, X_test, Y_train, Y_test = self.train_test_split(train_data,train_label,0.3) + + # 对于不同的K[1-20],计算验证集到训练集的欧氏距离 + best_k=0 + k_candi=0; + for k in range(1,20): + + true_couter=0 + for test_counter in range(0,len(X_test)): + pos_vec_list=[] + + for train_counter in range(0,len(X_train)): + pos_vec = np.array([self.distance(X_test[test_counter],X_train[train_counter]),Y_train[train_counter]]) + pos_vec_list.append(pos_vec) + + #对距离list根据距离排序 + pos_vec_list = np.array(pos_vec_list) + pos_vec_list_sorted = pos_vec_list[np.lexsort(pos_vec_list[:,::-1].T)] + #k-近邻结果列表 + result_list = pos_vec_list_sorted[:k][:,1] + + + + #test预测结果 + label = int(result_list[np.argmax(result_list)]) + + + #检验本次test在给定k下是否正确 + if (label == Y_test[test_counter] ): + true_couter=true_couter+1 + + + #最优K取值 + if (true_couter >= best_k): + best_k = true_couter + k_candi = k + + # print(k_candi) + self.k = k_candi + self.train_data = train_data + self.train_labels = train_label + return self.k + + def predict(self, test_data): + test_label=[] + result_list=[] + + # 归一化 + mu = np.mean(test_data, axis=0) + sigma = np.std(test_data, axis=0) + test_data = (test_data - mu) / sigma + #test_data = test_data / np.sqrt(np.sum(test_data**2)) + + for i in range (0,len(test_data)): + pos_vec_list=[] + for m in range(0,len(self.train_data)): + pos_vec = np.array([self.distance(self.train_data[m],test_data[i]),self.train_labels[m]]) + pos_vec_list.append(pos_vec) + + + + pos_vec_list = np.array(pos_vec_list) + pos_vec_list_sorted = pos_vec_list[np.lexsort(pos_vec_list[:,::-1].T)] + + result_list = pos_vec_list_sorted[:(self.k)][:,1] + test_label.append(result_list[np.argmax(result_list)]) + + return test_label + +def generate (amount_1,amount_2,amount_3): + + + mean = (2, 2) + cov = np.array([[1,0], [0, 1]]) + x = np.random.multivariate_normal(mean, cov, (amount_1,)) + + mean = (4, 6) + cov = np.array([[2, 0], [0, 2]]) + y = np.random.multivariate_normal(mean, cov, (amount_2,)) + + mean = (10, 10) + cov = np.array([[2,1],[1,3]]) + z = np.random.multivariate_normal(mean, cov, (amount_3,)) + + + data = np.concatenate([x,y,z]) + + label = np.concatenate([ + np.zeros((amount_1,),dtype=int), + np.ones((amount_2,),dtype=int), + np.ones((amount_3,),dtype=int)*2 + ]) + + return model.train_test_split(data,label,0.2) + + +def display(x,y): + type1_x = []; type1_y = [] + type2_x = []; type2_y = [] + type3_x = []; type3_y = [] + + plt.figure(figsize=(8,6)) + + for i in range(0,len(x)): + if(y[i]==0): + type1_x.append(x[i][0]) + type1_y.append(x[i][1]) + if(y[i]==1): + type2_x.append(x[i][0]) + type2_y.append(x[i][1]) + if(y[i]==2): + type3_x.append(x[i][0]) + type3_y.append(x[i][1]) + + fig = plt.figure(figsize = (10, 6)) + ax = fig.add_subplot(111) + + type1 = ax.scatter(type1_x, type1_y, s = 30, c = 'brown') + type2 = ax.scatter(type2_x, type2_y, s = 30, c = 'lime') + type3 = ax.scatter(type3_x, type3_y, s = 30, c = "darkviolet") + + + + ax.legend((type1, type2, type3), ("A", "B", "C"), loc = 0) + + plt.show()