diff --git a/assignment-3/submission/16307130040/README.md b/assignment-3/submission/16307130040/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dfaae6f5b35787f02947646ab7d719e8290fc268 --- /dev/null +++ b/assignment-3/submission/16307130040/README.md @@ -0,0 +1,271 @@ +# 实验报告3 + + + +## 1,KMeans 和 GaussianMixture 聚类算法的实现 + + + +#### 1,KMeans的实现 + + + +首先,是KMeans的实现。在fit之前要初始化,我的做法是随机选择数据中的n_clusters个点作为初始的聚簇中心。 + +训练的实现较为简单,在limit变量规定的次数中,反复进行以下两步: + + + +1,为数据的每一个点寻找最近的聚簇中心,并将它归于聚簇中。最终,输出一个表示每个点所属的聚簇的数组idx。这一步由方法FindClosestCenters(self,train_data)实现。 + + + +2,将每个聚簇中所有点的坐标进行平均,最终得出新的聚簇中心。这一步由方法ComputeNewCenters(self,idx,train_data)实现。 + +其中,聚簇中心存储在类的私有变量centers中。 + + + +预测的时候,使用的逻辑与FindClosestCenters相似。不同的是,输入的数据又训练数据变成了测试数据。 + + + +#### 2,GaussianMixture的实现 + + + +之后,是 GaussianMixture 的实现。初始的方法与Kmeans相同。 + + + +在训练中,GaussianMixture类使用了以下数据结构: + +*self.mean*:每个聚簇的平均值,格式为k*n。 +*self.var*:每个聚簇的方差,格式为k*n。在计算概率的时候,会将方差展开为对角矩阵。 +*self.W*:每个数据属于不同的聚簇的可能性,格式为m*k。W[i][j]为第i个数据属于第j个聚簇的概率。 +*self.Pi*:对随机的一个数据,它属于各个聚簇的可能性够成的数组。可以由W计算得出,但是为了计算方便,使用了独立的Pi变量。 + + + +在训练中,GaussianMixture类会在limit变量规定的次数中,反复进行以下两步: + + + +##### 1,E步 + +在E步中,我们会更新聚簇的分布。使用贝叶斯公式来重新计算每个数据属于各个聚簇的概率: + +![](.\img\1.png) + +之后,再更新pi变量: + +![](.\img\2.png) + +代码如下: + +```python + def step_E(self,train_data): + #update W matrix + m, n = train_data.shape + new_W=np.zeros((m,self.k)) + + for i in range(self.k): + for j in range(m): + new_W[j,i]=self.Pi[i]*self.gauss_prob(train_data[j],self.mean[i],np.diag(self.var[i])) + + for j in range(m): + sum=new_W[j].sum() + if sum != 0: + self.W[j]=new_W[j]/new_W[j].sum() + self.Pi = self.W.sum(axis=0) / self.W.sum() +``` + +这里用到了高斯分布的概率公式。这里使用了numpy进行实现,并在计算前提前考虑方差为0的情况: + +```python + def gauss_prob(self,x,mean,var): + #calculate the probability of gauss distribution by np + if not np.any(var): + n=var.shape[0] + var=np.diag(np.ones(n)/100) + + y = np.exp((-1 / 2) * (x - mean).T.dot(np.linalg.inv(var)).dot(x - mean)) / np.sqrt( + np.power(2 * np.pi, len(x)) * np.linalg.det(var) + ) + return y +``` + + + +##### 2,M步 + +在M步中,更新每个聚簇的均值和方差。具体说来,实现方法是基于W矩阵,计算数据坐标的均值和方差的均值: + +![](.\img\3.png) + +![](.\img\4.png) + +以下是代码: + +```python + def step_M(self,train_data): + m, n = train_data.shape + + #update the mean and var of each cluster + new_mean=np.zeros((self.k,n)) + new_var=np.zeros((self.k,n)) + for i in range(self.k): + new_mean[i]=np.average(train_data,axis=0,weights=np.array(self.W[:,i])) + new_var[i]=np.average((train_data-new_mean[i])**2, axis=0, weights=np.array(self.W[:, i])) + self.mean=new_mean + self.var=new_var +``` + + + +预测的时候,将测试数据代入每一个聚簇的高斯概率公式中进行计算,得出该数据点属于各类的概率。最终,选取概率最大的聚簇。 + + + +## 2,基础实验 + +基础实验中,我生成了如下2700个数据,属于三个二维高斯分布模型: + +```python + mean = (1, 2) + cov = np.array([[80, 0], [0, 22]]) + data_x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[40, 0], [0, 32.1]]) + data_y = np.random.multivariate_normal(mean, cov, (900,)) + + mean = (-6, 10) + cov = np.array([[20, 10], [10, 20]]) + data_z = np.random.multivariate_normal(mean, cov, (1000,)) +``` + +之后,混合,贴上标签,并洗匀之后,将其中80%作为训练数据,20%作为测试数据。 + +这是训练数据: + +![](.\img\test1_1.png) + +这是测试数据: + +![](.\img\test1_2.png) + +然后,通过KMeans模型进行聚类操作: + +![](.\img\test1_3.png) + +之后,用GaussianMixture模型进行聚类: + +![](.\img\test1_4.png) + +通过上方的准确率可以看出,高斯混合模型比KMeans的表现要好。 + +因为聚类模型采用的标签和数据原来的标签很有可能不一致,所以如果简单地进行一一比对的话,是不会得出准确的准确率的。所以,使用这样的方法来计算准确率:将聚类模型采用的标签不断地进行轮换,再一一比较,计算出每一个轮换中的准确率,并将最高的准确率作为这个模型的准确率。 + + + +```python +def calculate_accu(label_predict,label_test): + #when calculate the accu ,try all probables of label combination + rotates=np.array([[0,1,2],[0,2,1],[1,0,2],[1,2,0],[2,0,1],[2,1,0]]) + label_predict_rotated=[] + for rotate in rotates: + label_predict_temp=np.array(label_predict) + label_predict_temp=np.where(label_predict_temp != 0, label_predict_temp, -1) + label_predict_temp=np.where(label_predict_temp != 1, label_predict_temp, -2) + label_predict_temp=np.where(label_predict_temp != 2, label_predict_temp, -3) + + label_predict_temp=np.where(label_predict_temp != -1, label_predict_temp, rotate[0]) + label_predict_temp=np.where(label_predict_temp != -2, label_predict_temp, rotate[1]) + label_predict_temp=np.where(label_predict_temp != -3, label_predict_temp, rotate[2]) + label_predict_rotated.append(label_predict_temp) + + correct_max = np.count_nonzero((label_predict == label_test)) + for label_predict_temp in label_predict_rotated: + correct = np.count_nonzero((label_predict_temp == label_test)) + if correct>correct_max: + correct_max=correct + + accurate= correct_max/len(label_test) + return accurate +``` + +该实验所有的代码在test1.py中。 + +## 3, 自动选择聚簇数量的实验 + + + +首先说明选择聚簇数的思路。每当选择聚簇数,并训练数据之后,计算出每个聚簇的直径的平均数。一般来说,随着聚簇的数量增加,平均直径会减小。但是,在达到某一个值之后,直径的下降会明显缓慢,有时反而会微量地增加。这个临界值就是最佳的聚簇数。 + +该实验中,生成了如下2700个数据,属于三个二维高斯分布模型: + +```python + mean = (-10, 2) + cov = np.array([[6, 0], [0, 12]]) + data_x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (-10, -5) + cov = np.array([[14, 0], [0, 14]]) + data_y = np.random.multivariate_normal(mean, cov, (900,)) + + mean = (0, 0) + cov = np.array([[15, 14], [14, 15]]) + data_z = np.random.multivariate_normal(mean, cov, (1000,)) + +``` + +之后,混合,贴上标签,并洗匀之后,将其中80%作为训练数据,20%作为测试数据。 + +这是训练数据: + +![](.\img\test2_1.png) + +这是测试数据: + +![](.\img\test2_2.png) + +之后,用自动选择聚簇数的KMeans模型进行聚类: + +![](.\img\test2_3.png) + +这里模型自动认为这组数据可以分为三类。 + + + +不过,当三个类靠得更近一些的时候,会发生一些不同的情况: + +```python + mean = (2, 2) + cov = np.array([[6, 0], [0, 12]]) + data_x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (-5, -5) + cov = np.array([[7, 2], [2, 7]]) + data_y = np.random.multivariate_normal(mean, cov, (900,)) + + mean = (0, 0) + cov = np.array([[5, 0], [0, 5]]) + data_z = np.random.multivariate_normal(mean, cov, (1000,)) +``` + +![](.\img\test2_4.png) + +![](.\img\test2_5.png) + +此时,模型将数据分为了四个聚簇: + +![](.\img\test2_7.png) + + + +在观察后,发现这种分类方式的确更为合适。 + + + +该实验所有的代码在test2.py中。 \ No newline at end of file diff --git a/assignment-3/submission/16307130040/img/1.PNG b/assignment-3/submission/16307130040/img/1.PNG new file mode 100644 index 0000000000000000000000000000000000000000..ba4e5cfd1bf0c264ad2bade06bd601e7c7c29f34 Binary files /dev/null and b/assignment-3/submission/16307130040/img/1.PNG differ diff --git a/assignment-3/submission/16307130040/img/2.PNG b/assignment-3/submission/16307130040/img/2.PNG new file mode 100644 index 0000000000000000000000000000000000000000..6d08cca9311064e3c4eef0f332e89a55c734e2ae Binary files /dev/null and b/assignment-3/submission/16307130040/img/2.PNG differ diff --git a/assignment-3/submission/16307130040/img/3.PNG b/assignment-3/submission/16307130040/img/3.PNG new file mode 100644 index 0000000000000000000000000000000000000000..8ae5b5c31d972a554f4f1b8431b5296b76481c7f Binary files /dev/null and b/assignment-3/submission/16307130040/img/3.PNG differ diff --git a/assignment-3/submission/16307130040/img/4.PNG b/assignment-3/submission/16307130040/img/4.PNG new file mode 100644 index 0000000000000000000000000000000000000000..6c5aac59293cf8f48fdfedb635fbfddaa83d52aa Binary files /dev/null and b/assignment-3/submission/16307130040/img/4.PNG differ diff --git a/assignment-3/submission/16307130040/img/test1_1.png b/assignment-3/submission/16307130040/img/test1_1.png new file mode 100644 index 0000000000000000000000000000000000000000..d584f2519c65b8174b97eed171408275c91694ed Binary files /dev/null and b/assignment-3/submission/16307130040/img/test1_1.png differ diff --git a/assignment-3/submission/16307130040/img/test1_2.png b/assignment-3/submission/16307130040/img/test1_2.png new file mode 100644 index 0000000000000000000000000000000000000000..1196ccae6d9aefd14e5a000692aa12456fcfbbea Binary files /dev/null and b/assignment-3/submission/16307130040/img/test1_2.png differ diff --git a/assignment-3/submission/16307130040/img/test1_3.png b/assignment-3/submission/16307130040/img/test1_3.png new file mode 100644 index 0000000000000000000000000000000000000000..9e2270b9996c24d06342387d753c76e2c7e8bde2 Binary files /dev/null and b/assignment-3/submission/16307130040/img/test1_3.png differ diff --git a/assignment-3/submission/16307130040/img/test1_4.png b/assignment-3/submission/16307130040/img/test1_4.png new file mode 100644 index 0000000000000000000000000000000000000000..a93fbb5b131e476033aef945880543ddc4a739db Binary files /dev/null and b/assignment-3/submission/16307130040/img/test1_4.png differ diff --git a/assignment-3/submission/16307130040/img/test2_1.png b/assignment-3/submission/16307130040/img/test2_1.png new file mode 100644 index 0000000000000000000000000000000000000000..7298a4bc4f355326ce96ba5b4aa74725d08f7a9e Binary files /dev/null and b/assignment-3/submission/16307130040/img/test2_1.png differ diff --git a/assignment-3/submission/16307130040/img/test2_2.png b/assignment-3/submission/16307130040/img/test2_2.png new file mode 100644 index 0000000000000000000000000000000000000000..c954d088696d3bee2203fbc292f898685053b336 Binary files /dev/null and b/assignment-3/submission/16307130040/img/test2_2.png differ diff --git a/assignment-3/submission/16307130040/img/test2_3.png b/assignment-3/submission/16307130040/img/test2_3.png new file mode 100644 index 0000000000000000000000000000000000000000..b6d27ac32507dbb22615c2b91e8ab6b0aa05c938 Binary files /dev/null and b/assignment-3/submission/16307130040/img/test2_3.png differ diff --git a/assignment-3/submission/16307130040/img/test2_4.png b/assignment-3/submission/16307130040/img/test2_4.png new file mode 100644 index 0000000000000000000000000000000000000000..83c567792cb6df19bbce2ef15432f631989d1b14 Binary files /dev/null and b/assignment-3/submission/16307130040/img/test2_4.png differ diff --git a/assignment-3/submission/16307130040/img/test2_5.png b/assignment-3/submission/16307130040/img/test2_5.png new file mode 100644 index 0000000000000000000000000000000000000000..8a5c30ad7292e8fbe2727eea18054b9bb1cf7c52 Binary files /dev/null and b/assignment-3/submission/16307130040/img/test2_5.png differ diff --git a/assignment-3/submission/16307130040/img/test2_7.png b/assignment-3/submission/16307130040/img/test2_7.png new file mode 100644 index 0000000000000000000000000000000000000000..7771ab521d8ff17afc995c59d8c867fc4785fda7 Binary files /dev/null and b/assignment-3/submission/16307130040/img/test2_7.png differ diff --git a/assignment-3/submission/16307130040/source.py b/assignment-3/submission/16307130040/source.py new file mode 100644 index 0000000000000000000000000000000000000000..84f7c4a5a8c245b06954a30f886683f85ce21113 --- /dev/null +++ b/assignment-3/submission/16307130040/source.py @@ -0,0 +1,239 @@ +import numpy as np + +class KMeans: + + def __init__(self, n_clusters): + self.limit=50 + self.labels=[] + self.centers=[] + self.k=n_clusters + + + + def FindClosestCenters(self,train_data): + + m=train_data.shape[0] + idx=np.zeros(m) + # for each point,find the closest center + for i in range(m): + min_dist=np.sum((train_data[i] - self.centers[0]) ** 2) + idx[i]=0 + for j in range(self.k): + dist=np.sum((train_data[i] - self.centers[j]) ** 2) + if dist < min_dist: + min_dist=dist + idx[i]=j + return idx + + + def ComputeNewCenters(self,idx,train_data): + m, n = train_data.shape + for i in range(self.k): + indices = np.where(idx == i) + # for each center ,find the points select it + + if len(indices[0]) != 0: + self.centers[i]=np.sum(train_data[indices], axis=0) / len(indices[0]) + + + + def ComputeTheDiameter(self,train_data): + # calculate the average of each clusters after fit the train data + diameters=[] + idx=self.FindClosestCenters(train_data) + for i in range(self.k): + indices = np.where(idx == i) + data_temp=train_data[indices] + if len(indices[0]) != 0: + diameter_max=0 + for data1 in data_temp: + for data2 in data_temp: + diameter=sum((data1 - data2) ** 2) + if diameter>diameter_max: + diameter_max=diameter + + diameters.append(diameter_max) + return np.average(diameters) + + + + + def fit(self, train_data): + # initialize the centers, which are just same to k random points of data at first + m, n = train_data.shape + self.centers = np.zeros((self.k, n)) + select_idx = np.random.randint(0, m, self.k) + for i in range(self.k): + self.centers[i] = train_data[select_idx[i]] + + for i in range(self.limit): + idx=self.FindClosestCenters(train_data) + self.ComputeNewCenters(idx,train_data) + + + + + + def predict(self, test_data): + m,n=test_data.shape + predict_idx=np.zeros(m) + # similar to function above + for i in range(m): + min_dist=np.sum((test_data[i] - self.centers[0]) ** 2) + predict_idx[i]=0 + for j in range(self.k): + dist=np.sum((test_data[i] - self.centers[j]) ** 2) + if dist < min_dist: + min_dist=dist + predict_idx[i]=j + return predict_idx + +class GaussianMixture: + + def __init__(self, n_clusters): + # k is the number of clusters , each one set of mean and var is related to one cluster + # W is for hidden variable ,W(i,j) is the probability for Xi belongs to cluster j + self.k=n_clusters + self.mean=[] + self.var=[] + self.W=[] + self.Pi=[] + self.limit=5 + + def gauss_prob(self,x,mean,var): + #calculate the probability of gauss distribution by np + if not np.any(var): + n=var.shape[0] + var=np.diag(np.ones(n)/100) + + y = np.exp((-1 / 2) * (x - mean).T.dot(np.linalg.inv(var)).dot(x - mean)) / np.sqrt( + np.power(2 * np.pi, len(x)) * np.linalg.det(var) + ) + return y + + def step_E(self,train_data): + #update W matrix + m, n = train_data.shape + new_W=np.zeros((m,self.k)) + + for i in range(self.k): + for j in range(m): + new_W[j,i]=self.Pi[i]*self.gauss_prob(train_data[j],self.mean[i],np.diag(self.var[i])) + + for j in range(m): + sum=new_W[j].sum() + if sum != 0: + self.W[j]=new_W[j]/new_W[j].sum() + self.Pi = self.W.sum(axis=0) / self.W.sum() + #print(self.W) + + + + def step_M(self,train_data): + m, n = train_data.shape + + #update the mean and var of each cluster + new_mean=np.zeros((self.k,n)) + new_var=np.zeros((self.k,n)) + for i in range(self.k): + new_mean[i]=np.average(train_data,axis=0,weights=np.array(self.W[:,i])) + new_var[i]=np.average((train_data-new_mean[i])**2, axis=0, weights=np.array(self.W[:, i])) + self.mean=new_mean + self.var=new_var + #print(self.mean) + + def ComputeTheDiameter(self,train_data): + diameters=[] + idx=self.predict(train_data) + for i in range(self.k): + indices = np.where(idx == i) + data_temp=train_data[indices] + if len(indices[0]) != 0: + diameter_max=0 + for data1 in data_temp: + for data2 in data_temp: + diameter=sum((data1 - data2) ** 2) + if diameter>diameter_max: + diameter_max=diameter + #print(diameter_max) + diameters.append(diameter_max) + return np.average(diameters) + + + def fit(self, train_data): + # initialize the variables above + m,n=train_data.shape + self.mean=np.zeros((self.k,n)) + select_idx = np.random.randint(0, m, self.k) + for i in range(self.k): + self.mean[i] = train_data[select_idx[i]] + self.var=np.ones((self.k,n)) + self.W=np.ones((m,self.k))/self.k + self.Pi = self.W.sum(axis=0) / self.W.sum() + + for i in range(self.limit): + self.step_E(train_data) + # if(m==3): + # print('W: ',self.W) + self.step_M(train_data) + # if (m == 3): + # print('mean:',self.mean) + # print('var: ',self.var) + + + + + + def predict(self, test_data): + m,n=test_data.shape + predict_idx=np.zeros(m) + # similar to function above + for i in range(m): + max_prob=self.gauss_prob(test_data[i],self.mean[0],np.diag(self.var[0])) + predict_idx[i]=0 + for j in range(self.k): + prob=self.gauss_prob(test_data[i],self.mean[j],np.diag(self.var[j])) + if max_prob < prob: + max_prob=prob + predict_idx[i]=j + #print(predict_idx) + return predict_idx + +class ClusteringAlgorithm: + + + def __init__(self): + self.k=2 + self.model_Kmeans=KMeans(2) + + def fit(self, train_data): + last_diameter=0 + last_model = KMeans(2) + selected_k=2 + selected_model=KMeans(2) + # try the differnet n_clusters in order + for i in range(2,31): + #print(i) + model_Kmeans= KMeans(i) + model_Kmeans.fit(train_data) + if i == 2: + last_diameter=model_Kmeans.ComputeTheDiameter(train_data) + last_model=model_Kmeans + + else: + diameter=model_Kmeans.ComputeTheDiameter(train_data) + # when the average diameter of clusters is not decreased sharply , then select the previous n_cluster and Kmeans model + if diameter>last_diameter*0.80: + selected_k=i-1 + selected_model=last_model + break + else: + last_diameter=diameter + last_model=model_Kmeans + self.k=selected_k + self.model_Kmeans=selected_model + + def predict(self, test_data): + return self.k,self.model_Kmeans.predict(test_data) + + diff --git a/assignment-3/submission/16307130040/test1.py b/assignment-3/submission/16307130040/test1.py new file mode 100644 index 0000000000000000000000000000000000000000..4ce2d08de582ff7a476c810f825558335f90a038 --- /dev/null +++ b/assignment-3/submission/16307130040/test1.py @@ -0,0 +1,105 @@ +import numpy as np +import matplotlib.pyplot as plt +from source import KMeans, GaussianMixture + + +def train_test_split(data, label): + offset = int(len(data) * 0.8) + data_train = data[:offset] + data_test = data[offset:] + label_train = label[:offset] + label_test = label[offset:] + + + return np.array(data_train), np.array(data_test), np.array(label_train), np.array(label_test) + +def display(data, label,name,accu): + datas =[[],[],[]] + colors=['b','r','y'] + for i in range(len(data)): + datas[label[i]].append(data[i]) + for i,each in enumerate(datas): + each = np.array(each) + plt.scatter(each[:, 0], each[:, 1],c=colors[i]) + if not accu==0: + plt.text(-10, 20, 'accurate=%f'%accu) + plt.title(name) + plt.show() + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + +def calculate_accu(label_predict,label_test): + #when calculate the accu ,try all probables of label combination + rotates=np.array([[0,1,2],[0,2,1],[1,0,2],[1,2,0],[2,0,1],[2,1,0]]) + label_predict_rotated=[] + for rotate in rotates: + label_predict_temp=np.array(label_predict) + label_predict_temp=np.where(label_predict_temp != 0, label_predict_temp, -1) + label_predict_temp=np.where(label_predict_temp != 1, label_predict_temp, -2) + label_predict_temp=np.where(label_predict_temp != 2, label_predict_temp, -3) + + label_predict_temp=np.where(label_predict_temp != -1, label_predict_temp, rotate[0]) + label_predict_temp=np.where(label_predict_temp != -2, label_predict_temp, rotate[1]) + label_predict_temp=np.where(label_predict_temp != -3, label_predict_temp, rotate[2]) + label_predict_rotated.append(label_predict_temp) + + correct_max = np.count_nonzero((label_predict == label_test)) + for label_predict_temp in label_predict_rotated: + correct = np.count_nonzero((label_predict_temp == label_test)) + if correct>correct_max: + correct_max=correct + + accurate= correct_max/len(label_test) + return accurate + + +def test1(): + mean = (1, 2) + cov = np.array([[80, 0], [0, 22]]) + data_x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[40, 0], [0, 32.1]]) + data_y = np.random.multivariate_normal(mean, cov, (900,)) + + mean = (-6, 10) + cov = np.array([[20, 10], [10, 20]]) + data_z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, label = shuffle(data_x,data_y,data_z) + data_train,data_test,label_train,label_test=train_test_split(data,label) + + display(data_train,label_train,'train',0) + display(data_test,label_test,'test',0) + + model_KMeans=KMeans(3) + model_KMeans.fit(data_train) + label_predict_KMeans=np.array(model_KMeans.predict(data_test),dtype=np.int32) + accurate_Kmeans=calculate_accu(label_predict_KMeans,label_test) + display(data_test,label_predict_KMeans,'predict_Kmeans',accurate_Kmeans) + + + model_GaussianMixture=GaussianMixture(3) + model_GaussianMixture.fit(data_train) + label_predict_GaussianMixture=np.array(model_GaussianMixture.predict(data_test),dtype=np.int32) + accurate_GaussianMixture=calculate_accu(label_predict_GaussianMixture,label_test) + display(data_test,label_predict_GaussianMixture,'predict_GaussianMixture',accurate_GaussianMixture) + + + + + + +if __name__ == "__main__": + test1() \ No newline at end of file diff --git a/assignment-3/submission/16307130040/test2.py b/assignment-3/submission/16307130040/test2.py new file mode 100644 index 0000000000000000000000000000000000000000..18425d22d0d57e911490f9c3eff39e6bc37ecc25 --- /dev/null +++ b/assignment-3/submission/16307130040/test2.py @@ -0,0 +1,86 @@ +import numpy as np +import matplotlib.pyplot as plt +from source import KMeans, GaussianMixture,ClusteringAlgorithm + + +def train_test_split(data, label): + offset = int(len(data) * 0.8) + data_train = data[:offset] + data_test = data[offset:] + label_train = label[:offset] + label_test = label[offset:] + + + return np.array(data_train), np.array(data_test), np.array(label_train), np.array(label_test) + +def display(data, label,name,n_clusters): + # the display function of test2,which is vaild in different numbers of clusters + datas =[[] for i in range(n_clusters)] + + colors=['b','r','y','xkcd:neon purple','xkcd:deep green','xkcd:reddish pink','aliceblue','xkcd:electric green','xkcd:royal'] + for i in range(len(data)): + datas[label[i]].append(data[i]) + for i,each in enumerate(datas): + each = np.array(each) + plt.scatter(each[:, 0], each[:, 1],c=colors[i]) + #if not accu==0: + # plt.text(-10, 20, 'accurate=%f'%accu) + plt.title(name) + plt.show() + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + + + +def test2(): + mean = (2, 2) + cov = np.array([[6, 0], [0, 12]]) + data_x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (-5, -5) + cov = np.array([[7, 2], [2, 7]]) + data_y = np.random.multivariate_normal(mean, cov, (900,)) + + mean = (0, 0) + cov = np.array([[5, 0], [0, 5]]) + data_z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, label = shuffle(data_x,data_y,data_z) + data_train,data_test,label_train,label_test=train_test_split(data,label) + + display(data_train,label_train,'train',3) + display(data_test,label_test,'test',3) + + model_KMeans=ClusteringAlgorithm() + model_KMeans.fit(data_train) + n_clusters, label_predict_KMeans = model_KMeans.predict(data_test) + label_predict_KMeans=np.array(label_predict_KMeans,dtype=np.int32) + display(data_test,label_predict_KMeans,'predict_Kmeans',n_clusters) + + + + + + + + + + + + + +if __name__ == "__main__": + test2() \ No newline at end of file