diff --git a/assignment-2/handout-1/numpy_fnn.py b/assignment-2/handout-1/numpy_fnn.py index c2f5bf9d301a40d3390cd35eef2a9b1d0c2c4249..b04cde615c3da3644a52b0ba8ae29dd342eb7f24 100644 --- a/assignment-2/handout-1/numpy_fnn.py +++ b/assignment-2/handout-1/numpy_fnn.py @@ -160,3 +160,5 @@ class NumpyModel: self.W1 -= learning_rate * self.W1_grad self.W2 -= learning_rate * self.W2_grad self.W3 -= learning_rate * self.W3_grad + + \ No newline at end of file diff --git a/assignment-3/submission/19307130211/README.md b/assignment-3/submission/19307130211/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1f1d0089f07be6f29b98605b429ff69a84f3198c --- /dev/null +++ b/assignment-3/submission/19307130211/README.md @@ -0,0 +1,153 @@ +# Assignment-3 + +陈洋 19307130211 + +## 模型实现 + +### KMeans + +Kmeans算法在开始为每个聚类都选定一个聚类中心,在之后的迭代中进行以下两步操作,直到收敛位置: + +* 分配步:通过计算每个点到聚类中心的距离,将它分配到与之距离最小的聚类中心去。 + + ![image-20210614150634927](img/image-20210614150634927.png) + +* 更新步:计算每个聚类的均值,将之作为新的聚类中心。 + + ![image-20210614150906892](img/image-20210614150906892.png) + +收敛条件为每个点的标签不再发生变化。 + +普通的Kmeans算法在最开始随机选取中心点,其收敛速度和收敛结果都有可能会受到影响,所以在实现上使用了Kmeans++这一方法。 + +其实现方法为开始在样本中随机选取一个点为第一个聚类中心,之后算出训练样本中每个点离与其最近的聚类中心距离,然后通过轮盘法选出下一个聚类中心,这样能够保证在随机选择下一个聚类中心时,距离当前聚类中心越远的点被选中的概率越大。 + +在实验部分会比较不同方法初始化聚类中心的效果。 + +### GMM + +高斯混合模型通过EM算法进行参数估计,所以算法主要分为E步和M步进行迭代。 + +* E步:先固定参数 *μ,σ*,然后计算后验分布$p(z^{(n)}|x^{(n)})$,即: + + ![image-20210614153344247](img/image-20210614153344247.png) + +* M步:固定$γ_{nk}$,将参数问题转化为优化问题,通过令偏导数为0得到参数$π,μ,σ$的更新公式: + + ![image-20210614153814100](img/image-20210614153814100.png) + + + + 其中![image-20210614153834743](img/image-20210614153834743.png) + +在实现中GMM中使用了两种选取初始高斯分布平均值的方法,第一种是在训练样本的数据范围中随机选取,第二种是使用之前实现的Kmeans算法得到的聚类中心作为高斯分布的均值,在实验部分会进行两种方法的比较。 + +## 基本实验 + +### 测试结果: + +![image-20210614193106909](img/image-20210614193106909.png) + +### 1.对于Kmeans聚类中心不同初始化的比较: + +以下三张图分别是原数据,Kmeans随机初始化聚类中心和使用Kmeans++(迭代100次)的结果,可以看到kmeans++明显更符合原数据的分类。 + +
+ + + +
+ +### 2.对于GMM高斯分布均值的不同初始化的比较: + +以下分别是原数据,GMM随机获得高斯分布均值和使用Kmeans获取高斯分布均值的结果。(分别迭代50次和10次) + +
+ + + +
+ +
+ + + +
+ +可以看到使用kmeans初始化的GMM其绿色部分的点数更加符合原来的数据,原因应该为Kmeans能在一定程度上优化GMM,特别是当迭代次数较小时,使用kmeans进行初始化的方法仍然可以取得较为理想的结果。 + +### 3.Kmeans和GMM的比较: + +为了较为公平的比较,GMM选用的是随机初始化均值的初始化方法。 + +以下分别是原数据,Kmeans++和随机初始化GMM的结果: + +
+ + + +
+ +可以清楚的看到Kmeans因为是基于距离的划分,所以照成第二幅图中下半部分均分的形势,而GMM所做出的聚类更加符合原始数据,可以看出GMM更优一点。 + +
+ + + +
+ +在自己创造的数据上这一点更加明显,因为数据的建立比较有规律,Kmeans的结果类似于将点均匀分为了5块,而GMM的每一个聚类的点数规模都更接近真实数据。 + +综上,GMM的效果要优于Kmeans,但是在训练中也发现GMM的显著缺点:就是花费时间远大于Kmeans。 + +## 自动选择聚簇数量的实验 + +按照助教的提醒,使用Elbow method配合Kmeans,在了解Elbow Method的基本思路后,我需要做的就是在找出这个肘部。 + +方法简述:设定一个最大的K值,然后从k=1开始调用之前已经写好的kmeans,计算对应k值的簇内误差平方和(SSE),再计算处相邻两个K值对应的SSE之差,计算出相邻差值间的变化率,变化率最大的点就是我们需要的拐点。 + +~~~Python +'''计算每个K值对应的SSE''' +SSE_list = [] +for i in range(1, self.Max_K): + KMeans_model = KMeans(i) + KMeans_model.fit(train_data) + cents = KMeans_model.cluster_center + distances = [] + for row in train_data: + distances.append(np.min(np.linalg.norm(row - cents, axis=1))) # 与最近一个聚类中心的距离 + SSE_list.append(sum(distances)) + +'''计算相邻K值间的SSE之差''' +self.SSE = SSE_list +diff_SSE = [] +for i in range(self.Max_K-2): +diff_SSE.append(abs(SSE_list[i]-SSE_list[i+1])) + +'''获得K''' +Max_change_rate = 0 +best_k = 2 +for i in range(self.Max_K-3): + if float((diff_SSE[i] - diff_SSE[i+1])/diff_SSE[i+1]) > Max_change_rate: + Max_change_rate = float( + (diff_SSE[i] - diff_SSE[i+1])/diff_SSE[i+1]) + best_k = i+2 +~~~ + +最后结果展示: + +
+ + + +
+ +可以看到这一方法判断拐点的方法在我自己创造的数据集上表现较为良好,能够得到理想的结果 + +
+ + + +
+ +而在助教提供的data1中,方法却没有得到较为良好的效果,推测原因为:数据集1(助教提供的数据集)下方两个点集的重合区域过高,使用SSE这一指标,容易将之认为是同一种数据。 \ No newline at end of file diff --git a/assignment-3/submission/19307130211/img/Auto_k.png b/assignment-3/submission/19307130211/img/Auto_k.png new file mode 100644 index 0000000000000000000000000000000000000000..d7bb522ecf432a7861b9099f4786a166fcc3cca1 Binary files /dev/null and b/assignment-3/submission/19307130211/img/Auto_k.png differ diff --git a/assignment-3/submission/19307130211/img/Auto_k1.png b/assignment-3/submission/19307130211/img/Auto_k1.png new file mode 100644 index 0000000000000000000000000000000000000000..28d31033c0647168c5e3f472fdf66c1ee4c5555d Binary files /dev/null and b/assignment-3/submission/19307130211/img/Auto_k1.png differ diff --git a/assignment-3/submission/19307130211/img/GMM_kmeans.png b/assignment-3/submission/19307130211/img/GMM_kmeans.png new file mode 100644 index 0000000000000000000000000000000000000000..71527b2301804d812d71375d7bee3a906dd99fe8 Binary files /dev/null and b/assignment-3/submission/19307130211/img/GMM_kmeans.png differ diff --git a/assignment-3/submission/19307130211/img/GMM_kmeans1.png b/assignment-3/submission/19307130211/img/GMM_kmeans1.png new file mode 100644 index 0000000000000000000000000000000000000000..8e3f0ca5d93b6a18223d2e28acedc71248d3b2de Binary files /dev/null and b/assignment-3/submission/19307130211/img/GMM_kmeans1.png differ diff --git a/assignment-3/submission/19307130211/img/GMM_random.png b/assignment-3/submission/19307130211/img/GMM_random.png new file mode 100644 index 0000000000000000000000000000000000000000..97b1f3a97d97ead29c91f1cbe6839bcb7b5e799f Binary files /dev/null and b/assignment-3/submission/19307130211/img/GMM_random.png differ diff --git a/assignment-3/submission/19307130211/img/GMM_random1.png b/assignment-3/submission/19307130211/img/GMM_random1.png new file mode 100644 index 0000000000000000000000000000000000000000..6cd9d6c50fad540c7653656cf855be6a8005df90 Binary files /dev/null and b/assignment-3/submission/19307130211/img/GMM_random1.png differ diff --git a/assignment-3/submission/19307130211/img/GMM_random2.png b/assignment-3/submission/19307130211/img/GMM_random2.png new file mode 100644 index 0000000000000000000000000000000000000000..295a880d4d044f92631717ba77bb47b327c4c7f1 Binary files /dev/null and b/assignment-3/submission/19307130211/img/GMM_random2.png differ diff --git a/assignment-3/submission/19307130211/img/GMM_random3.png b/assignment-3/submission/19307130211/img/GMM_random3.png new file mode 100644 index 0000000000000000000000000000000000000000..e3c08efcc39c45f74399eb77ad9dbbb291110547 Binary files /dev/null and b/assignment-3/submission/19307130211/img/GMM_random3.png differ diff --git a/assignment-3/submission/19307130211/img/SSE.png b/assignment-3/submission/19307130211/img/SSE.png new file mode 100644 index 0000000000000000000000000000000000000000..bd40fc0902afaf09ee43513eb35480b91971bf2b Binary files /dev/null and b/assignment-3/submission/19307130211/img/SSE.png differ diff --git a/assignment-3/submission/19307130211/img/SSE1.png b/assignment-3/submission/19307130211/img/SSE1.png new file mode 100644 index 0000000000000000000000000000000000000000..6caf16d8f1cf87ddbf29df63a1fe11bd90e85d99 Binary files /dev/null and b/assignment-3/submission/19307130211/img/SSE1.png differ diff --git a/assignment-3/submission/19307130211/img/data_1.png b/assignment-3/submission/19307130211/img/data_1.png new file mode 100644 index 0000000000000000000000000000000000000000..7590e55c89137a47feda39bc30bd89909034a93b Binary files /dev/null and b/assignment-3/submission/19307130211/img/data_1.png differ diff --git a/assignment-3/submission/19307130211/img/data_2.png b/assignment-3/submission/19307130211/img/data_2.png new file mode 100644 index 0000000000000000000000000000000000000000..cc4036d76db6c11fb59a6f586f9af6da703a8936 Binary files /dev/null and b/assignment-3/submission/19307130211/img/data_2.png differ diff --git a/assignment-3/submission/19307130211/img/image-20210614150634927.png b/assignment-3/submission/19307130211/img/image-20210614150634927.png new file mode 100644 index 0000000000000000000000000000000000000000..8a7ebbb018612591701ffaaef2ce01aa5ba88a9a Binary files /dev/null and b/assignment-3/submission/19307130211/img/image-20210614150634927.png differ diff --git a/assignment-3/submission/19307130211/img/image-20210614150906892.png b/assignment-3/submission/19307130211/img/image-20210614150906892.png new file mode 100644 index 0000000000000000000000000000000000000000..e70f6138b10005cdd3a9f36bc4ca5ffd2ed29229 Binary files /dev/null and b/assignment-3/submission/19307130211/img/image-20210614150906892.png differ diff --git a/assignment-3/submission/19307130211/img/image-20210614153344247.png b/assignment-3/submission/19307130211/img/image-20210614153344247.png new file mode 100644 index 0000000000000000000000000000000000000000..2443e3de7eee32a4b6f08cf1c6209f2fe6d4dfca Binary files /dev/null and b/assignment-3/submission/19307130211/img/image-20210614153344247.png differ diff --git a/assignment-3/submission/19307130211/img/image-20210614153814100.png b/assignment-3/submission/19307130211/img/image-20210614153814100.png new file mode 100644 index 0000000000000000000000000000000000000000..74fca619802b05565985d10567204057ddcf87e7 Binary files /dev/null and b/assignment-3/submission/19307130211/img/image-20210614153814100.png differ diff --git a/assignment-3/submission/19307130211/img/image-20210614153834743.png b/assignment-3/submission/19307130211/img/image-20210614153834743.png new file mode 100644 index 0000000000000000000000000000000000000000..ba87cc9c6eb5242686a148f1f79b542dc0798894 Binary files /dev/null and b/assignment-3/submission/19307130211/img/image-20210614153834743.png differ diff --git a/assignment-3/submission/19307130211/img/image-20210614193106909.png b/assignment-3/submission/19307130211/img/image-20210614193106909.png new file mode 100644 index 0000000000000000000000000000000000000000..8c3960f9a734045c943b5047bc855d1a43207d17 Binary files /dev/null and b/assignment-3/submission/19307130211/img/image-20210614193106909.png differ diff --git a/assignment-3/submission/19307130211/img/kmeans++.png b/assignment-3/submission/19307130211/img/kmeans++.png new file mode 100644 index 0000000000000000000000000000000000000000..fe500fcf67e862f77ed1ccee24020ba0a6554f19 Binary files /dev/null and b/assignment-3/submission/19307130211/img/kmeans++.png differ diff --git a/assignment-3/submission/19307130211/img/kmeans++_1.png b/assignment-3/submission/19307130211/img/kmeans++_1.png new file mode 100644 index 0000000000000000000000000000000000000000..031afae5180dca2d6c04d6a6b11708662ff4c56f Binary files /dev/null and b/assignment-3/submission/19307130211/img/kmeans++_1.png differ diff --git a/assignment-3/submission/19307130211/img/kmeans++_2.png b/assignment-3/submission/19307130211/img/kmeans++_2.png new file mode 100644 index 0000000000000000000000000000000000000000..6c58edbb5ebf05ab2c00ffbeecf985f29b560810 Binary files /dev/null and b/assignment-3/submission/19307130211/img/kmeans++_2.png differ diff --git a/assignment-3/submission/19307130211/img/kmeans_random.png b/assignment-3/submission/19307130211/img/kmeans_random.png new file mode 100644 index 0000000000000000000000000000000000000000..4a001eb0f46a5a21f387b303a8f8bb5872198c47 Binary files /dev/null and b/assignment-3/submission/19307130211/img/kmeans_random.png differ diff --git a/assignment-3/submission/19307130211/source.py b/assignment-3/submission/19307130211/source.py new file mode 100644 index 0000000000000000000000000000000000000000..c2f593bfb99ca7f6c428b26c9ca9bb3ff9581ad5 --- /dev/null +++ b/assignment-3/submission/19307130211/source.py @@ -0,0 +1,357 @@ +import numpy as np +import matplotlib.pyplot as plt + + +class KMeans: + + def __init__(self, n_clusters, init_type="kmeans++", max_iter=200): + ''' + 模型初始化,得到聚类的数目k + ''' + self.k = n_clusters + self.cluster_center = None + self.init_type = init_type + self.max_iter = max_iter + + def init_centers(self, train_data): + K = self.k + N, D = train_data.shape + temp = train_data.tolist() + + if self.init_type == 'random': + # 初始化聚类中心:随机初始化 + random_list = np.random.choice(N, K, replace=False).tolist() + self.cluster_center = train_data[random_list, :] + + else: + # 初始化聚类中心:使用K-means++,让中心点尽可能的远 + cluster_centers = [] + + # 将train_data中随机一个点选取为中心点 + cluster_centers.append(temp[np.random.randint(N)]) + cluster_centers = np.array(cluster_centers) + + for i in range(1, K): + distances = [] + for row in train_data: + distances.append(np.min(np.linalg.norm( + row - cluster_centers, axis=1))) # 与最近一个聚类中心的距离 + + # 轮盘法选出下一个聚类中心; + total = sum(distances)*np.random.rand() + for j in range(N): + total -= distances[j] + if total < 0: + cluster_centers = np.append( + cluster_centers, [train_data[j]], axis=0) + break + + self.cluster_center = cluster_centers + + def fit(self, train_data): + ''' + 通过训练集拟合模型 + ''' + K = self.k + N = train_data.shape[0] + self.init_centers(train_data) + cluster_centers = self.cluster_center + + labels = np.ones(N) + + for i in range(self.max_iter): + distances = [] + get_new_label = [] + #分配步: + for i in range(N): + distances = (np.linalg.norm( + train_data[i]-cluster_centers, axis=1)) + label = np.argsort(distances)[0] + get_new_label.append(label) + + labels_new = np.array(get_new_label) + + if (sum(np.abs(labels-labels_new))) == 0: + break + else: + #更新步: + labels = labels_new + for i in range(K): + cluster_centers[i] = np.mean( + train_data[labels == i], axis=0) + + self.cluster_center = cluster_centers + return labels_new + + def predict(self, test_data): + return self.fit(test_data) + + +def multi_norm(x, mu, sigma): + ''' + 返回多维高斯分布的结果 + x: 输入数据 [d1,d2,d3,.....,dn] + mu: 高斯分布的均值 + sigma: 协方差矩阵 + ''' + det = np.linalg.det(sigma) + inv = np.matrix(np.linalg.inv(sigma)) + x_mu = np.matrix(x - mu).T + + const_item = 1 / (((2 * np.pi) ** (len(x) / 2)) + * (det ** (0.5))) # 高斯公式中的常数项 + exp_item = -0.5 * x_mu.T * inv * x_mu # 高斯公式中的指数项 + + return float(const_item * (np.exp(exp_item)+1e-8)) + + +class GaussianMixture: + + def __init__(self, n_clusters, max_iter=200, init_params="kmeans"): + ''' + 模型初始化,得到聚类的数目k + ''' + self.k = n_clusters + self.max_iter = max_iter + self.init_method = init_params + + self.pi = None + self.means = None + self.covs = None + + def init_params(self, train_data): + k = self.k + N, d = train_data.shape + self.covs = np.empty((k, d, d)) + for i in range(k): + self.covs[i] = np.eye(d)*np.random.rand(1)*k + + if self.init_method == "random": + # 随机初始化GMM的参数 + + self.pi = np.random.rand(k) + self.pi = self.pi/self.pi.sum() + self.means = np.random.rand(k, d) * np.max(train_data, axis=0) + + else: + # 通过Kmeans初始化GMM的参数 + + kmeans_model = KMeans(k) + kmeans_model.fit(train_data) + KM_label = kmeans_model.predict(train_data) + + self.means = kmeans_model.cluster_center + + self.pi = [] + for i in range(k): + temp = np.zeros(KM_label.shape) + temp[KM_label == i] = 1 + self.pi.append(np.sum(temp)/N) + + def fit(self, train_data): + ''' + 通过训练集对模型进行拟合 + ''' + k = self.k + N, d = train_data.shape + + # self.means = np.random.rand(k, d) * np.max(train_data, axis=0) + # self.covs = np.empty((k, d, d)) + # for i in range(k): + # self.covs[i] = np.eye(d)*np.random.rand(1)*k + self.init_params(train_data) + + for _ in range(self.max_iter): + + # E步:固定参数,计算x_n属于第k个高斯分布的后验概率γ_nk + posterior = [] + for i in range(N): + q_i = [] + for j in range(k): + postprob = multi_norm( + train_data[i], self.means[j], self.covs[j]) + q_i.append(self.pi[j]*postprob) + + posterior.append(q_i) + posterior = np.array(posterior) + + posterior = posterior / posterior.sum(axis=1, keepdims=True) + + # M步:固定γ_nk,计算更新后的π_K,means,covs + pi_hat = posterior.sum(axis=0) + means_hat = np.tensordot(posterior, train_data, axes=[0, 0]) + + covs_hat = np.empty(self.covs.shape) + for i in range(k): + tmp = train_data - self.means[i] + covs_hat[i] = np.dot(tmp.T*posterior[:, i], tmp) / pi_hat[i] + + self.covs = covs_hat + self.means = means_hat/pi_hat.reshape(-1, 1) + self.pi = pi_hat/N + + def predict(self, test_data): + posterior = [] + for i in range(test_data.shape[0]): + q_i = [] + for j in range(self.k): + postprob = multi_norm( + test_data[i], self.means[j], self.covs[j]) + q_i.append(self.pi[j]*postprob) + posterior.append(q_i) + + posterior = np.array(posterior) + labels = np.argmax(posterior, axis=1) + + return labels + + +class ClusteringAlgorithm: + + def __init__(self, Max_k=10): + self.Max_K = Max_k + self.k = 2 + self.Model = None + self.SSE = None + + def fit(self, train_data): + #计算每个K值对应的SSE + SSE_list = [] + for i in range(1, self.Max_K): + KMeans_model = KMeans(i) + KMeans_model.fit(train_data) + cents = KMeans_model.cluster_center + distances = [] + for row in train_data: + distances.append(np.min(np.linalg.norm( + row - cents, axis=1))) # 与最近一个聚类中心的距离 + SSE_list.append(sum(distances)) + + #计算相邻K值间的SSE之差 + self.SSE = SSE_list + diff_SSE = [] + for i in range(self.Max_K-2): + diff_SSE.append(abs(SSE_list[i]-SSE_list[i+1])) + #获得best_K + Max_change_rate = 0 + best_k = 2 + for i in range(self.Max_K-3): + if float((diff_SSE[i] - diff_SSE[i+1])/diff_SSE[i+1]) > Max_change_rate: + Max_change_rate = float( + (diff_SSE[i] - diff_SSE[i+1])/diff_SSE[i+1]) + best_k = i+2 + + self.k = best_k + self.Model = KMeans(self.k) + + #画出SSE图 + X = range(1, 10) + plt.xlabel('k') + plt.ylabel('SSE') + plt.plot(X, SSE_list, 'o-') + plt.show() + + def predict(self, test_data): + return self.Model.predict(test_data) + +######################## +# 以下为实验部分的代码 # +######################## + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, label = shuffle(x, y, z) + return (data, data), label, 3 + + +def data_2(): + mean = (0, 0) + cov = np.array([[40, 0], [0, 40]]) + x = np.random.multivariate_normal(mean, cov, (750,)) + + mean = (0, 15) + cov = np.array([[10, 0], [0, 10]]) + y = np.random.multivariate_normal(mean, cov, (150,)) + + mean = (20, 0) + cov = np.array([[10, 0], [0, 10]]) + z = np.random.multivariate_normal(mean, cov, (150,)) + + mean = (0, -20) + cov = np.array([[10, 0], [0, 10]]) + w = np.random.multivariate_normal(mean, cov, (150,)) + + mean = (-20, 0) + cov = np.array([[10, 0], [0, 10]]) + t = np.random.multivariate_normal(mean, cov, (150,)) + data, labels = shuffle(x, y, z, w, t) + return (data, data), labels, 5 + +#绘图函数 +def display(data, label, cluster): + datas = [] + for k in range(cluster): + data_k = [] + for i in range(len(data)): + if label[i] == k: + data_k .append(data[i]) + datas.append(data_k) + + for each in datas: + each = np.array(each) + plt.scatter(each[:, 0], each[:, 1]) + plt.show() + + +def print_params(model): + print("pi:\n", model.pi, "\n") + print("means:\n", model.means, "\n") + print("covs:\n", model.covs, "\n") + + +if __name__ == '__main__': + (train_data, test_data), labels, cluster = data_1() + + model = ClusteringAlgorithm(Max_k=10) + model.fit(train_data) + res1 = model.predict(test_data) + print(model.k) + display(test_data, labels, cluster=cluster) + display(test_data, res1, cluster=model.k) + + # model1 = KMeans(cluster,init_type="kmeans++") + # model1.fit(train_data) + # res1 = model1.predict(test_data) + + # model2 = GaussianMixture(cluster,init_params='random') + # model2.fit(train_data) + # res2 = model2.predict(test_data) + + # display(train_data, res1, cluster=cluster) + # display(train_data, res2, cluster=cluster) diff --git a/assignment-3/submission/19307130211/tester_demo.py b/assignment-3/submission/19307130211/tester_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..19ec0e8091691d4aaaa6b53dbb695fde9e826d89 --- /dev/null +++ b/assignment-3/submission/19307130211/tester_demo.py @@ -0,0 +1,117 @@ +import numpy as np +import sys + +from source import KMeans, GaussianMixture + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z) + return (data, data), 3 + + +def data_2(): + train_data = np.array([ + [23, 12, 173, 2134], + [99, -12, -126, -31], + [55, -145, -123, -342], + ]) + return (train_data, train_data), 2 + + +def data_3(): + train_data = np.array([ + [23], + [-2999], + [-2955], + ]) + return (train_data, train_data), 2 + + +def test_with_n_clusters(data_fuction, algorithm_class): + (train_data, test_data), n_clusters = data_fuction() + model = algorithm_class(n_clusters) + model.fit(train_data) + res = model.predict(test_data) + assert len( + res.shape) == 1 and res.shape[0] == test_data.shape[0], "shape of result is wrong" + return res + + +def testcase_1_1(): + test_with_n_clusters(data_1, KMeans) + return True + + +def testcase_1_2(): + res = test_with_n_clusters(data_2, KMeans) + return res[0] != res[1] and res[1] == res[2] + + +def testcase_2_1(): + test_with_n_clusters(data_1, GaussianMixture) + return True + + +def testcase_2_2(): + res = test_with_n_clusters(data_3, GaussianMixture) + return res[0] != res[1] and res[1] == res[2] + + +def test_all(err_report=False): + testcases = [ + ["KMeans-1", testcase_1_1, 4], + ["KMeans-2", testcase_1_2, 4], + # ["KMeans-3", testcase_1_3, 4], + # ["KMeans-4", testcase_1_4, 4], + # ["KMeans-5", testcase_1_5, 4], + ["GMM-1", testcase_2_1, 4], + ["GMM-2", testcase_2_2, 4], + # ["GMM-3", testcase_2_3, 4], + # ["GMM-4", testcase_2_4, 4], + # ["GMM-5", testcase_2_5, 4], + ] + sum_score = sum([case[2] for case in testcases]) + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except Exception as e: + if err_report: + print("Error [{}] occurs in {}".format(str(e), case[0])) + res = 0 + score += res + print("+ {:14} {}/{}".format(case[0], res, case[2])) + print("{:16} {}/{}".format("FINAL SCORE", score, sum_score)) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--report": + test_all(True) + else: + test_all()