diff --git a/assignment-3/submission/18307130252/depict.py b/assignment-3/submission/18307130252/depict.py new file mode 100644 index 0000000000000000000000000000000000000000..abd67f60454965961ff33cb57686280d9d8a113b --- /dev/null +++ b/assignment-3/submission/18307130252/depict.py @@ -0,0 +1,89 @@ +import numpy as np +import matplotlib.pyplot as plt + +color = [ + "red", + "blue", + "lightsteelblue", + "mediumpurple", + "gold", + "lightgreen", + "olive", + "sandybrown", + "green", + "lightskyblue" +] +def test_color(): + + n = len(color) + print(len(color)) + for i in range(n): + for j in range(20): + plt.scatter(j, i, c = 'black', alpha = 0.3) + plt.show() + +def scatter_graph(filename, data, label, target = None): + n = len(label) + print(len(label)) + for i in range(n): + plt.scatter(data[i][0], data[i][1], c = color[int(label[i])], alpha=0.3) + if target is not None: + n = target.shape[0] + for i in range(n): + plt.scatter(target[i][0], target[i][1], marker = 'x', c = 'black') + plt.savefig(filename) + plt.close() + +def scatter_graph_3D(filename, data, label, target = None): + n = len(label) + from mpl_toolkits.mplot3d import Axes3D + fig = plt.figure() + ax = Axes3D(fig) + for i in range(n): + if label[i] >= 10: + ax.scatter(data[i][0], data[i][1], data[i][2], c = label[i], alpha=0.3) + else: + ax.scatter(data[i][0], data[i][1], data[i][2], c = color[int(label[i])], alpha=0.3) + if target is not None: + n = target.shape[0] + for i in range(n): + ax.scatter(target[i][0], target[i][1], target[i][2], marker = 'x', c = 'black') + plt.savefig(filename) + plt.close() + +def curve_graph(filename, x, data, x_label = None, y_label = None, target = None, label = None): + if x_label is not None: + plt.xlabel(x_label) + if y_label is not None: + plt.ylabel(y_label) + sz = len(data) + for i in range(sz): + if label is None: + plt.plot(x, data[i], color = color[i], marker='o',) + else: + plt.plot(x, data[i], color = color[i], marker='o', label=label[i]) + if target is not None: + for i in range(sz): + plt.plot(target[i][0], target[i][1], color = 'black', marker='o') + plt.xticks(x,x[::1]) + # plt.show() + if label is not None: + plt.legend() + plt.savefig(filename) + plt.close() + +def generate(mean, cov, num): + n, dim = mean.shape + data = [] + + for i in range(n): + data.append(np.random.multivariate_normal(mean[i], cov[i], num[i])) + + X = np.concatenate([x for x in data], axis = 0) + Y = np.concatenate([[i] * num[i] for i in range(n)], axis = 0) + + idx = np.random.permutation(sum(num)) + X = X[idx] + Y = Y[idx] + + return X, Y diff --git a/assignment-3/submission/18307130252/img/0GaussianMixture_origin.png b/assignment-3/submission/18307130252/img/0GaussianMixture_origin.png new file mode 100644 index 0000000000000000000000000000000000000000..d5fadab97340a9abc55afbe153212c26a54cef19 Binary files /dev/null and b/assignment-3/submission/18307130252/img/0GaussianMixture_origin.png differ diff --git a/assignment-3/submission/18307130252/img/3D_GMM.png b/assignment-3/submission/18307130252/img/3D_GMM.png new file mode 100644 index 0000000000000000000000000000000000000000..adde8141342caa218a78292db20f3a3282c68d0a Binary files /dev/null and b/assignment-3/submission/18307130252/img/3D_GMM.png differ diff --git a/assignment-3/submission/18307130252/img/3D_KMeans.png b/assignment-3/submission/18307130252/img/3D_KMeans.png new file mode 100644 index 0000000000000000000000000000000000000000..9670cdd3d4e461b5f354948baaf1fdc455b37ab4 Binary files /dev/null and b/assignment-3/submission/18307130252/img/3D_KMeans.png differ diff --git a/assignment-3/submission/18307130252/img/3D_origin.png b/assignment-3/submission/18307130252/img/3D_origin.png new file mode 100644 index 0000000000000000000000000000000000000000..0a1f3c4c391943bb48318fc31fce02e609023dfa Binary files /dev/null and b/assignment-3/submission/18307130252/img/3D_origin.png differ diff --git a/assignment-3/submission/18307130252/img/3D_silhouette.png b/assignment-3/submission/18307130252/img/3D_silhouette.png new file mode 100644 index 0000000000000000000000000000000000000000..eeccb70370b251500d277651036b5c28197deb25 Binary files /dev/null and b/assignment-3/submission/18307130252/img/3D_silhouette.png differ diff --git a/assignment-3/submission/18307130252/img/ClusteringAlgorithm_result.png b/assignment-3/submission/18307130252/img/ClusteringAlgorithm_result.png new file mode 100644 index 0000000000000000000000000000000000000000..303875bdf74ce2a640e7221599871c8730a9ab6a Binary files /dev/null and b/assignment-3/submission/18307130252/img/ClusteringAlgorithm_result.png differ diff --git a/assignment-3/submission/18307130252/img/GMM_initialization.png b/assignment-3/submission/18307130252/img/GMM_initialization.png new file mode 100644 index 0000000000000000000000000000000000000000..65db4ef46ee199dcad4987a64c3c90a400a00397 Binary files /dev/null and b/assignment-3/submission/18307130252/img/GMM_initialization.png differ diff --git a/assignment-3/submission/18307130252/img/GMM_origin.png b/assignment-3/submission/18307130252/img/GMM_origin.png new file mode 100644 index 0000000000000000000000000000000000000000..0b7b34b15d785a157d12d6d917304775753a89e0 Binary files /dev/null and b/assignment-3/submission/18307130252/img/GMM_origin.png differ diff --git a/assignment-3/submission/18307130252/img/GaussianMixture.png b/assignment-3/submission/18307130252/img/GaussianMixture.png new file mode 100644 index 0000000000000000000000000000000000000000..61ca8862dc81dc1474f9524c4c1248eb5726e3f1 Binary files /dev/null and b/assignment-3/submission/18307130252/img/GaussianMixture.png differ diff --git a/assignment-3/submission/18307130252/img/KMeans.png b/assignment-3/submission/18307130252/img/KMeans.png new file mode 100644 index 0000000000000000000000000000000000000000..904099e3dc8c1380d69b5efe9002ceaf7c0e2f81 Binary files /dev/null and b/assignment-3/submission/18307130252/img/KMeans.png differ diff --git a/assignment-3/submission/18307130252/img/KMeans_bisecting.png b/assignment-3/submission/18307130252/img/KMeans_bisecting.png new file mode 100644 index 0000000000000000000000000000000000000000..39d42cf680bcf99a5831d5241a9217934a71a0da Binary files /dev/null and b/assignment-3/submission/18307130252/img/KMeans_bisecting.png differ diff --git a/assignment-3/submission/18307130252/img/KMeans_bisecting_curve.png b/assignment-3/submission/18307130252/img/KMeans_bisecting_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..2b7b56d50bec6f7d90c2b64149cdaf2169675113 Binary files /dev/null and b/assignment-3/submission/18307130252/img/KMeans_bisecting_curve.png differ diff --git a/assignment-3/submission/18307130252/img/KMeans_initialization.png b/assignment-3/submission/18307130252/img/KMeans_initialization.png new file mode 100644 index 0000000000000000000000000000000000000000..81aa6babfdc00e59a2df9c0fda1fab573936ba92 Binary files /dev/null and b/assignment-3/submission/18307130252/img/KMeans_initialization.png differ diff --git a/assignment-3/submission/18307130252/img/KMeans_origin.png b/assignment-3/submission/18307130252/img/KMeans_origin.png new file mode 100644 index 0000000000000000000000000000000000000000..6496313bbaab425f5108903b54bdbb8c0f1b20a5 Binary files /dev/null and b/assignment-3/submission/18307130252/img/KMeans_origin.png differ diff --git a/assignment-3/submission/18307130252/img/cluster.png b/assignment-3/submission/18307130252/img/cluster.png new file mode 100644 index 0000000000000000000000000000000000000000..3b745b1d6957d39bbca114b2a3ef6da14c93bbd1 Binary files /dev/null and b/assignment-3/submission/18307130252/img/cluster.png differ diff --git a/assignment-3/submission/18307130252/img/cluster_origin.png b/assignment-3/submission/18307130252/img/cluster_origin.png new file mode 100644 index 0000000000000000000000000000000000000000..3b745b1d6957d39bbca114b2a3ef6da14c93bbd1 Binary files /dev/null and b/assignment-3/submission/18307130252/img/cluster_origin.png differ diff --git a/assignment-3/submission/18307130252/img/elbow_result.png b/assignment-3/submission/18307130252/img/elbow_result.png new file mode 100644 index 0000000000000000000000000000000000000000..1a9ec5be84955ca0f2d658f01b8eede8a4ef6386 Binary files /dev/null and b/assignment-3/submission/18307130252/img/elbow_result.png differ diff --git a/assignment-3/submission/18307130252/img/elbow_single_curve.png b/assignment-3/submission/18307130252/img/elbow_single_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..35d4d20221dcc49c43fa882cba0440d8260e0145 Binary files /dev/null and b/assignment-3/submission/18307130252/img/elbow_single_curve.png differ diff --git a/assignment-3/submission/18307130252/img/silhouette_result.png b/assignment-3/submission/18307130252/img/silhouette_result.png new file mode 100644 index 0000000000000000000000000000000000000000..4a477601665eb7d14cfe6d35c763282178ae0858 Binary files /dev/null and b/assignment-3/submission/18307130252/img/silhouette_result.png differ diff --git a/assignment-3/submission/18307130252/img/silhouette_score.png b/assignment-3/submission/18307130252/img/silhouette_score.png new file mode 100644 index 0000000000000000000000000000000000000000..0924e361d3424f51e4362425657dd960601ab359 Binary files /dev/null and b/assignment-3/submission/18307130252/img/silhouette_score.png differ diff --git a/assignment-3/submission/18307130252/img/silhouette_single_curve.png b/assignment-3/submission/18307130252/img/silhouette_single_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..e79e5cfa1f6456becfcbd1b80b0083a24b900f30 Binary files /dev/null and b/assignment-3/submission/18307130252/img/silhouette_single_curve.png differ diff --git a/assignment-3/submission/18307130252/readme.md b/assignment-3/submission/18307130252/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..d2d8ff76039a1835cffa3cf55b1ed7795d9f6629 --- /dev/null +++ b/assignment-3/submission/18307130252/readme.md @@ -0,0 +1,193 @@ +# Assignment-3 聚类模型 + +[toc] + +## 1. 模型实现 + +### 1.1 K-means 原理简述 + + K-means 通过预测每个聚类的质心来确定每个样本点所属的聚类。假设当前有 $n$ 个样本点,需要划分成 $K$ 个聚类,具体方式如下: + +1. 初始化 $K$ 个聚类中心 (详见 2.1) +2. 根据现有的 $K$ 个聚类中心,更新每个样本点所属聚类:每个样本点 $i$ 找到与自己最近的聚类中心 $center_k$ ,并将样本 $i$ 标记为聚类 $k$ 中的一员,即 $label_i = argmin_k(||center_k - sample_i|| ^ 2)$ 。 +3. 根据现有的样本聚簇情况,更新 $K$ 个聚类中心的位置:每个聚类 $k$ 的中心 $center_k$ 更新为当前所有属于聚类 $k$ 的样本点的质心位置,即 $center_k= \frac{\Sigma_{i = 1}^n [label_i == k] * sample_i }{\Sigma_{i = 1}^n [label_i == k] }$ 。 +4. 重复迭代 2,3 两步操作,直到聚类中心稳定。 + +### 1.2 GMM 原理简述 + + GMM 模型的本质是通过参数估计得到每个聚类的高斯分布参数。通过预测到的参数,计算每个点属于各个分布的概率,取概率最大的分布作为该点的聚簇标签。假设当前有 $n$ 个样本,需要划分成 $K$ 个聚类。那么需要计算的参数有 ${\pi_k}, \mu_k, \sigma_k (1\le k \le K)$ ,具体方式如下: + +1. 初始化 $K$ 组聚类参数 ${\pi_k}, \mu_k, \sigma_k (1\le k \le K)$ (详见 2.2) + +2. 根据现有的 $K$ 组聚类参数,更新每个样本点 $i$ 属于第 $k$ 个高斯分布的后验概率: $\gamma_{ik} = \frac{\pi_k\mathcal{N}(sample_i;\mu_k, \sigma_k)}{\sum_{k=1}^{K}\pi_k\mathcal{N}(sample_i;\mu_k, \sigma_k)}$ 。 + +3. 根据现有的样本聚簇后验概率,更新 $K$ 组聚类参数: + $$ + \begin{split} + N_k &= \sum_{i = 1}^N\gamma_{ik}\\ + \pi_k &= \frac{N_k}{\sum_{k=1}^K N_k}\\ + \mu_k &= \frac{1}{N_k}\sum_{i=1}^N \gamma_{ik}\cdot sample_i\\ + \sigma_k^2 &= \frac{1}{N_k}\sum_{i=1}^N \gamma_{ik}(sample_i - \mu_k)^2 + \end{split} + $$ + +4. 重复迭代 2,3 两步操作,直到对数边际分布 $\sum_{i=1}^N \log p(sample_i)$ 收敛。 + +### 1.3 模型比较 + +两个模型都可以从 EM 算法的角度进行拆解,即通过 E 步估计隐变量的期望值,M 步更新其他参数来使得目标函数最小化。 + +在 GMM 中,E 步估计的隐变量为 $\gamma_{ik}$ ,M 步更新的参数为 $\pi_k , \mu_k , \sigma_k^2$,目标函数为 $ELBO(\gamma, D; \pi, \mu, \sigma) = \sum_{i=1}^K\sum_{k=1}^K \gamma_{ik}(\frac{-(x - \mu_k)^2}{2\sigma^2_k} - \log \sigma_k + \log \pi_k) + C$ 。 + +在 K-Means 中, E 步估计隐变量为 $label_i$, M 步更新的参数为 $center_k$,目标函数为畸变函数 $J(label, center) = \sum_{i=1}^N||x_i - center_{label_i}|| ^2$ 。 + +从这一角度来看,GMM 就是运用了 EM 算法的生成模型,K-Means 则是运用了 EM 算法的判别模型。 + +由于 GMM 和 K-Means 的目标函数都不是凸函数,所以在很多情况下,都只能达到局部最优解而非全局最优解。因此,参数初始化的方法对模型的收敛结果有很大的影响。 + +在实现中,需要注意的一点是 GMM 中涉及高斯分布函数的计算,对精度有更高的要求,一些会造成精度误差的操作,比如开根号,应使用 `x ** 0.5` 代替 `sqrt(x)` ,否则实测会导致精度下溢的问题。 + + + +## 2. 参数初始化 + +### 2.0 数据标准化处理 + +由于输入的数据样本可能不是很多,并且可能存在分布非常离散的情况,为了避免精度溢出的情况,在进行模型计算前,先将输入的数据做标准化处理: +$$ +data = \frac{data - mean}{std} +$$ + + +### 2.1 K-means 初始化 + +在本次实验中,尝试了 2 种 K-means 参数初始化的方法: + +1. 随机初始化:随机选择 K 个样本点作为初始中心 +2. 最远点初始化:先随机挑选一个样本点作为初始中心,之后迭代地加入与已经选择的初始点集合最远的样本点 +3. KMeans++:先随机挑选一个样本点作为初始中心,之后迭代地加入新的点。新点加入原则为离已选择初始点集合越远的点被选中加入的概率更大。即在第 2 种方法的基础上加入了概率选择。 + +原始数据集 + + + +重复 10 次实验,上面为部分列举了部分可视化结果。其中,最左列都是最远点对初始化的结果,中间列都是 KMeans++ 初始化的结果,最后一列都是随机初始化的结果。 + +根据下图的数据,可以发现这三种初始化方法都存在很大的随机性,几乎不能说哪一种方法比另一种更优。不过考虑到最远点对的初始化方法的震荡相对最小,稳定性更高,因此在此后的实验中也将采用最远点对的初始化方法。 + + + +因为目标函数往往只能收敛在局部最优解而非全局最优解上,所以得到这样的结果也是非常正常的。 + + + +### 2.2 GMM 初始化 + +在本次实验中,尝试了 2 种 GMM 参数初始化的方法: + +1. 随机初始化 +2. 最远点对初始化 +3. 以 K-Means 的结果作为初始化参数 + + + +同样重复了 10 次,这里仅列举了部分可视化结果。其中,最左列都是最远点对初始化的结果,中间列都是 KMeans 初始化的结果,最后一列都是随机初始化的结果。不难发现, GMM 模型对右上角聚簇明显的两个分布的分类情况都很好,主要的问题在于对左下部分的聚簇。在可视化的几张图中,都没能将原数据很好地还原。但通过比较,只有 KMeans 模型能将最左侧的色块较好地区分成 3 类,其他的两种方法都将其混在一起了。 + + + +而通过比较准确率,虽然对于不同的随机种子,三种初始化的方法各有优劣,但不难发现 KMeans初始化不论是平均准确率还是稳定性,都远高于另外两种方法。同时,尽管 GMM 算法在划分聚簇时可能不如 KMeans 算法那样将数据每一块都划分得那么均匀( GMM 聚簇下有些类会只有零星的几个点,而有一些类会将原本的几个分布合并,因此有时看上去可能 KMeans 和原始数据集更接近),但通过数据指标统计, GMM 的准确率和稳定性都要优于 KMeans 。 + + + +## 3. 自动选择聚簇数量 K + +本次实验中,以 K-means 算法为基础讨论如何自动选择聚簇数量 K 。 + +### 3.1 Elbow Method + +在本次实验中,可以在建立模型时,通过 `max_n_clusters = xxx` 设定聚簇数量上限,如未设定,则默认最多包含 20 类聚簇。 + + Elbow Method 算法枚举聚簇数量 $K \in [1 .. max\_n\_clusters]$,并统计在每个聚簇数量下的 SSE 指标:$SSE=\sum_{k=1}^K\sum_{x \in label_k} ||x - center_k||^2$ ,即每个样本点到其聚簇中心的距离的平方和。 + +在原始数据集上运用此算法,并将这个指标可视化后可以得到如下的折线图: + + + +图中的蓝色点为拐点,就对应了我们所要求的聚簇数量。运用此算法,最终得到的原始数据与聚簇结果如下: + + + + + +由于选择到了正确的聚簇数量,因此结果就和之前的基本一致,准确率在 91.08%。 + + + +### 3.2 轮廓系数法 + +对于每个样本点 $i$ 而言,其轮廓系数的定义为: +$$ +s_i = \frac{b-a}{max(a,b)} \in [-1, 1] +$$ +其中,凝聚度 $a$ 为样本点 $i$ 与同一聚簇下其他样本的平均距离,分离度 $b$ 为样本点 $i$ 与最近簇中所有样本的平均距离。 + +最近簇的定义为: +$$ +C_j = argmin_{C_k} \frac{1}{n}\sum_{p\in C_k} |p - X_i| ^ 2 +$$ +即用第 $i$ 个样本点 $X_i$ 到聚簇 $C_k$ 中所有样本点的平均距离作为衡量该点到该簇的距离,并且选择除本类簇外距离最近的作为最近簇。 + +对于一个固定的聚簇数量 $K$ ,整个数据集聚簇分类后的整体轮廓系数就是各样本点的轮廓系数的平均值。显然,整体轮廓系数的取值范围也是 $[-1,1]$。以轮廓系数为衡量指标,我们希望簇内样本距离越近越好,簇间样本距离越远越好,因此平均轮廓系数越大,聚类效果越好。 + +在和之前相同的数据集上进行测试,得到的平均轮廓系数和聚类情况如下: + + + + + +考虑到原数据集上中间的浅蓝深绿色块(即当前聚类出的黄色色块),确实可以看作是同一个分布。而其他分布的分类和原数据集基本一致,最终达到了 94.14% 的准确率,高于 elbow method。而且,经过多轮测试,这一方法比 Elbow Method 更为稳定。 + +## 4. 拓展研究 + +### 4.1 二分 KMeans 算法 + + 二分 KMeans 算法也是 KMeans 算法的一种优化方案。其原理如下: + +1. 将整个数据集看成一个聚簇,从而得到的一个聚簇中心。 +2. 枚举现有的每一聚簇,并尝试通过 KMeans 算法将该聚簇一分为二。最终选择分裂后能使得聚簇 SSE 之和最小的那个方案进行分裂。 + +同样进行 10 次实验,得到的准确率曲线如下: + + + +显然这一优化方案在这一数据集上的表现不佳,远不如之前尝试的几种方案。但是通过对比 10 次实验所得的最终聚类结果,发现只有如下两种聚簇情况: + + + +二分 KMeans 算法得到的聚簇结果与之前的几种方案相比显得非常稳定,而这和其模型原理 —— 迭代、二分获得新的中心点是分不开的。 + + + +### 4.2 三维数据的聚簇测试 + +在 3 维聚簇测试中,为方便显示,将聚簇数量设为 4 ,原始数据集如下。 + + + + KMeans 采用 最远点对初始化方法,GMM 采用 KMeans 初始化方法,自动聚簇采用 轮廓系数法。 + +模型测试结果如下: + + + +KMeans 聚簇结果 + + + + GMM 聚簇结果 + + + + 轮廓系数自动聚簇结果 + +在高维情况下,确定聚簇数量的聚簇分类仍然非常有效,但是自动确定聚簇数量的算法在高维情况下的结果就不那么理想了,将原本只有 4 个聚簇类的样本集合划分成了 9 类。 \ No newline at end of file diff --git a/assignment-3/submission/18307130252/source.py b/assignment-3/submission/18307130252/source.py new file mode 100644 index 0000000000000000000000000000000000000000..0c126ea84a3abdebe4aca6701d5138f48c9380c3 --- /dev/null +++ b/assignment-3/submission/18307130252/source.py @@ -0,0 +1,670 @@ +import numpy as np +import math +from depict import scatter_graph, curve_graph, generate + +def normalize(data): + """ + parameter: + data: 需要标准化的数据 + return + data' : 标准化之后的数据 + """ + stats = (data.mean(axis=0), data.std(axis=0)) + return (data - stats[0]) / stats[1] + +class KMeans: + + def __init__(self, n_clusters): + """ + parameter: + n_clusters: 聚类数目 + """ + self.n_clusters = n_clusters + self.means = None + pass + + def distance(self, X, Y): + """ + parameter: + X, Y: 两个 n 维向量 + return: + X 和 Y 的欧几里得距离 + """ + return np.sum((X - Y) ** 2) ** 0.5 + + def initialize(self, train_data, initializer): + """ + 初始化 n_clusters 个中心点 + parameter: + train_data: 训练数据 + initializer: 初始化方法 + """ + n, dim = train_data.shape + if initializer == "furthest": # 先随机选一个点,之后迭代地选取与已选点集合最远的点作为初始样本中心 + self.means = np.zeros((self.n_clusters, dim)) + idx = np.random.randint(n) + self.means[0] = train_data[idx] + for i in range(1, self.n_clusters): + min_dis = [] # min_dis 中存放每个点到距离它最近的已选点的距离 + for p in train_data: + min_dis.append(min([self.distance(p, self.means[k]) for k in range(i)])) + self.means[i] = train_data[np.argmax(min_dis)] # 选取距离最远的点作为新的中心点加入 + if initializer == "KMeansPlusPlus": # K-Means++ 初始化,和上面的很类似,也是迭代地加入点。区别在于新点加入原则为离已选择初始点集合越远的点被选中加入的概率更大。即在第 2 种方法的基础上加入了概率选择。 + self.means = np.zeros((self.n_clusters, dim)) + idx = np.random.randint(n) + self.means[0] = train_data[idx] + for i in range(1, self.n_clusters): + prob = [] # prob 中存放每个点被选中成为中心点的概率 + for p in train_data: # 先计算每个点到距离它最近的已选点的距离 + prob.append(min([self.distance(p, self.means[k]) for k in range(i)])) + prob = np.array(prob) + prob /= sum(prob) # 归一化处理,将距离映射成被选中的概率 + self.means[i] = train_data[np.random.choice(range(n), p = prob.ravel())] # 根据概率分布选择一个点成为新的中心点 + if initializer == "random": # 随机地选取 n_cluster 个不重复的样本点作为初始样本中心 + idx = np.random.permutation(n) + self.means = np.array(train_data[idx[:self.n_clusters]]) + + def fit(self, train_data, max_iter = 100, initializer = "furthest"): + """ + parameter: + train_data: 训练数据 + max_iter: 最大迭代次数,默认为 100. 同时,如果在 100 轮迭代前已收敛,则提前结束. + initializer: 初始化方法,可选择的初始化方法如下 + "random": 随机化方法 + "furthest": 最远点对方法 (默认) + "KMeansPlusPlus": KMeans++ 方法 + """ + # 获取训练集样本数和样本维数 + n, dim = train_data.shape + + # 标准化预处理 + train_data = normalize(train_data) + + # 先把训练数据随机打乱 + idx = np.random.permutation(n) + train_data = train_data[idx] + + # 初始化 n_clusters 个随机聚类中心 + self.initialize(train_data, initializer) + label = np.zeros(n) + sse = [] + while True: + isChange = False + points = [[] for i in range(self.n_clusters)] # points[i] 为第 i 个聚簇的样本点集合 + for i in range(n): + minDis = self.distance(self.means[0], train_data[i]) + minIndex = 0 + for j in range(1, self.n_clusters): # 对给定的样本点,找到距离其最近的聚簇中心点 + nowDis = self.distance(self.means[j], train_data[i]) + if nowDis < minDis: + minIndex = j + minDis = nowDis + if label[i] != minIndex: # 更新当前样本点所属的聚簇分类 + isChange = True + label[i] = minIndex + points[minIndex].append(train_data[i]) + ans = 0 + for i in range(self.n_clusters): # 根据当前所有样本点的聚簇分类情况,更新聚簇中心 + self.means[i] = np.sum(points[i], axis = 0) / len(points[i]) + for j in range(len(points[i])): + ans += self.distance(self.means[i], points[i][j]) + sse.append(ans) + if (isChange == False): + break + max_iter -= 1 + if (max_iter <= 0): + break + pass + + def predict(self, test_data): + """ + parameter: + test_data: 训练数据 + return: + test_data 对应的 label 数组(numpy格式) + """ + n, dim = test_data.shape + test_data = normalize(test_data) # 标准化预处理 + label = np.zeros(n) + for i in range(n): + minDis = self.distance(self.means[0], test_data[i]) + minIndex = 0 + for j in range(1, self.n_clusters): + nowDis = self.distance(self.means[j], test_data[i]) + if nowDis < minDis: + minIndex = j + minDis = nowDis + if label[i] != minIndex: + label[i] = minIndex + return label + +def bisecting_kmeans(train_data, n_clusters): + """ + 通过 二分-Kmeans 做法确定聚簇中心点 + parameter: + train_data: 训练数据 + return: + train_data 对应的 label 数组(numpy格式) + """ + # 获取训练集样本数和样本维数 + n, dim = train_data.shape + + # 标准化预处理 + train_data = normalize(train_data) + origin_data = train_data + + # 先把训练数据随机打乱 + idx = np.random.permutation(n) + train_data = train_data[idx] + + # 将整个数据集看成一个聚簇,并求得第一个聚簇中心 + center = [train_data.mean(axis = 0)[0]] + cluster_sse = [sum([sum((x - center) ** 2) for x in train_data])] + label = np.zeros(n) + cnt = 0 + + while len(center) < n_clusters: + min_sse = 1e40 + chosen_cluster = 0 + new_center = np.zeros((2, dim)) + new_sse = [] + for i in range(len(center)): # 枚举每个现有的聚簇,并尝试将其分裂 + cur = [k for k, x in enumerate(label) if x == i] + model = KMeans(2) + model.fit(train_data[cur]) + res = model.predict(train_data[cur]) # 通过 KMeans 模型将当前聚簇一分为二 + + # 计算新的 SSE + split_group = [[], []] + for j in range(len(cur)): + tag = int(res[j]) + split_group[tag].append(np.sum((train_data[cur[j]] - model.means[tag]) ** 2)) + split_sse = [sum(split_group[i]) / len(split_group[i]) for i in range(2)] + + sse_other = sum(cluster_sse) - cluster_sse[i] + + if sse_other + sum(split_sse) < min_sse: # 取最小值,找到最优分裂方案 + min_sse = sse_other + sum(split_sse) + chosen_cluster = i + new_center = model.means + new_sse = split_sse + new_cluster = [cur[k] for k, x in enumerate(res) if x == 1] + + # 根据当前的最优分裂方案,更新各聚簇中的点以及聚簇标签 + center[chosen_cluster] = new_center[0] + center.append(new_center[1]) + cluster_sse[chosen_cluster] = new_sse[0] + cluster_sse.append(new_sse[1]) + cnt += 1 + for idx in new_cluster: + label[idx] = cnt + + label = np.zeros(n) + for i in range(n): + minDis = np.sum((center[0] - origin_data[i]) ** 2) + minIndex = 0 + for j in range(1, n_clusters): + nowDis = np.sum((center[j] - origin_data[i]) ** 2) + if nowDis < minDis: + minIndex = j + minDis = nowDis + if label[i] != minIndex: + label[i] = minIndex + return label + +class GaussianMixture: + + def __init__(self, n_clusters, eps = 1e-30): + """ + parameter: + n_clusters: 聚类数目 + eps: 精度误差参数,默认为 1e-30 + """ + self.means = None + self.pi = None + self.sigma = None + self.n_clusters = n_clusters + self.eps = eps # 精度误差参数 + self.train_data = None + pass + + def gaussian(self, x, mean, cov): + """ + parameter: + x, mean, cov: 高斯函数的参数 + return: + N(x|mean, cov) + """ + cov = np.asmatrix(cov) + delta = np.asmatrix(x - mean) + num = - (0.5 * delta * cov.I * delta.T + self.eps) + res = (1.0 / (2.0 * np.pi * (np.linalg.det(cov) ** 0.5)) * math.exp(num)) + return res + + def distance(self, X, Y): + """ + parameter: + X, Y: 两个 n 维向量 + return: + X 和 Y 的欧几里得距离 + """ + return np.sum((X - Y) ** 2) ** 0.5 + + def initialize(self, train_data, initializer): + """ + 初始化 n_clusters 个分布的相关参数 + parameter: + train_data: 训练数据 + initializer: 初始化方法 + """ + n, dim = train_data.shape + if initializer == "furthest": # 先随机选一个点,之后迭代地选取与已选点集合最远的点作为初始样本钟信 + self.means = np.zeros((self.n_clusters, dim)) + idx = np.random.randint(n) + self.means[0] = train_data[idx] + for i in range(1, self.n_clusters): + min_dis = [] + for p in train_data: + min_dis.append(min([self.distance(p, self.means[k]) for k in range(i)])) + self.means[i] = train_data[np.argmax(min_dis)] + self.pi = np.array([1 / self.n_clusters for i in range(self.n_clusters)]) + self.sigma = np.array([np.identity(dim) for i in range(self.n_clusters)]) + if initializer == "random": # 随机选择样本点中的 n_clusters 个作为初始的分布中心 + idx = np.random.permutation(n) + self.means = np.array(train_data[idx[:self.n_clusters]]) + self.pi = np.array([1 / self.n_clusters for i in range(self.n_clusters)]) + self.sigma = np.array([np.identity(dim) for i in range(self.n_clusters)]) + if initializer == "KMeans": # 在 KMeans 聚类的基础上进行 GMM 分类 + # 通过 KMeans 得到每个样本点的分类情况 + model = KMeans(self.n_clusters) + model.fit(train_data) + label = model.predict(train_data) + + # 模仿 GMM 算法中的 M 步来初始化参数 + + # 每个分布的中心就是 KMeans 方法得到的 n_clusters 个聚簇中心 + self.means = model.means + # 每个样本点由第 k 个分布生成的概率 pi_k =【每个聚簇中的样本数/总样本】 + N = np.zeros(self.n_clusters) + for i in range(n): + N[int(label[i])] += 1 + sum_N = np.sum(N) + self.pi = N / sum_N + # 根据 M 步公式计算每个分布的协方差 + self.sigma = np.zeros((self.n_clusters, dim, dim)) + for j in range(n): + for k in range(self.n_clusters): + delta = np.asmatrix(train_data[j] - self.means[k]) + self.sigma[k] += delta.T * delta + self.sigma = np.asarray([self.sigma[k] / N[k] for k in range(self.n_clusters)]) + + def fit(self, train_data, max_iter = 30, initializer = "KMeans"): + """ + parameter: + train_data: 训练数据 + max_iter: 迭代轮次,默认为 30. 同时,如果在 100 轮迭代前已收敛,则提前结束. + initializer: 初始化方法,可选择的初始化方法如下 + "random" -- 随机化方法 + "furthest" -- 最远点对方法 + "KMeans" -- KMeans初始化 (默认) + """ + # 获取训练集样本数和样本维数 + n, dim = train_data.shape + + # 标准化预处理 + train_data = normalize(train_data) + + # 先把训练数据随机打乱 + idx = np.random.permutation(n) + train_data = train_data[idx] + + # 根据指定的方法进行参数初始化 + self.initialize(train_data, initializer) + + last_func = -1 + + for i in range(max_iter): + # E 步,固定参数 means 和 sigma, 计算后验分布 p(z|x) + gamma = np.zeros((n, self.n_clusters)) + p = np.zeros((n, self.n_clusters)) + + for j in range(n): + p[j] = [self.eps + self.pi[k] * self.gaussian(train_data[j], self.means[k], self.sigma[k]) for k in range(self.n_clusters)] + + for j in range(n): + gamma[j] = p[j] / sum(p[j]) + + # M 步,固定后验分布,更新参数 means 和 sigma + + # 根据公式更新 N_k = sum(gamma[n, k]) n = 1..N + N = np.zeros(self.n_clusters) + for j in range(n): + N = np.asarray([N[k] + gamma[j, k] for k in range(self.n_clusters)]) + sum_N = np.sum(N) + + # 根据公式更新 pi_k = N_k / N + self.pi = N / sum_N + + # 根据公式更新 mean_k = sum(gamma[n, k] * x[n]) / N_k n = 1..N + self.means = np.zeros((self.n_clusters, dim)) + for j in range(n): + self.means = np.asarray([self.means[k] + gamma[j][k] * train_data[j] for k in range(self.n_clusters)]) + self.means = np.asarray([self.means[k] / N[k] for k in range(self.n_clusters)]) + + # 根据公式更新 sigma_k^2 = sum(gamma[n, k] * (x[n] - mean_k)^2) / N_k n = 1..N + self.sigma = np.zeros((self.n_clusters, dim, dim)) + for j in range(n): + for k in range(self.n_clusters): + delta = np.asmatrix(train_data[j] - self.means[k]) + self.sigma[k] += gamma[j][k] * (delta.T * delta) + self.sigma = np.asarray([self.sigma[k] / N[k] for k in range(self.n_clusters)]) + + # 计算对数边际分布是否收敛,收敛则退出 + now_func = np.mean(np.log(p)) + if abs(now_func - last_func) / abs(last_func) < self.eps: + break + last_func = now_func + pass + + def predict(self, test_data): + """ + parameter: + test_data: 训练数据 + return: + test_data 对应的 label 数组(numpy格式) + """ + n = test_data.shape[0] + test_data = normalize(test_data) # 标准化预处理 + label = np.zeros(n) + for i in range(n): + # 计算样本点属于每个分布的概率 + p = [self.pi[k] * self.gaussian(test_data[i], self.means[k], self.sigma[k]) for k in range(self.n_clusters)] + # 取最大可能的分布作为当前样本的聚簇分类 + label[i] = np.argmax(np.array(p)) + return label + +class ClusteringAlgorithm: + + def __init__(self, max_n_clusters = 20, model_method = KMeans, fit_method = "elbow"): + """ + parameter: + max_n_clusters: 最大聚类数目, 默认为 20 + model_method: 使用的聚类模型:KMeans(默认), GaussianMixture + fit_method: 确定 K 值的方法:"elbow"(默认), "silhouette" + best_n_clusters: 拟合后得到的最佳聚簇数量,初始时为 None + """ + self.max_n_clusters = max_n_clusters + self.model_method = model_method + self.fit_method = fit_method + self.best_n_clusters = None + pass + + def distance(self, X, Y): + """ + parameter: + X, Y: 两个 n 维向量 + return: + X 和 Y 的欧几里得距离 + """ + return np.sum((X - Y) ** 2) ** 0.5 + + def elbow_method(self, train_data): + """ + 根据肘方法进行拟合 + parameter: + train_data: 训练数据 + return: + data, (target_X, target_Y)] 用于可视化 + data: list 格式, 其中每一项对应给定 K 值下的聚类结果指标 + (target_X, target_Y): tuple格式, target_X 为最优 K 值, target_Y 为取最优 K 值时对应的聚类结果指标 + """ + n, dim = train_data.shape + r = np.zeros(self.max_n_clusters) + for i in range(1, self.max_n_clusters): + print(i) + model = self.model_method(i) + model.fit(train_data) + label = model.predict(train_data) + r[i] = 0 + for j in range(n): + x = int(label[j]) + r[i] += self.distance(model.means[x], train_data[j]) # 计算每个聚簇的 SSE + show_data = r + r = r[1:self.max_n_clusters-1] - r[2:] + self.best_n_clusters = np.argmax(r[:self.max_n_clusters-3] / r[1:]) + 2 # 找到拐点 + return show_data[1:], (self.best_n_clusters, show_data[self.best_n_clusters]) # 返回可视化需要的数据 + + def silhouette_score(self, train_data): + """ + 根据轮廓系数进行拟合 + parameter: + train_data: 训练数据 + return: + data, (target_X, target_Y)] 用于可视化 + data: list 格式, 其中每一项对应给定 K 值下的聚类结果指标 + (target_X, target_Y): tuple格式, target_X 为最优 K 值, target_Y 为取最优 K 值时对应的聚类结果指标 + """ + n, dim = train_data.shape + r = np.zeros(self.max_n_clusters) + score = [] + for i in range(2, self.max_n_clusters): + print(i) + model = self.model_method(i) + model.fit(train_data) + label = model.predict(train_data) + s = [] + for j in range(n): # 计算第 j 个样本点的轮廓系数 s_j + dis = np.zeros(i) + N = np.zeros(i) + for k in range(n): + if j == k: + continue + x = int(label[k]) + dis[x] += self.distance(train_data[j], train_data[k]) # 计算当前样本点 j 和 聚簇标签为 x 的所有样本点 k 的距离总和 + N[x] += 1 # 计算每一个标签下的样本数目(除去样本点 j) + for k in range(i): + N[k] += 1 + dis /= N + a = np.partition(dis, 0)[0] # 与同一聚簇下其他样本的平均距离 + b = np.partition(dis, 1)[1] # 与最近簇中所有样本的平均距离 + s.append((b - a) / max(a, b)) # 计算轮廓系数 + score.append(sum(s) / len(s)) # 计算平均轮廓系数 + self.best_n_clusters = np.argmax(score) + 2 + return score, (self.best_n_clusters, score[self.best_n_clusters - 2]) # 返回可视化需要的数据 + + def fit(self, train_data): + """ + 根据已经选定的方法进行拟合 + parameter: + train_data: 训练数据 + return: + data, (target_X, target_Y)] 用于可视化 + data: list 格式, 其中每一项对应给定 K 值下的聚类结果指标 + (target_X, target_Y): tuple格式, target_X 为最优 K 值, target_Y 为取最优 K 值时对应的聚类结果指标 + """ + # 标准化预处理 + train_data = normalize(train_data) + + if self.fit_method == "elbow": + return self.elbow_method(train_data) + if self.fit_method == "silhouette": + return self.silhouette_score(train_data) + + def predict(self, test_data): + """ + parameter: + test_data: 测试数据 + return: + test_data 对应的 label 数组(numpy格式) + """ + # 标准化预处理 + test_data = normalize(test_data) + + model = self.model_method(self.best_n_clusters) + model.fit(test_data) + return model.predict(test_data) + +def cal_acc(pred, res, K): + """ + parameter: + pred: 预测标签 + res: 数据集原始标签 + K: 数据集标签总数 + return: + 聚类预测的准确性 + """ + n = pred.shape[0] + + label_matrix = np.zeros((K, len(set(pred)))) + label_map = np.zeros(K) + + for i in range(n): # 统计样本同时被贴上预测标签x和原始标签y的数量 + label_matrix[int(res[i])][int(pred[i])] += 1 + for i in range(K): # 如果一个标签x频繁和另一个标签y一起出现,可以认为预测标签x和原始标签y是等价的 + label_map[i] = np.argmax(label_matrix[i]) + + acc = 0 + for i in range(n): # 根据得到的标签映射关系计算 acc + if int(pred[i]) == int(label_map[res[i]]): + acc += 1 + return acc / n + +# 以下部分为模型实验和数据可视化 +# if __name__ == "__main__": +# mean = np.array([ +# (6,4), (31, 44), (22, 6), (10, 29), (43, 25), (3, 17), (27, 15) +# ]) +# cov = np.array([ +# [[11, 0], [0, 35]], +# [[21, 0], [0, 24]], +# [[25, 0], [0, 10]], +# [[13, 0], [0, 10]], +# [[3, 0], [0, 23]], +# [[4, 0], [0, 10]], +# [[30, 0], [0, 11]], +# ]) +# num = [200 for i in range(mean.shape[0])] +# X, Y = generate(mean, cov, num) + + # scatter_graph("./img1/KMeans_origin.png", X, Y) + # acc_two = [] + # for i in range(10): + # res = bisecting_kmeans(X, mean.shape[0]) + # acc_two.append(cal_acc(res, Y, mean.shape[0])) + # scatter_graph("./img1/KMeans{}_bisecting.png".format(i), X, res.tolist()) + # print(acc_two) + # print(sum(acc_two) / len(acc_two)) + # curve_graph("./img1/KMeans_Bisecting.png", range(1, 11), [acc_two]) + + # acc_random = [] + # for i in range(10): + # model = KMeans(mean.shape[0]) + # model.fit(X, initializer = "random") + # res = model.predict(X) + # acc_random.append(cal_acc(res, Y, mean.shape[0])) + # scatter_graph("./img1/KMeans{}_random_initialization.png".format(i), X, res.tolist()) + # print(acc_random) + # print(sum(acc_random) / len(acc_random)) + + # acc_furthest = [] + # for i in range(10): + # model = KMeans(mean.shape[0]) + # model.fit(X, initializer = "furthest") + # res = model.predict(X) + # acc_furthest.append(cal_acc(res, Y, mean.shape[0])) + # scatter_graph("./img1/KMeans{}_furthest_initialization.png".format(i), X, res.tolist()) + # print(acc_furthest) + # print(sum(acc_furthest) / len(acc_furthest)) + + # acc_kmeansplusplus = [] + # for i in range(10): + # model = KMeans(mean.shape[0]) + # model.fit(X, initializer = "KMeansPlusPlus") + # res = model.predict(X) + # acc_kmeansplusplus.append(cal_acc(res, Y, mean.shape[0])) + # scatter_graph("./img1/KMeans{}_KMeansPlusPlus_initialization.png".format(i), X, res.tolist()) + # print(acc_kmeansplusplus) + # print(sum(acc_kmeansplusplus) / len(acc_kmeansplusplus)) + + # curve_graph("./img1/KMeans.png", range(1, 11), [acc_random, acc_furthest, acc_kmeansplusplus], label=["random", "furthest", "KMeans++"]) + + # print("round:", test) + # acc_KMeans= [] + # acc_random = [] + # acc_furthest = [] + # from tqdm import tqdm + # for i in tqdm(range(10)): + # model = GaussianMixture(mean.shape[0]) + # model.fit(X, initializer = "KMeans") + # res = model.predict(X) + # acc_KMeans.append(cal_acc(res, Y, mean.shape[0])) + + # model = GaussianMixture(mean.shape[0]) + # model.fit(X, initializer = "random") + # res = model.predict(X) + # acc_random.append(cal_acc(res, Y, mean.shape[0])) + + # model = GaussianMixture(mean.shape[0]) + # model.fit(X, initializer = "furthest") + # res = model.predict(X) + # acc_furthest.append(cal_acc(res, Y, mean.shape[0])) + # print(acc_KMeans) + # print(sum(acc_KMeans) / len(acc_KMeans)) + # scatter_graph("./img/GaussianMixture_KMeans_initialization{}.png".format(test), X, res.tolist()) + # print(acc_random) + # print(sum(acc_random) / len(acc_random)) + # scatter_graph("./img/GaussianMixture_random_initialization{}.png".format(test), X, res.tolist()) + # print(acc_furthest) + # print(sum(acc_furthest) / len(acc_furthest)) + # scatter_graph("./img/GaussianMixture_furthest_initialization{}.png".format(test), X, res.tolist()) + + + # curve_graph("./img/GaussianMixture{}.png".format(test), [acc_random, acc_furthest, acc_KMeans], label=["random", "furthest", "KMeans"]) + + # model = GaussianMixture(mean.shape[0]) + # model.fit(X) + # res = model.predict(X) + # scatter_graph("GMM_result.png", X, res.tolist()) + + + # scatter_graph("./img/cluster.png", X, Y) + + # model = ClusteringAlgorithm(fit_method="silhouette") + # a, b = model.fit(X) + # res = model.predict(X) + # print(cal_acc(res, Y, 7)) + # scatter_graph("./img/silhouette_result.png", X, res.tolist()) + # # curve_graph("./img/silhouette_single_curve.png", range(2, 20), [a], target = [b]) + + + # model = ClusteringAlgorithm(fit_method="elbow") + # a, b = model.fit(X) + # res = model.predict(X) + # print(cal_acc(res, Y, 7)) + # scatter_graph("./img/elbow_result.png", X, res.tolist()) + # # curve_graph("./img/elbow_single_curve.png", range(1, 20), [a], target = [b]) + + # elbow_acc = [] + # silhouette_acc = [] + # elbow_Y = [] + # silhouette_Y = [] + # elbow_target = [] + # silhouette_target = [] + # for i in range(10): + # model = ClusteringAlgorithm(fit_method="silhouette") + # a, b = model.fit(X) + # res = model.predict(X) + # elbow_acc.append(cal_acc(res, Y, 7)) + # elbow_Y.append(a) + # elbow_target.append(b) + + # model = ClusteringAlgorithm(fit_method="elbow") + # a, b = model.fit(X) + # res = model.predict(X) + # silhouette_acc.append(cal_acc(res, Y, 7)) + # silhouette_Y.append(a) + # silhouette_target.append(b) + + # curve_graph("./img/silhouette_curve.png", range(2, 20), silhouette_Y, silhouette_target) + # curve_graph("./img/elbow_curve.png", range(1, 20), elbow_Y, elbow_target) + \ No newline at end of file