diff --git a/assignment-3/handout/source.py b/assignment-3/handout/source.py index 43aa5bb327e9a34a31b8c553af7cd9104acdd220..5cdb02368d31afb3cdf967cc26eec7b132e4b9f7 100644 --- a/assignment-3/handout/source.py +++ b/assignment-3/handout/source.py @@ -1,34 +1,37 @@ import numpy as np + class KMeans: def __init__(self, n_clusters): pass - + def fit(self, train_data): pass - + def predict(self, test_data): pass + class GaussianMixture: def __init__(self, n_clusters): pass - + def fit(self, train_data): pass - + def predict(self, test_data): pass + class ClusteringAlgorithm: def __init__(self): pass - + def fit(self, train_data): pass - + def predict(self, test_data): - pass + pass \ No newline at end of file diff --git a/assignment-3/handout/tester_demo.py b/assignment-3/handout/test_demo.py similarity index 99% rename from assignment-3/handout/tester_demo.py rename to assignment-3/handout/test_demo.py index 19ec0e8091691d4aaaa6b53dbb695fde9e826d89..a569dddc62de7f598dcc986179caedd842e44b77 100644 --- a/assignment-3/handout/tester_demo.py +++ b/assignment-3/handout/test_demo.py @@ -114,4 +114,4 @@ if __name__ == "__main__": if len(sys.argv) > 1 and sys.argv[1] == "--report": test_all(True) else: - test_all() + test_all() \ No newline at end of file diff --git a/assignment-3/submission/18307130154/README.md b/assignment-3/submission/18307130154/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ebae932a419bb8fca8b4591b403d8cfc81123282 --- /dev/null +++ b/assignment-3/submission/18307130154/README.md @@ -0,0 +1,260 @@ +# 作业3报告 + +## 概述 + +本次作业实现Kmeans算法和GMM算法(高斯混合模型)进行聚类,并实现方法对聚成的类数量进行选择。聚类算法均支持任意维度的输入,我自己构造了数据进行聚类的测试。报告中将推导高维GMM聚类的迭代公式,以及其中使用的迭代方法——EM方法(Expectation-Maximum)。 + +## Kmeans + +Kmeans算法: + +* 假设要聚成K类,随机选择K个初始类中心 +* 每一轮迭代中,将每个点放在最近的类中心所在类;对每个类,重新计算类中心,普遍使用的方法是所有点取平均值 +* 终止条件:我设置的是所有类中心在前后两轮迭代中没有变化,则终止算法。 + +Kmeans是一种简单的EM方法实例,K-means 算法迭代步骤中的**每次确认中心点以后重新进行标记**对应 EM 算法中的 E 步**求当前参数条件下的 Expectation**。而**根据标记重新求中心点**对应EM算法中的 M 步**求似然函数最大化时(损失函数最小时)对应的参数**。从EM层面理解,可以得出结论:Kmeans一定会收敛到局部最优解。Kmeans算法简单、高效、收敛快,但存在下列问题: + +* 要求数据分布是凸的,且是球形分布。对于非球形分布的问题,需要采用核函数映射到高维。 +* 对异常值敏感 +* 对初始值的选取敏感,可能收敛到局部最优解 + +### 异常值问题 + +针对上面问题的第二个做了简单的改进,为了克服异常值(考虑严重离群点)的影响,在计算新的类中心时,**如果一个点偏离本来的类中心太远(实践中,采用簇内平均距离的20倍作为阈值),就不纳入平均值的计算中**。只有在簇中数据不很少时才采用这种方法(实践中设置为多于50个)。 + +### 克服初始值选取的影响 + +初始值选取对Kmeans的收敛结果影响很大,不好的初始值容易导致其收敛到局部最优解。在实现中,我重复3次Kmeans聚类,并使用下面会介绍的**轮廓系数**对收敛结果进行评估,选取最好的作为聚类结果。理论上,这样可以降低不好的初始值造成局部最优解的可能。 + +## GMM + +高斯混合模型,是用多个高斯分布的加权来拟合数据分布,本次作业主要用它来进行聚类,当然,每个高斯分布就代表最后聚成的一类。 + +GMM的概率密度函数: +$$ +P(x|\theta) = \sum_{i=1}^{K}p_k \times N(x|\theta_k) +$$ +其中,$p_k$是隐变量,表示数据服从某一种高斯分布的先验; $\theta_k$表示第k种高斯分布的参数,具体包括**均值$\mu_k$和协方差$\Sigma_k$** + +与Kmeans相比,GMM的特点包括: + +* Kmeans更倾向将数据聚成球形,GMM由于引入协方差矩阵,对椭圆分布更加灵敏,更贴合现实 +* GMM模型大,运算慢,依然无法解决非球形分布的数据聚类,需要使用核函数映射到高维。 + +### GMM迭代 + +GMM聚类的思路是对参数 {$p_1,p_2...,p_K,\mu_1,\mu_2,...,\mu_K,\sigma_1,\sigma_2,...,\sigma_K$} 进行最大似然估计,其中样本包括: + +* 观测数据,也就是训练集,记作 X +* 隐变量,这是采用了EM的思想,数据本身只是一个样本点,无关乎到底“属于”哪个高斯分布;但我们隐式的给每一条样本赋一个随机变量 z, 用来表示它“属于”哪个分布。这个隐变量无法观测,但是EM算法的第一步通过隐变量的后验概率求均值,来“估计”它所在的分布。 + +既然只是进行最大似然估计,为什么要通过迭代的方式?可以看出,如果直接通过拉格朗日乘数法这样的解析方法计算参数,很难直接得到解析解,于是,通过迭代收敛的EM方法正好适合GMM聚类。 + +### E步 + +EM算法分为两步 + +* E:对当前观测样本中的隐变量通过后验求期望 +* M:最大似然估计参数 + +这里先进行第一步 E,形式化的,把EM的优化目标列出来: +$$ +\theta^{(t+1)} = argmax E_{z|x,\theta^{(t)}}[logP(x,z|\theta^{(t)})] +$$ +设共K个高斯分布,观测样本数为N, 下面对argmax中的式子进行转化 +$$ +Q(\theta) = \sum_{z_1z_2...z_N}(log\prod_{i=1}^NP(x_i,z_i|\theta) \prod_{i=1}^{n}P(z_i|x_i,\theta^{(t)})) \\\\ += \sum_{z_1z_2...z_N}(\sum_{i=1}^NlogP(x_i,z_i|\theta) \prod_{i=1}^{n}P(z_i|x_i,\theta^{(t)})) \\\\ += \sum_{i=1}^N(\sum_{z_1z_2...z_N}logP(x_i,z_i|\theta) \prod_{i=1}^{n}P(z_i|x_i,\theta^{(t)})) \\\\ += \sum_{i=1}^N(\sum_{z_1z_2...z_N}logP(x_i,z_i|\theta) \times P(z_i|x_i,\theta^{(t)}) \prod_{j=1,j\neq i}^{n}P(z_j|x_j,\theta^{(t)})) \\\\ += \sum_{i=1}^N(\sum_{z_i}logP(x_i,z_i|\theta) \times P(z_i|x_i,\theta^{(t)}) \sum_{z_1...z_{i-1} z_{i+1}...z_K} \prod_{j=1,j\neq i}^{n}P(z_j|x_j,\theta^{(t)})) +$$ +注意到,$\sum_{z_2...z_N} \prod_{k=2}^{n}P(z_i|x_i,\theta^{(t)})$这部分实际上是对$z_2到z_N$的每一种组合求全概率,所以为1, +$$ +上式 = \sum_{i=1}^N(\sum_{z_i}logP(x_i,z_i|\theta) \times P(z_i|x_i,\theta^{(t)}) \\\\ += \sum_{i=1}^N(\sum_{k=1}^KlogP(x_i,z_i|\theta) \times P(z_i=k|x_i,\theta^{(t)}) \\\\ += \sum_{i=1}^N(\sum_{k=1}^Klog(p_k \times N(x_i | \mu_k,\Sigma_k)) \times P(z_i=k|x_i,\theta^{(t)})) +$$ +其中$p_k,\mu_k,\Sigma_k$$是需要优化的参数,值得注意的是,$$P(z_i|x_i,\theta^{(t)}$$中的$$\theta^{(t)}$是迭代上一轮的参数,属于先验,不在优化的范围内。 + +### M步 + +然后我们对参数进行优化,这里只详细推导了参数p的计算过程,一方面限于篇幅,另一方面参数p与其他两类参数不同,属于条件优化问题,而其他两者通过矩阵求导容易得到。 + +我们形式化的定义这个优化问题: +$$ +p = \{p_1,p_2,...,p_K\}\\\\ +\max_p \sum_{i=1}^N(\sum_{k=1}^Klog(p_k \times N(x_i | \mu_k,\Sigma_k)) \times P(z_i=k|x_i,\theta^{(t)})\\\\ +s.t. \sum_{i=1}^Kp_i = 1 +$$ +于是可以使用拉格朗日乘数法进行优化: +$$ +L(p,\lambda) =\sum_{i=1}^N(\sum_{k=1}^Klog(p_k \times N(x_i | \mu_k,\Sigma_k)) \times P(z_i = k|x_i,\theta^{(t)})) + \lambda(\sum_{k=1}^Kp_k - 1)\\\\ +\frac{\partial L}{\partial p_k} = \sum_{i=1}^N\frac{1}{p_k}P(z_i=k|x_i,\theta{(t)}) + \lambda(\sum_1^Kp_k - 1)\\\\ +结合\frac{\partial L}{\partial p_k} = 0 和 \frac{\partial L}{\partial \lambda} = 0 +解得 \\ +p_k^{(t+1)} = \frac{1}{N} \sum_{i=1}^NP(zi=k|x_i,\theta^{(t)}) +$$ +再把P中的概率写开,这里实际上就是贝叶斯估计中利用全概率公式求后验 +$$ +P(z_i=k|x_i,\theta^{(t)}) = \frac{P(x_i,zi=k)}{P(x_i)} \\\\ +=\frac{p_k^{(t)}N(x_i|\mu_k^{(t)},\Sigma_k^{(t)})}{\sum_{j=1}^Kp_j^{(t)}N(x_i|\mu_j^{(t)},\Sigma_j^{(t)})} +$$ +之后我们令$r_{ik}=P(z_i=k|x_i,\theta^{(t)})$,即第i个样本是第k类的后验,它可由迭代前的先验得出。此时我们有: +$$ +p_k^{(t+1)} = \frac{1}{N} \sum_{i=1}^NP(zi=k|x_i,\theta^{(t)}) = \frac{1}{N}\sum_{i=1}^Nr_{ik} +$$ +同样的方法,我们还可以得到其他两个参数的最大似然估计: +$$ +令中间变量m_k=\sum_{i=1}^Nr_{ik}\\\\ +\mu_k = \frac{1}{m_k}\sum_{i=1}^{N}r_{ik}x_i\\\\ +\Sigma_k = \frac{1}{m_k}\sum_{i=1}^Nr_{ik}(x_i-\mu_k)^T(x_i-\mu_k) +$$ + +这两个变量的详细迭代公式我将手写的过程放在img中,由于相对繁杂且难看,不放在报告里了。 + +### 更新值 + +这样,我们得到了GMM中EM每一轮的迭代公式。迭代的初值设定: + +* 随机选取K个样本作为均值$\mu$ +* 取0.2倍单位阵作为协方差矩阵 +* 取$\frac{1}{K}$ 作为先验 P + +默认迭代次数为50次。 + +## 自动选择聚簇数量 + +设计模型自动选择最优的聚簇数量。我使用的评价标准是轮廓系数(Silhouette Coefficient)。我们希望得到的簇中,簇内尽量紧密,簇间尽量远离,轮廓系数是类的密集与分散程度的评价指标。轮廓系数的计算公式是: +$$ +S(i) = \frac{b(i)-a(i)}{max\{b(i),a(i)\}}\\\\ +Silhouette Coefficient = \frac{1}{N}\sum_1^NS(i) +$$ +a代表同簇样本到彼此间距离的均值,b代表样本到除自身所在簇外的最近簇的样本的均值,轮廓系数是所有样本S(i)的均值。S(i)取值在[-1, 1]之间。 如果S(i)接近1,代表样本所在簇合理;若S(i)接近-1代表S(i)更应该分到其他簇中。 + +### 循环结束标准 + +通过循环,依次尝试2至最大10000范围内的所有聚簇数量,寻找轮廓系数最高的聚簇数量。考虑效率因素,我设置这样的循环结束标准:如果连续5次尝试得到的轮廓系数比当前最大的轮廓系数小,则终止循环。由于轮廓系数曲线一般为一个上凸函数,有理由相信轮廓系数将不会再超过此时的最大值。 + +## 实验 + +我对不同分布的二维和三维数据进行了实验。 + +### 二维,分散的样本点 + +这组数据由4个高斯分布生成,输入的聚簇数量为4,整体上较为分散,相关系数均为0,即圆形分布。 + +#### Kmeans + +前面介绍到,Kmeans对初始类中心选择敏感。在我对**克服初始值选取的影响**这一项进行优化前,大约5次中会有一次收敛到局部最优解,出现这样的簇聚结果,此时的轮廓系数=0.4993806937643633,迭代次数为15: + +![image-20210613111556776](img/image-20210613111556776.png) + +通过重复3次聚类,并根据轮廓系数筛选最优的方法,在此数据上重复20次运行,均得到预期结果,轮廓系数=7005718095084397,迭代次数3~13: + +![image-20210613112058224](img/image-20210613112058224.png) + +#### GMM + +使用GMM聚类的方法,这种分散的数据也能很好的收敛。不过进行50轮迭代,运行时间相对较长。 + +![image-20210613112305460](img/image-20210613112305460.png) + +### 二维,较密集、椭圆形分布的数据 + +这组数据分布类之间较密集,且为椭圆形分布(协方差不为0)。同前面一样,依然是4类。 + +#### Kmeans + +整体上Kmeans表现良好,但可以观察到,Kmeans更倾向于分成圆形的分布: + +![image-20210613113041196](img/image-20210613113041196.png) + +这也导致了,在有些时候,Kmeans的收敛结果和高斯分布的生成方式出现区别: + +![image-20210613113138408](img/image-20210613113138408.png) + +#### GMM + +相比之下,GMM表现更好,收敛结果基本和椭圆形数据的生成方式一致: + +![image-20210613113402912](img/image-20210613113402912.png) + +### 二维,更大椭率的分布 + +这组数据在上一组的基础上增大了椭圆的椭率。 + +#### Kmeans + +可以看到,由于倾向圆形分布,下图画圈的几个位置的样本点Kmeans聚类和预期不符。 + +![image-20210613113650673](img/image-20210613113650673.png) + +#### GMM + +GMM表现较优,但也存在问题:GMM的簇聚结果倾向于样本点多的高斯分布(先验概率P大),这也导致了下图中蓝色“挤走”部分应该是红色的区域。 + +![image-20210613113936850](img/image-20210613113936850.png) + +### 三维,分散的数据 + +下面这组数据是三维高斯分布的数据,由3个高斯分布生成,聚为3类。整体上比较分散易于聚类。 + +#### Kmeans + +![image-20210613114833598](img/image-20210613114833598.png) + +#### GMM + +![image-20210613114938144](img/image-20210613114938144.png) + +### 三维,密集的数据 + +将类间的密集程度提高 + +#### Kmeans + +![image-20210613115409576](img/image-20210613115409576.png) + +#### GMM + +![image-20210613115339403](img/image-20210613115339403.png) + +### 总结 + +| | 优点 | 缺点 | +| ------ | -------------------------------------- | -------------------------------------- | +| Kmeans | 效率高,原理简单 | 不适用于椭球形,对初始中心、异常值敏感 | +| GMM | 适合椭球形,结果为概率分布而非确定的类 | 速度慢,倾向于先验概率高的类 | + +### 自动选择聚簇结果的实验 + +算法原理前面已经介绍过,这里放一张轮廓系数的计算概念图: + +![image-20210613120125395](img/image-20210613120125395.png) + +通过循环,依次尝试2至最大10000范围内的所有聚簇数量,寻找轮廓系数最高的聚簇数量。考虑效率因素,我设置这样的循环结束标准:如果连续5次尝试得到的轮廓系数比当前最大的轮廓系数小,则终止循环。 + +这种选择方式与具体使用的聚类方法无关,因此我的ClusteringAlgorithm可以支持Kmeans和GMM两种聚类算法,可以通过入参type进行调整,默认为Kmeans。 + +#### 在4类的二维数据上进行测试 + +由于左上角高斯分布生成的点数相对很少,最终的选择结果是聚成3类: + +![image-20210613120550093](img/image-20210613120550093.png) + +如果增大左上角高斯分布生成的样本数,将聚成4类: + +![image-20210613120930011](img/image-20210613120930011.png) + +#### 在3类的三维数据上进行测试 + +聚成3类: + +![image-20210613121057160](img/image-20210613121057160.png) + +轮廓系数变化: + +![image-20210613121126405](img/image-20210613121126405.png) + +由于蓝色类样本数很少,可以看到k=2 和 k=3时轮廓系数差别不大,非常合理。 \ No newline at end of file diff --git a/assignment-3/submission/18307130154/img/image-20210613111539321.png b/assignment-3/submission/18307130154/img/image-20210613111539321.png new file mode 100644 index 0000000000000000000000000000000000000000..0aa0311894443f2c0b8d9790f956695a9f5e8ba5 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613111539321.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613111556776.png b/assignment-3/submission/18307130154/img/image-20210613111556776.png new file mode 100644 index 0000000000000000000000000000000000000000..0aa0311894443f2c0b8d9790f956695a9f5e8ba5 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613111556776.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613112058224.png b/assignment-3/submission/18307130154/img/image-20210613112058224.png new file mode 100644 index 0000000000000000000000000000000000000000..8c71eccb0312e69b35f86cc50854d39ce5379dad Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613112058224.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613112305460.png b/assignment-3/submission/18307130154/img/image-20210613112305460.png new file mode 100644 index 0000000000000000000000000000000000000000..8cf8189b58f2684723acf42bc6692d78d23e564a Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613112305460.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613112633098.png b/assignment-3/submission/18307130154/img/image-20210613112633098.png new file mode 100644 index 0000000000000000000000000000000000000000..628ef61e3e547f6fd47c6643e684745b00c240de Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613112633098.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613113041196.png b/assignment-3/submission/18307130154/img/image-20210613113041196.png new file mode 100644 index 0000000000000000000000000000000000000000..351a7cfcfab1e9d8513f6c41fda9f7dad487e8c6 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613113041196.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613113138408.png b/assignment-3/submission/18307130154/img/image-20210613113138408.png new file mode 100644 index 0000000000000000000000000000000000000000..039df8332aeafb75ee0ba5a07e0cc3b6abe9a61a Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613113138408.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613113402912.png b/assignment-3/submission/18307130154/img/image-20210613113402912.png new file mode 100644 index 0000000000000000000000000000000000000000..f2660df20d4685e583a26dd6afdb1a33fc8be8cf Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613113402912.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613113650673.png b/assignment-3/submission/18307130154/img/image-20210613113650673.png new file mode 100644 index 0000000000000000000000000000000000000000..44fc42b9a40bc138349aa5d72062d6601c3e9d34 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613113650673.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613113936850.png b/assignment-3/submission/18307130154/img/image-20210613113936850.png new file mode 100644 index 0000000000000000000000000000000000000000..36903f2864bc33bec487da16fe1c9535b68fe543 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613113936850.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613114833598.png b/assignment-3/submission/18307130154/img/image-20210613114833598.png new file mode 100644 index 0000000000000000000000000000000000000000..d05322a5bf0dced08fd06cc0559b36c8e35b55d0 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613114833598.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613114938144.png b/assignment-3/submission/18307130154/img/image-20210613114938144.png new file mode 100644 index 0000000000000000000000000000000000000000..0640e906c621496323efd3535ebf4cbd39fbb3d8 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613114938144.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613115339403.png b/assignment-3/submission/18307130154/img/image-20210613115339403.png new file mode 100644 index 0000000000000000000000000000000000000000..d1ff60cf4efd7487075483800f9da41333c86621 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613115339403.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613115409576.png b/assignment-3/submission/18307130154/img/image-20210613115409576.png new file mode 100644 index 0000000000000000000000000000000000000000..96ed977c290efb0ffdcfbdc4dd9e24be0cfe4a35 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613115409576.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613120125395.png b/assignment-3/submission/18307130154/img/image-20210613120125395.png new file mode 100644 index 0000000000000000000000000000000000000000..40f1e00fea0990242f9c9e92948b8872ede51f14 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613120125395.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613120550093.png b/assignment-3/submission/18307130154/img/image-20210613120550093.png new file mode 100644 index 0000000000000000000000000000000000000000..75198836d7f00de49ad3e57a5f8af5581e77bb97 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613120550093.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613120930011.png b/assignment-3/submission/18307130154/img/image-20210613120930011.png new file mode 100644 index 0000000000000000000000000000000000000000..7e94d3871b3421b88053279f08c3aea57ed6f909 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613120930011.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613121057160.png b/assignment-3/submission/18307130154/img/image-20210613121057160.png new file mode 100644 index 0000000000000000000000000000000000000000..e567d19ae2d5ab5b2ace1b96b72ccab10db4531e Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613121057160.png differ diff --git a/assignment-3/submission/18307130154/img/image-20210613121126405.png b/assignment-3/submission/18307130154/img/image-20210613121126405.png new file mode 100644 index 0000000000000000000000000000000000000000..b5a5c7d0d02054a21dd2ca4c3b3f22cd08037100 Binary files /dev/null and b/assignment-3/submission/18307130154/img/image-20210613121126405.png differ diff --git "a/assignment-3/submission/18307130154/img/\346\216\250\345\257\2741.png" "b/assignment-3/submission/18307130154/img/\346\216\250\345\257\2741.png" new file mode 100644 index 0000000000000000000000000000000000000000..826e255393e037b194b690a09020bd7c525adc4f Binary files /dev/null and "b/assignment-3/submission/18307130154/img/\346\216\250\345\257\2741.png" differ diff --git "a/assignment-3/submission/18307130154/img/\346\216\250\345\257\2742.png" "b/assignment-3/submission/18307130154/img/\346\216\250\345\257\2742.png" new file mode 100644 index 0000000000000000000000000000000000000000..f95832bd757bd189a6be369bc2c29d0438bf8d7b Binary files /dev/null and "b/assignment-3/submission/18307130154/img/\346\216\250\345\257\2742.png" differ diff --git a/assignment-3/submission/18307130154/source.py b/assignment-3/submission/18307130154/source.py new file mode 100644 index 0000000000000000000000000000000000000000..29436a0f9537daf32808f50c4ceda9264a559857 --- /dev/null +++ b/assignment-3/submission/18307130154/source.py @@ -0,0 +1,366 @@ +import numpy as np + +class KMeans: + + def __init__(self, n_clusters): + self.k = n_clusters # 聚成的类数 + self.centers = None # 类中心(均值) + + def fit(self, train_data): + train_data = train_data.copy() + if train_data.shape[0] <= self.k: + self.centers = train_data + return + + def cluster(train_data, cens): + ''' + 一轮迭代 + parameters: + cens: 上一轮的类中心 + return: + 新的类中心 + ''' + clusters = [[] for _ in range(self.k)] + for i in range(train_data.shape[0]): # 对每个点,选择最近的类中心作为这个点的类 + data = train_data[i] + distances = np.sum((data - cens) ** 2,axis=1) + minind = np.argmin(distances) + clusters[minind].append(i) + + ncens = [] + for i in range(self.k): # 计算每个类中点的平均值作为新的类中心 + data = train_data[clusters[i]] + #删除异常值 + if data.shape[0] > 50: + distances = (np.sum((data - cens[i])** 2,axis=1)) ** 0.5 + avgdis = np.mean(distances) + ind = np.where(distances < avgdis * 20) + data = data[ind] + cen = np.average(data,axis=0) + ncens.append(cen) + return np.array(ncens) + + def eval(train_data): + np.random.shuffle(train_data) + cens = train_data[0:self.k] # 随机选择k个数据作为初始的类中心 + iter = 0 + while 1: + iter += 1 + ncens = cluster(train_data,cens) + if (ncens == cens).all(): # 迭代终止条件:前后两轮类中心不变,此后簇聚结果也不再会随迭代改变 + # print("迭代次数"+str(iter)) + break + cens = ncens + self.centers = cens + kind = self.predict(train_data) + evaluator = ClusteringAlgorithm() + distancs = evaluator._cal_distance(train_data) + score = evaluator.cal_Silhouette_Coefficient(train_data,kind,self.k,distancs) + # print(score) + recore_cens = cens + return recore_cens,score + + #为了克服初始类中心选取对簇聚结果的影响,重复若干次实验,选择轮廓系数最大的 + maxscore = -2 + for i in range(3): + cens,score = eval(train_data) + if score > maxscore: + self.centers = cens + + #对测试集聚类,如果draw=true, 将绘制聚类结果图(二维可用) + def predict(self, test_data,draw=False): + ans = [] + for i in range(test_data.shape[0]): + data = test_data[i] + # print(data) + distances = np.sum((self.centers - data) ** 2,axis=1) + cenid = np.argmin(distances) + # print(cenid) + ans.append(cenid) + if draw: + if test_data.shape[1] == 2: + self._drawing_dim2(np.array(ans),test_data) + else: + self._drawing_dim3(np.array(ans),test_data) + return np.array(ans) + + def _drawing_dim2(self,kind,X): + colors = ['red','green','blue','yellow'] + import matplotlib.pyplot as plt + for i in range(self.k): + ind = np.where(kind==i) + plt.scatter(X[ind][:,0], X[ind][:,1], c=colors[i%(len(colors))], alpha=0.4) + plt.show() + + def _drawing_dim3(self,kind,X): + from mpl_toolkits.mplot3d import Axes3D + import matplotlib.pyplot as plt + fig = plt.figure() + ax = Axes3D(fig) + colors = ['red','green','blue','yellow'] + for i in range(self.k): + ind = np.where(kind==i) + ax.scatter(X[ind][:,0], X[ind][:,1],X[ind][:,2], c=colors[i%(len(colors))], alpha=0.4) + + ax.legend(loc='best') + plt.show() + +class GaussianMixture: + + def __init__(self, n_clusters,iteration=50): + # pass + self.K = n_clusters + self.miu = self.sigma = self.P = None # GMM中各个高斯分布的sigma和miu,以及隐变量P + ''' + K为隐变量维度,即聚成的类数; D为维度 + miu: K * D 为均值 + sigma: K * D * D 为协方差矩阵 + P: 1 * K + ''' + self.itertion = iteration # 迭代次数,默认50轮 + + def _N(self,x, miu, sigma): + # 计算多维高斯分布的联合概率密度函数 + ''' + x,miu,sigma是np.array 1*D + 要转化为列正态向量 np.matrix + ''' + x = (np.matrix(x)).T + miu = (np.matrix(miu)).T + sigma = np.matrix(sigma) + D = x.shape[0] + t1 = 1/((2 * np.pi) ** (D/2) * (np.linalg.det(sigma))**0.5) + t2 = -0.5 * np.matmul(np.matmul((x - miu).T,sigma.I),(x - miu)) + return t1 * np.exp(t2) + np.exp(-200) + + def _cal_r(self,X,miu,sigma,Pt): + # 计算迭代时的中间变量 R(n * K) + # 其中 Rik 表示第 i 个数据属于第 k 个高斯分布的概率 + n = X.shape[0] + ret = np.zeros((n,self.K)) + for i in range(n): + x = X[i] + sum = 0 + for k in range(self.K): + t = Pt[k] * self._N(x,miu[k],sigma[k]) + sum += t + ret[i][k] = t + for k in range(self.K): + ret[i][k] /= sum + return ret + + def _iterate(self,X): + ''' + 一轮迭代 + 需要更新的参数有: + 设K为隐变量维度,即聚成的类数; D为维度 + miu: K * D 为均值 + sigma: K * D * D 为协方差矩阵 + P: 1 * K + ''' + miu = self.miu; sigma = self.sigma; Pt = self.P + n = X.shape[0] + D = X.shape[1] + R = self._cal_r(X,miu,sigma,Pt) + m = np.sum(R,axis=0) + nP = m / n #新的P + temp = np.matmul(R.T , X) + nmiu = temp / m[:,np.newaxis] #新的miu + + nsigma = np.zeros((self.K,D,D)) + for k in range(self.K): + temp = np.zeros((D,D)) + for i in range(n): + t = X[i] - miu[k] + t = np.matrix(t) + temp += R[i][k] * np.matmul(t.T,t) + temp /= m[k] + nsigma[k] = temp + + self.P = nP + self.sigma = nsigma + self.miu = nmiu + + def fit(self, train_data): + # 对训练集进行聚类,将进行 self.itertion 次迭代 + n = train_data.shape[0] + D = train_data.shape[1] + train_data = train_data.copy() + ind = np.arange(n) + np.random.shuffle(ind) + self.miu = train_data[ind[:self.K]] + self.sigma = np.full((self.K,D,D), fill_value=np.diag(np.full(D,0.2))) + self.P = np.ones(self.K) / self.K + for iter in range(self.itertion): + self._iterate(train_data) + # kind = self.predict(train_data) + # self._drawing_dim2(kind,train_data) + + + def _drawing_dim2(self,kind,X): + colors = ['red','green','blue','yellow'] + import matplotlib.pyplot as plt + for i in range(self.K): + ind = np.where(kind==i) + plt.scatter(X[ind][:,0], X[ind][:,1], c=colors[i%(len(colors))], alpha=0.4) + plt.show() + + def _drawing_dim3(self,kind,X): + from mpl_toolkits.mplot3d import Axes3D + import matplotlib.pyplot as plt + fig = plt.figure() + ax = Axes3D(fig) + colors = ['red','green','blue','yellow'] + for i in range(self.K): + ind = np.where(kind==i) + ax.scatter(X[ind][:,0], X[ind][:,1],X[ind][:,2], c=colors[i%(len(colors))], alpha=0.4) + + ax.legend(loc='best') + plt.show() + + def predict(self, test_data, draw=False): + #预测,设置draw=true 将绘制聚类结果 + R = self._cal_r(test_data,self.miu,self.sigma,self.P) + kind = np.argmax(R,axis=1) + if draw: + if test_data.shape[1] == 2: + self._drawing_dim2(kind,test_data) + else: + self._drawing_dim3(kind,test_data) + return kind + +class ClusteringAlgorithm: + + def __init__(self,type=1): + if type==0: + self.ModelClass = GaussianMixture + elif type==1: + self.ModelClass = KMeans + + def _cal_distance(self,X): + # 计算所有点两两之间的距离,返回距离矩阵 + n = X.shape[0] + distance = np.zeros((n,n)) + for i in range(n): + distance[i] = (np.sum((X[i] - X)**2,axis=1) )**0.5 + return distance + + def _cal_avgdistance(self,indk,ind,distance): + # 计算ind点到indk中所有点的平均距离 + if indk[0].shape[0] < 2: + return 0 + indk = np.delete(indk,np.where(indk==ind)) + avg = np.mean(distance[ind,indk]) + return avg + + def cal_Silhouette_Coefficient(self,X,kind,K,distance): + # 计算整个簇聚结果的轮廓系数 + n = X.shape[0] + inds = [] + for k in range(K): + inds.append(np.where(kind==k)) + S = [] + for i in range(n): # 对每个点求 S(i) + ai = self._cal_avgdistance(inds[kind[i]],i,distance) + bi = np.inf + for k in range(K): + if k == kind[i]: + continue + bi = min(bi,self._cal_avgdistance(inds[k],i,distance)) + S.append((bi-ai)/max(bi,ai)) + return np.mean(np.array(S)) + + + def fit(self, train_data): + distance = self._cal_distance(train_data) + print("计算距离结束") + fl = 0 + maxscore = -10 + # 通过循环,依次尝试2至最大10000范围内的所有聚簇数量,寻找轮廓系数最高的聚簇数量。考虑效率因素, + # 我设置这样的循环结束标准:如果连续5次尝试得到的轮廓系数比当前最大的轮廓系数小,则终止循环。 + for n_clusters in range(2,10000): + print("K=" + str(n_clusters)) + model = self.ModelClass(n_clusters) + model.fit(train_data) + kind = model.predict(train_data) + score = self.cal_Silhouette_Coefficient(train_data,kind,n_clusters,distance) + if score <= maxscore: + fl += 1 + else: + self.model = model + maxscore = score # 选择最佳的聚类结果(轮廓系数最高) + fl = 0 + if fl >= 5: + break + print("轮廓系数:" + str(score)) + + def predict(self, test_data, draw=False): + kind = self.model.predict(test_data,draw) + return kind + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + +def data_1(): + sigma = 40 + cold = 1.5 + mean = (-10,-10) + cov = np.array([[sigma, sigma/cold], [sigma/cold, sigma]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (-10, 10) + cov = np.array([[sigma,sigma/cold], [sigma/cold, sigma]]) + y = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (10, 0) + cov = np.array([[sigma, sigma/cold], [sigma/cold, sigma]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + mean = (10, -20) + cov = np.array([[sigma, sigma/cold], [sigma/cold, sigma]]) + w = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z,w) + return (data, data), 4 + +def data_2(): + sigma = 14 + mean = (10,0,0) + cov = np.array([[sigma, 0,0], [0, sigma,0],[0,0,sigma]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (0,10,0) + cov = np.array([[sigma, 0,0], [0, sigma,0],[0,0,sigma]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (0,0,10) + cov = np.array([[sigma, 0,0], [0, sigma,0],[0,0,sigma]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z) + return (data, data), 3 + +if __name__=="__main__": + # (data,_),K = data_2() + # # print(data.shape) + + # model = KMeans(K) + # model.fit(data) + # model.predict(data,draw=True) + + (data,_),K = data_2() + # print(data.shape) + + model = ClusteringAlgorithm(type=1) + model.fit(data) + model.predict(data,draw=True)