diff --git a/assignment-3/handout/source.py b/assignment-3/handout/source.py index 43aa5bb327e9a34a31b8c553af7cd9104acdd220..9a0a6d3a653cac055c1316c7c06a910ddef7eb4a 100644 --- a/assignment-3/handout/source.py +++ b/assignment-3/handout/source.py @@ -31,4 +31,4 @@ class ClusteringAlgorithm: pass def predict(self, test_data): - pass + pass \ No newline at end of file diff --git a/assignment-3/handout/test_demo.py b/assignment-3/handout/test_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..a569dddc62de7f598dcc986179caedd842e44b77 --- /dev/null +++ b/assignment-3/handout/test_demo.py @@ -0,0 +1,117 @@ +import numpy as np +import sys + +from source import KMeans, GaussianMixture + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z) + return (data, data), 3 + + +def data_2(): + train_data = np.array([ + [23, 12, 173, 2134], + [99, -12, -126, -31], + [55, -145, -123, -342], + ]) + return (train_data, train_data), 2 + + +def data_3(): + train_data = np.array([ + [23], + [-2999], + [-2955], + ]) + return (train_data, train_data), 2 + + +def test_with_n_clusters(data_fuction, algorithm_class): + (train_data, test_data), n_clusters = data_fuction() + model = algorithm_class(n_clusters) + model.fit(train_data) + res = model.predict(test_data) + assert len( + res.shape) == 1 and res.shape[0] == test_data.shape[0], "shape of result is wrong" + return res + + +def testcase_1_1(): + test_with_n_clusters(data_1, KMeans) + return True + + +def testcase_1_2(): + res = test_with_n_clusters(data_2, KMeans) + return res[0] != res[1] and res[1] == res[2] + + +def testcase_2_1(): + test_with_n_clusters(data_1, GaussianMixture) + return True + + +def testcase_2_2(): + res = test_with_n_clusters(data_3, GaussianMixture) + return res[0] != res[1] and res[1] == res[2] + + +def test_all(err_report=False): + testcases = [ + ["KMeans-1", testcase_1_1, 4], + ["KMeans-2", testcase_1_2, 4], + # ["KMeans-3", testcase_1_3, 4], + # ["KMeans-4", testcase_1_4, 4], + # ["KMeans-5", testcase_1_5, 4], + ["GMM-1", testcase_2_1, 4], + ["GMM-2", testcase_2_2, 4], + # ["GMM-3", testcase_2_3, 4], + # ["GMM-4", testcase_2_4, 4], + # ["GMM-5", testcase_2_5, 4], + ] + sum_score = sum([case[2] for case in testcases]) + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except Exception as e: + if err_report: + print("Error [{}] occurs in {}".format(str(e), case[0])) + res = 0 + score += res + print("+ {:14} {}/{}".format(case[0], res, case[2])) + print("{:16} {}/{}".format("FINAL SCORE", score, sum_score)) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--report": + test_all(True) + else: + test_all() \ No newline at end of file diff --git a/assignment-3/submission/16300110008/README.md b/assignment-3/submission/16300110008/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f57d4441c80b8ece7c186bf7d13cfad7f52353ac --- /dev/null +++ b/assignment-3/submission/16300110008/README.md @@ -0,0 +1,681 @@ + + +# 课程报告 + +这是一个有关numpy实现K-Means以及GMM的实验报告,代码保存在source.py中,实验结果展示使用了matplotlib库。 + +## 可视化聚类分析 + +### 数据生成 + +数据生成采用了`numpy.random.multivariate_normal()`方法,可以在给定参数的情况下生成多元二维高斯分布,示例如下: + + + +下面实验中所用到的数据均由此方法产生。 + +### K-Means + +#### 原理简介 + +K-Means算法是一类原型聚类方法,此类方法假定据类结构可以通过一组原型刻画,一般来说,算法先对原型进行初始化,之后对原型进行迭代更新求解,不同的原型聚类方法在原型表示、求解方式上有所不同。 + +在原型表示方式上,K-Means利用均值向量作为簇的原型;在求解方式上,给定样本集 $D=\{\pmb x_1,\pmb x_2,...,\pmb x_m\}$ ,假设算法得到的一个簇划分为 $C=\{C_1,C_2,...,C_k\}$ ,此划分应当满足平方误差最小,即使 +$$ +E=\sum^k_{i=1}\sum_{\pmb x \in C_i} || \pmb x - \pmb \mu_i ||^2_2 +\tag{1} +$$ +最小,其中 $\pmb \mu_i$ 是簇 $C_i$ 的均值向量。显然,想要找到这样的簇结构需要考察样本集的所有可能的划分,而这则是一个NP难问题,因此往往采用贪心策略,使用迭代优化来近似求解,当迭代更新后,聚类结果不变或者均值向量不变时停止迭代。 + +#### 数据预处理 + +考虑到输入数据中数据的分布可能比较分散,在计算均值向量及距离时很有可能出现因为数据太大而溢出的现象。因此在进行聚类前,笔者对数据的进行了标准化处理,代码如下: + +```python +def normalization(X): + """ + 对数据进行标准化处理,使其均值为0,方差为1 + :param X: 输入数据 + :return: 标准化后的数据 + """ + mean = np.mean(X, axis=0) + std = np.std(X, axis=0) + X = (X - mean) / std + return X +``` + +#### 初始化对聚类的影响分析 + +笔者使用了随机选取均值向量的方法确定各个簇的原型,这一做法在分布较为均衡的样本上表现很好,基本可以得到良好的聚类结果,但是如果样本分布十分不均衡时,随机选取的均值向量属于容量较少的类的可能就会降低,从而影响聚类结果,下图中笔者在两组实验中使用了相同的均值和协方差均值作为分布的参数,调整了生成数据时每个分布产生的样本数量,随机选取向量的代码如下: + +```python +idx = np.arange(num) # num是样本的数量 +mean_vec_idx = np.random.choice(idx, self.n_clusters, replace=False) # self.n_clusters是簇的个数,replace=False表示抽样不重复 +mean_vec = train_data[mean_vec_idx] # 初始均值向量 +``` + +参数如下: + +
+ 表1 参数取值 +
+ +| 生成簇名 | 均值 | 协方差矩阵 | 组1数量 | 组2数量 | 组3数量 | +| -------- | ---------------------- | ------------------------------------------------------------ | ------- | ------- | ------- | +| C1 | $\pmb \mu_1= [50, 22]$ | $\pmb \Sigma_1=\left[\begin{array}{cc}10 & 5 \\\\5 & 10\end{array}\right]$ | 100 | 100 | 100 | +| C2 | $\pmb \mu_2= [16, -5]$ | $\pmb \Sigma_2=\left[\begin{array}{cc}21.2 & 0 \\\\0 & 32.1\end{array}\right]$ | 100 | 100 | 1000 | +| C3 | $\pmb \mu_3= [10, 20]$ | $\pmb \Sigma_3=\left[\begin{array}{cc}73 & 0 \\\\0 & 22\end{array}\right]$ | 100 | 1000 | 10000 | + +
+ 表2 聚类过程与结果 +
+ + + + + + + + + + + + + + + + + + + + + + +
GenerationFirst IterationResult
+ +实验结果表明,当原始簇的数量比较均衡时,即使初始向量分布比较集中,也能够经过逐次迭代最终返回一个比较合理的簇分化;但是当原始簇的数量分布不均衡时,初始向量对最终的聚类结果会产生很大的影响,比如导致数量较少的原始簇被并入数量较大的原始簇中,尽管这两个簇可能在分布上差距较大;当各簇之间数量悬殊过大时,即使能够生成理想的初始向量,聚类结果也难以令人满意。对于后一个问题,GMM模型可以很好地解决,这里先不予分析;对于前一个问题,则需要对初始均值向量的选取做一定的限定 + +一种避免初始向量分布集中的做法是迭代挑选均值向量,使得新的均值向量距离已有向量足够远。K-Means++采用了以依据距离确定选取均值向量概率的做法,使得距离簇中心越远的样本被选为新中心的概率越大,基本思路如下: + +1. 随机选取第一个均值向量 +2. 计算每个样本到最近的均值向量的距离 +3. 以距离为概率选取新的均值向量 +4. 重复2、3,直到选出k个均值向量 + +代码如下: + +```python +def initialize(self): + """ + 初始化均值向量,KMeans++方法 + :return: 初始化后的均值向量 + """ + # n, d, k分别是样本数量,维度以及簇数 + n = self.data.shape[0] + d = self.data.shape[1] + k = self.n_clusters + # 随机选取第一个均值向量 + idx = np.random.randint(0, n) + # 初始化均值向量存储矩阵 + mean_vec = np.zeros((k, d)) + mean_vec[0] = self.data[idx].reshape(1, -1) + # 初始化每个样本到均值向量的距离存储矩阵 + dist = np.zeros((n, k)) + # 将每个样本到第一个向量的距离填入矩阵中 + dist[:, 0] = np.linalg.norm(self.data - mean_vec[0].reshape(1, -1), axis=1) + # 迭代选取剩余的向量 + for i in range(1, k): + # 保留每个样本到距离其最近的类中心的距离,距离都保存在dist矩阵的前i列中 + dist_min = np.min(dist[:, :i], axis=1) + # 在保留的距离中选取距离最大的一个作为新的均值向量 + new_vec_idx = np.argmax(dist_min) + # 将此步产生的向量并入 + mean_vec[i] = self.data[new_vec_idx].reshape(1, -1) + # 计算每个样本到新向量的距离,并存入距离矩阵 + dist[:, i] = np.linalg.norm(self.data - mean_vec[i], axis=1) + return mean_vec +``` + +对于上述实验中的组2,在更换了初始化方式后,聚类结果直观上有了很大的改善: + +
+ 表3 KMeans++方法初始化的组2 +
+ + + + + + + + + + + + +
GenerationFirst IterationResult
+ +### GMM + +#### 原理简介 + +GMM也是一类原型聚类向量,但与KMeans不同的是,其采用概率模型而非原型向量来刻画聚类结构。下面对GMM模型的原理进行梳理,推导过程参考周志华《机器学习》。 + +GMM算法假设样本由高斯混合分布生成,高斯混合分布的定义如下: +$$ +\begin {aligned} +P(\pmb x) &= +\frac 1 {(2\pi)^{\frac n 2} |\pmb \Sigma|^{\frac 1 2}} +e^{-\frac 1 2 +(\pmb x-\pmb \mu)^T |\pmb \Sigma|^{-1}(\pmb x-\pmb \mu)} +\\\\ +P_{M}(\pmb x) &= \sum^k_{i=1}\alpha_i \cdot P(\pmb x|\pmb \mu_i, \pmb \Sigma_i) +\end{aligned} +\tag{2} +$$ +生成的过程为根据混合系数的先验分布选择高斯混合成分,再根据被选择的混合成分的概率密度函数进行采样生成响应的样本。 + +根据贝叶斯定理可以计算样本属于每个混合成分的后验概率: +$$ +\begin{aligned} +P_M(z_j=i|\pmb x_j) +&= \frac{P(z_j=i) \cdot P_M(\pmb x_j|z_j=i)}{P_M(\pmb x_j)}\\\\ +&=\frac {\alpha_i \cdot P(\pmb x_j|\pmb \mu_i,\pmb \Sigma_i)} {\sum^k_{l=1}\alpha_l \cdot P(\pmb x_j|\pmb \mu_l,\pmb \Sigma_l)} +\\\\ +&=\gamma_{ji} +\end{aligned} +\tag{3} +$$ +其中,$z_j=i$ 表示第j个样本属于第i个高斯混合成分,将此后验概率记为 $\gamma_{ji}$ 。依据此后验概率分布,可以将样本划归其对应后验概率最大的混合成分。 + +可以使用极大似然法对模型参数进行估计: +$$ +\begin{aligned} +LL(D)&=\ln \biggl(\prod^m_{j=1} P_M(\pmb x_j) \biggl)\\\\ +&=\sum^m_{j=1} \ln +\biggl( +\sum^k_{i=1}\alpha_i \cdot P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i) +\biggl) +\end{aligned} +\tag{4} +$$ +那么寻找一个最优的簇划分的问题则转变为最大化(4)式的问题。 + +对(4)式对参数 $\pmb \mu_i, \pmb \Sigma_i$ 的偏导进行推导: + +对 $\pmb \mu_i$ : +$$ +\begin{aligned} +&\frac{\partial LL(D)}{\partial \pmb \mu_i}=0 \\\\ +\iff& +\frac{\partial}{\partial \pmb \mu_i} +\sum^m_{j=1} \ln +\biggl( +\sum^k_{i=1}\alpha_i \cdot P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i) +\biggl) +=0 \\\\ +\iff& +\sum^m_{j=1} +\frac{\partial}{\partial \pmb \mu_i} +\ln +\biggl( +\sum^k_{i=1}\alpha_i \cdot P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i) +\biggl) +=0 \\\\ +\iff& +\sum^m_{j=1} +\frac {1}{\sum^k_{i=1}\alpha_i \cdot P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i)} +\frac{\partial}{\partial \pmb \mu_i} +\biggl( +\sum^k_{i=1}\alpha_i \cdot P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i) +\biggl) +=0 \\\\ +\iff& +\sum^m_{j=1} +\frac {\alpha_i}{\sum^k_{i=1}\alpha_i \cdot P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i)} +\frac{\partial}{\partial \pmb \mu_i} +\biggl( +P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i) +\biggl) +=0 +\end{aligned} +\tag{5} +$$ + +其中 +$$ +\begin{aligned} +\frac{\partial}{\partial \pmb \mu_i} +\biggl( +P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i) +\biggl) +&=\frac{\partial}{\partial \pmb \mu_i} +\biggl( +\frac 1 {(2\pi)^{\frac n 2} |\pmb \Sigma_i|^{\frac 1 2}} +e^{-\frac 1 2 +(\pmb x_j-\pmb \mu_i)^T \pmb \Sigma_i^{-1}(\pmb x_j-\pmb \mu_i)} +\biggl) +\\\\ +&= +\frac 1 {(2\pi)^{\frac n 2} |\pmb \Sigma_i|^{\frac 1 2}} +e^{-\frac 1 2 +(\pmb x-\pmb \mu_i)^T \pmb \Sigma_i^{-1}(\pmb x-\pmb \mu_i)} +\\\\ +&\ \cdot\frac{\partial}{\partial \pmb \mu_i} +\biggl( +-\frac 1 2 +(\pmb x_j-\pmb \mu_i)^T \pmb \Sigma_i^{-1}(\pmb x_j-\pmb \mu_i) +\biggl) +\\\\ +&= +P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i)\\\\ +& \cdot +\biggl( +-\frac 1 2 +\biggl) +\frac{\partial}{\partial \pmb \mu_i} +\biggl( +(\pmb x_j-\pmb \mu_i)^T \pmb \Sigma_i^{-1}(\pmb x_j-\pmb \mu_i) +\biggl) +\end{aligned} +\tag{6} +$$ + +根据二次型矩阵求导公式 +$$ +\frac +{\partial x^TAx} +{\partial x} +=2Ax +\tag{7} +$$ + +有 +$$ +\frac{\partial}{\partial \pmb \mu_i} +\biggl( +(\pmb x_j-\pmb \mu_i)^T \pmb \Sigma_i^{-1}(\pmb x_j-\pmb \mu_i) +\biggl) +=-2 +\pmb \Sigma_i^{-1} +(\pmb x_j-\pmb \mu_i) +\tag{8} +$$ + +将(6)、(8)带回(5),有 +$$ +\sum^m_{j=1} +\frac +{\alpha_i \cdot P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i)}{\sum^k_{i=1}\alpha_i \cdot P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i)} +\pmb \Sigma_i^{-1} +(\pmb x_j-\pmb \mu_i) +=0 +\tag{9} +$$ + +因为 $|\pmb \Sigma_i|^{-1}$ 左边是一个标量,因此可以在两边左乘 $|\pmb \Sigma_i|$ ,则有 +$$ +\sum^m_{j=1} +\frac +{\alpha_i \cdot P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i)}{\sum^k_{i=1}\alpha_i \cdot P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i)} +(\pmb x_j-\pmb \mu_i) +=0 +\tag{10} +$$ + +由式(3)有 +$$ +\pmb \mu_i = +\frac +{\sum^m_{j=1} \gamma_{ji} \pmb x_j} +{\sum^m_{j=1} \gamma_{ji}} +\tag{11} +$$ + +也就是均值向量可以通过样本的加权平均来估计,样本权重是每个样本属于该成分的后验概率。 +同理可以求出协方差矩阵(参考《机器学习》): +$$ +\pmb \Sigma_i +=\frac +{\sum^m_{j=1}\gamma_{ji}(\pmb x_j-\pmb \mu_i)(\pmb x_j-\pmb \mu_i)^T} +{\sum^m_{j=1}\gamma_{ji}} +\tag{12} +$$ + +又因为优化LL(D)建立在混合系数之和为1的条件下,因此使用拉格朗日乘子法: +$$ +LL(D)+\lambda +\biggl( +\sum^k_{i=1}\alpha_i-1 +\biggl) +\tag{13} +$$ + +对混合系数求导: +$$ +\sum^m_{j=1} +\frac +{P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i)}{\sum^k_{l=1}\alpha_l \cdot P(\pmb x_l|\pmb \mu_i, \pmb \Sigma_l)} ++\lambda +=0 +\tag{14} +$$ + +对此式在两边乘以αi,并且对所有成分进行求和,有 +$$ +\begin{aligned} +&\sum^k_{i=1} +\sum^m_{j=1} +\frac +{\alpha_i P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i)}{\sum^k_{l=1}\alpha_l \cdot P(\pmb x_l|\pmb \mu_i, \pmb \Sigma_l)} ++\sum^k_{i=1}\lambda \alpha_i +=0\\\\ +\iff +& +\sum^m_{j=1} +\sum^k_{i=1} +\frac +{\alpha_i P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i)}{\sum^k_{l=1}\alpha_l \cdot P(\pmb x_l|\pmb \mu_i, \pmb \Sigma_l)} ++\sum^k_{i=1}\lambda \alpha_i +=0\\\\ +\iff +& +\sum^m_{j=1} +\sum^k_{i=1} +\gamma_{ji} ++\sum^k_{i=1}\lambda \alpha_i +=0\\\\ +\iff +& +\sum^m_{j=1} +\sum^k_{i=1} +P_M(z_j=i|\pmb x_j) ++\sum^k_{i=1}\lambda \alpha_i +=0\\\\ +\iff +& +\sum^m_{j=1} +1 ++\lambda \sum^k_{i=1} \alpha_i +=0\\\\ +\iff +& +m ++\lambda +=0\\\\ +\iff +& \lambda = -m +\end{aligned} +\tag{15} +$$ + +所以有 +$$ +\begin{aligned} +&\sum^m_{j=1} +\frac +{\alpha_i P(\pmb x_j|\pmb \mu_i, \pmb \Sigma_i)}{\sum^k_{l=1}\alpha_l \cdot P(\pmb x_l|\pmb \mu_i, \pmb \Sigma_l)} ++\lambda \alpha_i +=0\\\\ +\iff +&\sum^m_{j=1} +\gamma_{ji} +-m \alpha_i +=0\\\\ +\iff +& +\alpha_i= +\frac 1 m +\sum^m_{j=1} +\gamma_{ji} +\end{aligned} +\tag{16} +$$ + +即每个高斯混合成分的混合系数由样本属于该成分的平均后验概率确定。 + +因为我们无法观测到样本属于哪一个混合成分,因此可以使用EM算法求解, + +- E步:先根据当前参数计算每个样本属于每个高斯成分的后验概率 $\gamma_{ji}$ +- M步:再根据(11),(12),(16)更新模型参数 $\mu,\Sigma,\alpha$ + +《机器学习》P210 + +聚类代码参见 [source.py](source.py) 。 + +#### 初始化与计算稳定性 + +初始化方式上,笔者将设定混合系数的初始值相等,方差为样本方差,均值通过随机抽取的方产生。在计算过程中,笔者发现,如果样本中某两个特征i与j之间线性相关程度很高,会导致计算协方差矩阵逆矩阵的过程中产生奇异矩阵。为了提升计算稳定性,笔者在求取后验概率时,对于奇异矩阵使用伪逆作为替代,并且使用了平滑手段,代码如下: + +```python +def compute_posterior_prob(weight, mu, cov_mat, x): + """ + 计算每个样本的后验概率 + :param weight: 混合系数 + :param mu: 均值 + :param cov_mat: 协方差矩阵 + :param x: 样本数据 + :return: 后验概率 + """ + d = mu.shape[0] + # 用来盛放协方差矩阵的逆矩阵 + inv_cov_mat = np.zeros_like(cov_mat) + for i in range(d): + # 如果遇到奇异矩阵则使用伪逆替代 + try: + inv_cov_mat[i] = np.linalg.inv(cov_mat[i]) + except np.linalg.LinAlgError: + print("遭遇奇异矩阵,使用伪逆替代") + inv_cov_mat[i] = np.linalg.pinv(cov_mat[i]) + # 下面开始计算正态分布概率密度 + ## 指数部分,使用了增加维度的方式提升运算速度 + exponential = - np.diagonal( + np.matmul(np.matmul(x - mu[:, np.newaxis, :], inv_cov_mat), + (x - mu[:, np.newaxis, :]).transpose(0, 2, 1)), axis1=1, axis2=2 + ) / 2 + ## 求行列式可能溢出,进行平滑,满足下式:cov_det = sign * np.exp(logdet),因为协方差矩阵的行列式不为赋值,所以就等于后半部分 + sign, logdet = np.linalg.slogdet(cov_mat) + ## 计算没有加权的概率密度,np.exp(logdet / 2)是开了根号的行列式 + phi = np.divide(np.exp(exponential), np.power(2 * np.pi, d / 2) + * np.exp(logdet / 2).reshape(-1, 1) + epsilon).T + ## 实际上,计算概率密度的过程中很容易出现上溢或下溢的现象,对其进行平滑, + ## 分别取最小或最大值作为平滑标准,因为平滑因子在分子分母上斗湖出现,所以不会对结果产生影响 + if np.abs(np.min(phi)) < 1e-10: + phi += epsilon + factor = np.power(np.min(phi), -0.5) + numer = phi * factor * weight.T + elif np.abs(np.max(phi)) > 1e8: + factor = np.power(np.max(phi), 0.5) + numer = phi * factor * weight.T + else: + numer = phi * weight.T + # 计算归一化操作需要的分母 + denom = np.sum(numer, axis=1).reshape(-1, 1) + # 计算平滑操作后每一行是不是和为1 + res = numer / (denom + epsilon) + assert np.abs( + np.sum( + np.sum( + res, + axis=1, + keepdims=True) - + np.ones((res.shape[0], 1)))) < 1e-3 + # 返回后验概率和分母,用作后续计算 + return res, denom +``` + +笔者同时设置了最大迭代次数与极大似然函数值作为度量,当达到最大迭代次数或者极大似然函数值更新幅度较小时,停止迭代。 + +#### 聚类过程与结果探究 + +GMM模型能够很好地对初始化不理想的模型进行聚类,对于1.2.3中的组2,即便初始化的均值向量比较集中,GMM可以进行很好地分类。为了让实验具有对照的作用,均值初始化方式选择了无约束的随机初始化。 + +
+ 表4 GMM聚类的组2 +
+ + + + + + + + + + + + + + +
+
+
初始分布
+
+
+
Ietration=1
+
+
+
Ietration=7
+
+
+
Ietration=13
+
+
+
Ietration=21
+
+
+
Ietration=39
+
+不难看出,即使初始向量分布非常集中,经过一定次数的迭代最终也可以得到比较理想的聚类结果,而且另一方面GMM会得到一个数据分布不连续的集合,这对于一些非连续的分布也会有很好的作用。 + +对于分布不均衡的数据,KMeans会受到非常大的影响,这是因为KMean可以看作高斯混合聚类在混合成份方差相等,且每个样本仅指派给一个混合成分时的特例。GMM由于考虑了不同成分进行混合的比例,并且考虑了各成分的方差,因此在面对不均衡样本时有更强的适应力,GMM在组3上的表现如下: + +
+ 表5 GMM聚类的组3 +
+ + + + + + + + + + + + + + +
+
+
初始分布
+
+
+
Ietration=1
+
+
+
Ietration=7
+
+
+
Ietration=13
+
+
+
Ietration=21
+
+
+
Ietration=39
+
+ +从结果上看,GMM对于样本不均衡的模型也有很好的表现。 + +## 自动选择聚簇数量 + +这一节实验主要探究使用了Elbow Method配合K-Means 算法进行自动选择聚类簇数的实验,详细代码见`ClusteringAlgorithm`类。算法主要思想是从一个较小的初始值到一个较大的初始值之间选取一些簇数取值c进行聚类,并考察聚类结果在误差平方和SSE上的表现,当k等于样本个数时,SSE等于0。当c小于真实簇数时,c每增大一点都会给SSE带来比较大的降幅,而当k大于真实簇数时,k的增大对SSE带来的影响幅度较小。SSE计算公式如下: +$$ +SSE = +\sum^{|C|} _ {i=1} +\sum_{p \in C_i} +|p-m_i|^2 +\tag{17} +$$ + +代码: + +```python +def compute_SSE(self): + """ + 计算误差平方和SSE + :return: + """ + SSE = 0 + for c in self.cluster.keys(): + member = self.cluster[c] # 获取类内成员的下标 + if len(member) == 0: # 空类则跳过 + continue + # 累计SSE,考虑到直接相加在样本数量较大的情况下很容易溢出,对其进行缩放,self.const = 1e7 + SSE += np.sum(np.square(self.data[member] - self.mean_vec[c])) / self.const + return SSE +``` + +初始值的选定上采用了启发式的方法,c的取值从2开始,当样本总数量大于1000时,c的最大值定位样本数的开方,否则则定为样本数。为了使得采样点更加密集,步长上统一设置为1。得到SSE散点图后,寻找转折点的方法参考了知乎文章([求曲线转折点坐标——浮生一记](https://zhuanlan.zhihu.com/p/109348995)),做法是分析每个点左右定长窗口内的点的方差,寻找左右方差之比最大的点,笔者选择了不同大小的窗口并进行加权平均,代码见`fit()`方法。笔者使用一组数据作为说明, + + + + + + + + + +
+
+
初始分布
+
+
+
SSE散点图
+
+
+
窗口方差比率
+
+
+
聚类结果
+
+ +从结果上来看,找到的c值就是SSE曲线的转折点,说明此方法有一定的效果。 + +当样本数量较多、簇数较多时,此方法效果较好,当样本点数量较少或是簇类较少时,此方法表现较差;当簇比较集中时效果较好,簇比较分散时效果较差。为了简化实验,笔者采用了随机生成协方差矩阵和均值向量的方法来产生数据,并设定每个初始簇的样本数量相同,代码如下: + +```python +data_lst = [] +for i in range(15): + mean = np.random.rand(2) * 100 + cov = generate_positive_cov(2) + np.eye(2) * 10 + data_ = np.random.multivariate_normal(mean, cov, (1000,)) + data_lst.append(data_) +``` + +下表中记录了一些自动选择簇数量的数据,其中(100/cluster)表示每个簇的样本数量,C1_t表示在C1(100/cluster)的基础上将均值扩大十倍的结果,这一操作能够使得簇间间隔更大。 + +
+ 表6 自动聚类结果 +
+ +| num of clusters | C1(100/cluster) | C1_t | C2(1000/cluster) | C2_t | +| --------------- | --------------- | ---- | ---------------- | ---- | +| 5 | 4 | 3 | 4 | 3 | +| 15 | 4 | 12 | 15 | 12 | +| 30 | 26 | 28 | 15 | 27 | + +实际上,当簇内元素比较离散时,不同的簇很容易重叠到一起,这将会消弭不同簇之间的差别,从而影响自动选择出来的簇数量。 + +六、代码使用方法 + +```python +python source.py --auto_c # Elbow Method配合K-Means +python source.py --kmeans # KMeans展示 +python source.py --gaussian # GMM展示 +``` + + + diff --git a/assignment-3/submission/16300110008/img/Figure_1.png b/assignment-3/submission/16300110008/img/Figure_1.png new file mode 100644 index 0000000000000000000000000000000000000000..b3914a47d6eba1951c8a738f3bb7ed085e0900e3 Binary files /dev/null and b/assignment-3/submission/16300110008/img/Figure_1.png differ diff --git a/assignment-3/submission/16300110008/img/Figure_2.png b/assignment-3/submission/16300110008/img/Figure_2.png new file mode 100644 index 0000000000000000000000000000000000000000..21e8ccf70fa5e8444a78445eff975bc1ccb78e84 Binary files /dev/null and b/assignment-3/submission/16300110008/img/Figure_2.png differ diff --git a/assignment-3/submission/16300110008/img/Figure_3.png b/assignment-3/submission/16300110008/img/Figure_3.png new file mode 100644 index 0000000000000000000000000000000000000000..1da5f7b248a9af39b380be3042ff9f3aeb8badfd Binary files /dev/null and b/assignment-3/submission/16300110008/img/Figure_3.png differ diff --git a/assignment-3/submission/16300110008/img/Figure_4.png b/assignment-3/submission/16300110008/img/Figure_4.png new file mode 100644 index 0000000000000000000000000000000000000000..1c3e34a5fa38000e0f39c2b161dcac2302c127d0 Binary files /dev/null and b/assignment-3/submission/16300110008/img/Figure_4.png differ diff --git a/assignment-3/submission/16300110008/img/data_generation.png b/assignment-3/submission/16300110008/img/data_generation.png new file mode 100644 index 0000000000000000000000000000000000000000..4b9af853ee5fb62189c930b6dfa550fe5160ff0b Binary files /dev/null and b/assignment-3/submission/16300110008/img/data_generation.png differ diff --git a/assignment-3/submission/16300110008/img/gmm1.png b/assignment-3/submission/16300110008/img/gmm1.png new file mode 100644 index 0000000000000000000000000000000000000000..9693df7c5883521976e98f06b34a113b4df4a4be Binary files /dev/null and b/assignment-3/submission/16300110008/img/gmm1.png differ diff --git a/assignment-3/submission/16300110008/img/init11.png b/assignment-3/submission/16300110008/img/init11.png new file mode 100644 index 0000000000000000000000000000000000000000..75ee53c5b004ad86ffa7a4c935e2d9559e5a0bdb Binary files /dev/null and b/assignment-3/submission/16300110008/img/init11.png differ diff --git a/assignment-3/submission/16300110008/img/init12.png b/assignment-3/submission/16300110008/img/init12.png new file mode 100644 index 0000000000000000000000000000000000000000..a5a5807d4fa1380d3db4661cf7de47a29d58d9d6 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init12.png differ diff --git a/assignment-3/submission/16300110008/img/init13.png b/assignment-3/submission/16300110008/img/init13.png new file mode 100644 index 0000000000000000000000000000000000000000..8134d6e4affb53455fd540c5c14da6c281d987e3 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init13.png differ diff --git a/assignment-3/submission/16300110008/img/init21.png b/assignment-3/submission/16300110008/img/init21.png new file mode 100644 index 0000000000000000000000000000000000000000..e8e70a3a80d29c9ac4ff3a556016596f943edbdf Binary files /dev/null and b/assignment-3/submission/16300110008/img/init21.png differ diff --git a/assignment-3/submission/16300110008/img/init211.png b/assignment-3/submission/16300110008/img/init211.png new file mode 100644 index 0000000000000000000000000000000000000000..37b57481a8b49a4ab28798fff75fe343c6713b3f Binary files /dev/null and b/assignment-3/submission/16300110008/img/init211.png differ diff --git a/assignment-3/submission/16300110008/img/init212.png b/assignment-3/submission/16300110008/img/init212.png new file mode 100644 index 0000000000000000000000000000000000000000..833f22c1c911381958434c018c43a80dd293b931 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init212.png differ diff --git a/assignment-3/submission/16300110008/img/init213.png b/assignment-3/submission/16300110008/img/init213.png new file mode 100644 index 0000000000000000000000000000000000000000..14a29ff80e005fb14550bb37f47c7734a0118f04 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init213.png differ diff --git a/assignment-3/submission/16300110008/img/init214.png b/assignment-3/submission/16300110008/img/init214.png new file mode 100644 index 0000000000000000000000000000000000000000..65be5214102d58ef44de439b8815710ab9ed0d0d Binary files /dev/null and b/assignment-3/submission/16300110008/img/init214.png differ diff --git a/assignment-3/submission/16300110008/img/init215.png b/assignment-3/submission/16300110008/img/init215.png new file mode 100644 index 0000000000000000000000000000000000000000..794e539775090fa261b09ebb535b800bee81989e Binary files /dev/null and b/assignment-3/submission/16300110008/img/init215.png differ diff --git a/assignment-3/submission/16300110008/img/init216.png b/assignment-3/submission/16300110008/img/init216.png new file mode 100644 index 0000000000000000000000000000000000000000..7892531101c2106af0a81c92424a98507245bae5 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init216.png differ diff --git a/assignment-3/submission/16300110008/img/init217.png b/assignment-3/submission/16300110008/img/init217.png new file mode 100644 index 0000000000000000000000000000000000000000..a778506ae215129a0bfa9a059e44d99462ff0a55 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init217.png differ diff --git a/assignment-3/submission/16300110008/img/init22.png b/assignment-3/submission/16300110008/img/init22.png new file mode 100644 index 0000000000000000000000000000000000000000..833f22c1c911381958434c018c43a80dd293b931 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init22.png differ diff --git a/assignment-3/submission/16300110008/img/init221.png b/assignment-3/submission/16300110008/img/init221.png new file mode 100644 index 0000000000000000000000000000000000000000..f38b36f52942029f4702faf4d107552ab5a44c16 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init221.png differ diff --git a/assignment-3/submission/16300110008/img/init222.png b/assignment-3/submission/16300110008/img/init222.png new file mode 100644 index 0000000000000000000000000000000000000000..bd4fc995635f0d0d570d7d97a1ecd2d726b3a512 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init222.png differ diff --git a/assignment-3/submission/16300110008/img/init223.png b/assignment-3/submission/16300110008/img/init223.png new file mode 100644 index 0000000000000000000000000000000000000000..bdbc6a397a42503b8e7343d088542803c02c7412 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init223.png differ diff --git a/assignment-3/submission/16300110008/img/init224.png b/assignment-3/submission/16300110008/img/init224.png new file mode 100644 index 0000000000000000000000000000000000000000..ebb22762e3126cccb43ad265d1569b1dbfe1b782 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init224.png differ diff --git a/assignment-3/submission/16300110008/img/init225.png b/assignment-3/submission/16300110008/img/init225.png new file mode 100644 index 0000000000000000000000000000000000000000..44ce93164f58bbba052d3999c31cd513c43b76bd Binary files /dev/null and b/assignment-3/submission/16300110008/img/init225.png differ diff --git a/assignment-3/submission/16300110008/img/init226.png b/assignment-3/submission/16300110008/img/init226.png new file mode 100644 index 0000000000000000000000000000000000000000..043aab0f31c18c2337ed9db2b99ab41637ff0228 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init226.png differ diff --git a/assignment-3/submission/16300110008/img/init227.png b/assignment-3/submission/16300110008/img/init227.png new file mode 100644 index 0000000000000000000000000000000000000000..0ed91bf45b78ee412782a81adaa7cce8aa607700 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init227.png differ diff --git a/assignment-3/submission/16300110008/img/init23.png b/assignment-3/submission/16300110008/img/init23.png new file mode 100644 index 0000000000000000000000000000000000000000..ccb52af3ca794431d519e31e2fbe40d2e9b39758 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init23.png differ diff --git a/assignment-3/submission/16300110008/img/init31.png b/assignment-3/submission/16300110008/img/init31.png new file mode 100644 index 0000000000000000000000000000000000000000..cd07e315bbaddcd3e2fc726be7494239e82428c3 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init31.png differ diff --git a/assignment-3/submission/16300110008/img/init32.png b/assignment-3/submission/16300110008/img/init32.png new file mode 100644 index 0000000000000000000000000000000000000000..bd4fc995635f0d0d570d7d97a1ecd2d726b3a512 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init32.png differ diff --git a/assignment-3/submission/16300110008/img/init33.png b/assignment-3/submission/16300110008/img/init33.png new file mode 100644 index 0000000000000000000000000000000000000000..6c71a32c3b389a8c1435e5abdaa5ed7ba179dd9f Binary files /dev/null and b/assignment-3/submission/16300110008/img/init33.png differ diff --git a/assignment-3/submission/16300110008/img/init41.png b/assignment-3/submission/16300110008/img/init41.png new file mode 100644 index 0000000000000000000000000000000000000000..2b11688945b674e86e1d65a8abf3b75fdc959cb3 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init41.png differ diff --git a/assignment-3/submission/16300110008/img/init42.png b/assignment-3/submission/16300110008/img/init42.png new file mode 100644 index 0000000000000000000000000000000000000000..a5bcceea6511e1780a7b56389f8a9ca7816a0a16 Binary files /dev/null and b/assignment-3/submission/16300110008/img/init42.png differ diff --git a/assignment-3/submission/16300110008/source.py b/assignment-3/submission/16300110008/source.py new file mode 100644 index 0000000000000000000000000000000000000000..79c30349623c9d884331f5590285787e0d66fa1f --- /dev/null +++ b/assignment-3/submission/16300110008/source.py @@ -0,0 +1,691 @@ +import numpy as np +import matplotlib.pyplot as plt + + +epsilon = 1e-6 + +plt.style.use('seaborn') + +def normalization(X): + """ + 对数据进行标准化处理,使其均值为0,方差为1 + :param X: 输入数据 + :return: 标准化后的数据 + """ + mean = np.mean(X, axis=0) + std = np.std(X, axis=0) + X = (X - mean) / std + return X + + +def compute_dis(center_vec, target_vec): + """ + 计算向量与类中心的距离 + :param center_vec: 类中心,tensor,k * d + :param target_vec: 待计算向量,n * d + :return: 距离,k * n,第i行第j列位置的元素表示第j个向量到第i类中心的距离 + """ + # 这里的计算方法是将center_vec扩充为n * 1 * d维 可以利用广播机制直接计算 + # target_vec到所有类中心的距离,并利用norm方法计算距离,可以充分利用numpy + # 的并行计算优势 + return np.linalg.norm(center_vec[:, np.newaxis, :] - target_vec, axis=-1) + + +def scatter_show(model, iteration=None): + """ + 绘图函数 + :param iteration: 迭代次数 + :param model: 使用的模型 + :return: None + """ + # 获取数据与簇情况 + data = model.data + label = model.cluster + mean_vec = model.mean_vec + # 为每个簇绘图 + color = ['#FFB6C1', '#00BFFF', '#DA70D6'] + for c in label: + idx = label[c] + # plt.scatter(data[idx, 0], data[idx, 1], c=color[c], s=20) + plt.scatter(data[idx, 0], data[idx, 1], s=20) + # 将簇中心画出 + plt.scatter(mean_vec[:, 0], mean_vec[:, 1], marker='+', c='#000000', s=1000) + + if iteration is not None: + plt.title(f"n clusters = {model.n_clusters}\n" + f"iteration = {iteration}") + else: + plt.title(f"n clusters = {model.n_clusters}") + + plt.show() + + +class KMeans: + def __init__(self, n_clusters): + """ + 初始化 + :param n_clusters: 待聚类的簇数 + """ + self.n_clusters = n_clusters + # 簇 + self.cluster = { + c: None for c in range(n_clusters) + } + # 当簇中心变动距离的均值小于阈值时,停止迭代 + self.epsilon = 1e-1 + # 用来缓存训练数据与簇中心 + self.data = None + self.mean_vec = None + + def reset_dict(self): + """ + 重置簇 + :return: None + """ + self.cluster = { + c: None for c in range(self.n_clusters) + } + self.mean_vec = None + + def initialize(self): + """ + 初始化均值向量,KMeans++方法 + :return: 初始化后的均值向量 + """ + # n, d, k分别是样本数量,维度以及簇数 + n = self.data.shape[0] + d = self.data.shape[1] + k = self.n_clusters + # 随机选取第一个均值向量 + idx = np.random.randint(0, n) + # 初始化均值向量存储矩阵 + mean_vec = np.zeros((k, d)) + mean_vec[0] = self.data[idx].reshape(1, -1) + # 初始化每个样本到均值向量的距离存储矩阵 + dist = np.zeros((n, k)) + # 将每个样本到第一个向量的距离填入矩阵中 + dist[:, 0] = np.linalg.norm(self.data - mean_vec[0].reshape(1, -1), axis=1) + # 迭代选取剩余的向量 + for i in range(1, k): + # 保留每个样本到距离其最近的类中心的距离,这些距离保存在dist矩阵的前i列 + dist_min = np.min(dist[:, :i], axis=1) + # 在保留的距离中选取距离最大的一个作为新的均值向量 + new_vec_idx = np.argmax(dist_min) + # 将此步产生的向量并入 + mean_vec[i] = self.data[new_vec_idx].reshape(1, -1) + # 计算每个样本到新向量的距离,并存入距离矩阵 + dist[:, i] = np.linalg.norm(self.data - mean_vec[i], axis=1) + return mean_vec + + + def fit(self, train_data, is_plot=False): + """ + 聚类过程 + :param train_data: 训练数据 + :return: None + """ + # 缓存 + print("Fit Begin") + num = len(train_data) + train_data = normalization(train_data) + self.data = train_data + # 从D中随机选择n_clusters个样本作为初始均值向量 + # idx = np.arange(num) + # mean_vec_idx = np.random.choice(idx, self.n_clusters, replace=False) + # mean_vec = train_data[mean_vec_idx] + mean_vec = self.initialize() + + # 最大迭代次数 + max_iterations = 50 + # 缓存 + self.mean_vec = mean_vec + # 迭代开始 + for i in range(max_iterations): + print(f"Iteration: {i + 1}") + # 保存上一步簇中心结果,用来计算更新步幅 + last_mean_vec = mean_vec.copy() + # 计算距离 + dis = compute_dis(mean_vec, train_data) + # 获取标签:样本x以距离其最近的簇中心的标签为自身的标签 + label = np.argmin(dis, axis=0) + + # 绘图 + if i % 2 == 0 and is_plot: + # 计算每个簇内的样本数量 + for j in range(self.n_clusters): + self.cluster[j] = np.where(label == j)[0].tolist() + print(self.mean_vec) + scatter_show(self, i + 1) + + # 计算新的簇中心 + for j in range(self.n_clusters): + mean_vec[j] = np.mean(train_data[np.where(label == j)], axis=0) + + # 计算更新幅度:对应簇中心变化距离的均值,小于阈值停止更新,结束聚类过程 + difference = mean_vec - last_mean_vec + if np.mean(np.linalg.norm(difference, axis=1)) < self.epsilon: + break + # 缓存新的簇中心 + self.mean_vec = mean_vec.copy() + + print("Fit End") + + def predict(self, test_data): + """ + 预测测试数据的标签 + :param test_data: 测试数据,n * d + :return: 每个样本的类别,(n,) + """ + # 计算每个样本到每个类中心的距离 + dis = compute_dis(self.mean_vec, test_data) + # 预测样本属于哪个簇 + return np.argmin(dis, axis=0) + + +def compute_posterior_prob(weight, mu, cov_mat, x): + """ + 计算每个样本的后验概率 + :param weight: 混合系数 + :param mu: 均值 + :param cov_mat: 协方差矩阵 + :param x: 样本数据 + :return: 后验概率 + """ + c = mu.shape[0] + d = mu.shape[1] + # 用来盛放协方差矩阵的逆矩阵 + inv_cov_mat = np.zeros_like(cov_mat) + cov_mat += epsilon * 1e5 + for i in range(c): + # 如果遇到奇异矩阵则使用伪逆替代 + try: + # cov_mat_m = np.eye(cov_mat[i].shape[0]) * epsilon + cov_mat[i] + if np.abs(np.linalg.det(cov_mat[i])) < 1e-4: + raise np.linalg.LinAlgError + else: + inv_cov_mat[i] = np.linalg.inv(cov_mat[i]) + except np.linalg.LinAlgError: + print("遭遇奇异矩阵,使用伪逆替代") + inv_cov_mat[i] = np.linalg.pinv(cov_mat[i]) + # 下面开始计算正态分布概率密度 + ## 指数部分,使用了增加维度的方式提升运算速度 + exponential = - np.diagonal( + np.matmul(np.matmul(x - mu[:, np.newaxis, :], inv_cov_mat), + (x - mu[:, np.newaxis, :]).transpose(0, 2, 1)), axis1=1, axis2=2 + ) / 2 + ## 求行列式可能溢出,进行平滑,满足下式:cov_det = sign * np.exp(logdet) + sign, logdet = np.linalg.slogdet(cov_mat) + ## 计算没有加权的概率密度 + phi = np.divide(np.exp(exponential) + epsilon, np.power(2 * np.pi, d / 2) + * np.exp(logdet / 2).reshape(-1, 1) + epsilon).T + ## 实际上,计算概率密度的过程中很容易出现上溢或下溢的现象,对其进行平滑, + ## 分别取最小或最大值作为平滑标准,因为平滑因子在分子分母上斗湖出现,所以不会对结果产生影响 + if np.abs(np.min(phi)) < 1e-10: + phi += epsilon + factor = np.power(np.min(phi), -0.5) + numer = phi * factor * weight.T + elif np.abs(np.max(phi)) > 1e8: + factor = np.power(np.max(phi), 0.5) + numer = phi * factor * weight.T + else: + numer = phi * weight.T + # 计算归一化操作需要的分母 + denom = np.sum(numer, axis=1).reshape(-1, 1) + # 计算平滑操作后每一行是不是和为1 + res = numer / (denom + epsilon) + # assert np.abs( + # np.sum( + # np.sum( + # res, + # axis=1, + # keepdims=True) - + # np.ones((res.shape[0], 1)))) < 1e-1 + # 返回后验概率和分母,用作后续计算 + return res, denom + + +def compute_log_likelihood(gamma_denom): + return np.sum(np.log(gamma_denom), axis=0) + + +class GaussianMixture: + + def __init__(self, n_clusters): + """ + 初始化 + :param n_clusters: 待聚类的簇数,也就是高斯混合成分的个数 + """ + self.n_clusters = n_clusters + # 簇,value元组:(下标集合,混合权重,均值向量,协方差矩阵) + self.cluster = { + c: None for c in range(n_clusters) + } + self.data = None + self.mean_vec = None + self.cov_mat = None + self.weight = None + + def reset_params(self): + """ + 重置每个簇对应的参数 + :return: None + """ + self.cluster = { + c: (None, None, None, None) for c in range(self.n_clusters) + } + def initialize(self): + """ + 初始化均值向量,KMeans++方法 + :return: 初始化后的均值向量 + """ + # n, d, k分别是样本数量,维度以及簇数 + n = self.data.shape[0] + d = self.data.shape[1] + k = self.n_clusters + # 随机选取第一个均值向量 + idx = np.random.randint(0, n) + # 初始化均值向量存储矩阵 + mean_vec = np.zeros((k, d)) + mean_vec[0] = self.data[idx].reshape(1, -1) + # 初始化每个样本到均值向量的距离存储矩阵 + dist = np.zeros((n, k)) + # 将每个样本到第一个向量的距离填入矩阵中 + dist[:, 0] = np.linalg.norm(self.data - mean_vec[0].reshape(1, -1), axis=1) + # 迭代选取剩余的向量 + for i in range(1, k): + # 保留每个样本到距离其最近的类中心的距离,这些距离保存在dist矩阵的前i列 + dist_min = np.min(dist[:, :i], axis=1) + # 在保留的距离中选取距离最大的一个作为新的均值向量 + new_vec_idx = np.argmax(dist_min) + # 将此步产生的向量并入 + mean_vec[i] = self.data[new_vec_idx].reshape(1, -1) + # 计算每个样本到新向量的距离,并存入距离矩阵 + dist[:, i] = np.linalg.norm(self.data - mean_vec[i], axis=1) + return mean_vec + + def fit(self, train_data, is_plot=False): + """ + 聚类 + :param train_data: 输入样本 + :param is_plot: 是否绘图 + :return: None + """ + # 预处理 + train_data = normalization(train_data) + if len(train_data) < 10: + aug_data = train_data.copy() + for i in range(4): + temp = train_data + np.random.randn(*train_data.shape) * 5e-2 + aug_data = np.concatenate([aug_data, temp], axis=0) + train_data = aug_data + + self.data = train_data + + # 获取样本的一些特征 + num = len(train_data) + n = train_data.shape[0] + k = self.n_clusters + d = train_data.shape[1] + + # 下面开始初始化 + ## 初始化混合成分被选择的概率 + weight = (np.zeros(k).reshape(-1, 1) + 1) / k + ## 初始化每个混合成分的均值向量,这里采用随机选取的方法 + # idx = np.arange(num) + # mu_idx = np.random.choice(idx, k, replace=False) + # mu = train_data[mu_idx] + mu = self.initialize() + ## 初始化协方差矩阵,假定属性间是相互独立的 + cov_mat = np.tile( + np.diag(np.var(train_data, axis=0))[np.newaxis, :, :], + (k, 1, 1) + ) + # 用来存储上一步的性能度量数值 + metric_cache = 1 + # 最大迭代次数 + max_iterations = 100 + print("Fit Begin") + for iteration in range(max_iterations): + # 迭代开始,这里使用了增加维度的方法,能够充分利用numpy的并行计算优势,一次性计算出所有混合成分的参数 + print(f"iteration: {iteration}") + # 计算样本xj属于每个混合成分的后验概率 + posterior_prob, gamma_denom = compute_posterior_prob( + weight, mu, cov_mat, train_data) + # 对后验概率求和作为归一化分母 + denom = np.sum(posterior_prob, axis=0).reshape(-1, 1) + # assert posterior_prob.shape[0] == n, posterior_prob.shape[1] == k + # 计算当前参数下的极大似然估计数值 + metric = compute_log_likelihood(gamma_denom) + # 计算新均值向量,依据算法中的公式,参见README + mu_ = np.matmul(posterior_prob.T, train_data) / denom + # assert mu_.shape[0] == k, mu_.shape[1] == d + # 计算新协方差矩阵 + temp = train_data[:, np.newaxis, :] - mu_ + temp = np.matmul(temp[:, :, :, np.newaxis], + temp[:, :, np.newaxis, :]) + # assert temp.shape == (n, k, d, d) + temp = posterior_prob[:, :, np.newaxis, np.newaxis] * temp + # assert temp.shape == (n, k, d, d) + conv_mat_ = np.sum(temp, axis=0) / denom[:, :, np.newaxis] + # assert conv_mat_.shape == (k, d, d) + # 计算新混合系数 + weight_ = denom / n + # assert weight_.shape == (k, 1) + + # 是否满足停止条件 + update_step = metric - metric_cache + if iteration % 2 == 0 and is_plot: + label = np.argmax(posterior_prob, axis=1) + self.weight = weight + self.mean_vec = mu + self.cov_mat = cov_mat + for j in range(self.n_clusters): + self.cluster[j] = np.where(label == j)[0].tolist() + scatter_show(self, iteration + 1) + # 更新步幅为负或小于阈值时停止迭代 + if iteration > 0 and ( + update_step / + np.abs(metric_cache) <= 1e-3 or iteration == max_iterations - + 1): + label = np.argmax(posterior_prob, axis=1) + self.weight = weight + self.mean_vec = mu + self.cov_mat = cov_mat + for j in range(self.n_clusters): + self.cluster[j] = np.where(label == j)[0].tolist() + if is_plot: + scatter_show(self) + break + # 更新似然函数值 + metric_cache = metric + # 更新模型参数 + mu = mu_ + cov_mat = conv_mat_ + weight = weight_ + + def predict(self, test_data): + """ + 预测测试数据的标签 + :param test_data: 测试数据,n * d + :return: 每个样本的类别,(n,) + """ + # 计算每个样本到每个类中心的距离 + test_data = normalization(test_data) + posterior_prob, _ = compute_posterior_prob( + self.weight, self.mean_vec, self.cov_mat, test_data) + # 预测样本属于哪个簇 + return np.argmax(posterior_prob, axis=1) + + +class ClusteringAlgorithm: + + def __init__(self): + """ + 初始化 + """ + self.n_clusters = None + # 簇 + self.cluster = {} + # 当簇中心变动距离的均值小于阈值时,停止迭代 + self.epsilon = 1e-1 + # 用来缓存训练数据与簇中心 + self.data = None + self.mean_vec = None + + + def reset_dict(self): + """ + 重置簇 + :return: None + """ + self.cluster = { + c: None for c in range(self.n_clusters) + } + self.mean_vec = None + + def compute_SSE(self): + """ + 计算误差平方和SSE + :return: + """ + SSE = 0 + for c in self.cluster.keys(): + member = self.cluster[c] # 获取类内成员的下标 + if len(member) == 0: # 空类则跳过 + continue + # 累计SSE + SSE += np.sum(np.square(self.data[member] - self.mean_vec[c])) + return SSE + + def initialize(self): + """ + 初始化均值向量,KMeans++方法 + :return: 初始化后的均值向量 + """ + # n, d, k分别是样本数量,维度以及簇数 + n = self.data.shape[0] + d = self.data.shape[1] + k = self.n_clusters + # 随机选取第一个均值向量 + idx = np.random.randint(0, n) + # 初始化均值向量存储矩阵 + mean_vec = np.zeros((k, d)) + mean_vec[0] = self.data[idx].reshape(1, -1) + # 初始化每个样本到均值向量的距离存储矩阵 + dist = np.zeros((n, k)) + # 将每个样本到第一个向量的距离填入矩阵中 + dist[:, 0] = np.linalg.norm(self.data - mean_vec[0].reshape(1, -1), axis=1) + # 迭代选取剩余的向量 + for i in range(1, k): + # 保留每个样本到距离其最近的类中心的距离,这些距离保存在dist矩阵的前i列 + dist_min = np.min(dist[:, :i], axis=1) + # 在保留的距离中选取距离最大的一个作为新的均值向量 + new_vec_idx = np.argmax(dist_min) + # 将此步产生的向量并入 + mean_vec[i] = self.data[new_vec_idx].reshape(1, -1) + # 计算每个样本到新向量的距离,并存入距离矩阵 + dist[:, i] = np.linalg.norm(self.data - mean_vec[i], axis=1) + return mean_vec + + def fit_step(self): + """ + 单步聚类过程 + :return: None + """ + # 获取训练数据 + self.reset_dict() + train_data = self.data + # 从D中随机选择n_clusters个样本作为初始均值向量 + ## 随机初始化 + # idx = np.arange(num) + # mean_vec_idx = np.random.choice(idx, self.n_clusters, replace=False) + # mean_vec = train_data[mean_vec_idx] + ## 带约束 + mean_vec = self.initialize() + # 最大迭代次数 + max_iterations = 50 + # 缓存 + self.mean_vec = mean_vec + # 迭代开始 + for i in range(max_iterations): + # 保存上一步簇中心结果,用来计算更新步幅 + last_mean_vec = mean_vec.copy() + # 计算距离 + dis = compute_dis(mean_vec, train_data) + # 获取标签:样本x以距离其最近的簇中心的标签为自身的标签 + label = np.argmin(dis, axis=0) + + # 绘图 + if i % 2 == 0: + # 计算每个簇内的样本数量 + for j in range(self.n_clusters): + self.cluster[j] = np.where(label == j)[0].tolist() + # print(self.mean_vec) + # scatter_show(self) + + # 计算新的簇中心 + for j in range(self.n_clusters): + mean_vec[j] = np.nanmean(train_data[np.where(label == j)], axis=0) + + # 计算更新幅度:对应簇中心变化距离的均值,小于阈值停止更新,结束聚类过程 + difference = mean_vec - last_mean_vec + if np.mean(np.linalg.norm(difference, axis=1)) < self.epsilon: + break + # 缓存新的簇中心 + self.mean_vec = mean_vec.copy() + + def fit(self, train_data): + """ + 自动挑选簇数并进行聚类 + :param train_data: 训练数据 + :return: + """ + print("Fit Begin") + num = len(train_data) + # 预处理 + train_data = normalization(train_data) + # 缓存 + self.data = train_data + # 确定c的选取范围与迭代步长 + c_min = 1 + if num > 100: + c_max = int(np.sqrt(num)) + else: + c_max = num + c_step = 1 + print(c_max) + # 记录SSE数值 + c_SSE_pairs = [] + for c in range(c_min, c_max + 1, c_step): + self.n_clusters = c + self.fit_step() + s = self.compute_SSE() + c_SSE_pairs.append([c, s]) + pair_num = len(c_SSE_pairs) + c_SSE_pairs = np.array(c_SSE_pairs) + plt.scatter(c_SSE_pairs[:,0], c_SSE_pairs[:,1]) + plt.show() + # 滑动窗口计算左右方差比 + delta_s_1 = [] + # 窗口大小的范围 + n_min = 2 + n_max = 4 + for j in range(n_max, pair_num - 1 - n_max): + b_l = 0 + b_r = 0 + denom = 0 + for n in range(n_min, n_max + 1): + # 计算左右断电 + left1 = j - n + right1 = j + n + # 计算方差 + b_l_n = np.var(c_SSE_pairs[left1: j, 1]) + b_r_n = np.var(c_SSE_pairs[j + 1: right1 + 1, 1]) + # 计算权重因子 + factor = n ** 2 + b_l += factor * b_l_n + b_r += factor * b_r_n + denom += factor + # 加权平均 + b_l_bar = b_l / denom + b_r_bar = b_r / denom + # 计算比值 + delta_s_1.append(b_l_bar / b_r_bar) + delta_s_1 = np.array(delta_s_1) + plt.scatter(range(n_min + 1, len(delta_s_1) + n_min + 1), delta_s_1) + plt.show() + # 找到比值最大的点 + pos = np.argmax(delta_s_1) + # 确定对应的簇数 + self.n_clusters = pos + n_min + 1 + self.fit_step() + scatter_show(self) + + print("Fit End") + + def predict(self, test_data): + """ + 预测测试数据的标签 + :param test_data: 测试数据,n * d + :return: 每个样本的类别,(n,) + """ + # 计算每个样本到每个类中心的距离 + dis = compute_dis(self.mean_vec, test_data) + # 预测样本属于哪个簇 + return np.argmin(dis, axis=0) + + +def generate_positive_cov(dimension): + """ + # 用于生成随机的数据集 + :param dimension: 协方差矩阵的维度 + :return: + """ + A = np.abs(np.random.rand(dimension, dimension)) + B = np.dot(A, A.T) + return B + + +def select_c(): + np.random.seed(1) + model = ClusteringAlgorithm() + + + data_lst = [] + + for i in range(30): + mean = np.random.rand(2) * 1000 + cov = generate_positive_cov(2) + np.eye(2) * 10 + data_ = np.random.multivariate_normal(mean, cov, (100,)) + data_lst.append(data_) + + data = np.concatenate(data_lst, axis=0) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + plt.scatter(data[:, 0], data[:, 1]) + plt.title('data generation') + plt.show() + model.fit(data) + # scatter_show(model) + +def show(mode): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data = np.concatenate([x, y, z], axis=0) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + + plt.scatter(data[:, 0], data[:, 1]) + plt.title('data generation') + plt.show() + + if mode == 'KMeans': + model = KMeans(3) + model.fit(data, True) + elif mode == 'Gaussian': + model = GaussianMixture(3) + model.fit(data, True) + + +if __name__ == '__main__': + import sys + if len(sys.argv) > 1 and sys.argv[1] == "--auto_c": + select_c() + elif len(sys.argv) > 1 and sys.argv[1] == "--kmeans": + show('KMeans') + elif len(sys.argv) > 1 and sys.argv[1] == "--gaussian": + show('Gaussian') diff --git a/assignment-3/handout/tester_demo.py b/assignment-3/submission/16300110008/tester_demo.py similarity index 99% rename from assignment-3/handout/tester_demo.py rename to assignment-3/submission/16300110008/tester_demo.py index 19ec0e8091691d4aaaa6b53dbb695fde9e826d89..9094497f33b783f07d8ab0401b02a178b19598f6 100644 --- a/assignment-3/handout/tester_demo.py +++ b/assignment-3/submission/16300110008/tester_demo.py @@ -58,6 +58,7 @@ def test_with_n_clusters(data_fuction, algorithm_class): model = algorithm_class(n_clusters) model.fit(train_data) res = model.predict(test_data) + print(res) assert len( res.shape) == 1 and res.shape[0] == test_data.shape[0], "shape of result is wrong" return res