diff --git a/assignment-3/submission/.keep b/assignment-3/submission/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assignment-3/submission/17307110367/.keep b/assignment-3/submission/17307110367/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assignment-3/submission/17307110367/README.md b/assignment-3/submission/17307110367/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1fb9905a7befcd8913acc87c835be5e4312d04ba --- /dev/null +++ b/assignment-3/submission/17307110367/README.md @@ -0,0 +1,153 @@ +# PRML-2021 Assignment3 + +姓名:张艳琳 + +学号:17307110367 + +## 问题概述 + +本次实验中需要通过`NumPy`实现`K-Means`和`GMM`两个用于聚类分析的模型,并在此基础上使用`elbow method`的方法实现自动判断数据集中聚簇的数量并进行聚类的算法。 +## 模型实现 + +### `K-Means` + +`K-Means`模型在构建时随机选取聚类中心,在随后的每次迭代中包含以下步骤: + +- 计算每个样本点至聚类中心点的距离,将每个分配至距离最短的中心点对应的类中; +- 计算每个类中样本点的均值,将其作为新的聚类中心。 + +在`K-Means`的初始聚类中心的选择上,通过随机选取样本点来作为初始的`K-Means`模型的聚类中心: + +```python + def init_centers(self, train_data): + init_row = np.random.choice(range(train_data.shape[0]), self.num_clusters, replace=False) + self.centers = train_data[init_row] +``` + +### `GMM` + +高斯混合模型`GMM`通过`Expectation-Maximum`算法进行参数估计。 + +设$ x$为样本点,$z$表示样本所属的高斯分布,$\pi,\mu,\sigma$分别表示每个高斯分布的分配概率、均值和协方差,在每次迭代中: + +- `Expectation`步:固定参数$\mu,\sigma$,计算后验分布$\gamma_{nk}=p(z^{(n)}=k|x^{(n)})$; + +- `Maximum`步:固定$\gamma_{nk}$,更新参数$\pi,\mu,\sigma$: + $$ + N_k=\sum_{n=1}^N\gamma_{nk}\\\\ + \pi_k=\frac{N_k}{N}\\\\ + \sigma_k=\frac1{N_k}\sum_{n=1}^N\gamma_{nk}(x^{(n)}-\mu_k)(x^{(n)}-\mu_k)^T\\\\ + \mu_k=\frac1{N_k}\sum_{n=1}^N\gamma_{nk}x^{(n)} + $$ + +## 基础实验 + +### 对给定的data_1进行可视化 +#### 数据集参数 + +$$ +\mu_x=\begin{bmatrix}1&2\end{bmatrix},\mu_y=\begin{bmatrix}16&-5\end{bmatrix},\mu_z=\begin{bmatrix}10&22\end{bmatrix}\\\\ +\Sigma_x=\begin{bmatrix}73&0\\\\0&22\end{bmatrix},\Sigma_y=\begin{bmatrix}21.2&0\\\\0&32.1\end{bmatrix},\Sigma_z=\begin{bmatrix}10&5\\\\5&10\end{bmatrix}\\\\ +x样本点数量为800, +y的样本点数量为200,z的样本点数量为100. +$$ + +生成的数据如图所示 + +![](./img/data1_1_Kmeans.png) + +使用`K-Means`和`GMM`进行聚类,画出各自的聚簇: + +#### `K-Means` + +![](./img/res1_1_Kmeans.png) + + + +#### `GMM` + +![](./img/res1_1_GMM.png) + + +可以看到GMM生成的结果要比K-Means生成的结果好一些。 + + +### 自己生成的二维高斯分布 + +#### 数据集参数 + +$$ +\mu_x=\begin{bmatrix}1&2\end{bmatrix},\mu_y=\begin{bmatrix}10&22\end{bmatrix},\mu_z=\begin{bmatrix}-10&22\end{bmatrix},\mu_x=\begin{bmatrix}10&-22\end{bmatrix},\mu_x=\begin{bmatrix}-10&-22\end{bmatrix}\\\\ +\Sigma_x=\begin{bmatrix}73&0\\\\0&22\end{bmatrix},\Sigma_y=\Sigma_z=\Sigma_w=\Sigma_t=\begin{bmatrix}10&5\\\\5&10\end{bmatrix}\\\\ +x样本点数量为8000, +y,z,w,t的样本点数量为1000 +$$ + +生成的数据如图所示 + +![](./img/data6.png) + +使用`K-Means`和`GMM`进行聚类,画出各自的聚簇: + +#### `K-Means` + +![](./img/res_6_K.png) + + + +#### `GMM` + +![](./img/res6_G.png) + +发现GMM由于初始分布是随机的而导致了分类结果十分奇怪。再次进行实验后的结果就正常许多 + +![](./img/res6_G_2.png) + +--- + + +## 自动选择聚簇数量的实验 + +### `Elbow Method` +我们知道k-means是以最小化样本与聚类中心的平方误差作为目标函数,将每个聚类中心与类内样本点的平方距离误差和称为畸变程度(distortions)。那么,对于一个类,它的畸变程度越低,代表类内成员越紧密,畸变程度越高,代表类内结构越松散。 + +畸变程度会随着类别的增加而降低,但对于有一定区分度的数据,在达到某个临界点时畸变程度会得到极大改善,之后缓慢下降,这个临界点就可以考虑为聚类性能较好的点。其图像像一个胳膊肘,故名为elbow method。 + +在实际的代码中可以对`Elbow Method`的结果进行图像绘制。在我的实验中设置的类的范围是2~10。(即代码中的upper为10) + +将`Elbow Method`配合`K-Means`的代码如下: +```python + def fit(self, train_data, upper): + sum = np.zeros(upper-2) + for i in range(2, upper): + kmeans = KMeans(i) + kmeans.fit(train_data) + m = kmeans.labels + c = kmeans.centers + for j in range(len(train_data)): + c1 = c[int(m[j])] + x1 = train_data[j] + sum[i-2] += np.sum(np.square(c1-x1)) + c = plt.plot(np.arange(2, upper), sum) + plt.savefig(f'./img/elbow.png') + plt.show() + n = len(sum) + mx = 0 + for i in range(1, n - 1): + del1 = (sum[0] - sum[i]) / i + del2 = (sum[i] - sum[n - 1]) / (n - 1 - i) + delta = del1 - del2 + if delta > 0.3 * max(del1, del2) and delta > mx: + mx = delta + self.cluster_num = i + 2 + self.model = KMeans(self.cluster_num) + self.model.fit(train_data) +``` +`Elbow Method`生成的图像结果如下 + +![](./img/elbow_final.png) + +很容易看到,在K=5处有个明显的拐点。因此根据`Elbow Method`的方法,K=5应当被自动选择的聚簇数量,事实也是如此。 + + + diff --git a/assignment-3/submission/17307110367/img/.keep b/assignment-3/submission/17307110367/img/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assignment-3/submission/17307110367/img/data1_1_Kmeans.png b/assignment-3/submission/17307110367/img/data1_1_Kmeans.png new file mode 100644 index 0000000000000000000000000000000000000000..7d8ad67746f11258851b67e16a6db33680668be3 Binary files /dev/null and b/assignment-3/submission/17307110367/img/data1_1_Kmeans.png differ diff --git a/assignment-3/submission/17307110367/img/data6.png b/assignment-3/submission/17307110367/img/data6.png new file mode 100644 index 0000000000000000000000000000000000000000..505b8174dd315c12147bcdb81beecddd112db355 Binary files /dev/null and b/assignment-3/submission/17307110367/img/data6.png differ diff --git a/assignment-3/submission/17307110367/img/elbow_final.png b/assignment-3/submission/17307110367/img/elbow_final.png new file mode 100644 index 0000000000000000000000000000000000000000..85d5774ddf148c67f3e467c2bbb86a154575a876 Binary files /dev/null and b/assignment-3/submission/17307110367/img/elbow_final.png differ diff --git a/assignment-3/submission/17307110367/img/res1_1_GMM.png b/assignment-3/submission/17307110367/img/res1_1_GMM.png new file mode 100644 index 0000000000000000000000000000000000000000..2fe9166bde8705c270cf2b8df27e3f5c12a26953 Binary files /dev/null and b/assignment-3/submission/17307110367/img/res1_1_GMM.png differ diff --git a/assignment-3/submission/17307110367/img/res1_1_Kmeans.png b/assignment-3/submission/17307110367/img/res1_1_Kmeans.png new file mode 100644 index 0000000000000000000000000000000000000000..c3d685f0541e41fad338bc2b13eb6572cfce11b7 Binary files /dev/null and b/assignment-3/submission/17307110367/img/res1_1_Kmeans.png differ diff --git a/assignment-3/submission/17307110367/img/res6_G.png b/assignment-3/submission/17307110367/img/res6_G.png new file mode 100644 index 0000000000000000000000000000000000000000..aa1a477faa279600da157b786a420cb4d940e159 Binary files /dev/null and b/assignment-3/submission/17307110367/img/res6_G.png differ diff --git a/assignment-3/submission/17307110367/img/res6_G_2.png b/assignment-3/submission/17307110367/img/res6_G_2.png new file mode 100644 index 0000000000000000000000000000000000000000..1b093c6fa62f334132b1cf3d64149b23483b0d86 Binary files /dev/null and b/assignment-3/submission/17307110367/img/res6_G_2.png differ diff --git a/assignment-3/submission/17307110367/img/res_6_K.png b/assignment-3/submission/17307110367/img/res_6_K.png new file mode 100644 index 0000000000000000000000000000000000000000..8e9c60961d7b42defc1a851ccd68c31af404d343 Binary files /dev/null and b/assignment-3/submission/17307110367/img/res_6_K.png differ diff --git a/assignment-3/submission/17307110367/source.py b/assignment-3/submission/17307110367/source.py new file mode 100644 index 0000000000000000000000000000000000000000..48cc09b1afa4a6621a740563b5195b0a796670af --- /dev/null +++ b/assignment-3/submission/17307110367/source.py @@ -0,0 +1,194 @@ +import numpy as np +import matplotlib.pyplot as plt +class KMeans: + + def __init__(self, n_clusters): + self.num_clusters = n_clusters + self.centers = None + self.dists = None + self.labels = None + self.max_iter = 1000 + self.stop_var = 1e-3 + self.variance = 1e-2 + + # 无监督分类的实现 + def fit(self, train_data): + self.init_centers(train_data) + for _iter in range(self.max_iter): + self.update_dists(train_data) + self.update_centers(train_data) + if self.variance < self.stop_var: + print('Current_iter:', iter) + break + + # 预测样本所属类别 + def predict(self, test_data): + res = [] + for i, sample in enumerate(test_data): + dist = self.l2_distance(sample) + res.append(np.argmin(dist)) + return np.array(res) + + #初始化中心点 + def init_centers(self, train_data): + init_row = np.random.choice(range(train_data.shape[0]), self.num_clusters, replace=False) + self.centers = train_data[init_row] + + #更新距离 + def update_dists(self, train_data): + labels = np.empty(train_data.shape[0]) + dists = np.empty([0, self.num_clusters]) + for i, sample in enumerate(train_data): + dist = self.l2_distance(sample) + labels[i] = np.argmin(dist) + dists = np.vstack([dists, dist]) + if self.dists is not None: + self.variance = np.sum(np.abs(self.dists - dists)) + self.dists = dists + self.labels = labels + + #更新中心点 + def update_centers(self, samples): + centers = np.empty([0, samples.shape[1]]) + for i in range(self.num_clusters): + idx = (self.labels == i) + center_samples = samples[idx] + if len(center_samples) > 0: + center = np.mean(center_samples, axis=0) + else: + center = self.centers[i] + centers = np.vstack((centers, center[np.newaxis, :])) + self.centers = centers + + #l2距离的计算 + def l2_distance(self, sample): + return np.sum(np.square(self.centers - sample), axis=1) + +class GaussianMixture: + + def __init__(self, n_clusters): + self.num_clusters = n_clusters + self.max_iter = 100 + self.num = None + self.dim = None + self.X = None + self.Q = None + self.weight = None + self.covar = None + self.mu = None + self.labels = None + + def fit(self, train_data): + self._initialize_params(train_data) + while self.max_iter > 0: + # 初始化变量 + # e-step + self.e_step() + # m-step + self.m_step() + self.max_iter -= 1 + self.labels = np.argmax(self.Q, axis=1) + + def predict(self, test_data): + out = [] + for i in range(self.num): + mxp = 0 + res = -1 + for k in range(self.num_clusters): + temp = self.weight[k]*self.multi_norm(test_data[i, :], self.mu[k, :], self.covar[k, :, :]) + if temp > mxp: + mxp = temp + res = k + out.append(res) + out = np.array(out) + return out + + def _initialize_params(self, X): + self.X = X # 分类的数据集 + self.num = X.shape[0] # 样本数目 + self.dim = X.shape[1] # 特征维度 + self.Q = np.zeros((self.num, self.num_clusters)) # 初始化各高斯分布对观测数据的响应度矩阵 + self.weight = [1 / self.num_clusters] * self.num_clusters # 初始化各高斯分布的权重为聚类数目分之一 + self.mu = np.random.uniform(0, 1, (self.num_clusters, self.dim)) * np.max(X, axis=0) # 随机产生均值向量 + self.covar = np.array([np.identity(self.dim) for _ in range(self.num_clusters)]) # 随机产生协方差矩阵 + + #更新分模型对数据的响应度矩阵Q。e步 + def e_step(self): + for i in range(self.num): + q_i = [] + for k in range(self.num_clusters): + postProb = self.multi_norm(self.X[i, :], self.mu[k, :], self.covar[k, :, :]) + q_i.append(self.weight[k] * postProb + 1e-32) + self.Q[i, :] = np.array(q_i) / np.sum(q_i) + + #返回多维高斯分布的结果 + def multi_norm(self, x, mu, sigma): + det = np.linalg.det(sigma) + inv = np.matrix(np.linalg.inv(sigma)) + x_mu = np.matrix(x - mu).T + const = 1 / (((2 * np.pi) ** (len(x) / 2)) * (det ** (1 / 2))) + exp = -0.5 * x_mu.T * inv * x_mu + return float(const * np.exp(exp)) + + # m步,更新参数 + def m_step(self): + # update weight 更新权值矩阵 + self.weight = np.mean(self.Q, axis=0) + + # update mu 更新均值向量 + temp = [] + for k in range(self.num_clusters): + up = np.zeros(self.dim) + for j in range(self.num): + up += self.Q[j, k] * np.array(self.X[j, :]) + down = np.sum(self.Q[:, k]) + temp.append(up / down) + self.mu = np.array(temp) + + # update covar 更新协方差矩阵 + for k in range(self.num_clusters): + up = np.zeros((self.dim, self.dim)) + for j in range(self.num): + x_mu = np.matrix(self.X[j, :] - self.mu[k, :]) + # print(x_mu.T*x_mu) + up += self.Q[j, k] * (x_mu.T * x_mu) + # print(up) + down = np.sum(self.Q[:, k]) + var = np.array(up / down) + self.covar[k, :, :] = var + +class ClusteringAlgorithm: + + def __init__(self): + self.cluster_num = 2 + self.model = None + + def fit(self, train_data, upper): + sum = np.zeros(upper-2) + for i in range(2, upper): + kmeans = KMeans(i) + kmeans.fit(train_data) + m = kmeans.labels + c = kmeans.centers + for j in range(len(train_data)): + c1 = c[int(m[j])] + x1 = train_data[j] + sum[i-2] += np.sum(np.square(c1-x1)) + c = plt.plot(np.arange(2, upper), sum) + plt.savefig(f'./img/elbow.png') + plt.show() + n = len(sum) + mx = 0 + for i in range(1, n - 1): + del1 = (sum[0] - sum[i]) / i + del2 = (sum[i] - sum[n - 1]) / (n - 1 - i) + delta = del1 - del2 + # 找到符合要求,并且插值最大的 K + if delta > 0.3 * max(del1, del2) and delta > mx: + mx = delta + self.cluster_num = i + 2 + self.model = KMeans(self.cluster_num) + self.model.fit(train_data) + + def predict(self, test_data): + return self.model.predict(test_data) diff --git a/assignment-3/submission/17307110367/tester_demo.py b/assignment-3/submission/17307110367/tester_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..243bf59a44e179a1c741cd1bde087f63f1b3bb7a --- /dev/null +++ b/assignment-3/submission/17307110367/tester_demo.py @@ -0,0 +1,197 @@ +import numpy as np +import sys +import matplotlib.pyplot as plt +from source import KMeans, GaussianMixture,ClusteringAlgorithm + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (100,)) + + data, _ = shuffle(x, y, z) + #displayer([x, y, z], "data") + return (data, data), 3 + + +def data_2(): + train_data = np.array([ + [23, 12, 173, 2134], + [99, -12, -126, -31], + [55, -145, -123, -342], + ]) + return (train_data, train_data), 2 + + +def data_3(): + train_data = np.array([ + [23], + [-2999], + [-2955], + ]) + return (train_data, train_data), 2 + +def displayer(data,name): + datas = [[], [], [], [], []] + for kind in range(5): + for i in range(len(data[kind])): + datas[kind].append(data[kind][i]) + + for each in datas: + each = np.array(each) + if each.size > 0: + plt.scatter(each[:, 0], each[:, 1]) + plt.savefig(f'img/{name}') + plt.show() + +def display(data, label, name): + datas = [[], [], [], [], []] + for i in range(len(data)): + datas[label[i]].append(data[i]) + + for each in datas: + each = np.array(each) + if each.size > 0: + #plt.scatter(each[:, 0], np.zeros(each.size)) + plt.scatter(each[:, 0], each[:, 1]) + plt.savefig(f'img/{name}') + plt.show() + +#自己制作的二维数据集 +def data_6(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (80,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + y = np.random.multivariate_normal(mean, cov, (100,)) + + mean = (-10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (100,)) + + mean = (10, -22) + cov = np.array([[10, 5], [5, 10]]) + u = np.random.multivariate_normal(mean, cov, (100,)) + + mean = (-10, -22) + cov = np.array([[10, 5], [5, 10]]) + v = np.random.multivariate_normal(mean, cov, (100,)) + + + data, _ = shuffle(x, y, z, u, v) + #displayer([x, y, z, u, v], "data") + return (data, data), 5 + +def test_without_n_clusters(data_fuction, algorithm_class): + (train_data, test_data), n_clusters = data_fuction() + model = algorithm_class() + model.fit(train_data, 10) + res = model.predict(test_data) + return model.cluster_num + + +def test_with_n_clusters(data_fuction, algorithm_class): + (train_data, test_data), n_clusters = data_fuction() + model = algorithm_class(n_clusters) + model.fit(train_data) + res = model.predict(test_data) + #display(test_data, res, "res") + assert len( + res.shape) == 1 and res.shape[0] == test_data.shape[0], "shape of result is wrong" + return res + + +def testcase_1_1(): + test_with_n_clusters(data_1, KMeans) + return True + + +def testcase_1_2(): + res = test_with_n_clusters(data_2, KMeans) + return res[0] != res[1] and res[1] == res[2] + + +def testcase_2_1(): + test_with_n_clusters(data_1, GaussianMixture) + return True + + +def testcase_2_2(): + res = test_with_n_clusters(data_3, GaussianMixture) + return res[0] != res[1] and res[1] == res[2] + +def testcase_1_6(): + test_with_n_clusters(data_6, KMeans) + return True + +def testcase_2_6(): + test_with_n_clusters(data_6, GaussianMixture) + return True + +def testcase_3_1(): + res = test_without_n_clusters(data_6, ClusteringAlgorithm) + print(res) + return res == 5 + +def test_all(err_report=False): + testcases = [ + ["KMeans-1", testcase_1_1, 4], + ["KMeans-2", testcase_1_2, 4], + + # ["KMeans-3", testcase_1_3, 4], + # ["KMeans-4", testcase_1_4, 4], + # ["KMeans-5", testcase_1_5, 4], + #["KMeans-6", testcase_1_6, 4], + + ["GMM-1", testcase_2_1, 4], + ["GMM-2", testcase_2_2, 4], + + # ["GMM-3", testcase_2_3, 4], + # ["GMM-4", testcase_2_4, 4], + # ["GMM-5", testcase_2_5, 4], + #["GMM-6", testcase_2_6, 4], + #["ClusteringAlgorithm", testcase_3_1, 4] + ] + sum_score = sum([case[2] for case in testcases]) + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except Exception as e: + if err_report: + print("Error [{}] occurs in {}".format(str(e), case[0])) + res = 0 + score += res + print("+ {:14} {}/{}".format(case[0], res, case[2])) + print("{:16} {}/{}".format("FINAL SCORE", score, sum_score)) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--report": + test_all(True) + else: + test_all()