diff --git a/assignment-3/submission/.keep b/assignment-3/submission/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assignment-3/submission/19210680053/.keep b/assignment-3/submission/19210680053/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assignment-3/submission/19210680053/README.md b/assignment-3/submission/19210680053/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6f59e4523add7d648e9586ee5c0c79ddc6f6f5d0 --- /dev/null +++ b/assignment-3/submission/19210680053/README.md @@ -0,0 +1,275 @@ +# Assignment 3 + +## 1.KMeans 模型结构 +本次实验首先对KMeans模型进行实现 +### 类中心点的初始化 +在给定类簇数目(cluNum)以及数据维度(N*dim)之后,首先随机从数据集中选取cluNum行数据,作为若干个类的中心点 + +得到如下初始化数据中心 +$$\begin{Bmatrix} +{{{C_1,C_2,...,C_{cluNum}}}}\end{Bmatrix} +$$ +### 数据到中心的距离以及分类 +接着计算训练集中每个对象到各个中心的欧式距离, +$$ +euclDistance_{i,j}={\sqrt {\sum_{t=1}^{dim} (X_{it}-C_{jt})^2}} +$$ +对于每个对象,分别保留其相距 __最近中心点__ 下标以及对应距离平方,存储在clusterAssment中(shape: N*2)的第一列与第二列 +### 中心点更新 +根据clusterAssment的第一列数据,分别得到距离每个$C_k$,$1\leq k\leq cluNum$最近的一系列点坐标,因此对于k类数据点求取均值,得到其作为新的$C_k$ +### 循环停止条件 +设置clusterchanged作为循环标志,即对于所有点的clusterAssment中距离最近中心点$C_k$的下标k不再发生改变,则循环结束 +```python + if clusterAssment[i, 0] != minIndex: + clusterChanged = True + clusterAssment[i, :] = minIndex, minDist ** 2 +``` +此时输出cluNum个中心点作为模型训练结果 +$$\begin{Bmatrix} +C_1,C_2,C_3,...C_{cluNum}\end{Bmatrix} +$$ +### 模型预测 +```python + def predict(self, test_data): + self.test_data = test_data + numSamples, dim = self.test_data.shape + m_res = [] + for i in range(numSamples): + distance = [self.euclDistance(self.test_data[i, :], self.centroids[j, :]) for j in range(self.cluNum)] + clus_type = distance.index(min(distance)) + m_res.append(clus_type) + return np.array(m_res) +``` +通过计算测试集每个对象距离所有中心点的欧式距离,每个点会被归类到距其最近的中心点所属类中。 + +## 2.GaussianMixture 模型结构 +本次实验要求对1-D 与2-D 数据进行GaussianMixture的实现,所以分别对两种类型数据进行实现 +### 2维数据 +#### 进行每个聚类的均值、协方差矩阵以及对应权重初始化 +由于初值选择会影响分类效果,因此均值采用介于训练集数据$[(Minimum \quad value)/2,(Maximum \quad value)/2]$之间生成随机数 +```python +self.means = np.random.randint(train_data.min() / 2, train_data.max() / 2, size=(self.cluNum, sample_fea)) + +``` +每个协方差矩阵令其矩阵对角线取值为1,非对角线取值为0 +每个聚类具有等权重 +```python + self.weight = np.ones(self.cluNum) / self.cluNum +``` +#### E step + +$$\gamma_{k}^{(i)} = \frac{\pi_{k}N(x^{(i)}|\mu_{k},\Sigma_{k})}{\sum_{k=1}^{K}N(x^{(i)}|\mu_{k},\Sigma_{k})}$$ +$N(x^{(i)}|\mu_{k},\Sigma_{k})$采用二维高斯分布进行拟合,再乘以$\pi_{k}$每个cluster对应weight,通过上式计算得到第i个数据样本点落在第k个聚类中的概率 +#### M step +通过计算cluster k 中的样本数来进行权重更新$\pi_{k}$ +$$N_{k}=\sum_{i=1}^{n}\gamma_{k}^{(i)}$$ +$$\pi_{k}=\frac{N_{k}}{N}$$ +并对每个cluster的均值以及协方差matrix进行更新 +$$\mu_{k}=\frac{\sum_{i=1}^{cluNum}\gamma_{k}^{(i)}x^{(i)}}{N_{k}}$$ +$$\Sigma_{k}=\frac{\sum_{i=1}^{cluNum}\gamma_{k}^{(i)}(x^{(i)}-\mu_{k})(x^{(i)}-\mu_{k})^{T}}{N_{k}}$$ + +```python +N_k = np.sum(p_mat[:, j], axis=0) +self.weight[j] = N_k / sampleN +self.means[j] = (1 / N_k) * np.sum(train_data * p_mat[:, j].reshape(-1, 1), axis=0) + +self.covariance[j] = (1 / N_k) * np.dot( + (p_mat[:, j].reshape(-1, 1) * (train_data - self.means[j])).T, + (train_data - self.means[j])) + self.reg_cov +``` +#### 迭代条件 +模型可通过使得$\begin{Bmatrix}\Sigma_{k},\pi_{k},\mu_{k}\end{Bmatrix}$均收敛来作为停止条件,此处进行最大训练次数来作为模型训练的终止条件。 +#### 模型预测 +由train_data得到每个聚类的均值、协方差矩阵以及对应权重 +$prob_{k,i}=\pi_{k}N(x^{(i)}|\mu_{k},\Sigma_{k})$ +对于训练集中数据点i,寻求在所有中心点中令上式最大化的中心点k,数据会被归至该类。 +### 1维数据 +#### 数据正则化 +对于一维数据,首先进行数据正则化处理,以及形式上转化为一维数据 +```python + def normalize(self, data): + flat_data = data.flatten() + nor_data = (flat_data - flat_data.mean()) / np.sqrt(flat_data.var()) + return nor_data +``` +#### 进行数据的每个聚类均值、方差(而非协方差矩阵)以及对应权重初始化 +对于一维数据均值初始化,采用方式是:生成关于0对称array * 训练集中均值 +```python +self.mu = (np.arange(self.cluNum) - self.cluNum // 2) * (train_data.max() - train_data.min()) / self.cluNum +``` +对于方差初始化,cluNum个聚类全部取值为1; +对于权重,赋予每个聚类等权重 +#### E step and M step +E step与M step同2D数据思想相同 +但在M step中进行每个cluster均值与方差更新时,会赋予旧均值、旧方差0.1的权重;新均值方差0.9的权重,进行迭代更新 +#### 迭代停止条件 +模型通过设置阈值,使得$\begin{Bmatrix}\Sigma_{k},\pi_{k},\mu_{k}\end{Bmatrix}$均收敛来作为停止条件 +```python + if (np.sum((self.mu - mu_) ** 2) + np.abs(self.sigma2 - sigma2_).sum()) < 1e-3: + break +``` +#### 模型预测 + +由train_data得到每个聚类的均值、方差以及对应权重,同样进行 +$argmax\frac{N(X_{k}|\mu_{k},\Sigma_{k})Weight_{k}} +{\sum_{k=1}^{cluNum}N(X_{k}|\mu_{k},\Sigma_{k})Weight_{k}}$ +的计算,此时数据点i会被归至第k类 +## 3.基础实验 +### 3.1数据可视化 +在test_demo.py中进行数据可视化实现 +```python +def pltshow(test_data, res, n_clu): + mark = [[np.random.randint(0, 256) for i in range(3)] for j in range(n_clu)] + marklist = ["#{:02x}{:02x}{:02x}".format(mark[i][0], mark[i][1], mark[i][2]) for i in range(n_clu)] + colorlist = [marklist[i] for i in res] + pyplot.scatter(test[:, 0], test[:, 1], c=colorlist) + pyplot.show() +``` +### 3.2随机二维数据集生成 +通过生成不同均值、方差(协方差矩阵)、以及不同数目高斯分布作为训练集与数据集的二维数据对模型效果进行检验 +当数据分散时,分类效果较准确 ++ 第一类数据,数量为800个,标签为0: + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +3 & 0 \\\\ +0 & 2 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +1 & 2 +\end{array}\right] +\end{array} +$$ + ++ 第二类数据,数量为200个,标签为1: + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +2 & 0 \\\\ +0 & 2 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +26 & 28 +\end{array}\right] +\end{array} +$$ + ++ 第三类数据,数量为1000个,标签为2: + +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +10 & 5 \\\\ +5 & 10 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +-25 & -23 +\end{array}\right] +\end{array} +$$ + +随机打乱数据集后,前80%作为训练集,后20%作为测试集 +生成训练集如下所示: + +Training dataset with 3 Gaussian Distribution + +测试集如下所示: + +Testing dataset with 3 Gaussian Distribution + +Kmeans分类结果如下所示: + +Kmeans Classification with 3 Gaussian Distribution + +GaussianMixture分类结果如下所示: + +Kmeans Classification with 3 Gaussian Distribution + +此时,加大分类难度,再增加一类与已有较为接近数据集: + +第四类数据集,数量为1000,标签为3: +$$ +\begin{array}{l} +\Sigma=\left[\begin{array}{cc} +4 & 3 \\\\ +3 & 4 +\end{array}\right] \\\\ +\mu=\left[\begin{array}{ll} +-20 & -21 +\end{array}\right] +\end{array} +$$ + +随机打乱新数据集后,前80%作为训练集,后20%作为测试集 + +生成训练集如下所示: + +Kmeans Classification with 4 Gaussian Distribution + +测试集如下所示: + +Kmeans Classification with 4 Gaussian Distribution + +Kmeans分类结果如下所示: + +Kmeans Classification with 3 Gaussian Distribution + +GMM分类结果如下所示: + +Kmeans Classification with 3 Gaussian Distribution + +此时,GMM分类结果优于KMeans,原因是给出数据本身服从二维高斯分布。 + +KMeans的迭代停止条件是计算欧氏距离,直至每一cluster中心点不再发生变化,当存在两类数据中心点较为接近时,即便迭代停止,也无法对数据点进行有效区分; + +而在给定数据服从二维高斯分布,符合GMM也是假定数据服从高斯分布假设,此时分类效果更好。 + +## 4.类簇选择 + +### Elbow Method 进行优化 + +算法原理是在K 值相对较小的情况下,当选择的k值小于真正的n_clusters时,k每增加1,代价值就会大幅的减小;当选择的k值大于真正的n_cluster时, k每增加1,cost值的变化就不会那么明显。此时,正确的k值就会在这个转折点,类似肘部的地方。 + +对于代价值定义: +$$ +cost = \sum_{k=1}^{(n_cluster)}\sum_{i=1}^{n}(X_{i,k}-C_{k})^2 +$$ +即对于训练集中每个点,计算它到所属类别中心的欧式距离平方进行加总,作为总代价值 + +## 最优类簇数目选择 + +首先进行总代价变化率: +$$ +cost chg_{k} = cost_{k} / cost_{k-1}-1 +$$ +接着计算变化率的变动情况(即凸性) +$$ +cost convex \quad chg_{k} = cost chg_{k} - cost chg_{k-1} +$$ +选取$cost convex \quad chg_{k}$最大的k值作为最优分类下的类簇数目进行拟合,即分类数目为$k-1$到k下降陡峭,分类数目为k到$k+1下降平缓。 + +分别使用test_demo_elbow中由三个高斯分布和四个高斯分布组合的数据集进行求解: + + +#### 三个高斯分布 + +训练集的实际分布情况 +Elbow Kmeans Classification with 3 Gaussian Distribution + +模型预测 + +Elbow Kmeans Classification with 3 Gaussian Distribution + +#### 四个高斯分布 + +训练集的实际分布情况 + +Kmeans Classification with 3 Gaussian Distribution + +模型预测 + +Kmeans Classification with 3 Gaussian Distribution + +Elbow test的局限性在于其一般适用于k值较小的情况,因此生成二维高斯数据集数目较少。 diff --git a/assignment-3/submission/19210680053/img/.keep b/assignment-3/submission/19210680053/img/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/assignment-3/submission/19210680053/img/GMM1.png b/assignment-3/submission/19210680053/img/GMM1.png new file mode 100644 index 0000000000000000000000000000000000000000..f716743bd900ec8229ff10ac5a11baca631403bb Binary files /dev/null and b/assignment-3/submission/19210680053/img/GMM1.png differ diff --git a/assignment-3/submission/19210680053/img/GMM2.png b/assignment-3/submission/19210680053/img/GMM2.png new file mode 100644 index 0000000000000000000000000000000000000000..dcb9604acad5dd5ad40b1d23ab184dd9ebdc668d Binary files /dev/null and b/assignment-3/submission/19210680053/img/GMM2.png differ diff --git a/assignment-3/submission/19210680053/img/KMeans1.png b/assignment-3/submission/19210680053/img/KMeans1.png new file mode 100644 index 0000000000000000000000000000000000000000..1d7e98ba48fe7c89f6f52737d4e6b2e61c1aa5c5 Binary files /dev/null and b/assignment-3/submission/19210680053/img/KMeans1.png differ diff --git a/assignment-3/submission/19210680053/img/KMeans2.png b/assignment-3/submission/19210680053/img/KMeans2.png new file mode 100644 index 0000000000000000000000000000000000000000..f83bb59388ee3babda573eba7504d1ccc0700e7e Binary files /dev/null and b/assignment-3/submission/19210680053/img/KMeans2.png differ diff --git a/assignment-3/submission/19210680053/img/Test1.png b/assignment-3/submission/19210680053/img/Test1.png new file mode 100644 index 0000000000000000000000000000000000000000..3bf3f0a5619282ddbb8136a36441259fa377e2c4 Binary files /dev/null and b/assignment-3/submission/19210680053/img/Test1.png differ diff --git a/assignment-3/submission/19210680053/img/Test2.png b/assignment-3/submission/19210680053/img/Test2.png new file mode 100644 index 0000000000000000000000000000000000000000..6ed505894310f7d279fdbd0807223d065adfa79b Binary files /dev/null and b/assignment-3/submission/19210680053/img/Test2.png differ diff --git a/assignment-3/submission/19210680053/img/Train1.png b/assignment-3/submission/19210680053/img/Train1.png new file mode 100644 index 0000000000000000000000000000000000000000..b4b5e8924d1009a2aa98e634ff369a915003e419 Binary files /dev/null and b/assignment-3/submission/19210680053/img/Train1.png differ diff --git a/assignment-3/submission/19210680053/img/Train2.png b/assignment-3/submission/19210680053/img/Train2.png new file mode 100644 index 0000000000000000000000000000000000000000..ffc0c227aa625dc14bbaae3b7ba8190c6e873e08 Binary files /dev/null and b/assignment-3/submission/19210680053/img/Train2.png differ diff --git a/assignment-3/submission/19210680053/img/elbowres1.png b/assignment-3/submission/19210680053/img/elbowres1.png new file mode 100644 index 0000000000000000000000000000000000000000..53fe163150b45e1d316bbd75a2f5c0d1f1156b7a Binary files /dev/null and b/assignment-3/submission/19210680053/img/elbowres1.png differ diff --git a/assignment-3/submission/19210680053/img/elbowres2.png b/assignment-3/submission/19210680053/img/elbowres2.png new file mode 100644 index 0000000000000000000000000000000000000000..0b56417f1a1f1a203702088396e10d9da79e6268 Binary files /dev/null and b/assignment-3/submission/19210680053/img/elbowres2.png differ diff --git a/assignment-3/submission/19210680053/img/elbowtest1.png b/assignment-3/submission/19210680053/img/elbowtest1.png new file mode 100644 index 0000000000000000000000000000000000000000..9caaa1aa90c204b794b478cbf75ba4498d5ad6a4 Binary files /dev/null and b/assignment-3/submission/19210680053/img/elbowtest1.png differ diff --git a/assignment-3/submission/19210680053/img/elbowtest2.png b/assignment-3/submission/19210680053/img/elbowtest2.png new file mode 100644 index 0000000000000000000000000000000000000000..fc8c97fcd20a06a34cc5ef7a5194b213f14b1f88 Binary files /dev/null and b/assignment-3/submission/19210680053/img/elbowtest2.png differ diff --git a/assignment-3/submission/19210680053/img/kmeans2.png b/assignment-3/submission/19210680053/img/kmeans2.png new file mode 100644 index 0000000000000000000000000000000000000000..f83bb59388ee3babda573eba7504d1ccc0700e7e Binary files /dev/null and b/assignment-3/submission/19210680053/img/kmeans2.png differ diff --git a/assignment-3/submission/19210680053/source.py b/assignment-3/submission/19210680053/source.py new file mode 100644 index 0000000000000000000000000000000000000000..5bf9619a1d4b1d2c9576aa14e25c012f16d7f170 --- /dev/null +++ b/assignment-3/submission/19210680053/source.py @@ -0,0 +1,197 @@ +import numpy as np +from scipy.stats import multivariate_normal + + +class KMeans(object): + def __init__(self, n_clusters): + self.cluNum = n_clusters + self.distance = 0 + + def fit(self, train_data): + # first set the initial centers by random selection + self.train_data = train_data + numSamples, dim = self.train_data.shape + # step1: init the center + self.centroids = np.zeros((self.cluNum, dim)) + for i in range(self.cluNum): + index = int(np.random.uniform(0, numSamples)) + self.centroids[i, :] = self.train_data[index, :] + clusterAssment = np.mat(np.zeros((numSamples, 2))) + clusterChanged = True + + while clusterChanged: + clusterChanged = False + ## for each sample + for i in range(numSamples): + minDist = 100000.0 + minIndex = 0 + ## step 2: find the centroid who is closestfor each centroid + for j in range(self.cluNum): + distance = self.euclDistance(self.centroids[j, :], self.train_data[i, :]) + if distance < minDist: + minDist = distance + minIndex = j + + ## step 3: update its cluster + if clusterAssment[i, 0] != minIndex: + clusterChanged = True + clusterAssment[i, :] = minIndex, minDist ** 2 + ## step 4: update centroids + for j in range(self.cluNum): + pointsInCluster = np.array(self.train_data[np.nonzero(clusterAssment[:, 0].A == j)[0]]) + if len(pointsInCluster) != 0: + self.centroids[j, :] = np.mean(pointsInCluster, axis=0) + + for i in range(len(clusterAssment)): + self.distance += clusterAssment[i, 1] + return self.centroids, clusterAssment, self.distance + + def predict(self, test_data): + self.test_data = test_data + numSamples, dim = self.test_data.shape + m_res = [] + for i in range(numSamples): + distance = [self.euclDistance(self.test_data[i, :], self.centroids[j, :]) for j in range(self.cluNum)] + clus_type = distance.index(min(distance)) + m_res.append(clus_type) + return np.array(m_res) + + + def euclDistance(self, vector1, vector2): + return np.sqrt(sum(np.power(vector2 - vector1, 2))) + + + +class GaussianMixture: + # covariance: ndarray + def __init__(self, n_clusters, reg_cov: float = 1e-06): + self.cluNum = n_clusters + self.max_iter = 1200 + self.reg_cov = reg_cov + self.alpha = (np.ones(n_clusters) / n_clusters) + self.sigma2 = (np.ones(n_clusters)) + + def fit(self, train_data): + sampleN, sample_fea = train_data.shape + ''' + gamma.shape(N, K) + mu.shape(1, K) + sigma2.shape(1,K) + alpha.shape(1, K) + ''' + if sample_fea == 1: + train_data = self.normalize(train_data) + print(train_data) + self.gamma = np.ones((train_data.shape[0], self.cluNum)) / self.cluNum + self.mu = (np.arange(self.cluNum) - self.cluNum // 2) * (train_data.max() - train_data.min()) / self.cluNum + sigma2_ = self.sigma2 + mu_ = self.mu + while True: + + self.gamma = (0.1 * self.gamma + 0.9 * self.phi(train_data, self.mu).T * self.alpha / ( + self.phi(train_data, self.mu).T * self.alpha).sum( + axis=1).reshape(train_data.shape[0], 1)) + + self.mu = (0.1 * self.mu + 0.9 * np.matmul(train_data, self.gamma) / self.gamma.sum(axis=0)) + + self.sigma2 = (0.1 * self.sigma2 + 0.9 * ( + self.gamma * (train_data.reshape(train_data.shape[0], 1) - self.mu) ** 2).sum( + axis=0) / self.gamma.sum(axis=0)) + + self.alpha = (0.1 * self.alpha + 0.9 * self.gamma.sum(axis=0) / train_data.shape[0]) + if (np.sum((self.mu - mu_) ** 2) + np.abs(self.sigma2 - sigma2_).sum()) < 1e-3: + break + mu_ = self.mu + sigma2_ = self.sigma2 + elif sample_fea == 2: + self.reg_cov = self.reg_cov * np.identity(sample_fea) + self.means = np.random.randint(train_data.min() / 2, train_data.max() / 2, size=(self.cluNum, sample_fea)) + self.covariance = np.zeros((self.cluNum, sample_fea, sample_fea)) + for k in range(self.cluNum): + np.fill_diagonal(self.covariance[k], 1) + self.weight = np.ones(self.cluNum) / self.cluNum + + p_mat = np.zeros((sampleN, self.cluNum)) + for i in range(self.max_iter): + for j in range(self.cluNum): + self.covariance += self.reg_cov + g = multivariate_normal(mean=self.means[j], cov=self.covariance[j]) + p_mat[:, j] = self.weight[j] * g.pdf(train_data) + total_n = p_mat.sum(axis=1) + total_n[total_n == 0] = self.cluNum + p_mat /= total_n.reshape(-1, 1) + if sample_fea != 1: + for j in range(self.cluNum): + N_k = np.sum(p_mat[:, j], axis=0) + + self.means[j] = (1 / N_k) * np.sum(train_data * p_mat[:, j].reshape(-1, 1), axis=0) + + self.covariance[j] = (1 / N_k) * np.dot( + (p_mat[:, j].reshape(-1, 1) * (train_data - self.means[j])).T, + (train_data - self.means[j])) + self.reg_cov + self.weight[j] = N_k / sampleN + else: + print("sorry, the model is only designed for 1-D and 2-D data") + + def predict(self, test_data): + sampleN, sample_fea = test_data.shape + if sample_fea == 1: + test_data = self.normalize(test_data) + gamma = (self.phi(test_data, self.mu).T * self.alpha / (self.phi(test_data, self.mu).T * self.alpha).sum( + axis=1).reshape(test_data.shape[0], 1)) + return gamma.argmax(axis=1) + elif sample_fea == 2: + p_mat = np.zeros((test_data.shape[0], self.cluNum)) + for j in range(self.cluNum): + self.covariance += self.reg_cov + g = multivariate_normal(mean=self.means[j], cov=self.covariance[j]) + p_mat[:, j] = self.weight[j] * g.pdf(test_data) + + total_n = p_mat.sum(axis=1) + total_n[total_n == 0] = self.cluNum + p_mat /= total_n.reshape(-1, 1) + return np.argmax(p_mat, axis=1) + else: + print("one and two dimension data is accepted") + + def normalize(self, data): + flat_data = data.flatten() + nor_data = (flat_data - flat_data.mean()) / np.sqrt(flat_data.var()) + return nor_data + + def phi(self, data, mu): + '''phi.shape(K, N)''' + mu = (np.arange(self.cluNum) - self.cluNum // 2) * (data.max() - data.min()) / self.K + phi = (1 / np.sqrt(2 * np.pi * self.sigma2.reshape(self.cluNum, 1)) * np.exp( + - (data - mu.reshape(self.cluNum, 1)) ** 2 / (2 * self.sigma2.reshape(self.cluNum, 1)))) + return phi + + +class ClusteringAlgorithm: + def __init__(self): + self.elbow_dict = {} + self.elbow_chg = [] + self.elbow_chg_next = [] + self.bestCluNum = 0 + + def fit(self, train_data): + for i in range(2, 6): + mini_loss = 1e8 + '''To guarantee robustness''' + for j in range(50): + self.km = KMeans(i) + if self.km.fit(train_data)[2] < mini_loss: + mini_loss = self.km.fit(train_data)[2] + self.elbow_dict[i] = self.km.fit(train_data)[2] + elbow_dist = list(self.elbow_dict.values()) + self.elbow_chg = [(elbow_dist[i] / elbow_dist[i - 1]) - 1 for i in range(1, len(elbow_dist))] + self.elbow_chg_next = [(self.elbow_chg[i] - self.elbow_chg[i - 1]) for i in range(1, len(self.elbow_chg))] + self.bestCluNum = self.elbow_chg.index(max(self.elbow_chg)) + 3 + self.km = KMeans(self.bestCluNum) + self.km.fit(train_data) + return elbow_dist, self.bestCluNum + + def predict(self, test_data): + res = self.km.predict(test_data) + return res + diff --git a/assignment-3/submission/19210680053/test_demo.py b/assignment-3/submission/19210680053/test_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..1dbd114e1e4cadf63d2932f104206ae40f01e9d9 --- /dev/null +++ b/assignment-3/submission/19210680053/test_demo.py @@ -0,0 +1,168 @@ +import numpy as np +import sys +from source import GaussianMixture, KMeans, ClusteringAlgorithm +from ondGMM import GMM +from matplotlib import pyplot + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int) * i + for (i, d) in enumerate(datas)]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z) + return (data, data), 3 + + +def data_test(input=3): + mean = (1, 2) + cov = np.array([[3, 0], [0, 2]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (26, 28) + cov = np.array([[2, 0], [0, 2]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (-25, -23) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + mean = (11, 13) + cov = np.array([[4, 3], [3, 4]]) + q = np.random.multivariate_normal(mean, cov, (1000,)) + if input == 3: + data, label = shuffle(x, y, z) + elif input == 4: + data, label = shuffle(x, y, z, q) + else: + print("only 3 and 4 is accepted") + num = len(data) + train_data, test_data = data[:int(0.8 * num)], data[int(0.8 * num):num] + train_label, test_label = label[:int(0.8 * num)], label[int(0.8 * num):num] + return train_data, test_data, train_label, test_label + + +def data_2(): + train_data = np.array([ + [23, 12, 173, 2134], + [99, -12, -126, -31], + [55, -145, -123, -342], + ]) + return (train_data, train_data), 2 + + +def data_3(): + train_data = np.array([ + [23], + [-2999], + [-2955], + ]) + + return (train_data, train_data), 2 + + +def test_with_n_clusters(data_fuction, algorithm_class): + (train_data, test_data), n_clusters = data_fuction()[0], data_fuction()[1] + model = algorithm_class(n_clusters) + model.fit(train_data) + res = model.predict(test_data) + assert len( + res.shape) == 1 and res.shape[0] == test_data.shape[0], "shape of result is wrong" + return res + + +def testcase_1_1(): + test_with_n_clusters(data_1, KMeans) + return True + + +def testcase_1_2(): + res = test_with_n_clusters(data_2, KMeans) + return res[0] != res[1] and res[1] == res[2] + + +def testcase_2_1(): + test_with_n_clusters(data_1, GaussianMixture) + return True + + +def testcase_2_2(): + res = test_with_n_clusters(data_3, GMM) + return res[0] != res[1] and res[1] == res[2] + + +def test_all(err_report=False): + testcases = [ + ["KMeans-1", testcase_1_1, 4], + ["KMeans-2", testcase_1_2, 4], + # ["KMeans-3", testcase_1_3, 4], + # ["KMeans-4", testcase_1_4, 4], + # ["KMeans-5", testcase_1_5, 4], + ["GMM-1", testcase_2_1, 4], + ["GMM-2", testcase_2_2, 4], + # ["GMM-3", testcase_2_3, 4], + # ["GMM-4", testcase_2_4, 4], + # ["GMM-5", testcase_2_5, 4], + ] + sum_score = sum([case[2] for case in testcases]) + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except Exception as e: + if err_report: + print("Error [{}] occurs in {}".format(str(e), case[0])) + res = 0 + score += res + print("+ {:14} {}/{}".format(case[0], res, case[2])) + print("{:16} {}/{}".format("FINAL SCORE", score, sum_score)) + + +def pltshow(data, res, n_clu): + mark = [[np.random.randint(0, 256) for i in range(3)] for j in range(n_clu)] + marklist = ["#{:02x}{:02x}{:02x}".format(mark[i][0], mark[i][1], mark[i][2]) for i in range(n_clu)] + colorlist = [marklist[i] for i in res] + pyplot.scatter(data[:, 0], data[:, 1], c=colorlist) + pyplot.show() + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--report": + test_all(True) + else: + test_all() + + final = data_test(3)#input = 3 or 4 + train, test, train_label, test_label = final[0], final[1], final[2], final[3] + cluster_num = 3 + km = KMeans(cluster_num) + km.fit(train) + kmeans_res = km.predict(test) + gmm = GaussianMixture(cluster_num) + gmm.fit(train) + gmm_res = gmm.predict(test) + # pltshow(train, train_label, cluster_num) + # pltshow(test, test_label, cluster_num) + # pltshow(test, kmeans_res, cluster_num) + # pltshow(test, gmm_res, cluster_num) diff --git a/assignment-3/submission/19210680053/test_demo_elbow.py b/assignment-3/submission/19210680053/test_demo_elbow.py new file mode 100644 index 0000000000000000000000000000000000000000..b24efe53217cfeeaccb0979a293f49be3e0eee22 --- /dev/null +++ b/assignment-3/submission/19210680053/test_demo_elbow.py @@ -0,0 +1,70 @@ +import numpy as np +from source import KMeans, ClusteringAlgorithm +from matplotlib import pyplot + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int) * i + for (i, d) in enumerate(datas)]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_test(input=3): + mean = (1, 2) + cov = np.array([[3, 0], [0, 2]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (26, 28) + cov = np.array([[2, 0], [0, 2]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (-25, -23) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + mean = (11, 13) + cov = np.array([[4, 3], [3, 4]]) + q = np.random.multivariate_normal(mean, cov, (1000,)) + if input == 3: + data, label = shuffle(x, y, z) + elif input == 4: + data, label = shuffle(x, y, z, q) + else: + print("only 3 and 4 is accepted") + num = len(data) + train_data, test_data = data[:int(0.8 * num)], data[int(0.8 * num):num] + train_label, test_label = label[:int(0.8 * num)], label[int(0.8 * num):num] + return train_data, test_data, train_label, test_label + + +def pltshow(data, res, n_clu): + mark = [[np.random.randint(0, 256) for i in range(3)] for j in range(n_clu)] + marklist = ["#{:02x}{:02x}{:02x}".format(mark[i][0], mark[i][1], mark[i][2]) for i in range(n_clu)] + # marklist = ['c', 'b', 'g', 'r', 'm', 'y', 'k', 'w'] + colorlist = [marklist[i] for i in res] + pyplot.scatter(data[:, 0], data[:, 1], c=colorlist) + pyplot.show() + + +if __name__ == "__main__": + input_ = 4 + final = data_test(input_) + train, test, train_label, test_label = final[0], final[1], final[2], final[3] + # cluster_num = 3 + clu_test = ClusteringAlgorithm() + elbow_dict, best_num = clu_test.fit(train) + clu_res = clu_test.predict(test) + + km = KMeans(input_) + km.fit(train) + res = km.predict(test) + + pltshow(test, res, input_) + pltshow(test, clu_res, best_num)