diff --git a/assignment-3/submission/18307130213/README.md b/assignment-3/submission/18307130213/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7a8599b1ef2a76041d95ecaf1429158d0c9c1c56 --- /dev/null +++ b/assignment-3/submission/18307130213/README.md @@ -0,0 +1,156 @@ +# Assignment3实验报告 + +[toc] + + +## 模型实现 + +### 数据预处理 + +由于每个维度都有可能存在极端值,且极端值大小不同,因此在每一维度上均对训练数据使用类似于 `min-max` 初始化方法进行数值标准化,将数值投影到[-1, 1]。 + +尽管可能会出现测试数据无法使用同样的比例投影到[-1, 1]的情况,但总的来讲这是一个较为合理的标准化手段。 + + + +### K-均值(Kmeans) + +Kmeans方法是一个较为粗糙但效率较高的聚类方法,能在指定聚类数目时较快地完成聚类。 + +#### 原理 + +Kmeans方法会初始选取一些样本点作为聚类中心,并通过数轮迭代以下过程直至收敛: + +1. 计算每个点距离最近的聚类中心,归到对应类中。 +2. 计算每个类的中心点,作为新的聚类中心。 + +#### 收敛条件 + +假设聚类中心不再变化,则模型收敛。 + +#### 模型结构 + +在每个函数内部,声明有一些对应域下的函数,详见代码。 + + + +### 高斯混合模型(GMM) + +#### 原理 + +高斯混合模型假设需要进行聚类的点为多个高斯分布的样本点的并集。 + +因此,我们通过拟合这若干个高斯分布就能对点进行聚类。 + +为此,我们引入一个隐藏层,表示点属于哪一个高斯分布的概率。 + +这样,我们对这些高斯分布的参数和该隐藏层进行多轮迭代优化就能得到较好的聚类建模。 + +为了进行迭代优化,我们采用EM算法,E步在固定协方差和均值后,计算高维高斯概率值,并加和归一化获得更好的概率值;M步在给出后验估计后优化均值和协方差,从而优化模型整体的效果。 + +涉及到的数学公式如下: + +![nndlP270](./img/1.png) + + + +#### 问题 + +GMM对于初值较为敏感,对不同的初值的结果会有较大的差异,因此需要进行较多次初始选取或者使用其他初值选取方法才能得到更好的结果。 + +此外,EM算法本身可能对于某些初值较难达到收敛,需要对迭代轮数做出限制。 + + + +#### 初始值选取 + +前面提到,GMM算法对初始值非常敏感,尤其是在少样本的情况下,GMM算法会在自变量小范围震荡时,引发较大的似然值改变。实际代码实现时,若采用似然改变阈值作为收敛条件,会出现无法跑出结果的情况。 + +大体来说存在两种初始值选取方法: + +* 采用抽样法,对训练数据进行抽样,并采取平均结果作为初始值 +* 采用Kmeans预先训练,采用聚类中心作为高斯分布的均值 + +经过试验,多次抽样取最优和利用Kmeans的效果相差无几,但后者运行时间较低,更适合大规模推广。 + + + +#### 模型结构 + +在每个函数内部,声明有一些对应域下的函数,详见代码。 + + + +### ClusteringAlgorithm + +#### 实现原理 + +以Kmeans聚类方法为基础,clusteringAlgorithm主要完成的任务是筛选一个合适的聚类数K,并得到K对应的Kmeans和GMM模型中较好的一个。 + +#### K值选取方法实现 + +##### 经验法 + +经验法为K值的经验公式,一般为$\sqrt{n/2}$,在本轮实验中,该数值作为我们的简易聚类方法的上界。 + +##### 简易聚类方法 + +对于某个特定的K,我们将使用KMeans对数据进行学习,并将其结果的**轮廓系数**作为其评价指标。 + +若**轮廓系数**连续t次都更差,我们认为目前的最优点即为全局最优点,t为给定的超参。 + +![Silhouette Coefficient](./img/2.png) + +**轮廓系数**的取值范围为[-1, 1],值越大效果越好,值越小效果越差。 + + + +## 实验部分 + +### 可视化及代码使用方法 + +为了便于可视化,我们使用二维高斯分布作为数据生成源,可指定分布数N进行实验。 + +以 `N=3` 为例: + +```bash +python source.py g 3 # 生成数据集 + +python source.py d 3 # 生成数据集的可视化结果(保存在img文件夹下) + +python source.py p KMeans 3 # 指定 k=3 使用上述命令生成的数据集进行KMeans的训练和结果可视化 + +python source.py p GaussianMixture 3 # 指定 k=3 使用上述命令生成的数据集进行GMM的训练和结果可视化 + +python source.py p # 不指定 k 使用上述命令生成的数据集进行简易聚类算法的训练和结果可视化 + +``` + +### 效果图 + +运行python source.py g 5命令后运行python source.py d 5命令,得到训练和测试数据的可视化结果。 + +训练数据: + +![train](./img/train.png) + +测试数据: + +![test](./img/test.png) + +KMeans结果: + +![KMeans](./img/KMeans.png) + +GMM结果: + +![GMM](./img/GaussianMixture.png) + +简易聚类算法结果: + +![CA](./img/ClusteringAlgorithm.png) + +聚类算法指标变动展示: + +![SC](./img/k-Silhouette%20Coefficient%20graph.png) + diff --git a/assignment-3/submission/18307130213/img/1.png b/assignment-3/submission/18307130213/img/1.png new file mode 100644 index 0000000000000000000000000000000000000000..8d3960e9b5868c3c63dc607f0bace434ac23626e Binary files /dev/null and b/assignment-3/submission/18307130213/img/1.png differ diff --git a/assignment-3/submission/18307130213/img/2.png b/assignment-3/submission/18307130213/img/2.png new file mode 100644 index 0000000000000000000000000000000000000000..fcf7dc22cdba3d2520e86885c63108953fa3d099 Binary files /dev/null and b/assignment-3/submission/18307130213/img/2.png differ diff --git a/assignment-3/submission/18307130213/img/ClusteringAlgorithm.png b/assignment-3/submission/18307130213/img/ClusteringAlgorithm.png new file mode 100644 index 0000000000000000000000000000000000000000..b617084fbe797636194113045110abfbf43a7799 Binary files /dev/null and b/assignment-3/submission/18307130213/img/ClusteringAlgorithm.png differ diff --git a/assignment-3/submission/18307130213/img/GaussianMixture.png b/assignment-3/submission/18307130213/img/GaussianMixture.png new file mode 100644 index 0000000000000000000000000000000000000000..bdedec731e133e1b2af133e1123325cc237cf63a Binary files /dev/null and b/assignment-3/submission/18307130213/img/GaussianMixture.png differ diff --git a/assignment-3/submission/18307130213/img/KMeans.png b/assignment-3/submission/18307130213/img/KMeans.png new file mode 100644 index 0000000000000000000000000000000000000000..7917666136e535dc6ef9d6792145cf4dddbd12d4 Binary files /dev/null and b/assignment-3/submission/18307130213/img/KMeans.png differ diff --git a/assignment-3/submission/18307130213/img/k-Silhouette Coefficient graph.png b/assignment-3/submission/18307130213/img/k-Silhouette Coefficient graph.png new file mode 100644 index 0000000000000000000000000000000000000000..73151eafbdd961edfca186316a37bf4621b73a8d Binary files /dev/null and b/assignment-3/submission/18307130213/img/k-Silhouette Coefficient graph.png differ diff --git a/assignment-3/submission/18307130213/img/test.png b/assignment-3/submission/18307130213/img/test.png new file mode 100644 index 0000000000000000000000000000000000000000..02622a06d5b4b7580678a2759baf0d91099fe709 Binary files /dev/null and b/assignment-3/submission/18307130213/img/test.png differ diff --git a/assignment-3/submission/18307130213/img/train.png b/assignment-3/submission/18307130213/img/train.png new file mode 100644 index 0000000000000000000000000000000000000000..3ee29add0d4979d50f6e90788500e1957b9a18ae Binary files /dev/null and b/assignment-3/submission/18307130213/img/train.png differ diff --git a/assignment-3/submission/18307130213/source.py b/assignment-3/submission/18307130213/source.py new file mode 100644 index 0000000000000000000000000000000000000000..c7d1473e4a97ad97749f1a2d17e7a8228371da76 --- /dev/null +++ b/assignment-3/submission/18307130213/source.py @@ -0,0 +1,377 @@ +import numpy as np +import matplotlib.pyplot as plt +import random +import sys +import math + +class KMeans: + + def __init__(self, n_clusters): + self.k = n_clusters + self.kernels = None + self.norms = None + + def fit(self, train_data): + def normlization(data): # 对训练数据进行标准化并保留标准化参数以标准化测试数据 + minabs = np.abs(data.min(axis=0)) + maxabs = np.abs(data.max(axis=0)) + myabs = np.where(minabs > maxabs, minabs, maxabs) + self.norm = myabs + mydata = data/myabs + # assert(mydata.min() >= -1.0) + # assert(mydata.max() <= 1.0) + return mydata + + def trainOnce(data): + def genNewKernels(data, kernels): # 进行一轮迭代,生成新的聚类核 + clusters = [[] for _ in range(self.k)] + n = data.shape[0] + for i in range(n): + distances = np.sum((data[i] - kernels)**2, axis=1) + clusters[np.argmin(distances)].append(i) + + newKernels = [] + for i in range(self.k): + kern = np.average(data[clusters[i]],axis=0) + newKernels.append(kern) + return np.array(newKernels) + + np.random.shuffle(data) + kernels = data[0:self.k] + for i in range(100000): + newKernels = genNewKernels(data, kernels) + if (kernels == newKernels).all(): + break + kernels = newKernels + return kernels + else: + print('Too many iterations.') + exit(0) + + def calcScore(data, kernels): # 计算欧氏距离来衡量模型好坏 + score = 0.0 + n = data.shape[0] + for i in range(n): + distances = np.sum((data[i] - kernels) ** 2,axis=1)**0.5 + score += distances.min() + return score + + data = train_data.copy() + data = normlization(data) + if data.shape[0] <= self.k: + self.kernels = data + return + + bestKernels = trainOnce(data) + bestScore = calcScore(data, bestKernels) + for i in range(10): + newKernels = trainOnce(data) + newScore = calcScore(data, newKernels) + if newScore < bestScore: + bestKernel = newKernels + bestScore = newScore + + self.kernels = bestKernels + + def predict(self, test_data): + def normlization(data): # 根据训练数据的标准化参数对测试数据进行标准化处理 + mydata = data/self.norm + return mydata + + data = normlization(test_data) + ans = [] + for i in range(data.shape[0]): + distances = np.sum((self.kernels - data[i]) ** 2,axis=1) + ans.append(np.argmin(distances)) + + return np.array(ans) + +class GaussianMixture: + + def __init__(self, n_clusters): + self.k = n_clusters + self.mius = None + self.sigs = None + self.ps = None + self.norm = None + + def fit(self, train_data): + def normlization(data): # 对训练数据进行标准化并保留标准化参数以标准化测试数据 + minabs = np.abs(data.min(axis=0)) + maxabs = np.abs(data.max(axis=0)) + myabs = np.where(minabs > maxabs, minabs, maxabs) + self.norm = myabs + mydata = data/myabs + # assert(mydata.min() >= -1.0) + # assert(mydata.max() <= 1.0) + return mydata + + def calcGaussP(x, miu, sig): # 计算高维高斯分布的概率 + dim = x.shape[0] + sigma = np.matrix(sig) + numerator = np.exp(-0.5 * np.matmul(np.matmul(x - miu, sigma.I), (x - miu).T)) + denominator = ((2.0*np.pi)**dim * np.linalg.det(sigma))**0.5 + return numerator / (denominator + np.exp(-213)) + + def calcEachGauss(xs, mius, sigs, ps): # 计算GMM对应的概率 + n = xs.shape[0] + res = np.zeros((n, self.k)) + for i in range(n): + sum = 0.0 + for k in range(self.k): + res[i][k] = ps[k] * calcGaussP(xs[i], mius[k], sigs[k]) + sum += res[i][k] + for k in range(self.k): + res[i][k] /= sum + return res + + def iteration(xs): + mius = self.mius + sigs = self.sigs + ps = self.ps + n = xs.shape[0] + dim = xs.shape[1] + gama = calcEachGauss(xs, mius, sigs, ps) + nks = np.sum(gama, axis=0) + nps = nks / n + nmius = np.matmul(gama.T, xs) / nks[:,np.newaxis] + + nsigma = np.zeros((self.k,dim,dim)) + for k in range(self.k): + tmp = np.zeros((dim,dim)) + for i in range(n): + t = np.matrix(xs[i] - mius[k]) + tmp += nks[k]*np.matmul(t.T,t) + nsigma[k] = tmp/nks[k] + + data = train_data.copy() + data = normlization(data) + n = data.shape[0] + dim = data.shape[1] + indexs = np.arange(n) + np.random.shuffle(indexs) + self.mius = data[indexs[:self.k]] + self.sigs = np.full((self.k, dim, dim), fill_value=np.diag(np.full(dim, 0.1))) + self.ps = np.ones(self.k) / self.k + for iter in range(64): + iteration(data) + + def predict(self, test_data): + def normlization(data): # 根据训练数据的标准化参数对测试数据进行标准化处理 + mydata = data/self.norm + return mydata + + def calcGaussP(x, miu, sig): # 计算高维高斯分布的概率 + dim = x.shape[0] + sigma = np.matrix(sig) + numerator = np.exp(-0.5 * np.matmul(np.matmul(x - miu, sigma.I), (x - miu).T)) + denominator = ((2.0*np.pi)**dim * np.linalg.det(sigma))**0.5 + return numerator / (denominator + np.exp(-213)) + + def calcEachGauss(xs, mius, sigs, ps): # 计算GMM对应的概率 + n = xs.shape[0] + res = np.zeros((n, self.k)) + for i in range(n): + sum = 0.0 + for k in range(self.k): + res[i][k] = ps[k] * calcGaussP(xs[i], mius[k], sigs[k]) + sum += res[i][k] + for k in range(self.k): + res[i][k] /= sum + return res + + data = test_data.copy() + data = normlization(data) + gama = calcEachGauss(data, self.mius, self.sigs, self.ps) + pred = np.argmax(gama, axis=1) + return pred + +class ClusteringAlgorithm: + + def __init__(self): + self.bestK = None + self.bestModel = None + + def fit(self, train_data): + def prepDist(data): # 准备距离矩阵 + n = data.shape[0] + dist = np.zeros((n,n)) + for i in range(n): + dist[i] = (np.sum((data[i] - data)**2, axis=1))**0.5 + return dist + + def calcDist(i, idx, dist): # 计算点簇距离 + if idx[0].shape[0] <=1 : + return 0 + avg = np.sum(dist[i, idx]) / (idx[0].shape[0] - 1) + return avg + + def calcSilhouetteCoefficient(n, labels, k, dist): #计算轮廓系数 + idxs = [] + for t in range(k): + idxs.append(np.where(labels==t)) + + S = np.zeros((n, )) + for i in range(n): + ai = calcDist(i, idxs[labels[i]], dist) + bi = np.inf + for t in range(k): + if t != labels[i]: + bi = min(bi, calcDist(i, idxs[t], dist)) + S[i] = (bi-ai)/max(bi, ai) + return np.mean(S) + + scores = [] + + data = train_data.copy() + n = data.shape[0] + dist = prepDist(data) + bestK = 2 + model = KMeans(2) + model.fit(data) + labels = model.predict(data) + bestModel = model + bestScore = calcSilhouetteCoefficient(n, labels, 2, dist) + scores.append(bestScore) + cnt = 5 + maxk = int((0.5*n)**0.5) + for i in range(3, maxk): + model = KMeans(i) + model.fit(data) + labels = model.predict(data) + score = calcSilhouetteCoefficient(n, labels, i, dist) + scores.append(score) + if score > bestScore: + bestK = i + bestScore = score + cnt = 5 + else: + cnt -= 1 + if cnt == 0: + break + + kmeans = KMeans(bestK) + kmeans.fit(data) + kmeansLabels = kmeans.predict(data) + kmeansScore = calcSilhouetteCoefficient(n, kmeansLabels, bestK, dist) + + gmm = GaussianMixture(bestK) + gmm.fit(data) + gmmLabels = gmm.predict(data) + gmmScore = calcSilhouetteCoefficient(n, kmeansLabels, bestK, dist) + if kmeansScore > gmmScore: + self.bestK = bestK + self.bestModel = kmeans + else: + self.bestK = bestK + self.bestModel = gmm + + return scores + + def predict(self, test_data): + return self.bestK, self.bestModel.predict(test_data) + +if __name__ == '__main__': + def generate(n): + np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) + if n <= 0: + print('error: n <= 0') + return + r = n/max(1, math.log(n, 2)) + sizs = [] + xs = [] + for i in range(n): + theta = i*(2*math.pi/n) + mean = (r*math.cos(theta) , r*math.sin(theta)) + rand_mat = np.random.rand(2, 2) + cov = rand_mat.transpose()*rand_mat + siz = random.randint(100, 200) + sizs.append(siz) + x = np.random.multivariate_normal(mean, cov, (siz, )) + xs.append(x) + siz = sum(sizs) + idx = np.arange(siz) + np.random.shuffle(idx) + data = np.concatenate(xs) + label = np.concatenate([np.ones((sizs[j], ), dtype=int)*j for j in range(n)]) + data = data[idx] + label = label[idx] + + t = 3 + train_data, test_data = data[:(siz//t)*(t-1),], data[(siz//t)*(t-1):,] + train_label, test_label = label[:(siz//t)*(t-1),], label[(siz//t)*(t-1):,] + + np.save("data.npy",( + (train_data, train_label), (test_data, test_label) + )) + + def read(): + (train_data, train_label), (test_data, test_label) = np.load("data.npy", allow_pickle=True) + return (train_data, train_label), (test_data, test_label) + + def genimg(n, data, label, name): + datas =[[] for i in range(n)] + for i in range(len(data)): + datas[label[i]].append(data[i]) + + for each in datas: + each = np.array(each) + plt.scatter(each[:, 0], each[:, 1]) + plt.savefig(f'img/{name}') + plt.close() + + if len(sys.argv) > 1 and sys.argv[1] == 'g': + try: + n = int(sys.argv[2]) + generate(n) + except: + print('error: wrong n') + elif len(sys.argv) > 1 and sys.argv[1] == 'd': + (train_data, train_label), (test_data, test_label) = read() + try: + n = int(sys.argv[2]) + genimg(n, train_data, train_label, 'train') + genimg(n, test_data, test_label, 'test') + except: + print('somthing goes wrong!') + elif len(sys.argv) > 2 and sys.argv[1] == 'p' and sys.argv[2] == 'KMeans': + (train_data, train_label), (test_data, test_label) = read() + try: + n = int(sys.argv[3]) + model = KMeans(n) + model.fit(train_data) + res = model.predict(test_data) + genimg(n, test_data, res, 'KMeans') + except: + print('somthing goes wrong!') + elif len(sys.argv) > 2 and sys.argv[1] == 'p' and sys.argv[2] == 'GaussianMixture': + (train_data, train_label), (test_data, test_label) = read() + try: + n = int(sys.argv[3]) + model = GaussianMixture(n) + model.fit(train_data) + res = model.predict(test_data) + genimg(n, test_data, res, 'GaussianMixture') + except: + print('somthing goes wrong!') + elif len(sys.argv) > 0 and sys.argv[1] == 'p': + (train_data, train_label), (test_data, test_label) = read() + try: + model = ClusteringAlgorithm() + scores = model.fit(train_data) + + nk = [x+2 for x in range(len(scores))] + l,=plt.plot(nk, scores, label='score') + plt.xlabel('n_clusters') + plt.ylabel('Silhouette Coefficient by KMeans') + plt.title('k-Silhouette Coefficient graph') + plt.legend() + plt.savefig('img/k-Silhouette Coefficient graph') + plt.close() + + n, res = model.predict(test_data) + genimg(n, test_data, res, 'ClusteringAlgorithm') + except: + print('somthing goes wrong!') + else: + print('Test OK') \ No newline at end of file diff --git a/assignment-3/submission/18307130213/tester_demo.py b/assignment-3/submission/18307130213/tester_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..19ec0e8091691d4aaaa6b53dbb695fde9e826d89 --- /dev/null +++ b/assignment-3/submission/18307130213/tester_demo.py @@ -0,0 +1,117 @@ +import numpy as np +import sys + +from source import KMeans, GaussianMixture + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z) + return (data, data), 3 + + +def data_2(): + train_data = np.array([ + [23, 12, 173, 2134], + [99, -12, -126, -31], + [55, -145, -123, -342], + ]) + return (train_data, train_data), 2 + + +def data_3(): + train_data = np.array([ + [23], + [-2999], + [-2955], + ]) + return (train_data, train_data), 2 + + +def test_with_n_clusters(data_fuction, algorithm_class): + (train_data, test_data), n_clusters = data_fuction() + model = algorithm_class(n_clusters) + model.fit(train_data) + res = model.predict(test_data) + assert len( + res.shape) == 1 and res.shape[0] == test_data.shape[0], "shape of result is wrong" + return res + + +def testcase_1_1(): + test_with_n_clusters(data_1, KMeans) + return True + + +def testcase_1_2(): + res = test_with_n_clusters(data_2, KMeans) + return res[0] != res[1] and res[1] == res[2] + + +def testcase_2_1(): + test_with_n_clusters(data_1, GaussianMixture) + return True + + +def testcase_2_2(): + res = test_with_n_clusters(data_3, GaussianMixture) + return res[0] != res[1] and res[1] == res[2] + + +def test_all(err_report=False): + testcases = [ + ["KMeans-1", testcase_1_1, 4], + ["KMeans-2", testcase_1_2, 4], + # ["KMeans-3", testcase_1_3, 4], + # ["KMeans-4", testcase_1_4, 4], + # ["KMeans-5", testcase_1_5, 4], + ["GMM-1", testcase_2_1, 4], + ["GMM-2", testcase_2_2, 4], + # ["GMM-3", testcase_2_3, 4], + # ["GMM-4", testcase_2_4, 4], + # ["GMM-5", testcase_2_5, 4], + ] + sum_score = sum([case[2] for case in testcases]) + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except Exception as e: + if err_report: + print("Error [{}] occurs in {}".format(str(e), case[0])) + res = 0 + score += res + print("+ {:14} {}/{}".format(case[0], res, case[2])) + print("{:16} {}/{}".format("FINAL SCORE", score, sum_score)) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--report": + test_all(True) + else: + test_all()