diff --git a/assignment-3/submission/17307100038/README.md b/assignment-3/submission/17307100038/README.md new file mode 100644 index 0000000000000000000000000000000000000000..73518100ac4d0395ff0df12978f238fb959d1718 --- /dev/null +++ b/assignment-3/submission/17307100038/README.md @@ -0,0 +1,161 @@ +# Assignment 3 + +## 1.KMeans + +KMeans,首先随机化得到的k个中心。再每个点按照距离各中心远近划分类别,在对每个类更新新的中心。循环上述过程至收敛。 + +本实验中的KMean类包含以下函数: + +```markdown +model.fit(train_data) #主程序,对各中心初始化,循环训练模型 + +model.predict(test_data) # 返回一个数组,表示各个数据点的分类 + +model.initcenters(k,dataSet) #初始化得到各个中心 +``` + +以test_demo中的data1为例,训练结果如下图: + +![kmean_1](https://gitee.com/xyhdsg10022/prml-21-spring/raw/master/assignment-3/submission/17307100038/img/kmean_1.jpg) + +更改数据集,对应data2的均值、方差如下: +$$ +\mu = + \left[ + \begin{matrix} + 4 &4 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + -1 & 11 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + -8 & 10 + \end{matrix} + \right] + +$$ + +$$ +\Sigma = + \left[ + \begin{matrix} + 9 & 0 \\ + 0 & 9 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 1.5 & 0 \\ + 0 & 1.5 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 1.5 & 0 \\ + 0 & 1.5 + \end{matrix} + \right] +$$ + +对应结果如图: + +![kmean_2](https://gitee.com/xyhdsg10022/prml-21-spring/raw/master/assignment-3/submission/17307100038/img/kmean_2.jpg) + +更改数据集,对应data3的均值、方差如下: +$$ +\mu = + \left[ + \begin{matrix} + 5&10 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + -5 & -2 + \end{matrix} + \right] + \mu = + \left[ + \begin{matrix} + 10 & -5 + \end{matrix} + \right] +$$ + +$$ +\Sigma = + \left[ + \begin{matrix} + 9 & 6 \\ + 8 & 5 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 9 & 9 \\ + 12 & 5 + \end{matrix} + \right] + \Sigma = + \left[ + \begin{matrix} + 9 & 9 \\ + 16 & 25 + \end{matrix} + \right] +$$ + +对应的结果: + +![kmnea_3](https://gitee.com/xyhdsg10022/prml-21-spring/raw/master/assignment-3/submission/17307100038/img/kmnea_3.png) + +## 2.GMM + +GMM模型用em算法。 + +e步优化参数gamma,m步优化均值、方差与pi。本次实验中对应的函数如下: + +```markdown +model.fit(train_data) #主程序,对各中心初始化,循环训练模型 + +model.predict(test_data) # 返回一个数组,表示各个数据点的分类 + +multivariate_normal(train_data,mean_vector, covariance_matrix) #生成多元正态分布 + +normal(train_data, mean, covariance) #生成正态分布 +``` + +### 3.ClusteringAlgorithm + +本类对KMeans实现k值的选取。选取的方式如下: + +1、对应取值范围内的每个K值: + +​ 1.1分别训练KMeans模型, + +​ 1.2 对每个KMeans模型中的点,计算与中心的距离之和intertia + +2.绘制k - inertia曲线图,以elbow规则选取最优值k,自动化选取方法为: + +​ 2.2 选取第一个点与最后一个点连线为基准线 + +​ 2.2 计算其余每个点与基准线构成三角形面积 + +​ 2.3 选取面积最大对应的k值,即elbow的拐点 + +以data1为数据集,模型迭代次数500次,k值范围在2-5对应的k-inertia值如下图 + +![1](https://gitee.com/xyhdsg10022/prml-21-spring/raw/master/assignment-3/submission/17307100038/img/1.jpg) + +对应的模型自动选择k值为3,与人工判别相同。 + diff --git a/assignment-3/submission/17307100038/img/1.jpg b/assignment-3/submission/17307100038/img/1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9abd4a3f1de8c1079e4d94e927f2f84d76d7037e Binary files /dev/null and b/assignment-3/submission/17307100038/img/1.jpg differ diff --git a/assignment-3/submission/17307100038/img/kmean_1.jpg b/assignment-3/submission/17307100038/img/kmean_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5beaa7620e7aee58089b15e694b47333204592d8 Binary files /dev/null and b/assignment-3/submission/17307100038/img/kmean_1.jpg differ diff --git a/assignment-3/submission/17307100038/img/kmean_2.jpg b/assignment-3/submission/17307100038/img/kmean_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..46ea8aba4c221388d66fe90e7770cf9f483db0d9 Binary files /dev/null and b/assignment-3/submission/17307100038/img/kmean_2.jpg differ diff --git a/assignment-3/submission/17307100038/img/kmnea_3.png b/assignment-3/submission/17307100038/img/kmnea_3.png new file mode 100644 index 0000000000000000000000000000000000000000..8d0e3c4f76bb708e3c23c90a3ab5c58e35382f72 Binary files /dev/null and b/assignment-3/submission/17307100038/img/kmnea_3.png differ diff --git a/assignment-3/submission/17307100038/source.py b/assignment-3/submission/17307100038/source.py new file mode 100644 index 0000000000000000000000000000000000000000..ac173150e87ff085a7868b14361bdfecf70aa763 --- /dev/null +++ b/assignment-3/submission/17307100038/source.py @@ -0,0 +1,179 @@ +import numpy as np + +class KMeans: + + def __init__(self, n_clusters, max_iter = 500): + self.n_clusters = n_clusters + self.max_iter = max_iter + self.labels = None + self.centers = None + + + def initcenters(self, k, dataSet): + # 初始化每个k个中心 + m = dataSet.shape[0] + shape = list(dataSet.shape) + shape[0] = k + centers = np.zeros(shape) + xrange = np.r_[0:m] + for i in range(0,k): + index = int(np.random.choice(xrange, 1)) + xrange = np.delete(xrange, index) # 不放回抽样,防止两个点初始化在同一个位置 + centers[i,:] = dataSet[index, :] + return centers + + def euc(self, vec1, vec2): + distance = np.sum(pow(vec1 - vec2, 2)) + return distance + + def fit(self, train_data): + self.centers = self.initcenters(self.n_clusters, train_data) # 初始化点 + m = train_data.shape[0] + dismatrix = np.mat(np.zeros((m, 2))) # 生成一分矩阵记录label与distance + + for iteration in range(self.max_iter): + for i in range(0,m): + # 初始化一个距离 + n = train_data[i, :].reshape(-1) + minDistance = pow(train_data[i, :].max(), 2) *n.shape[0] + minIndex = 0 + + # 循环获得最近的center + for j in range(0,self.n_clusters): + distance = self.euc(self.centers[j, :], train_data[i, :]) + if distance < minDistance: + minDistance = distance + minIndex = j + dismatrix[i, :] = minIndex, minDistance + # 更新中心 + for j in range(0,self.n_clusters): + pointsInCluster = train_data[np.nonzero(dismatrix[:,0].A == j)[0]] # 选取所有该类别的点 + self.centers[j, :] = np.mean(pointsInCluster, axis=0) # 去这些点的中心 + + def predict(self, test_data): + m_td = test_data.shape[0] + labels = np.zeros(m_td) + for i in range(0,m_td): + dist = [self.euc(self.centers[j, :], test_data[i, :]) for j in range(self.n_clusters)] # 这个点与每个center的距离 + + labels[i] = dist.index(min(dist)) # 最近点为对应的label + return labels + + +class GaussianMixture: + + def __init__(self, n_components, max_iter=300): + self.n_componets = n_components + self.max_iter = max_iter + self.pi = [1 / self.n_componets for each in range(self.n_componets)] # 初始化PI + self.tail = 1e-10 + + def multivariate_normal(self, train_data, mean_vector, covariance_matrix): + # 计算多元正态分布 + return (2 * np.pi) ** (-len(train_data) / 2) * np.linalg.det(covariance_matrix) ** (-1 / 2) * np.exp( + -np.dot(np.dot((train_data - mean_vector).T, np.linalg.inv(covariance_matrix)), + (train_data - mean_vector)) / 2) + + def normal(self, train_data, mean, covariance): + # 正态分布 + return (1/((2*np.pi) ** 0.5 * covariance + 1e-10)) * np.exp(-(train_data-mean)**2/(2*covariance**2 + 1e-10)) + + + def fit(self, train_data): + # 初始化均值、方差 + new_X = np.array_split(train_data, self.n_componets) + self.mean_vector = [np.mean(x, axis=0) for x in new_X] + self.covariance_matrixes = [np.cov(x.T) for x in new_X] + del new_X # 删除节省内存 + + for iteration in range(self.max_iter): + '''E-step ''' + self.gamma = np.zeros((len(train_data), self.n_componets)) # 初始化gamma系数矩阵 + for n in range(len(train_data)): # 计算每个gamma的值 + for k in range(self.n_componets): + self.gamma[n][k] = self.pi[k] * self.multivariate_normal(train_data[n], self.mean_vector[k], + self.covariance_matrixes[k]) + self.gamma[n][k] /= (sum([self.pi[j] * self.multivariate_normal(train_data[n], self.mean_vector[j], + self.covariance_matrixes[j]) for j in + range(self.n_componets)]) + self.tail) + N = np.sum(self.gamma, axis=0) + + '''M-step''' + # 更新均值 + self.mean_vector = np.zeros((self.n_componets, len(train_data[0]))) # 初始化一个用于存放mean值 + for k in range(self.n_componets): # 计算每个值 + for n in range(len(train_data)): + self.mean_vector[k] += self.gamma[n][k] * train_data[n] + self.mean_vector = [1 / N[k] * self.mean_vector[k] for k in range(self.n_componets)] + # 更新方差 + for k in range(self.n_componets): + self.covariance_matrixes[k] = np.cov(train_data.T, aweights=(self.gamma[:, k]), ddof=0) + self.covariance_matrixes = [1 / N[k] * self.covariance_matrixes[k] for k in range(self.n_componets)] + self.pi = [N[k] / len(train_data) for k in range(self.n_componets)] + + def predict(self, test_data): + # 计算每个点属于各类的概率 + probas = [] + for n in range(len(test_data)): + probas.append([self.multivariate_normal(test_data[n], self.mean_vector[k], self.covariance_matrixes[k]) + for k in range(self.n_componets)]) + labels = np.argmax(np.array(probas), axis=1) # 计算所属的类别 + return labels + + +class ClusteringAlgorithm: + + def __init__(self): + self.n_clusters = 2 + self.model = None + + def line(self, pointA, pointB): + length = np.sqrt(np.sum(pow(pointA - pointB, 2))) + return length + + def area(self, lineA, lineB, lineC): + s = (lineA + lineB + lineC) / 2 + area = (s * (s - lineA) * (s - lineB) * (s - lineC)) ** 0.5 + return area + + def fit(self, train_data, high=5): + ''' 对每个k值对应的聚类结果,计算每个点距离中心的距离之和。再以elbow规则判别最优的k值。其中k值选取距离头尾两种结果连线最远的点''' + # 计算每个的结果与类中心距离之和 + res_list, num_list = [], [] + for i in range(2, high + 1): + model = KMeans(i, max_iter=100) + model.fit(train_data) + res = model.predict(train_data) + + centers = model.centers + inertia = 0 + for j in range(len(centers)): + new_data = train_data[np.where(res == j)] # 获得这个类别的所有点 + center = centers[j] + center = np.tile(center, len(new_data)).reshape(len(new_data), 2) + ss = np.sum(np.sqrt(pow(new_data - center, 2))) + inertia += ss + res_list.append(inertia) + num_list.append(i) + + # plt.plot(num_list,res_list) + # plt.show() + + # 计算选取拐点 + point1 = np.array([num_list[0], res_list[0]]) + point2 = np.array([num_list[-1], res_list[-1]]) + linec = self.line(point1, point2) + area_list = [] + for i in range(3, high): + pointx = np.array([num_list[i - 2], res_list[i - 2]]) + linea, lineb = self.line(point1, pointx), self.line(point2, pointx) + area = self.area(linea, lineb, linec) + area_list.append(area) + + self.n_clusters = area_list.index(max(area_list)) + 3 # 取最大值所在位置+3 + # print(self.n_clusters) + self.model = KMeans(self.n_clusters) + self.model.fit(train_data) + + def predict(self, test_data): + return self.model.predict(test_data) diff --git a/assignment-3/submission/17307100038/tester_demo.py b/assignment-3/submission/17307100038/tester_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..4343c2f0e13b68a0fd633f828e29335b3c35330c --- /dev/null +++ b/assignment-3/submission/17307100038/tester_demo.py @@ -0,0 +1,117 @@ +import numpy as np +import sys + +from source import KMeans, GaussianMixture + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z) + return (data, data), 3 + + +def data_2(): + train_data = np.array([ + [23, 12, 173, 2134], + [99, -12, -126, -31], + [55, -145, -123, -342], + ]) + return (train_data, train_data), 2 + + +def data_3(): + train_data = np.array([ + [23], + [-2999], + [-2955], + ]) + return (train_data, train_data), 2 + + +def test_with_n_clusters(data_fuction, algorithm_class): + (train_data, test_data), n_clusters = data_fuction() + model = algorithm_class(n_clusters) + model.fit(train_data) + res = model.predict(test_data) + assert len( + res.shape) == 1 and res.shape[0] == test_data.shape[0], "shape of result is wrong" + return res + + +def testcase_1_1(): + test_with_n_clusters(data_1, KMeans) + return True + + +def testcase_1_2(): + res = test_with_n_clusters(data_2, KMeans) + return res[0] != res[1] and res[1] == res[2] + + +def testcase_2_1(): + test_with_n_clusters(data_1, GaussianMixture) + return True + + +def testcase_2_2(): + res = test_with_n_clusters(data_3, GaussianMixture) + return res[0] != res[1] and res[1] == res[2] + + +def test_all(err_report=False): + testcases = [ + ["KMeans-1", testcase_1_1, 4], + ["KMeans-2", testcase_1_2, 4], + # ["KMeans-3", testcase_1_3, 4], + # ["KMeans-4", testcase_1_4, 4], + # ["KMeans-5", testcase_1_5, 4], + ["GMM-1", testcase_2_1, 4], + ["GMM-2", testcase_2_2, 4], + # ["GMM-3", testcase_2_3, 4], + # ["GMM-4", testcase_2_4, 4], + # ["GMM-5", testcase_2_5, 4], + ] + sum_score = sum([case[2] for case in testcases]) + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except Exception as e: + if err_report: + print("Error [{}] occurs in {}".format(str(e), case[0])) + res = 0 + score += res + print("+ {:14} {}/{}".format(case[0], res, case[2])) + print("{:16} {}/{}".format("FINAL SCORE", score, sum_score)) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--report": + test_all(True) + else: + test_all()