diff --git a/assignment-1/submission/18307130213/source.py b/assignment-1/submission/18307130213/source.py index 7a53de852289de55e74fe9a70c7ac56fdb5372ec..92e5c6368d81682d71df858ea6e51040ffc16e5b 100644 --- a/assignment-1/submission/18307130213/source.py +++ b/assignment-1/submission/18307130213/source.py @@ -121,7 +121,7 @@ def genimg(n, data, label, name): for each in datas: each = np.array(each) plt.scatter(each[:, 0], each[:, 1]) - plt.savefig(f'img/{name}') + plt.savefig(f"./img/{name}") plt.close() # plt.show() diff --git a/assignment-3/submission/17307130285/README.md b/assignment-3/submission/17307130285/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c921c0f33a094436f1ae3e0eda01a2ff22df5c9f --- /dev/null +++ b/assignment-3/submission/17307130285/README.md @@ -0,0 +1,38 @@ +# 作业-3 聚类算法 + +### 基础的 K-Means模型可视化结果 + +使用k均值聚类算法迭代求解的聚类,其步骤是随机选取K个对象作为初始的聚类中心,然后计算每个对象与各个种子聚类中心之间的距离,把每个对象分配给距离它最近的聚类中心。 + +其具体步骤如下: + +1. 簇分配:采用随机分配。 +2. 更新聚类中心。 +3. 计算距离,更新簇分配。 + +![k-means-normal](img/k-means-normal.png) + +- loss square + +![losssquare_kmeans_outcome](img/losssquare_kmeans_outcome.png) + +### 基础GMM模型可视化结果 + +- GMM=Gaussian-Mixed-Model 即高斯混合模型,多个高斯分布模型的加权组合可以用来拟合任意类型的分布。 + +![lossgmm_05_balance_l_kmeans_outcome](img/gmm-normal.png) + +- loss + +![loss-gmm-normal](img/loss-gmm-normal.png) + +### 自动选择聚簇数量的实验可视化结果 + +- Elbow Method 配合 K-Means 算法 +- 随着聚类数k的增大,每个聚类覆盖样本空间范围变小,聚合程度会逐渐提高,SSE逐渐变小。并且,当k小于真实聚类数时,由于k的增大会大幅增加每个簇的聚合程度,故SSE的下降幅度会很大,而当k到达真实聚类数时,再增加k所得到的聚合程度回报会迅速变小,所以SSE的下降幅度会骤减,然后随着k值的继续增大而趋于平缓,也就是说SSE和k的关系图是一个elbow的形状,而这个肘部对应的k值就是数据的真实聚类数。 + + + +![kmeans-EM](img/kmeans-EM.png) + +![loss-EM](img/loss-EM.png) \ No newline at end of file diff --git a/assignment-3/submission/17307130285/img/gmm-k.png b/assignment-3/submission/17307130285/img/gmm-k.png new file mode 100644 index 0000000000000000000000000000000000000000..0974e58ab988888c86804b63baaa4a5785243fd3 Binary files /dev/null and b/assignment-3/submission/17307130285/img/gmm-k.png differ diff --git a/assignment-3/submission/17307130285/img/gmm-normal.png b/assignment-3/submission/17307130285/img/gmm-normal.png new file mode 100644 index 0000000000000000000000000000000000000000..db6756e63db0e8e267f92d097b726ecb290be926 Binary files /dev/null and b/assignment-3/submission/17307130285/img/gmm-normal.png differ diff --git a/assignment-3/submission/17307130285/img/k-means-normal.png b/assignment-3/submission/17307130285/img/k-means-normal.png new file mode 100644 index 0000000000000000000000000000000000000000..ceadc5a84dce24e3614cd8ae276419bd4107bd97 Binary files /dev/null and b/assignment-3/submission/17307130285/img/k-means-normal.png differ diff --git a/assignment-3/submission/17307130285/img/kmeans-EM.png b/assignment-3/submission/17307130285/img/kmeans-EM.png new file mode 100644 index 0000000000000000000000000000000000000000..11f00075fb54853f392e105916737e5c0e5015be Binary files /dev/null and b/assignment-3/submission/17307130285/img/kmeans-EM.png differ diff --git a/assignment-3/submission/17307130285/img/loss-EM.png b/assignment-3/submission/17307130285/img/loss-EM.png new file mode 100644 index 0000000000000000000000000000000000000000..16258df6be63a37b319176a5c633988cc728b969 Binary files /dev/null and b/assignment-3/submission/17307130285/img/loss-EM.png differ diff --git a/assignment-3/submission/17307130285/img/loss-gmm-normal.png b/assignment-3/submission/17307130285/img/loss-gmm-normal.png new file mode 100644 index 0000000000000000000000000000000000000000..964e23d4ac41a8b0943f535f9fe80ca40d53a825 Binary files /dev/null and b/assignment-3/submission/17307130285/img/loss-gmm-normal.png differ diff --git a/assignment-3/submission/17307130285/img/losssquare_kmeans_outcome.png b/assignment-3/submission/17307130285/img/losssquare_kmeans_outcome.png new file mode 100644 index 0000000000000000000000000000000000000000..3d3a4be2d3b87087be1025f290b121f912131f43 Binary files /dev/null and b/assignment-3/submission/17307130285/img/losssquare_kmeans_outcome.png differ diff --git a/assignment-3/submission/17307130285/img/qfunc.png b/assignment-3/submission/17307130285/img/qfunc.png new file mode 100644 index 0000000000000000000000000000000000000000..9d935a6730186b0a318714f741a702f0de5e05ee Binary files /dev/null and b/assignment-3/submission/17307130285/img/qfunc.png differ diff --git a/assignment-3/submission/17307130285/source.py b/assignment-3/submission/17307130285/source.py new file mode 100644 index 0000000000000000000000000000000000000000..259492abd5f23fba70ef0401b7bd25aec1aa69bc --- /dev/null +++ b/assignment-3/submission/17307130285/source.py @@ -0,0 +1,164 @@ +#encoding: utf-8 +import numpy as np + +class KMeans: + + def initCentroids(self,dataSet,k): + centers = dataSet[:k] + return centers + + def cluster_classifier(self,centroids,dataSet): + distances = np.sqrt(((dataSet - centroids[:, np.newaxis])**2).sum(axis=2)) + return np.argmin(distances, axis=0) + + def __init__(self, n_clusters): + self.n_clusters = n_clusters + + def fit(self, train_data): + + Centroids = self.initCentroids(train_data,self.n_clusters) + temp_label = self.cluster_classifier(Centroids,train_data) + + while(True): + for i in range(self.n_clusters): + Centroids[i] = np.mean(train_data[temp_label==i]) + updated_label = self.cluster_classifier(Centroids,train_data) + if (updated_label.all() == temp_label.all()): + self.Centroids=Centroids + return Centroids,updated_label + break + else: + temp_label=updated_label + + def predict(self, test_data): + return self.cluster_classifier(self.Centroids,test_data) + + + +class GaussianMixture: + + def __init__(self, n_clusters): + self.num_clusters = n_clusters + self.max_iter = 100 + self.num = None + self.dim = None + self.X = None + self.Q = None + self.weight = None + self.covar = None + self.mu = None + self.labels = None + + def fit(self, train_data): + self._initialize_params(train_data) + while self.max_iter > 0: + # init param + # e-step + self.e_step() + # m-step + self.m_step() + self.max_iter -= 1 + self.labels = np.argmax(self.Q, axis=1) + + def predict(self, test_data): + out = [] + for i in range(self.num): + mxp = 0 + res = -1 + for k in range(self.num_clusters): + temp = self.weight[k]*self.multi_norm(test_data[i, :], self.mu[k, :], self.covar[k, :, :]) + if temp > mxp: + mxp = temp + res = k + out.append(res) + out = np.array(out) + return out + + def _initialize_params(self, X): + self.X = X # 分类的数据集 + self.num = X.shape[0] # 样本数目 + self.dim = X.shape[1] # 特征维度 + self.Q = np.zeros((self.num, self.num_clusters)) # 初始化各高斯分布对观测数据的响应度矩阵 + self.weight = [1 / self.num_clusters] * self.num_clusters # 初始化各高斯分布的权重为聚类数目分之一 + self.mu = np.random.uniform(0, 1, (self.num_clusters, self.dim)) * np.max(X, axis=0) # 随机产生均值向量 + self.covar = np.array([np.identity(self.dim) for _ in range(self.num_clusters)]) # 随机产生协方差矩阵 + + #e步 + def e_step(self): + for i in range(self.num): + q_i = [] + for k in range(self.num_clusters): + postProb = self.multi_norm(self.X[i, :], self.mu[k, :], self.covar[k, :, :]) + q_i.append(self.weight[k] * postProb + 1e-32) + self.Q[i, :] = np.array(q_i) / np.sum(q_i) + + #返回多维高斯分布的结果 + def multi_norm(self, x, mu, sigma): + det = np.linalg.det(sigma) + inv = np.matrix(np.linalg.inv(sigma)) + x_mu = np.matrix(x - mu).T + const = 1 / (((2 * np.pi) ** (len(x) / 2)) * (det ** (1 / 2))) + exp = -0.5 * x_mu.T * inv * x_mu + return float(const * np.exp(exp)) + + # m步,更新参数 + def m_step(self): + # update weight 更新权值矩阵 + self.weight = np.mean(self.Q, axis=0) + + # update mu 更新均值向量 + temp = [] + for k in range(self.num_clusters): + up = np.zeros(self.dim) + for j in range(self.num): + up += self.Q[j, k] * np.array(self.X[j, :]) + down = np.sum(self.Q[:, k]) + temp.append(up / down) + self.mu = np.array(temp) + + # update covar 更新协方差矩阵 + for k in range(self.num_clusters): + up = np.zeros((self.dim, self.dim)) + for j in range(self.num): + x_mu = np.matrix(self.X[j, :] - self.mu[k, :]) + # print(x_mu.T*x_mu) + up += self.Q[j, k] * (x_mu.T * x_mu) + # print(up) + down = np.sum(self.Q[:, k]) + var = np.array(up / down) + self.covar[k, :, :] = var + +class ClusteringAlgorithm: + + def __init__(self): + self.cluster_num = 2 + self.model = None + + def fit(self, train_data, upper): + sum = np.zeros(upper-2) + for i in range(2, upper): + kmeans = KMeans(i) + kmeans.fit(train_data) + m = kmeans.labels + c = kmeans.centers + for j in range(len(train_data)): + c1 = c[int(m[j])] + x1 = train_data[j] + sum[i-2] += np.sum(np.square(c1-x1)) + c = plt.plot(np.arange(2, upper), sum) + plt.show() + n = len(sum) + mx = 0 + for i in range(1, n - 1): + del1 = (sum[0] - sum[i]) / i + del2 = (sum[i] - sum[n - 1]) / (n - 1 - i) + delta = del1 - del2 + # 找到符合要求,并且插值最大的 K + if delta > 0.3 * max(del1, del2) and delta > mx: + mx = delta + self.cluster_num = i + 2 + self.model = KMeans(self.cluster_num) + self.model.fit(train_data) + + def predict(self, test_data): + return self.model.predict(test_data) \ No newline at end of file