diff --git a/assignment-1/submission/17307130285/README.md b/assignment-1/submission/17307130285/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8353191ae181196f0a23717212df38805c52c8a7 --- /dev/null +++ b/assignment-1/submission/17307130285/README.md @@ -0,0 +1,105 @@ +# assignment-1 KNN + +## 实验探究 + +生成3个二维高斯分布,拆分数据集为训练集和测试集(train/test = 4),循环k = 1,2,...,10 选出最好的k值,并进行测试。 + +- 训练数据 + +![fig_train1](img/fig_train1.jpg) + +- 测试数据 + +![fig_test1](img/fig_test1.jpg) + +result: best_k = 8, accuracy = 89.95% + +```python +class KNN: + + def __init__(self): + self.train_data = None + self.train_labels = None + self.k = None + + def split_data(self,x,y,rate): + shuf_indexes = np.random.permutation(len(x)) + test_size = int(len(x) * rate) + train_index = shuf_indexes[test_size:] + test_index = shuf_indexes[:test_size] + return x[train_index], x[test_index], y[train_index], y[test_index] + + def distance(self,v1,v2): + + dis2 = (v1-v2)**2 + total = np.sum(dis2) + return total**(0.5) + + def fit(self, train_data, train_label): + + + mu = np.mean(train_data, axis=0) + sigma = np.std(train_data, axis=0) + train_data = (train_data - mu) / sigma + + X_train, X_test, Y_train, Y_test = self.split_data(train_data,train_label,0.3) + + best_k=0 + k_candi=0; + for k in range(1,10): + + true_couter=0 + for test_counter in range(0,len(X_test)): + pos_vec_list=[] + + for train_counter in range(0,len(X_train)): + pos_vec = np.array([self.distance(X_test[test_counter],X_train[train_counter]),Y_train[train_counter]]) + pos_vec_list.append(pos_vec) + + pos_vec_list = np.array(pos_vec_list) + pos_vec_list_sorted = pos_vec_list[np.lexsort(pos_vec_list[:,::-1].T)] + result_list = pos_vec_list_sorted[:k][:,1] + + + + label = int(result_list[np.argmax(result_list)]) + + + if (label == Y_test[test_counter] ): + true_couter=true_couter+1 + + + if (true_couter >= best_k): + best_k = true_couter + k_candi = k + + self.k = k_candi + self.train_data = train_data + self.train_labels = train_label + return self.k + + def predict(self, test_data): + test_label=[] + result_list=[] + + mu = np.mean(test_data, axis=0) + sigma = np.std(test_data, axis=0) + test_data = (test_data - mu) / sigma + + for i in range (0,len(test_data)): + pos_vec_list=[] + for m in range(0,len(self.train_data)): + pos_vec = np.array([self.distance(self.train_data[m],test_data[i]),self.train_labels[m]]) + pos_vec_list.append(pos_vec) + + + + pos_vec_list = np.array(pos_vec_list) + pos_vec_list_sorted = pos_vec_list[np.lexsort(pos_vec_list[:,::-1].T)] + + result_list = pos_vec_list_sorted[:(self.k)][:,1] + test_label.append(result_list[np.argmax(result_list)]) + + return test_label +``` + diff --git a/assignment-1/submission/17307130285/source.py b/assignment-1/submission/17307130285/source.py new file mode 100644 index 0000000000000000000000000000000000000000..823f08d5333779f4b9abb13707cd51f732f06d5f --- /dev/null +++ b/assignment-1/submission/17307130285/source.py @@ -0,0 +1,146 @@ +import numpy as np +import matplotlib.pyplot as plt +class KNN: + + def __init__(self): + self.train_data = None + self.train_labels = None + self.k = None + + def split_data(self,x,y,rate): + shuf_indexes = np.random.permutation(len(x)) + test_size = int(len(x) * rate) + train_index = shuf_indexes[test_size:] + test_index = shuf_indexes[:test_size] + return x[train_index], x[test_index], y[train_index], y[test_index] + + def distance(self,v1,v2): + + dis2 = (v1-v2)**2 + total = np.sum(dis2) + return total**(0.5) + + def fit(self, train_data, train_label): + + + mu = np.mean(train_data, axis=0) + sigma = np.std(train_data, axis=0) + train_data = (train_data - mu) / sigma + + X_train, X_test, Y_train, Y_test = self.split_data(train_data,train_label,0.3) + + best_k=0 + k_candi=0; + for k in range(1,10): + + true_couter=0 + for test_counter in range(0,len(X_test)): + pos_vec_list=[] + + for train_counter in range(0,len(X_train)): + pos_vec = np.array([self.distance(X_test[test_counter],X_train[train_counter]),Y_train[train_counter]]) + pos_vec_list.append(pos_vec) + + pos_vec_list = np.array(pos_vec_list) + pos_vec_list_sorted = pos_vec_list[np.lexsort(pos_vec_list[:,::-1].T)] + result_list = pos_vec_list_sorted[:k][:,1] + + + + label = int(result_list[np.argmax(result_list)]) + + + if (label == Y_test[test_counter] ): + true_couter=true_couter+1 + + + if (true_couter >= best_k): + best_k = true_couter + k_candi = k + + self.k = k_candi + self.train_data = train_data + self.train_labels = train_label + return self.k + + def predict(self, test_data): + test_label=[] + result_list=[] + + mu = np.mean(test_data, axis=0) + sigma = np.std(test_data, axis=0) + test_data = (test_data - mu) / sigma + + for i in range (0,len(test_data)): + pos_vec_list=[] + for m in range(0,len(self.train_data)): + pos_vec = np.array([self.distance(self.train_data[m],test_data[i]),self.train_labels[m]]) + pos_vec_list.append(pos_vec) + + + + pos_vec_list = np.array(pos_vec_list) + pos_vec_list_sorted = pos_vec_list[np.lexsort(pos_vec_list[:,::-1].T)] + + result_list = pos_vec_list_sorted[:(self.k)][:,1] + test_label.append(result_list[np.argmax(result_list)]) + + return test_label + +def generate (amount_1,amount_2,amount_3): + + + mean = (2, 3) + cov = np.array([[1,0], [0, 1]]) + x = np.random.multivariate_normal(mean, cov, (amount_1,)) + + mean = (4, 6) + cov = np.array([[2, 1], [1, 2]]) + y = np.random.multivariate_normal(mean, cov, (amount_2,)) + + mean = (7, 8) + cov = np.array([[2.1,2.2],[1.1,1.5]]) + z = np.random.multivariate_normal(mean, cov, (amount_3,)) + + + data = np.concatenate([x,y,z]) + + label = np.concatenate([ + np.zeros((amount_1,),dtype=int), + np.ones((amount_2,),dtype=int), + np.ones((amount_3,),dtype=int)*2 + ]) + + return model.split_data(data,label,0.2) + + +def display(x,y): + type1_x = []; type1_y = [] + type2_x = []; type2_y = [] + type3_x = []; type3_y = [] + + plt.figure(figsize=(8,6)) + + for i in range(0,len(x)): + if(y[i]==0): + type1_x.append(x[i][0]) + type1_y.append(x[i][1]) + if(y[i]==1): + type2_x.append(x[i][0]) + type2_y.append(x[i][1]) + if(y[i]==2): + type3_x.append(x[i][0]) + type3_y.append(x[i][1]) + + fig = plt.figure(figsize = (10, 6)) + ax = fig.add_subplot(111) + + type1 = ax.scatter(type1_x, type1_y, s = 30, c = 'brown') + type2 = ax.scatter(type2_x, type2_y, s = 30, c = 'lime') + type3 = ax.scatter(type3_x, type3_y, s = 30, c = "red") + + + + ax.legend((type1, type2, type3), ("g1", "g2", "g3"), loc = 0) + + plt.show() diff --git a/assignment-1/submission/18307130213/source.py b/assignment-1/submission/18307130213/source.py index 7a53de852289de55e74fe9a70c7ac56fdb5372ec..92e5c6368d81682d71df858ea6e51040ffc16e5b 100644 --- a/assignment-1/submission/18307130213/source.py +++ b/assignment-1/submission/18307130213/source.py @@ -121,7 +121,7 @@ def genimg(n, data, label, name): for each in datas: each = np.array(each) plt.scatter(each[:, 0], each[:, 1]) - plt.savefig(f'img/{name}') + plt.savefig(f"./img/{name}") plt.close() # plt.show() diff --git a/assignment-3/submission/17307130285/img/gmm-k.png b/assignment-3/submission/17307130285/img/gmm-k.png new file mode 100644 index 0000000000000000000000000000000000000000..0974e58ab988888c86804b63baaa4a5785243fd3 Binary files /dev/null and b/assignment-3/submission/17307130285/img/gmm-k.png differ diff --git a/assignment-3/submission/17307130285/img/gmm-normal.png b/assignment-3/submission/17307130285/img/gmm-normal.png new file mode 100644 index 0000000000000000000000000000000000000000..db6756e63db0e8e267f92d097b726ecb290be926 Binary files /dev/null and b/assignment-3/submission/17307130285/img/gmm-normal.png differ diff --git a/assignment-3/submission/17307130285/img/k-means-normal.png b/assignment-3/submission/17307130285/img/k-means-normal.png new file mode 100644 index 0000000000000000000000000000000000000000..ceadc5a84dce24e3614cd8ae276419bd4107bd97 Binary files /dev/null and b/assignment-3/submission/17307130285/img/k-means-normal.png differ diff --git a/assignment-3/submission/17307130285/img/kmeans-EM.png b/assignment-3/submission/17307130285/img/kmeans-EM.png new file mode 100644 index 0000000000000000000000000000000000000000..11f00075fb54853f392e105916737e5c0e5015be Binary files /dev/null and b/assignment-3/submission/17307130285/img/kmeans-EM.png differ diff --git a/assignment-3/submission/17307130285/img/loss-EM.png b/assignment-3/submission/17307130285/img/loss-EM.png new file mode 100644 index 0000000000000000000000000000000000000000..16258df6be63a37b319176a5c633988cc728b969 Binary files /dev/null and b/assignment-3/submission/17307130285/img/loss-EM.png differ diff --git a/assignment-3/submission/17307130285/img/loss-gmm-normal.png b/assignment-3/submission/17307130285/img/loss-gmm-normal.png new file mode 100644 index 0000000000000000000000000000000000000000..964e23d4ac41a8b0943f535f9fe80ca40d53a825 Binary files /dev/null and b/assignment-3/submission/17307130285/img/loss-gmm-normal.png differ diff --git a/assignment-3/submission/17307130285/img/losssquare_kmeans_outcome.png b/assignment-3/submission/17307130285/img/losssquare_kmeans_outcome.png new file mode 100644 index 0000000000000000000000000000000000000000..3d3a4be2d3b87087be1025f290b121f912131f43 Binary files /dev/null and b/assignment-3/submission/17307130285/img/losssquare_kmeans_outcome.png differ diff --git a/assignment-3/submission/17307130285/img/qfunc.png b/assignment-3/submission/17307130285/img/qfunc.png new file mode 100644 index 0000000000000000000000000000000000000000..9d935a6730186b0a318714f741a702f0de5e05ee Binary files /dev/null and b/assignment-3/submission/17307130285/img/qfunc.png differ diff --git a/assignment-3/submission/17307130285/source.py b/assignment-3/submission/17307130285/source.py new file mode 100644 index 0000000000000000000000000000000000000000..259492abd5f23fba70ef0401b7bd25aec1aa69bc --- /dev/null +++ b/assignment-3/submission/17307130285/source.py @@ -0,0 +1,164 @@ +#encoding: utf-8 +import numpy as np + +class KMeans: + + def initCentroids(self,dataSet,k): + centers = dataSet[:k] + return centers + + def cluster_classifier(self,centroids,dataSet): + distances = np.sqrt(((dataSet - centroids[:, np.newaxis])**2).sum(axis=2)) + return np.argmin(distances, axis=0) + + def __init__(self, n_clusters): + self.n_clusters = n_clusters + + def fit(self, train_data): + + Centroids = self.initCentroids(train_data,self.n_clusters) + temp_label = self.cluster_classifier(Centroids,train_data) + + while(True): + for i in range(self.n_clusters): + Centroids[i] = np.mean(train_data[temp_label==i]) + updated_label = self.cluster_classifier(Centroids,train_data) + if (updated_label.all() == temp_label.all()): + self.Centroids=Centroids + return Centroids,updated_label + break + else: + temp_label=updated_label + + def predict(self, test_data): + return self.cluster_classifier(self.Centroids,test_data) + + + +class GaussianMixture: + + def __init__(self, n_clusters): + self.num_clusters = n_clusters + self.max_iter = 100 + self.num = None + self.dim = None + self.X = None + self.Q = None + self.weight = None + self.covar = None + self.mu = None + self.labels = None + + def fit(self, train_data): + self._initialize_params(train_data) + while self.max_iter > 0: + # init param + # e-step + self.e_step() + # m-step + self.m_step() + self.max_iter -= 1 + self.labels = np.argmax(self.Q, axis=1) + + def predict(self, test_data): + out = [] + for i in range(self.num): + mxp = 0 + res = -1 + for k in range(self.num_clusters): + temp = self.weight[k]*self.multi_norm(test_data[i, :], self.mu[k, :], self.covar[k, :, :]) + if temp > mxp: + mxp = temp + res = k + out.append(res) + out = np.array(out) + return out + + def _initialize_params(self, X): + self.X = X # 分类的数据集 + self.num = X.shape[0] # 样本数目 + self.dim = X.shape[1] # 特征维度 + self.Q = np.zeros((self.num, self.num_clusters)) # 初始化各高斯分布对观测数据的响应度矩阵 + self.weight = [1 / self.num_clusters] * self.num_clusters # 初始化各高斯分布的权重为聚类数目分之一 + self.mu = np.random.uniform(0, 1, (self.num_clusters, self.dim)) * np.max(X, axis=0) # 随机产生均值向量 + self.covar = np.array([np.identity(self.dim) for _ in range(self.num_clusters)]) # 随机产生协方差矩阵 + + #e步 + def e_step(self): + for i in range(self.num): + q_i = [] + for k in range(self.num_clusters): + postProb = self.multi_norm(self.X[i, :], self.mu[k, :], self.covar[k, :, :]) + q_i.append(self.weight[k] * postProb + 1e-32) + self.Q[i, :] = np.array(q_i) / np.sum(q_i) + + #返回多维高斯分布的结果 + def multi_norm(self, x, mu, sigma): + det = np.linalg.det(sigma) + inv = np.matrix(np.linalg.inv(sigma)) + x_mu = np.matrix(x - mu).T + const = 1 / (((2 * np.pi) ** (len(x) / 2)) * (det ** (1 / 2))) + exp = -0.5 * x_mu.T * inv * x_mu + return float(const * np.exp(exp)) + + # m步,更新参数 + def m_step(self): + # update weight 更新权值矩阵 + self.weight = np.mean(self.Q, axis=0) + + # update mu 更新均值向量 + temp = [] + for k in range(self.num_clusters): + up = np.zeros(self.dim) + for j in range(self.num): + up += self.Q[j, k] * np.array(self.X[j, :]) + down = np.sum(self.Q[:, k]) + temp.append(up / down) + self.mu = np.array(temp) + + # update covar 更新协方差矩阵 + for k in range(self.num_clusters): + up = np.zeros((self.dim, self.dim)) + for j in range(self.num): + x_mu = np.matrix(self.X[j, :] - self.mu[k, :]) + # print(x_mu.T*x_mu) + up += self.Q[j, k] * (x_mu.T * x_mu) + # print(up) + down = np.sum(self.Q[:, k]) + var = np.array(up / down) + self.covar[k, :, :] = var + +class ClusteringAlgorithm: + + def __init__(self): + self.cluster_num = 2 + self.model = None + + def fit(self, train_data, upper): + sum = np.zeros(upper-2) + for i in range(2, upper): + kmeans = KMeans(i) + kmeans.fit(train_data) + m = kmeans.labels + c = kmeans.centers + for j in range(len(train_data)): + c1 = c[int(m[j])] + x1 = train_data[j] + sum[i-2] += np.sum(np.square(c1-x1)) + c = plt.plot(np.arange(2, upper), sum) + plt.show() + n = len(sum) + mx = 0 + for i in range(1, n - 1): + del1 = (sum[0] - sum[i]) / i + del2 = (sum[i] - sum[n - 1]) / (n - 1 - i) + delta = del1 - del2 + # 找到符合要求,并且插值最大的 K + if delta > 0.3 * max(del1, del2) and delta > mx: + mx = delta + self.cluster_num = i + 2 + self.model = KMeans(self.cluster_num) + self.model.fit(train_data) + + def predict(self, test_data): + return self.model.predict(test_data) \ No newline at end of file diff --git a/assignment-3/submission/17307130285/tester_demo.py b/assignment-3/submission/17307130285/tester_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..19ec0e8091691d4aaaa6b53dbb695fde9e826d89 --- /dev/null +++ b/assignment-3/submission/17307130285/tester_demo.py @@ -0,0 +1,117 @@ +import numpy as np +import sys + +from source import KMeans, GaussianMixture + + +def shuffle(*datas): + data = np.concatenate(datas) + label = np.concatenate([ + np.ones((d.shape[0],), dtype=int)*i + for (i, d) in enumerate(datas) + ]) + N = data.shape[0] + idx = np.arange(N) + np.random.shuffle(idx) + data = data[idx] + label = label[idx] + return data, label + + +def data_1(): + mean = (1, 2) + cov = np.array([[73, 0], [0, 22]]) + x = np.random.multivariate_normal(mean, cov, (800,)) + + mean = (16, -5) + cov = np.array([[21.2, 0], [0, 32.1]]) + y = np.random.multivariate_normal(mean, cov, (200,)) + + mean = (10, 22) + cov = np.array([[10, 5], [5, 10]]) + z = np.random.multivariate_normal(mean, cov, (1000,)) + + data, _ = shuffle(x, y, z) + return (data, data), 3 + + +def data_2(): + train_data = np.array([ + [23, 12, 173, 2134], + [99, -12, -126, -31], + [55, -145, -123, -342], + ]) + return (train_data, train_data), 2 + + +def data_3(): + train_data = np.array([ + [23], + [-2999], + [-2955], + ]) + return (train_data, train_data), 2 + + +def test_with_n_clusters(data_fuction, algorithm_class): + (train_data, test_data), n_clusters = data_fuction() + model = algorithm_class(n_clusters) + model.fit(train_data) + res = model.predict(test_data) + assert len( + res.shape) == 1 and res.shape[0] == test_data.shape[0], "shape of result is wrong" + return res + + +def testcase_1_1(): + test_with_n_clusters(data_1, KMeans) + return True + + +def testcase_1_2(): + res = test_with_n_clusters(data_2, KMeans) + return res[0] != res[1] and res[1] == res[2] + + +def testcase_2_1(): + test_with_n_clusters(data_1, GaussianMixture) + return True + + +def testcase_2_2(): + res = test_with_n_clusters(data_3, GaussianMixture) + return res[0] != res[1] and res[1] == res[2] + + +def test_all(err_report=False): + testcases = [ + ["KMeans-1", testcase_1_1, 4], + ["KMeans-2", testcase_1_2, 4], + # ["KMeans-3", testcase_1_3, 4], + # ["KMeans-4", testcase_1_4, 4], + # ["KMeans-5", testcase_1_5, 4], + ["GMM-1", testcase_2_1, 4], + ["GMM-2", testcase_2_2, 4], + # ["GMM-3", testcase_2_3, 4], + # ["GMM-4", testcase_2_4, 4], + # ["GMM-5", testcase_2_5, 4], + ] + sum_score = sum([case[2] for case in testcases]) + score = 0 + for case in testcases: + try: + res = case[2] if case[1]() else 0 + except Exception as e: + if err_report: + print("Error [{}] occurs in {}".format(str(e), case[0])) + res = 0 + score += res + print("+ {:14} {}/{}".format(case[0], res, case[2])) + print("{:16} {}/{}".format("FINAL SCORE", score, sum_score)) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--report": + test_all(True) + else: + test_all()