# 机器学习作业 **Repository Path**: liudashuailiqi/machine-learning-homework ## Basic Information - **Project Name**: 机器学习作业 - **Description**: 作业而已 - **Primary Language**: Unknown - **License**: Not specified - **Default Branch**: master - **Homepage**: None - **GVP Project**: No ## Statistics - **Stars**: 0 - **Forks**: 0 - **Created**: 2024-12-05 - **Last Updated**: 2024-12-07 ## Categories & Tags **Categories**: Uncategorized **Tags**: None ## README # 机器学习作业 3 ## KNN算法 ### 算法原理 KNN是一种简单但非常有效的分类算法。它的工作原理是: 计算待分类样本与训练集中所有样本的距离(如欧氏距离)。 选择距离最近的k个样本。 根据这k个样本的类别进行投票,将待分类样本归为票数最多的类别。 ### 算法代码 ``` import numpy as np import pandas as pd from collections import Counter from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # 计算欧氏距离 def euclidean_distance(x1, x2): return np.sqrt(np.sum((x1 - x2) ** 2)) # KNN算法 def knn_classify(train_data, train_labels, test_data, k): predictions = [] for test_sample in test_data: distances = [euclidean_distance(test_sample, train_sample) for train_sample in train_data] nearest_neighbors = np.argsort(distances)[:k] nearest_labels = [train_labels[i] for i in nearest_neighbors] most_common = Counter(nearest_labels).most_common(1) predictions.append(most_common[0][0]) return predictions # 加载数据集 url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'] iris_data = pd.read_csv(url, header=None, names=column_names) # 特征和标签 X = iris_data.iloc[:, :-1].values y = iris_data.iloc[:, -1].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}).values # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 训练KNN模型 k = 3 predictions = knn_classify(X_train, y_train, X_test, k) # 计算准确率 accuracy = accuracy_score(y_test, predictions) print(f'KNN Accuracy: {accuracy:.2f}') ``` ### 实验结果 KNN Accuracy: 0.98 ## ID3算法 ### 算法原理 ID3算法是一种基于信息熵的决策树算法。它的工作原理是: 计算数据集的信息熵。 对每个特征计算条件熵和信息增益。 选择信息增益最大的特征作为当前节点的分裂特征。 递归地在子节点上重复上述步骤,直到满足停止条件(如所有样本属于同一类,或没有更多特征可选)。 ### 数据集 Wine数据集可以从UCI机器学习库下载。数据集包含178个样本,每个样本有13个特征,分别是酒精含量、苹果酸含量等,分为3个类别:Class_1、Class_2和Class_3。 ### 算法代码 ``` import numpy as np import pandas as pd from collections import Counter # 计算信息熵 def entropy(labels): counts = Counter(labels) probabilities = [count / len(labels) for count in counts.values()] return -sum(p * np.log2(p) for p in probabilities if p > 0) # 计算条件熵 def conditional_entropy(data, labels, feature_index): feature_values = np.unique(data[:, feature_index]) total_entropy = 0.0 weighted_sum = np.sum([np.sum(labels == value) for value in np.unique(labels)]) for value in feature_values: sub_data = data[data[:, feature_index] == value] sub_labels = labels[data[:, feature_index] == value] proportion = len(sub_labels) / weighted_sum total_entropy += proportion * entropy(sub_labels) return total_entropy # 计算信息增益 def information_gain(data, labels, feature_index): return entropy(labels) - conditional_entropy(data, labels, feature_index) # ID3决策树 class DecisionTree: def __init__(self, min_samples_split=2, max_depth=100): self.min_samples_split = min_samples_split self.max_depth = max_depth self.tree = None def fit(self, data, labels): self.tree = self._build_tree(data, labels) def _build_tree(self, data, labels): num_samples, num_features = data.shape num_labels = len(np.unique(labels)) # 停止条件 if num_labels == 1: return np.unique(labels)[0] if num_samples < self.min_samples_split: return np.bincount(labels).argmax() if self.max_depth == 0: return np.bincount(labels).argmax() # 选择最佳特征 best_feature_index = np.argmax([information_gain(data, labels, i) for i in range(num_features)]) best_feature_values = np.unique(data[:, best_feature_index]) tree = {best_feature_index: {}} self.max_depth -= 1 for value in best_feature_values: sub_data = data[data[:, best_feature_index] == value] sub_labels = labels[data[:, best_feature_index] == value] subtree = self._build_tree(sub_data, sub_labels) tree[best_feature_index][value] = subtree return tree def predict(self, data): predictions = [] for sample in data: prediction = self._traverse_tree(self.tree, sample) predictions.append(prediction) return predictions def _traverse_tree(self, node, sample): if not isinstance(node, dict): return node feature_index = list(node.keys())[0] feature_value = sample[feature_index] return self._traverse_tree(node[feature_index][feature_value], sample) # 加载数据集 url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" column_names = [f'feature_{i+1}' for i in range(13)] + ['class'] wine_data = pd.read_csv(url, header=None, names=column_names) # 特征和标签 X = wine_data.iloc[:, :-1].values y = wine_data.iloc[:, -1].map({'1': 0, '2': 1, '3': 2}).values # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 训练ID3决策树模型 tree = DecisionTree(min_samples_split=5, max_depth=10) tree.fit(X_train, y_train) predictions = tree.predict(X_test) # 计算准确率 accuracy = accuracy_score(y_test, predictions) print(f'ID3 Decision Tree Accuracy: {accuracy:.2f}') ``` ### 实验结果 ID3 Decision Tree Accuracy: 0.94