## Cynhard85 / MachineLearningTutorial

liuxinyang 提交于 2018-07-06 16:43 . modify names
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# 长颈鹿和梅花鹿\n",
"## 纯 Python 实现"
]
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1, 0]\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"# 整理数据\n",
"data = np.array([[2.45], [2.31], [2.38], [2.40], [2.38], [0.25], [0.3], [0.24], [0.31], [0.29]])\n",
"target = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])\n",
"\n",
"test = np.array([[2.33], [0.28]])\n",
"\n",
"def distance_Euclidean(x):\n",
" \"\"\" 计算单一样本 x 与整个样本集的欧式距离 \"\"\"\n",
" \n",
" m = data.shape[0] # 获取整体样本集的样本数量\n",
" diff = np.tile(x, [m, 1]) - data # 为了计算方便，将测试样本 x 提高维度，与整体样本集相减\n",
" diff_squared = diff ** 2 # 计算平方\n",
" distance_squared = diff_squared.sum(axis=1) # 计算平方和\n",
" distance = distance_squared ** 0.5 # 计算开方\n",
" return distance\n",
"\n",
"def knn(d, k):\n",
" \"\"\" 找出单一样本前 k 个最小距离，并预测分类 \"\"\"\n",
" \n",
" d_sorted = d.argsort() # 对索引进行排序\n",
" min_indices = np.array([0, 0]) # 索引为分类，值为分类数量\n",
" for i in range(k): # 统计前 k 个最小距离中，每个分类的数量\n",
" min_indices[target[d_sorted[i]]] += 1\n",
" min_indices_sort = min_indices.argsort() # 索引从小到大排序\n",
" return min_indices_sort[-1] # 返回最大的索引\n",
" \n",
"\n",
"# 计算每个测试样本与整个样本集的欧式距离\n",
"ds = np.array([distance_Euclidean(test[i]) for i in range(len(test))])\n",
"\n",
"# 找出前 k 个最小的距离\n",
"ds_knn = [knn(d, k=3) for d in ds]\n",
"\n",
"# 打印分类结果\n",
"print(ds_knn)"
]
},
{
"cell_type": "markdown",
"source": [
"## 使用 sklearn 实现"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 0]\n"
]
}
],
"source": [
"import numpy as np\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"\n",
"data = np.array([[2.45], [2.31], [2.38], [2.40], [2.38], [0.25], [0.3], [0.24], [0.31], [0.29]])\n",
"target = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])\n",
"\n",
"test = np.array([[2.33], [0.28]])\n",
"\n",
"neigh = KNeighborsClassifier(n_neighbors=3)\n",
"neigh.fit(data, target)\n",
"print(neigh.predict(test))"
]
},
{
"cell_type": "markdown",
"source": [
"# 归一化\n",
"\n",
"## 读取数据"
]
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体\n",
"plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题\n",
"\n",
"fly = np.array([])\n",
"play = np.array([])\n",
"eat = np.array([])\n",
"with open(\"datingTestSet.txt\") as f:\n",
" for line in f:\n",
" line = line.strip()\n",
" data = line.split('\\t')\n",
" fly = np.append(fly, float(data[0]))\n",
" play = np.append(play, float(data[1]))\n",
" eat = np.append(eat, float(data[2]))"
]
},
{
"cell_type": "markdown",
"source": [
"## 归一化"
]
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [],
"source": [
"def normalize(data):\n",
" \"\"\" 实现归一化 \n",
" \n",
" data: numpy array 表示的数据集\n",
" \"\"\"\n",
" \n",
" min_val = data.min()\n",
" max_val = data.max()\n",
" interval = max_val - min_val\n",
" return (data - min_val) / interval\n",
"\n",
"\n",
"fly_normal = normalize(fly)\n",
"play_normal = normalize(play)"
]
},
{
"cell_type": "markdown",
"source": [
"## 归一化前后对比"
]
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x133468e3898>"
]
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(15, 7))\n",
"\n",
"plt.subplot(121)\n",
"plt.scatter(fly, play)\n",
"plt.xlabel(u'每年获得的飞行常客里程数', fontsize=18)\n",
"plt.ylabel(u'玩视频游戏所耗时间百分比', fontsize=18)\n",
"plt.title(u'原始数据', fontsize=18)\n",
"\n",
"plt.subplot(122)\n",
"plt.scatter(fly_normal, play_normal)\n",
"plt.xlabel(u'每年获得的飞行常客里程数', fontsize=18)\n",
"plt.ylabel(u'玩视频游戏所耗时间百分比', fontsize=18)\n",
"plt.title(u'归一化后数据', fontsize=18)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"source": [
"# k 值对结果的影响"
]
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x133469309b0>"
]
},
"output_type": "display_data"
}
],
"source": [
"def plot_circle(r, c, s, w, l):\n",
" pts = np.linspace(0, 2 * np.pi, 800)\n",
" x, y = np.cos(pts) * r, np.sin(pts) * r\n",
" plt.plot(x, y, color=c, linestyle=s, linewidth=w, label=l)\n",
"\n",
"x1 = [0.6, 0.3, 0.7, 1.7, -0.2]\n",
"y1 = [0.4, 0.2, 1.8, 1.2, 1.8]\n",
"x2 = [-0.4, -1.0, -1.6, -0.9, -0.8]\n",
"y2 = [-0.3, 0.8, -1.4, -0.7, -1.6]\n",
"x_test = [0.0]\n",
"y_test = [0.0]\n",
"\n",
"plt.figure(figsize=(6,6))\n",
"plot_circle(0.9, 'black', '-', 1.0, 'k=3')\n",
"plot_circle(1.5, 'black', '--', 1.0, 'k=5')\n",
"plt.scatter(x_test, y_test, marker='o', s=150, color='green')\n",
"plt.scatter(x1, y1, marker='^', s=150, color='red')\n",
"plt.scatter(x2, y2, marker='s', s=150, color='blue')\n",
"plt.annotate('?', xy=(0, 0), xytext=(-0.05, 0.11), fontsize=18)\n",
"plt.xlim(-2, 2)\n",
"plt.ylim(-2, 2)\n",
"plt.xticks([])\n",
"plt.yticks([])\n",
"plt.legend(loc='lower right', fontsize=16)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"source": [
"# 鸢尾花分类"
]
},
{
"cell_type": "markdown",
"source": [
"## 准备数据"
]
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iris Plants Database\n",
"====================\n",
"\n",
"Notes\n",
"-----\n",
"Data Set Characteristics:\n",
" :Number of Instances: 150 (50 in each of three classes)\n",
" :Number of Attributes: 4 numeric, predictive attributes and the class\n",
" :Attribute Information:\n",
" - sepal length in cm\n",
" - sepal width in cm\n",
" - petal length in cm\n",
" - petal width in cm\n",
" - class:\n",
" - Iris-Setosa\n",
" - Iris-Versicolour\n",
" - Iris-Virginica\n",
" :Summary Statistics:\n",
"\n",
" ============== ==== ==== ======= ===== ====================\n",
" Min Max Mean SD Class Correlation\n",
" ============== ==== ==== ======= ===== ====================\n",
" sepal length: 4.3 7.9 5.84 0.83 0.7826\n",
" sepal width: 2.0 4.4 3.05 0.43 -0.4194\n",
" petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n",
" petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n",
" ============== ==== ==== ======= ===== ====================\n",
"\n",
" :Missing Attribute Values: None\n",
" :Class Distribution: 33.3% for each of 3 classes.\n",
" :Creator: R.A. Fisher\n",
" :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n",
" :Date: July, 1988\n",
"\n",
"This is a copy of UCI ML iris datasets.\n",
"http://archive.ics.uci.edu/ml/datasets/Iris\n",
"\n",
"The famous Iris database, first used by Sir R.A Fisher\n",
"\n",
"This is perhaps the best known database to be found in the\n",
"pattern recognition literature. Fisher's paper is a classic in the field and\n",
"is referenced frequently to this day. (See Duda & Hart, for example.) The\n",
"data set contains 3 classes of 50 instances each, where each class refers to a\n",
"type of iris plant. One class is linearly separable from the other 2; the\n",
"latter are NOT linearly separable from each other.\n",
"\n",
"References\n",
"----------\n",
" - Fisher,R.A. \"The use of multiple measurements in taxonomic problems\"\n",
" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\n",
" Mathematical Statistics\" (John Wiley, NY, 1950).\n",
" - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\n",
" (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n",
" - Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\n",
" Structure and Classification Rule for Recognition in Partially Exposed\n",
" Environments\". IEEE Transactions on Pattern Analysis and Machine\n",
" Intelligence, Vol. PAMI-2, No. 1, 67-71.\n",
" - Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\". IEEE Transactions\n",
" on Information Theory, May 1972, 431-433.\n",
" - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al\"s AUTOCLASS II\n",
" conceptual clustering system finds 3 classes in the data.\n",
" - Many, many more ...\n",
"\n"
]
}
],
"source": [
"\n",
"print(iris.DESCR)"
]
},
{
"cell_type": "markdown",
"source": [
"## 审查数据"
]
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x13345fbd518>"
]
},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"%matplotlib inline\n",
"\n",
"def plot_iris(iris_data):\n",
" plt.figure(figsize=(8, 6))\n",
" markers = ['x', '+', '^']\n",
" labels = iris.target_names\n",
" for i in range(3):\n",
" data = iris_data[iris.target == i]\n",
" # 第 2、3 列分别表示花瓣长、宽\n",
" plt.scatter(data[:,2], data[:,3], marker=markers[i], s=60, label=labels[i])\n",
" plt.xticks(fontsize=18)\n",
" plt.yticks(fontsize=18)\n",
" plt.legend(loc='best', fontsize=18)\n",
" plt.show()\n",
" \n",
"plot_iris(iris.data)"
]
},
{
"cell_type": "markdown",
"source": [
"## 整理数据"
]
},
{
"cell_type": "markdown",
"source": [
"### 归一化"
]
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x13346a80cc0>"
]
},
"output_type": "display_data"
}
],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"\n",
"scaler = MinMaxScaler()\n",
"iris_data = scaler.fit_transform(iris.data)\n",
"\n",
"plot_iris(iris_data)"
]
},
{
"cell_type": "markdown",
"source": [
"### 准备数据集"
]
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [],
"source": [
"dataset_size = iris.data.shape[0]\n",
"\n",
"np.random.seed(42)\n",
"\n",
"# shuffle dataset\n",
"indices = np.random.permutation(dataset_size)\n",
"suffled_data = iris.data[indices]\n",
"suffled_target = iris.target[indices]\n",
"\n",
"# prepare datasets\n",
"ratio = 0.8\n",
"train_data_size = int(dataset_size * 0.8)\n",
"train_data = suffled_data[:train_data_size]\n",
"train_target = suffled_target[:train_data_size]\n",
"test_data = suffled_data[train_data_size:]\n",
"test_target = suffled_target[train_data_size:]"
]
},
{
"cell_type": "markdown",
"source": [
"## 训练模型 - Python"
]
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1 1 2 2 0 1 1 0 1 2]\n"
]
}
],
"source": [
"import operator\n",
"\n",
"class KNNClassifier:\n",
" \n",
" def __init__(self, k):\n",
" self.k = k\n",
" \n",
" def fit(self, train_data, train_target):\n",
" self.train_data = train_data\n",
" self.train_target = train_target\n",
" \n",
" def distance(self, test_data):\n",
" \"\"\" 计算测试集 test_data 与 训练集的欧式距离 \"\"\"\n",
" train_size = self.train_data.shape[0] # 获取训练样本集的样本数量\n",
" ds = [] # 保存每个测试样本与训练样本的距离\n",
" for data in test_data:\n",
" diff = np.tile(data, [train_size, 1]) - self.train_data # 与训练样本集相减\n",
" diff_squared = diff ** 2 # 计算平方\n",
" distance_squared = diff_squared.sum(axis=1) # 计算平方和\n",
" distance = distance_squared ** 0.5 # 计算开方\n",
" ds.append(distance)\n",
" return np.array(ds)\n",
"\n",
" def predict(self, test_data):\n",
" \"\"\" 预测分类结果 \"\"\"\n",
" \n",
" distances = self.distance(test_data) # 计算距离\n",
" distances_sorted = distances.argsort() # 对索引进行排序\n",
" predicts = [] # 保存预测结果\n",
" for distance in distances_sorted: # 遍历每一个样本的距离，找出最小的 k 个\n",
" predict_dict = {} # 保存预测结果\n",
" for i in range(self.k): # 统计前 k 个最小距离中，每个分类的数量\n",
" key = self.train_target[distance[i]]\n",
" predict_dict[key] = predict_dict.get(key, 0) + 1\n",
" items = predict_dict.items()\n",
" sorted_predict_dict = sorted(predict_dict.items(), key=operator.itemgetter(1), reverse=True)\n",
" predict = sorted_predict_dict[0][0]\n",
" predicts.append(predict)\n",
" return np.array(predicts)\n",
" \n",
"kNN = KNNClassifier(k=3)\n",
"kNN.fit(train_data, train_target)\n",
"predicts = kNN.predict(test_data)\n",
"print(predicts)"
]
},
{
"cell_type": "markdown",
"source": [
"## 评估模型 - Python"
]
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Test accuracy: 0.9666666666666667\n"
]
}
],
"source": [
"correct = (predicts == test_target).astype(float)\n",
"accuracy = np.mean(correct)\n",
"print(\"Test accuracy: \", accuracy)"
]
},
{
"cell_type": "markdown",
"source": [
"## 训练模型 - sklearn"
]
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1 1 2 2 0 1 1 0 1 2]\n"
]
}
],
"source": [
"from sklearn.neighbors import KNeighborsClassifier\n",
"\n",
"kNN = KNeighborsClassifier(n_neighbors=3)\n",
"kNN.fit(train_data, train_target)\n",
"predicts = kNN.predict(test_data)\n",
"print(predicts)"
]
},
{
"cell_type": "markdown",
"source": [
"## 评估模型 - sklearn"
]
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Test accuracy: 0.9666666666666667\n"
]
}
],
"source": [
"accuracy = kNN.score(test_data, test_target)\n",
"print(\"Test accuracy: \", accuracy)"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": []
}
],
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
},
"toc": {
"base_numbering": 1,
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}