## Cynhard85 / MachineLearningTutorial

Cynhard 提交于 2018-07-08 11:44 . write a little
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import numpy as np\n",
"from sklearn.linear_model import LogisticRegression"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"def plot_dataset(splitline=False):\n",
"    np.random.seed(42)\n",
"    x = 10 * np.random.rand(100, 2)\n",
"    y = (x[:,0] + x[:,1] > 10).astype(np.int)\n",
"    log_reg = LogisticRegression(C=10**10, random_state=42)\n",
"    log_reg.fit(x, y)\n",
"\n",
"    left_right = np.array([0, 10])\n",
"    boundary = -(log_reg.coef_[0][0] * left_right + log_reg.intercept_[0]) / log_reg.coef_[0][1]\n",
"\n",
"    plt.figure(figsize=(15,6))\n",
"    plt.plot(x[y==1][:,0], x[y==1][:,1], 'g^', markersize=14)\n",
"    plt.plot(x[y==0][:,0], x[y==0][:,1], 'bs', markersize=14)\n",
"    if splitline:\n",
"        plt.plot(left_right, boundary, 'r-', linewidth=2)\n",
"    plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x24559802cf8>"
]
},
"output_type": "display_data"
}
],
"source": [
"plot_dataset()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x2455a563780>"
]
},
"output_type": "display_data"
}
],
"source": [
"plot_dataset(splitline=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x2455f3b0a20>"
]
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(15,6))\n",
"y_pt = np.array([0.07, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 0.93])\n",
"x_pt = np.log(y_pt / (1 - y_pt))\n",
"plt.plot(x_pt, y_pt, 'bs-', markerfacecolor='r', markeredgecolor='r')\n",
"plt.hlines(1.0, -6.0, 6.0, linestyle='dashed', alpha=0.5)\n",
"plt.hlines(0.5, -6.0, 6.0, linestyle='dashed', alpha=0.5)\n",
"plt.hlines(0.0, -6.0, 6.0, linestyle='dashed', alpha=0.5)\n",
"plt.yticks(np.linspace(0.0,1.0,11))\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x2455f39eb00>"
]
},
"output_type": "display_data"
}
],
"source": [
"x = np.linspace(-6, 6, 1000)\n",
"y = 1 / (1 + np.exp(-x))\n",
"plt.figure(figsize=(18,4))\n",
"\n",
"plt.subplot(121)\n",
"plt.plot(x, y, 'b-')\n",
"plt.hlines(1.0, -6.0, 6.0, linestyle='dashed', alpha=0.5)\n",
"plt.hlines(0.5, -6.0, 6.0, linestyle='dashed', alpha=0.5)\n",
"plt.hlines(0.0, -6.0, 6.0, linestyle='dashed', alpha=0.5)\n",
"plt.yticks(np.linspace(0.0,1.0,11))\n",
"\n",
"plt.subplot(122)\n",
"plt.plot(x_pt, y_pt, 'bs-', markerfacecolor='r', markeredgecolor='r')\n",
"plt.hlines(1.0, -6.0, 6.0, linestyle='dashed', alpha=0.5)\n",
"plt.hlines(0.5, -6.0, 6.0, linestyle='dashed', alpha=0.5)\n",
"plt.hlines(0.0, -6.0, 6.0, linestyle='dashed', alpha=0.5)\n",
"plt.yticks(np.linspace(0.0,1.0,11))\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"source": [
"# 示例"
]
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iris Plants Database\n",
"====================\n",
"\n",
"Notes\n",
"-----\n",
"Data Set Characteristics:\n",
"    :Number of Instances: 150 (50 in each of three classes)\n",
"    :Number of Attributes: 4 numeric, predictive attributes and the class\n",
"    :Attribute Information:\n",
"        - sepal length in cm\n",
"        - sepal width in cm\n",
"        - petal length in cm\n",
"        - petal width in cm\n",
"        - class:\n",
"                - Iris-Setosa\n",
"                - Iris-Versicolour\n",
"                - Iris-Virginica\n",
"    :Summary Statistics:\n",
"\n",
"    ============== ==== ==== ======= ===== ====================\n",
"                    Min  Max   Mean    SD   Class Correlation\n",
"    ============== ==== ==== ======= ===== ====================\n",
"    sepal length:   4.3  7.9   5.84   0.83    0.7826\n",
"    sepal width:    2.0  4.4   3.05   0.43   -0.4194\n",
"    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)\n",
"    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)\n",
"    ============== ==== ==== ======= ===== ====================\n",
"\n",
"    :Missing Attribute Values: None\n",
"    :Class Distribution: 33.3% for each of 3 classes.\n",
"    :Creator: R.A. Fisher\n",
"    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n",
"    :Date: July, 1988\n",
"\n",
"This is a copy of UCI ML iris datasets.\n",
"http://archive.ics.uci.edu/ml/datasets/Iris\n",
"\n",
"The famous Iris database, first used by Sir R.A Fisher\n",
"\n",
"This is perhaps the best known database to be found in the\n",
"pattern recognition literature.  Fisher's paper is a classic in the field and\n",
"is referenced frequently to this day.  (See Duda & Hart, for example.)  The\n",
"data set contains 3 classes of 50 instances each, where each class refers to a\n",
"type of iris plant.  One class is linearly separable from the other 2; the\n",
"latter are NOT linearly separable from each other.\n",
"\n",
"References\n",
"----------\n",
"   - Fisher,R.A. \"The use of multiple measurements in taxonomic problems\"\n",
"     Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\n",
"     Mathematical Statistics\" (John Wiley, NY, 1950).\n",
"   - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\n",
"     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.\n",
"   - Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\n",
"     Structure and Classification Rule for Recognition in Partially Exposed\n",
"     Environments\".  IEEE Transactions on Pattern Analysis and Machine\n",
"     Intelligence, Vol. PAMI-2, No. 1, 67-71.\n",
"   - Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\".  IEEE Transactions\n",
"     on Information Theory, May 1972, 431-433.\n",
"   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al\"s AUTOCLASS II\n",
"     conceptual clustering system finds 3 classes in the data.\n",
"   - Many, many more ...\n",
"\n"
]
}
],
"source": [
"from sklearn.datasets import load_iris\n",
"\n",
"print(iris.DESCR)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x2455f555d30>"
]
},
"output_type": "display_data"
}
],
"source": [
"def plot_iris(iris_data):\n",
"    plt.figure(figsize=(15, 8))\n",
"    for index, style in zip((1,2), ('bs', 'g^')):\n",
"        data = iris_data[iris.target == index]\n",
"        # 第 2、3 列分别表示花瓣长、宽\n",
"        plt.plot(data[:,2], data[:,3], style, label=iris.target_names[index])\n",
"    plt.xticks(fontsize=18); plt.xlabel(\"petal length\", fontsize=18)\n",
"    plt.yticks(fontsize=18); plt.ylabel(\"petal width\", fontsize=18)\n",
"    plt.legend(loc='best', fontsize=18)\n",
"    \n",
"plot_iris(iris.data)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"def sigmoid(z):\n",
"    return 1 / (1 + np.exp(-z))\n",
"\n",
"x = iris.data[:,(2,3)]\n",
"y = (iris.target == 2).astype(np.int).reshape(-1, 1)\n",
"m = x.shape[0]\n",
"X = np.c_[np.ones([m, 1]), x]\n",
"theta = np.random.randn(3, 1)\n",
"\n",
"epoches = 10000\n",
"eta = 0.01\n",
"\n",
"for epoch in range(epoches):\n",
"    theta += eta * X.T.dot(y - sigmoid(X.dot(theta)))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x2455f2d71d0>"
]
},
"output_type": "display_data"
}
],
"source": [
"left_right = np.array([2.9, 7])\n",
"boundary = -(theta[1] * left_right + theta[0]) / theta[2]\n",
"\n",
"plot_iris(iris.data)\n",
"plt.plot(left_right, boundary, 'r-')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x2455f596cf8>"
]
},
"output_type": "display_data"
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"x = iris.data[:,(2,3)]\n",
"y = (iris.target == 2).astype(np.int)\n",
"\n",
"log_reg = LogisticRegression(C=10**10)\n",
"\n",
"log_reg.fit(x, y)\n",
"                              \n",
"left_right = np.array([2.9, 7])\n",
"boundary = -(log_reg.coef_[0][0] * left_right + log_reg.intercept_[0]) / log_reg.coef_[0][1]\n",
"\n",
"plot_iris(iris.data)\n",
"plt.plot(left_right, boundary, 'r-')\n",
"plt.show()"
]
}
],
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
},
"toc": {
"base_numbering": 1,
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}