## Cynhard85 / MachineLearningTutorial

liuxinyang 提交于 2018-07-06 16:43 . modify names
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# 过拟合与欠拟合"
]
},
{
"cell_type": "code",
"execution_count": 129,
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 130,
"outputs": [],
"source": [
"m = 16\n",
"train_size = 12\n",
"test_size = m - train_size\n",
"np.random.seed(42)\n",
"\n",
"XMin = 0\n",
"XMax = 8\n",
"\n",
"X = np.linspace(XMin, XMax, m).reshape(-1, 1)\n",
"y = 0.8 * X ** 2 + 2 * X + 4  + 2.5 * np.random.randn(m, 1)\n",
"X_train = X[0:train_size]\n",
"y_train = y[0:train_size]\n",
"X_test = X[train_size:]\n",
"y_test = y[train_size:]"
]
},
{
"cell_type": "code",
"execution_count": 131,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x23a1ab854a8>"
]
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(7, 5))\n",
"\n",
"plt.plot(X_train, y_train, 'b.', markersize=10, label=\"Training Set\")\n",
"plt.plot(X_test, y_test, 'gs', label=\"Test Set\")\n",
"plt.legend(loc='upper left', fontsize='x-large')\n",
"plt.title(\"Dataset\", fontsize='x-large')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 132,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x23a1a5d4ac8>"
]
},
"output_type": "display_data"
}
],
"source": [
"thetas = []\n",
"degrees = (2, 3, 10)\n",
"\n",
"# 计算 theta\n",
"for degree in degrees:\n",
"    X_b = np.ones([train_size, 1])\n",
"    for p in range(degree-1):\n",
"        X_b = np.column_stack([X_b, X_train**(p+1)])\n",
"    theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)\n",
"    thetas.append(theta)\n",
"\n",
"# 绘图\n",
"plt.figure(figsize=(6*len(thetas), 4))\n",
"for index, degree in zip(range(len(thetas)), degrees):\n",
"    X_plot = np.linspace(XMin, XMax, 1000)\n",
"    X_b_plot = np.ones([1000, 1])\n",
"    for p in range(degree-1):\n",
"        X_b_plot = np.column_stack([X_b_plot, X_plot**(p+1)])\n",
"    plt.subplot(1,len(thetas),index+1)\n",
"    plt.plot(X_train, y_train, 'b.', markersize=10, label=\"Training Set\")\n",
"    y_plot = X_b_plot.dot(thetas[index])\n",
"    plt.plot(X_plot, y_plot, 'r-', label=\"degree={}\".format(degree))\n",
"    plt.legend(loc='upper left', fontsize='x-large')\n",
"    plt.ylim(-5,70)\n",
"    \n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 133,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x23a1aa2cd30>"
]
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(6*len(thetas), 4))\n",
"for index, degree in zip(range(len(thetas)), degrees):\n",
"    X_plot = np.linspace(XMin, XMax, 1000)\n",
"    X_b_plot = np.ones([1000, 1])\n",
"    for p in range(degree-1):\n",
"        X_b_plot = np.column_stack([X_b_plot, X_plot**(p+1)])\n",
"    plt.subplot(1,len(thetas),index+1)\n",
"    plt.plot(X_train, y_train, 'b.', markersize=10, label=\"Training Set\")\n",
"    plt.plot(X_test, y_test, 'gs', label=\"Test Set\")\n",
"    y_plot = X_b_plot.dot(thetas[index])\n",
"    plt.plot(X_plot, y_plot, 'r-', label=\"degree={}\".format(degree))\n",
"    plt.legend(loc='upper left', fontsize=14)\n",
"    plt.ylim(-5,75)\n",
"    \n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 134,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x23a1a8971d0>"
]
},
"output_type": "display_data"
}
],
"source": [
"train_mses = []\n",
"test_mses = []\n",
"for degree in range(0, 10):\n",
"    X_b = np.ones([train_size, 1])\n",
"    X_train_b = np.ones([train_size, 1])\n",
"    X_test_b = np.ones([test_size, 1])\n",
"    for p in range(degree-1):\n",
"        X_b = np.column_stack([X_b, X_train**(p+1)])\n",
"        X_train_b = np.column_stack([X_train_b, X_train**(p+1)])\n",
"        X_test_b = np.column_stack([X_test_b, X_test**(p+1)])\n",
"    theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)\n",
"    y_train_pred = X_train_b.dot(theta)\n",
"    train_mse = np.mean(np.square(y_train_pred - y_train))\n",
"    train_mses.append(train_mse)\n",
"    y_test_pred = X_test_b.dot(theta)\n",
"    test_mse = np.mean(np.square(y_test_pred - y_test))\n",
"    test_mses.append(test_mse)\n",
"\n",
"degrees_plot = range(0, 10)\n",
"plt.plot(degrees_plot, train_mses, 'rs-')\n",
"plt.plot(degrees_plot, test_mses, 'b.-')\n",
"plt.xlabel(\"Degrees\")\n",
"plt.ylabel(\"MSE\")\n",
"plt.ylim(0,100)\n",
"plt.xticks(range(1,10))\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 135,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x23a1c2020f0>"
]
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(18, 4))\n",
"for index in range(3):\n",
"    train_size = 3 * (index + 1)\n",
"    X_train = X[0:train_size]\n",
"    y_train = y[0:train_size]\n",
"    X_test = X[train_size:]\n",
"    y_test = y[train_size:]\n",
"    X_b = np.c_[np.ones([train_size, 1]), X_train, X_train**2]\n",
"    theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)\n",
"    X_plot = np.linspace(-0.2, 8.3, 1000)\n",
"    X_b_plot = np.c_[np.ones([1000, 1]), X_plot, X_plot**2]\n",
"    plt.subplot(1, 3, index+1)\n",
"    plt.plot(X_train, y_train, 'b.', markersize=10, label=\"Training Set\")\n",
"    plt.plot(X_test, y_test, 'gs', label=\"Test Set\")\n",
"    y_plot = X_b_plot.dot(theta)\n",
"    plt.plot(X_plot, y_plot, 'r-', label=\"degree=2\")\n",
"    plt.xlim(-1, 8.5)\n",
"    plt.ylim(-15, 160)\n",
"    plt.legend(loc='upper left', fontsize=14)\n",
"    plt.title(\"Training Samples: {}\".format(train_size), fontsize=14)\n",
"    \n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 136,
"outputs": [
{
"data": {
"text/plain": [
]
},
"output_type": "display_data"
}
],
"source": [
"m = 10\n",
"X = np.linspace(0, 8, m).reshape(-1, 1)\n",
"train_size = 8\n",
"plt.figure(figsize=(16, 4))\n",
"np.random.seed(42)\n",
"for index in range(3):    \n",
"    y = 3 * X ** 2 - 4 * X + 5 + 3 ** (index+1) * np.random.randn(m, 1)\n",
"    X_train = X[0:train_size]\n",
"    y_train = y[0:train_size]\n",
"    X_test = X[train_size:]\n",
"    y_test = y[train_size:]\n",
"    X_b = np.c_[np.ones([train_size, 1]), X_train, X_train**2]\n",
"    theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)\n",
"    X_plot = np.linspace(-0.2, 8.1, 1000)\n",
"    X_b_plot = np.c_[np.ones([1000, 1]), X_plot, X_plot**2]\n",
"    plt.subplot(1, 3, index+1)\n",
"    plt.plot(X_train, y_train, 'b.', markersize=10, label=\"Training Set\")\n",
"    plt.plot(X_test, y_test, 'gs', label=\"Test Set\")\n",
"    y_plot = X_b_plot.dot(theta)\n",
"    plt.plot(X_plot, y_plot, 'r-', label=\"degree=2\")\n",
"    plt.legend(loc='upper left', fontsize=14)\n",
"    plt.title(\"$\\sigma={}$\".format(3 ** (index+1)), fontsize=14)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"source": [
"# 交叉验证"
]
},
{
"cell_type": "code",
"execution_count": 137,
"outputs": [],
"source": [
"m = 16\n",
"train_size = 12\n",
"test_size = m - train_size\n",
"np.random.seed(42)\n",
"\n",
"XMin = 0\n",
"XMax = 8\n",
"\n",
"X_value = np.linspace(XMin, XMax, m).reshape(-1, 1)\n",
"y_value = 0.8 * X_value ** 2 + 2 * X_value + 4  + 2.5 * np.random.randn(m, 1)\n",
"X = X_value[0:train_size]\n",
"y = y_value[0:train_size]\n",
"X_test = X_value[train_size:]\n",
"y_test = y_value[train_size:]"
]
},
{
"cell_type": "code",
"execution_count": 138,
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x23a1c3e7860>"
]
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(7, 5))\n",
"\n",
"plt.plot(X_value, y_value, 'b.', markersize=10, label=\"Training Set\")\n",
"plt.plot(X_test, y_test, 'gs', label=\"Test Set\")\n",
"plt.legend(loc='upper left', fontsize='x-large')\n",
"plt.title(\"Dataset\", fontsize='x-large')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 139,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"degree: 1 total mse: 1004.8243901991978\n",
"degree: 2 total mse: 78.71016157762188\n",
"degree: 3 total mse: 14.505611368548402\n",
"degree: 4 total mse: 88.8396712174888\n",
"degree: 5 total mse: 1307.6732551728437\n",
"degree: 6 total mse: 20001.38512177253\n",
"degree: 7 total mse: 18480.563606888358\n",
"degree: 8 total mse: 10350128.013801835\n",
"degree: 9 total mse: 63201404.49438538\n",
"degree: 10 total mse: 121739968.04381178\n"
]
},
{
"data": {
"text/plain": [
"<matplotlib.figure.Figure at 0x23a1c149208>"
]
},
"output_type": "display_data"
}
],
"source": [
"validation_size = 3\n",
"folders = train_size // validation_size\n",
"real_train_size = train_size - validation_size\n",
"total_mse_validation_coll = []\n",
"for degree in range(1, 11):\n",
"    total_mse_validation = 0.0\n",
"    for folder in range(folders):\n",
"        # 划分训练集和验证集\n",
"        folder_indices = range(folder*validation_size, folder*validation_size + validation_size)\n",
"        X_validation = X[folder_indices]\n",
"        y_validation = y[folder_indices]\n",
"        X_train = np.setdiff1d(X, X_validation).reshape(-1, 1)\n",
"        y_train = np.setdiff1d(y, y_validation).reshape(-1, 1)\n",
"        # 构造训练集和验证集矩阵\n",
"        X_train_b = np.ones([real_train_size, 1])\n",
"        X_validation_b = np.ones([validation_size, 1])\n",
"        for p in range(degree - 1):\n",
"            X_train_b = np.column_stack([X_train_b, X_train ** (p+1)])\n",
"            X_validation_b = np.column_stack([X_validation_b, X_validation ** (p+1)])\n",
"        # 训练模型\n",
"        theta = np.linalg.inv(X_train_b.T.dot(X_train_b)).dot(X_train_b.T).dot(y_train)\n",
"        y_pred_validation = X_validation_b.dot(theta)\n",
"        mse_validation = np.mean(np.square(y_pred_validation - y_validation))\n",
"        total_mse_validation += mse_validation\n",
"    print(\"degree:\", degree, \"total mse:\", total_mse_validation)\n",
"    total_mse_validation_coll.append(total_mse_validation)\n",
"fig, ax1 = plt.subplots(figsize=(8,6))\n",
"left,bottom,width,height = [0.23, 0.37, 0.35, 0.35]\n",
"ax1.plot(range(1, 11), total_mse_validation_coll, 'bs-')\n",
"ax2.plot(range(1, 11), total_mse_validation_coll, 'bs-')\n",
"ax2.set_xlim(0, 6)\n",
"ax2.set_xticks(range(0,6))\n",
"ax2.set_ylim(-100, 1400)\n",
"plt.show()\n",
"plt.show()"
]
}
],
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
},
"toc": {
"base_numbering": 1,
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}