1 line
8.6 KiB
Plaintext
1 line
8.6 KiB
Plaintext
{"cells": [{"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["# Exercises for Lecture 16 (Decisions Trees)"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["from sklearn.datasets import load_iris\n", "from sklearn.tree import DecisionTreeClassifier \n", "from sklearn.tree import export_graphviz\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from matplotlib.colors import ListedColormap"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["## Exercise 1: Train a decision tree classifier on the dual moons data"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["### Read in example dual moons data"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["from sklearn.datasets import make_moons\n", "\n", "X_moons, y_moons = make_moons(n_samples=150, noise=0.2, random_state=42)"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["### Train a decision tree classifier with default hyperparameters"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["### Plot decision boundaries"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["def plot_decision_boundary(clf, X, y, axes, cmap):\n", " x1, x2 = np.meshgrid(np.linspace(axes[0], axes[1], 100),\n", " np.linspace(axes[2], axes[3], 100))\n", " X_new = np.c_[x1.ravel(), x2.ravel()]\n", " y_pred = clf.predict(X_new).reshape(x1.shape)\n", " \n", " plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=cmap)\n", " plt.contour(x1, x2, y_pred, cmap=\"Greys\", alpha=0.8)\n", " colors = {\"Wistia\": [\"#78785c\", \"#c47b27\"], \"Pastel1\": [\"red\", \"blue\"]}\n", " markers = (\"o\", \"^\")\n", " for idx in (0, 1):\n", " plt.plot(X[:, 0][y == idx], X[:, 1][y == idx],\n", " color=colors[cmap][idx], marker=markers[idx], linestyle=\"none\")\n", " plt.axis(axes)\n", " plt.xlabel(r\"$x_1$\")\n", " plt.ylabel(r\"$x_2$\", rotation=0)"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["plot_decision_boundary(tree_clf1, X_moons, y_moons,\n", " axes=[-1.5, 2.4, -1, 1.5], cmap=\"Wistia\")\n", "plt.title(\"No restrictions\");"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["You probably found the decision tree was overfitted."]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["### Regularise model"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["Now train a new decision tree but by regularising it by setting appropriate hyperparameters."]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["### Plot decision boundaries"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["plot_decision_boundary(tree_clf2, X_moons, y_moons,\n", " axes=[-1.5, 2.4, -1, 1.5], cmap=\"Wistia\")\n", "plt.title(f\"min_samples_leaf = {tree_clf2.min_samples_leaf}\")\n", "plt.ylabel(\"\")"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["Hopefully your revised model no longer overfits and will generalise better to unseen data."]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["### Evaluate performance of the two models on unseen data"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["X_moons_test, y_moons_test = make_moons(n_samples=1000, noise=0.2,\n", " random_state=43)"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["tree_clf1.score(X_moons_test, y_moons_test)"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["tree_clf2.score(X_moons_test, y_moons_test)"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["Hopefully your regularised model performs better."]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["## Exercise 2: Train a decision tree regressor on quadratic data"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["### Set up mock data"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["import numpy as np\n", "from sklearn.tree import DecisionTreeRegressor\n", "\n", "np.random.seed(42)\n", "X_quad = np.random.rand(200, 1) - 0.5 # a single random input feature\n", "y_quad = X_quad ** 2 + 0.025 * np.random.randn(200, 1)"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["### Train a decision tree regressor with default hyperparameters"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["### Make predictions"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["x1 = np.linspace(-0.5, 0.5, 500).reshape(-1, 1)\n", "y_pred1 = tree_reg1.predict(x1)"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["plt.plot(X_quad, y_quad, \"b.\")\n", "plt.plot(x1, y_pred1, \"r.-\", linewidth=2, label=r\"$\\hat{y}$\")\n", "plt.axis([-0.5, 0.5, -0.05, 0.25])\n", "plt.xlabel(\"$x_1$\")\n", "plt.ylabel(\"$y$\", rotation=0)\n", "plt.legend(loc=\"upper center\")\n", "plt.title(\"No restrictions\");"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["### Regularise model"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["Now train a new decision tree but by regularising it by setting appropriate hyperparameters."]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["tree_reg2 = DecisionTreeRegressor(random_state=42, min_samples_leaf=10)\n", "tree_reg2.fit(X_quad, y_quad)"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["### Make predictions"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["y_pred2 = tree_reg2.predict(x1)"]}, {"cell_type": "code", "execution_count": null, "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "outputs": [], "source": ["plt.plot(X_quad, y_quad, \"b.\")\n", "plt.plot(x1, y_pred2, \"r.-\", linewidth=2, label=r\"$\\hat{y}$\")\n", "plt.axis([-0.5, 0.5, -0.05, 0.25])\n", "plt.xlabel(\"$x_1$\")\n", "plt.ylabel(\"$y$\", rotation=0)\n", "plt.legend(loc=\"upper center\")\n", "plt.title(\"No restrictions\");"]}, {"cell_type": "markdown", "metadata": {"editable": true, "slideshow": {"slide_type": ""}, "tags": []}, "source": ["Hopefully your revised model no longer overfits and will generalise better to unseen data."]}], "metadata": {"celltoolbar": "Tags", "kernelspec": {"display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11"}}, "nbformat": 4, "nbformat_minor": 4} |