# Exercises for Lecture 17 (Ensemble Learning and Random Forests)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import train_test_split

## Exercise 1: Early stopping

### Set up mock data

In [None]:
# Training set: a noisy quadratic function
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

# First create test and train data
X_train, X_val, y_train, y_val = train_test_split(X, y)

### Train with many trees

In [None]:
from sklearn.metrics import mean_squared_error

n_estimators = 300
gbrt = GradientBoostingRegressor(
    max_depth=2, 
    n_estimators=n_estimators, 
    learning_rate=0.1, # Set a low learning rate here
    random_state=42)

gbrt.fit(X_train, y_train)

### Compute and plot validation error for intermediate number of trees

In [None]:
# measure MSE validation error at each stage
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]

In [None]:
plt.figure(figsize=(11, 4))
plt.plot(errors, "b.-")
plt.axis([0, 300, 0, 0.01])
plt.xlabel("Number of trees")
plt.title("Validation error", fontsize=14)

### Training a better model with fewer trees

- Find the best number of trees from the validation error.  Show this on a plot.
- Train a new GBRT using the optimal number of trees from above.
- Plot predictions of the original and best models.


In [None]:
def plot_predictions(
    regressors, X, y, axes, 
    label=None, 
    style="r-", 
    data_style="b.", 
    data_label=None):
    
    x1 = np.linspace(axes[0], axes[1], 500)
    
    y_pred = sum(
        regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
            
    plt.plot(X[:, 0], y, data_style, label=data_label)
    plt.plot(x1, y_pred, style, linewidth=2, label=label)
    if label or data_label:
        plt.legend(loc="upper center", fontsize=16)
    plt.axis(axes)