Tune Random Forest Regressor in Pipeline

For this tutorial we will be predicting NBA wins based on a number of team statistics

Import dependencies

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import numpy as np
from scipy.stats import pearsonr, shapiro

Import data

df = pd.read_csv('dt_NBA_reg.csv')

Save the DV as DV

DV = 'W'

Dummy code categorical variables

final_data = pd.get_dummies(df, drop_first=True)

Save IVs (X) and DV (y) and then split them into testing/training

X = final_data.drop(DV, axis = 1)
y = final_data[DV]
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Set up the steps for a pipeline

steps = [('scaler', StandardScaler()), ('Forest', RandomForestRegressor())]

Setup the pipeline

pipeline = Pipeline(steps)

Specify the hyperparameter space

parameters = {'Forest__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
              'Forest__max_features': ['auto', 'sqrt','log2',None],
              'Forest__min_samples_split': [2, 5, 10],
              'Forest__min_samples_leaf': [1, 2, 4],
              'Forest__bootstrap': [True, False],
              'Forest__warm_start': [True, False]}

Instantiate the GridSearchCV model

model = RandomizedSearchCV(pipeline, parameters, n_iter=10, scoring='explained_variance', cv=4)

Time the tuning (start)

start_time = time.time()

Fit to the training set

model.fit(X_train, y_train)

Time the tuning (end)

elapsed_time = (time.time() - start_time)/60
print('Time to tune the model: {} min.'.format(elapsed_time))

## Time to tune the model: 1.9312451163927713 min.

Print the tuned parameters

print('Tuned Model Parameters: {}'.format(model.best_params_))

## Tuned Model Parameters: {'Forest__warm_start': True, 'Forest__n_estimators': 1400, 'Forest__min_samples_split': 5, 'Forest__min_samples_leaf': 1, 'Forest__max_features': None, 'Forest__bootstrap': True}

Get predictions

predictions = model.predict(X_test)

Get scatterplot of actual vs. predicted values

plt.scatter(y_test, predictions)
# Add a trendline
z = np.polyfit(y_test, predictions, 1)
p = np.poly1d(z)
plt.plot(y_test, p(y_test), 'r--')
# Add labels
plt.xlabel('Y Test (True Values)')
plt.ylabel('Predicted Values')
plt.title('Predicted vs. Actual Values (r = {0:0.2f})'.format(pearsonr(y_test, predictions)[0], 2))
plt.savefig('Rand_For_Reg_Resid')
plt.show()

plt.clf()

Print interpretation of the pearson r

if pearsonr(y_test, predictions)[0] == 1.00:
    print('There is a perfect positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] >= 0.8:
    print('There is a very strong, positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] >= 0.6:
    print('There is a strong, positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] >= 0.4:
    print('There is a moderate, positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] >= 0.2:
    print('There is a weak, positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] > 0:
    print('There is a very weak, positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] == 0:
    print('There is no linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] <= -0.8:
    print('There is a very strong, negative linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] <= -0.6:
    print('There is a strong, negative linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] <= -0.4:
    print('There is a moderate, negative linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] <= -0.2:
    print('There is a weak, negative linear relationship between the predicted and actual values.')
else: # <= 0 and pearsonr(y_test, predictions)[0] > -0.2
    print('There is a very weak, negative linear relationship between the predicted and actual values.')

## There is a very strong, positive linear relationship between the predicted and actual values.

Print regression metrics

# make metrics into a dataframe
metrics_df = pd.DataFrame({'Metric': ['MAE', 
                                      'MSE', 
                                      'RMSE', 
                                      'R-Squared'],
                          'Value': [metrics.mean_absolute_error(y_test, predictions),
                                    metrics.mean_squared_error(y_test, predictions),
                                    np.sqrt(metrics.mean_squared_error(y_test, predictions)),
                                    metrics.explained_variance_score(y_test, predictions)]}).round(3)
print(metrics_df)

##       Metric   Value
## 0        MAE   3.969
## 1        MSE  27.024
## 2       RMSE   5.198
## 3  R-Squared   0.840

Plot histogram of residuals (we want them to be normally distributed)

sns.distplot((y_test - predictions), bins = 50)

## C:\Users\aengland\AppData\Local\CONTIN~1\ANACON~1\lib\site-packages\matplotlib\axes\_axes.py:6499: MatplotlibDeprecationWarning: 
## The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
##   alternative="'density'", removal="3.1")

plt.xlabel(DV)
plt.ylabel('Density')
plt.title('Histogram of Residuals')
plt.savefig('Rand_For_Reg_Resid_Hist')
plt.show()

plt.clf()

Check residuals for normality

shapiro_df = pd.DataFrame({'Metric': ['Shapiro W',
                                      'p-value'],
                            'Value': [shapiro(y_test - predictions)[0],
                                      shapiro(y_test - predictions)[1]]}).round(3)
print(shapiro_df)

##       Metric  Value
## 0  Shapiro W  0.981
## 1    p-value  0.001

Print the interpretation of the test

if shapiro(y_test - predictions)[1] > 0.05:
    print('Fail to reject the null hypothesis. Data is normally distributed.')
else:
    print('Null hypothesis is rejected. Data is not normally distributed.')

## Null hypothesis is rejected. Data is not normally distributed.

Random Forest Regressor

Aaron England

November 14, 2018

Tune Random Forest Regressor in Pipeline

For this tutorial we will be predicting NBA wins based on a number of team statistics

Import dependencies

Import data

Save the DV as DV

Dummy code categorical variables

Save IVs (X) and DV (y) and then split them into testing/training

Set up the steps for a pipeline

Setup the pipeline

Specify the hyperparameter space

Instantiate the GridSearchCV model

Time the tuning (start)

Fit to the training set

Time the tuning (end)

Print the tuned parameters

Get predictions

Get scatterplot of actual vs. predicted values

Print interpretation of the pearson r

Print regression metrics

Plot histogram of residuals (we want them to be normally distributed)

Check residuals for normality

Print the interpretation of the test