Tune Random Forest Regressor in Pipeline
For this tutorial we will be predicting NBA wins based on a number of team statistics
Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import numpy as np
from scipy.stats import pearsonr, shapiro
Import data
df = pd.read_csv('dt_NBA_reg.csv')
Save the DV as DV
DV = 'W'
Dummy code categorical variables
final_data = pd.get_dummies(df, drop_first=True)
Save IVs (X) and DV (y) and then split them into testing/training
X = final_data.drop(DV, axis = 1)
y = final_data[DV]
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
Set up the steps for a pipeline
steps = [('scaler', StandardScaler()), ('Forest', RandomForestRegressor())]
Setup the pipeline
pipeline = Pipeline(steps)
Specify the hyperparameter space
parameters = {'Forest__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
'Forest__max_features': ['auto', 'sqrt','log2',None],
'Forest__min_samples_split': [2, 5, 10],
'Forest__min_samples_leaf': [1, 2, 4],
'Forest__bootstrap': [True, False],
'Forest__warm_start': [True, False]}
Instantiate the GridSearchCV model
model = RandomizedSearchCV(pipeline, parameters, n_iter=10, scoring='explained_variance', cv=4)
Time the tuning (start)
start_time = time.time()
Fit to the training set
model.fit(X_train, y_train)
Time the tuning (end)
elapsed_time = (time.time() - start_time)/60
print('Time to tune the model: {} min.'.format(elapsed_time))
## Time to tune the model: 1.9312451163927713 min.
Print the tuned parameters
print('Tuned Model Parameters: {}'.format(model.best_params_))
## Tuned Model Parameters: {'Forest__warm_start': True, 'Forest__n_estimators': 1400, 'Forest__min_samples_split': 5, 'Forest__min_samples_leaf': 1, 'Forest__max_features': None, 'Forest__bootstrap': True}
Get predictions
predictions = model.predict(X_test)
Get scatterplot of actual vs. predicted values
plt.scatter(y_test, predictions)
# Add a trendline
z = np.polyfit(y_test, predictions, 1)
p = np.poly1d(z)
plt.plot(y_test, p(y_test), 'r--')
# Add labels
plt.xlabel('Y Test (True Values)')
plt.ylabel('Predicted Values')
plt.title('Predicted vs. Actual Values (r = {0:0.2f})'.format(pearsonr(y_test, predictions)[0], 2))
plt.savefig('Rand_For_Reg_Resid')
plt.show()

plt.clf()
Print interpretation of the pearson r
if pearsonr(y_test, predictions)[0] == 1.00:
print('There is a perfect positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] >= 0.8:
print('There is a very strong, positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] >= 0.6:
print('There is a strong, positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] >= 0.4:
print('There is a moderate, positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] >= 0.2:
print('There is a weak, positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] > 0:
print('There is a very weak, positive linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] == 0:
print('There is no linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] <= -0.8:
print('There is a very strong, negative linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] <= -0.6:
print('There is a strong, negative linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] <= -0.4:
print('There is a moderate, negative linear relationship between the predicted and actual values.')
elif pearsonr(y_test, predictions)[0] <= -0.2:
print('There is a weak, negative linear relationship between the predicted and actual values.')
else: # <= 0 and pearsonr(y_test, predictions)[0] > -0.2
print('There is a very weak, negative linear relationship between the predicted and actual values.')
## There is a very strong, positive linear relationship between the predicted and actual values.
Print regression metrics
# make metrics into a dataframe
metrics_df = pd.DataFrame({'Metric': ['MAE',
'MSE',
'RMSE',
'R-Squared'],
'Value': [metrics.mean_absolute_error(y_test, predictions),
metrics.mean_squared_error(y_test, predictions),
np.sqrt(metrics.mean_squared_error(y_test, predictions)),
metrics.explained_variance_score(y_test, predictions)]}).round(3)
print(metrics_df)
## Metric Value
## 0 MAE 3.969
## 1 MSE 27.024
## 2 RMSE 5.198
## 3 R-Squared 0.840
Plot histogram of residuals (we want them to be normally distributed)
sns.distplot((y_test - predictions), bins = 50)
## C:\Users\aengland\AppData\Local\CONTIN~1\ANACON~1\lib\site-packages\matplotlib\axes\_axes.py:6499: MatplotlibDeprecationWarning:
## The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.
## alternative="'density'", removal="3.1")
plt.xlabel(DV)
plt.ylabel('Density')
plt.title('Histogram of Residuals')
plt.savefig('Rand_For_Reg_Resid_Hist')
plt.show()

plt.clf()
Check residuals for normality
shapiro_df = pd.DataFrame({'Metric': ['Shapiro W',
'p-value'],
'Value': [shapiro(y_test - predictions)[0],
shapiro(y_test - predictions)[1]]}).round(3)
print(shapiro_df)
## Metric Value
## 0 Shapiro W 0.981
## 1 p-value 0.001
Print the interpretation of the test
if shapiro(y_test - predictions)[1] > 0.05:
print('Fail to reject the null hypothesis. Data is normally distributed.')
else:
print('Null hypothesis is rejected. Data is not normally distributed.')
## Null hypothesis is rejected. Data is not normally distributed.