Tune XGBoost Classifier in Pipeline

For this tutorial we will be predicting whether or not an NBA team makes the playoffs based on a number of team statistics

Import dependencies

import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

Load in the data set

df = pd.read_csv('dt_NBA_class.csv')

Save the DV as DV

DV = 'Playoffs'

Dummy code categorical variables

final_data = pd.get_dummies(df, drop_first=True)

Save IVs (X) and DV (y) and then split them into testing/training

X = final_data.drop(DV, axis = 1)
y = final_data[DV]
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Set up the steps for a pipeline

steps = [('scaler', StandardScaler()), ('XGBoost', xgb.XGBClassifier())] 

Setup the pipeline

pipeline = Pipeline(steps)

Specify the hyperparameter space

parameters = {'XGBoost__booster': ['gbtree', 'gblinear', 'dart'],
              'XGBoost__subsample': np.arange(0.5, 1, 0.05),
              'XGBoost__max_depth': np.arange(3, 20, 1),
              'XGBoost__colsample_bytree': np.arange(0.1, 1.05, 0.05),
              'XGBoost__objective': ['reg:logistic','binary:logistic']}

Instantiate the GridSearchCV model

model = RandomizedSearchCV(pipeline, parameters, n_iter=10, scoring='roc_auc', cv=4)

Time the model-tuning

start_time = time.time()

Fit to the training set

model.fit(X_train, y_train)

Time the tuning (end)

elapsed_time = (time.time() - start_time)/60
print('Time to tune the model: {0:0.2f} min.'.format(elapsed_time))
## Time to tune the model: 0.06 min.

Print the tuned parameters

print('Tuned Model Parameters: {}'.format(model.best_params_))
## Tuned Model Parameters: {'XGBoost__subsample': 0.5, 'XGBoost__objective': 'binary:logistic', 'XGBoost__max_depth': 7, 'XGBoost__colsample_bytree': 0.7000000000000002, 'XGBoost__booster': 'dart'}

Get predicted classes

predictions = model.predict(X_test)
## C:\Users\aengland\AppData\Local\CONTIN~1\ANACON~1\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
##   if diff:

Get predicted probability

pred_prob = model.predict_proba(X_test)[:,1]

Evaluate performance with confusion matrix

cm = pd.DataFrame(confusion_matrix(y_test, predictions))
cm['Total'] = np.sum(cm, axis=1)
cm = cm.append(np.sum(cm, axis=0), ignore_index=True)
cm.columns = ['Predicted No', 'Predicted Yes', 'Total']
cm = cm.set_index([['Actual No', 'Actual Yes', 'Total']])
print(cm)
##             Predicted No  Predicted Yes  Total
## Actual No             93             28    121
## Actual Yes            11            153    164
## Total                104            181    285

Evaluate performance with classification report

print(classification_report(y_test, predictions))
##              precision    recall  f1-score   support
## 
##           0       0.89      0.77      0.83       121
##           1       0.85      0.93      0.89       164
## 
## avg / total       0.87      0.86      0.86       285

Evaluate performance with ROC curve

# Calculate area under the curve (logit_roc_auc)
logit_roc_auc = roc_auc_score(y_test, model.predict(X_test)) 
# Plot ROC
## C:\Users\aengland\AppData\Local\CONTIN~1\ANACON~1\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
##   if diff:
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1]) 
plt.figure()
plt.plot(fpr, tpr, label='Model (Area = %0.2f)' % logit_roc_auc) 
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig('XGBoost_ROC')
plt.show()

plt.clf()

Evaluate performance with PR Curve (for when there is a classification imbalance)

# Get average precision
average_precision = average_precision_score(y_test, predictions)
# Plot PR
precision, recall, _ = precision_recall_curve(y_test, pred_prob)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))
plt.savefig('XGBoost_PR')
plt.show()