Tune XGBoost Classifier in Pipeline

For this tutorial we will be predicting whether or not an NBA team makes the playoffs based on a number of team statistics

Import dependencies

import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

Load in the data set

df = pd.read_csv('dt_NBA_class.csv')

Save the DV as DV

DV = 'Playoffs'

Dummy code categorical variables

final_data = pd.get_dummies(df, drop_first=True)

Save IVs (X) and DV (y) and then split them into testing/training

X = final_data.drop(DV, axis = 1)
y = final_data[DV]
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Set up the steps for a pipeline

steps = [('scaler', StandardScaler()), ('XGBoost', xgb.XGBClassifier())]

Setup the pipeline

pipeline = Pipeline(steps)

Specify the hyperparameter space

parameters = {'XGBoost__booster': ['gbtree', 'gblinear', 'dart'],
              'XGBoost__subsample': np.arange(0.5, 1, 0.05),
              'XGBoost__max_depth': np.arange(3, 20, 1),
              'XGBoost__colsample_bytree': np.arange(0.1, 1.05, 0.05),
              'XGBoost__objective': ['reg:logistic','binary:logistic']}

Instantiate the GridSearchCV model

model = RandomizedSearchCV(pipeline, parameters, n_iter=10, scoring='roc_auc', cv=4)

Time the model-tuning

start_time = time.time()

Fit to the training set

model.fit(X_train, y_train)

Time the tuning (end)

elapsed_time = (time.time() - start_time)/60
print('Time to tune the model: {0:0.2f} min.'.format(elapsed_time))

## Time to tune the model: 0.06 min.

Print the tuned parameters

print('Tuned Model Parameters: {}'.format(model.best_params_))

## Tuned Model Parameters: {'XGBoost__subsample': 0.5, 'XGBoost__objective': 'binary:logistic', 'XGBoost__max_depth': 7, 'XGBoost__colsample_bytree': 0.7000000000000002, 'XGBoost__booster': 'dart'}

Get predicted classes

predictions = model.predict(X_test)

## C:\Users\aengland\AppData\Local\CONTIN~1\ANACON~1\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
##   if diff:

Get predicted probability

pred_prob = model.predict_proba(X_test)[:,1]

Evaluate performance with confusion matrix

cm = pd.DataFrame(confusion_matrix(y_test, predictions))
cm['Total'] = np.sum(cm, axis=1)
cm = cm.append(np.sum(cm, axis=0), ignore_index=True)
cm.columns = ['Predicted No', 'Predicted Yes', 'Total']
cm = cm.set_index([['Actual No', 'Actual Yes', 'Total']])
print(cm)

##             Predicted No  Predicted Yes  Total
## Actual No             93             28    121
## Actual Yes            11            153    164
## Total                104            181    285

Evaluate performance with classification report

print(classification_report(y_test, predictions))

##              precision    recall  f1-score   support
## 
##           0       0.89      0.77      0.83       121
##           1       0.85      0.93      0.89       164
## 
## avg / total       0.87      0.86      0.86       285

Evaluate performance with ROC curve

# Calculate area under the curve (logit_roc_auc)
logit_roc_auc = roc_auc_score(y_test, model.predict(X_test)) 
# Plot ROC

## C:\Users\aengland\AppData\Local\CONTIN~1\ANACON~1\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
##   if diff:

fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1]) 
plt.figure()
plt.plot(fpr, tpr, label='Model (Area = %0.2f)' % logit_roc_auc) 
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig('XGBoost_ROC')
plt.show()

plt.clf()

Evaluate performance with PR Curve (for when there is a classification imbalance)

# Get average precision
average_precision = average_precision_score(y_test, predictions)
# Plot PR
precision, recall, _ = precision_recall_curve(y_test, pred_prob)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))
plt.savefig('XGBoost_PR')
plt.show()

XG Boost Classifier

Aaron England

November 14, 2018

Tune XGBoost Classifier in Pipeline

For this tutorial we will be predicting whether or not an NBA team makes the playoffs based on a number of team statistics

Import dependencies

Load in the data set

Save the DV as DV

Dummy code categorical variables

Save IVs (X) and DV (y) and then split them into testing/training

Set up the steps for a pipeline

Setup the pipeline

Specify the hyperparameter space

Instantiate the GridSearchCV model

Time the model-tuning

Fit to the training set

Time the tuning (end)

Print the tuned parameters

Get predicted classes

Get predicted probability

Evaluate performance with confusion matrix

Evaluate performance with classification report

Evaluate performance with ROC curve

Evaluate performance with PR Curve (for when there is a classification imbalance)