Tune XGBoost Classifier in Pipeline
For this tutorial we will be predicting whether or not an NBA team makes the playoffs based on a number of team statistics
Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
Load in the data set
df = pd.read_csv('dt_NBA_class.csv')
Save the DV as DV
DV = 'Playoffs'
Dummy code categorical variables
final_data = pd.get_dummies(df, drop_first=True)
Save IVs (X) and DV (y) and then split them into testing/training
X = final_data.drop(DV, axis = 1)
y = final_data[DV]
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
Set up the steps for a pipeline
steps = [('scaler', StandardScaler()), ('XGBoost', xgb.XGBClassifier())]
Setup the pipeline
pipeline = Pipeline(steps)
Specify the hyperparameter space
parameters = {'XGBoost__booster': ['gbtree', 'gblinear', 'dart'],
'XGBoost__subsample': np.arange(0.5, 1, 0.05),
'XGBoost__max_depth': np.arange(3, 20, 1),
'XGBoost__colsample_bytree': np.arange(0.1, 1.05, 0.05),
'XGBoost__objective': ['reg:logistic','binary:logistic']}
Instantiate the GridSearchCV model
model = RandomizedSearchCV(pipeline, parameters, n_iter=10, scoring='roc_auc', cv=4)
Time the model-tuning
start_time = time.time()
Fit to the training set
model.fit(X_train, y_train)
Time the tuning (end)
elapsed_time = (time.time() - start_time)/60
print('Time to tune the model: {0:0.2f} min.'.format(elapsed_time))
## Time to tune the model: 0.06 min.
Print the tuned parameters
print('Tuned Model Parameters: {}'.format(model.best_params_))
## Tuned Model Parameters: {'XGBoost__subsample': 0.5, 'XGBoost__objective': 'binary:logistic', 'XGBoost__max_depth': 7, 'XGBoost__colsample_bytree': 0.7000000000000002, 'XGBoost__booster': 'dart'}
Get predicted classes
predictions = model.predict(X_test)
## C:\Users\aengland\AppData\Local\CONTIN~1\ANACON~1\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
## if diff:
Get predicted probability
pred_prob = model.predict_proba(X_test)[:,1]
Evaluate performance with confusion matrix
cm = pd.DataFrame(confusion_matrix(y_test, predictions))
cm['Total'] = np.sum(cm, axis=1)
cm = cm.append(np.sum(cm, axis=0), ignore_index=True)
cm.columns = ['Predicted No', 'Predicted Yes', 'Total']
cm = cm.set_index([['Actual No', 'Actual Yes', 'Total']])
print(cm)
## Predicted No Predicted Yes Total
## Actual No 93 28 121
## Actual Yes 11 153 164
## Total 104 181 285
Evaluate performance with classification report
print(classification_report(y_test, predictions))
## precision recall f1-score support
##
## 0 0.89 0.77 0.83 121
## 1 0.85 0.93 0.89 164
##
## avg / total 0.87 0.86 0.86 285
Evaluate performance with ROC curve
# Calculate area under the curve (logit_roc_auc)
logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
# Plot ROC
## C:\Users\aengland\AppData\Local\CONTIN~1\ANACON~1\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
## if diff:
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Model (Area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig('XGBoost_ROC')
plt.show()

plt.clf()
Evaluate performance with PR Curve (for when there is a classification imbalance)
# Get average precision
average_precision = average_precision_score(y_test, predictions)
# Plot PR
precision, recall, _ = precision_recall_curve(y_test, pred_prob)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))
plt.savefig('XGBoost_PR')
plt.show()
