Santander Customer Satisfaction : Prediction

Lekshman Ramesh

What Is It About?

Objective

Loading Data

load("D:/Lekshman/Kaggle/train.csv/Santander-Data Wrangling/Santander-Data Viz/Santander-Prediction/train.Rda")
dim(train)
## [1] 76020   157

Defining Parameters

param <- list(objective = "binary:logistic", 
              eval_metric = "auc",
              booster = "gbtree", 
              eta = 0.05,
              subsample = 0.7,
              colsample_bytree = 0.7,
              max_depth = 5)

Preparing Sparse Matrix For XGBoost

library(xgboost)
library(Matrix)
label=train$TARGET
xgmat=xgb.DMatrix(as.matrix(train[,2:156]), label=label)
##OR use sparse.model.matrix function
#xgmat=sparse.model.matrix(TARGET~.-1,data=train)

Basic Model Test

cv.res_red=xgb.cv(data=xgmat, nfold=3, label=label,nround=200, params = param, print_every_n = 100)
## Warning in xgb.get.DMatrix(data, label, missing): xgboost: label will be
## ignored.
## [1]  train-auc:0.779339+0.041311 test-auc:0.770796+0.037849 
## [101]    train-auc:0.872975+0.000825 test-auc:0.838322+0.001451 
## [200]    train-auc:0.891807+0.000446 test-auc:0.838562+0.001643
#Test AUC ~0.84

Experiment- Transforming Variables -Var 1 & 2

Variables explaining the outcome Variables explaining the outcome

Experiment- Transforming Variables -Var 3 & 4

Variables explaining the outcome Variables explaining the outcome

Transforming Select Variables To Factors

train$num_var4_factor=as.factor(train$num_var4)
train$num_var35_factor=as.factor(train$num_var35)
train$var36_factor=as.factor(train$var36)
train$num_meses_var39_vig_ult3_factor=as.factor(train$num_meses_var39_vig_ult3)

Testing Experiment

xgmat=sparse.model.matrix(TARGET~.-1,data=train)
cv.res_red=xgb.cv(data=xgmat, nfold=3, label=label,nround=200, params = param, print_every_n = 100)
## [1]  train-auc:0.804939+0.009476 test-auc:0.792259+0.010621 
## [101]    train-auc:0.877267+0.001319 test-auc:0.837756+0.000989 
## [200]    train-auc:0.898721+0.001388 test-auc:0.839191+0.001112
##Improves AUC by 0.0011 on average

Parameter Tuning

# Dataframe of values tested. 
# Outputs errors for each combination tested 
# nrounds and ETA can be tested separately to reduce computations
xgb_grid_1=expand.grid(nrounds=200,eta=0.05,max_depth=c(5,7,9),
subsample=c(0.6,0.7,0.8),colsample_bytree=c(0.6,0.7,0.8))
cv_final=NULL
for (i in 1:nrow(xgb_grid_1)){
param <- list(objective = "binary:logistic", eval_metric = "auc",
booster = "gbtree", eta = xgb_grid_1$eta[i],
        subsample = xgb_grid_1$subsample[i],
           colsample_bytree = xgb_grid_1$colsample_bytree[i],
           max_depth = xgb_grid_1$max_depth[i])
cv.res_red=xgb.cv(data=xgmat, nfold=3,nround=xgb_grid_1$nrounds[i], params = param, verbose=0)
cv_op=cv.res_red$evaluation_log
cv_op=cv_op[200,]
cv_final=rbind(cv_op,cv_final)}