load(".RData")
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
detectCores()
## [1] 4
registerDoParallel(cores=4)
library(C50)
data(churn)
str(churnTrain)
## 'data.frame': 3333 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num 265 162 243 299 167 ...
## $ total_day_calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num 245 254 163 197 187 ...
## $ total_night_calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
predictors <- names(churnTrain)[names(churnTrain) != "churn"]
library(caret)
set.seed(1)
inTrainingSet <- createDataPartition(allData$churn, p=.75, list = FALSE) #an index
churnTrain <- allData[inTrainingSet,]
churnTest <- allData[-inTrainingSet,]
#also see createFolds (K-fold CV), createMultiFolds, createResamples (bootstrapping)
numerics <- c("account_length", "total_day_calls", "total_night_calls")
##means and sd's
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
procValues <- preProcess(churnTrain[,numerics],
method=c("center", "scale", "YeoJohnson")) #note: it is going to do ALL of these, order is automatic
str(procValues)
## List of 21
## $ dim : int [1:2] 3333 3
## $ bc : NULL
## $ yj : Named num [1:3] 0.89 1.169 0.934
## ..- attr(*, "names")= chr [1:3] "account_length" "total_day_calls" "total_night_calls"
## $ et : NULL
## $ invHyperbolicSine: NULL
## $ mean : Named num [1:3] 67.3 189.2 78.6
## ..- attr(*, "names")= chr [1:3] "account_length" "total_day_calls" "total_night_calls"
## $ std : Named num [1:3] 24.2 43.5 14.4
## ..- attr(*, "names")= chr [1:3] "account_length" "total_day_calls" "total_night_calls"
## $ ranges : NULL
## $ rotation : NULL
## $ method :List of 4
## ..$ center : chr [1:3] "account_length" "total_day_calls" "total_night_calls"
## ..$ scale : chr [1:3] "account_length" "total_day_calls" "total_night_calls"
## ..$ YeoJohnson: chr [1:3] "account_length" "total_day_calls" "total_night_calls"
## ..$ ignore : chr(0)
## $ thresh : num 0.95
## $ pcaComp : NULL
## $ numComp : NULL
## $ ica : NULL
## $ wildcards :List of 2
## ..$ PCA: chr(0)
## ..$ ICA: chr(0)
## $ k : num 5
## $ knnSummary :function (x, ...)
## $ bagImp : NULL
## $ median : NULL
## $ data : NULL
## $ rangeBounds : num [1:2] 0 1
## - attr(*, "class")= chr "preProcess"
procValues
## Created from 3333 samples and 3 variables
##
## Pre-processing:
## - centered (3)
## - ignored (0)
## - scaled (3)
## - Yeo-Johnson transformation (3)
##
## Lambda estimates for Yeo-Johnson transformation:
## 0.89, 1.17, 0.93
##use the predict method to do the adjustments
trainScaled <- predict(procValues, churnTrain[,numerics])
testScaled <- predict(procValues, churnTrain[,numerics])
#note preProcess can called within other functions (more later)
“There are three classes of functions: helper functions, training functions and some feature selection methods”
#great discussion in the webinar
#train fits a sequence of models, estimates performance using resampling, and we pick the model with the best performance
#basic method
gbmTune <- train(x=churnTrain[,predictors],
y=churnTrain$churn,
method = "gbm")
#or, use formula
gbmTune <- train(churn ~ ., data = churnTrain, method="gbm")
#add option for sampling method
ctrl <- trainControl(method = "repeatedcv", repeats = 5) #for resampling, you can use boostrapping, 10-fold cross-validation
gbmTune <- train(x=churnTrain[,predictors],
y=churnTrain$churn,
method = "gbm",
verbose = FALSE,
trControl = ctrl)
#add option for accuracy
ctrl <- trainControl(method = "repeatedcv", repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary) #twoClassSummary gives AOC/specificity/sensitivity
gbmTune <- train(churn ~ ., data=churnTrain,
method = "gbm",
metric = "ROC", #optimizes for ROC (there are others)
verbose = FALSE,
trControl = ctrl)
#tuning parameters/search grid, by default, 3 values for each tuning parameter
ctrl <- trainControl(method = "repeatedcv", repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary) #twoClassSummary gives AOC/specificity/sensitivity
grid <- expand.grid(interaction.depth = seq(1,7, by = 2),
n.trees = seq(100, 1000, by = 50),
shrinkage = c(0.01, 0.1))
gbmTune <- train(churn ~ ., data=churnTrain,
method = "gbm",
metric = "ROC",
verbose = FALSE,
trControl = ctrl,
tuneGrid = grid)
ctrl <- trainControl(method = "repeatedcv", repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary) #twoClassSummary gives AOC/specificity/sensitivity
grid <- expand.grid(interaction.depth = seq(1,7, by = 2),
n.trees = seq(100, 1000, by = 50),
shrinkage = c(0.01, 0.1),
n.minobsinnode = 10)
gbmTune <- train(churn ~ ., data=churnTrain,
method = "gbm",
metric = "ROC",
verbose = FALSE,
trControl = ctrl,
tuneGrid = grid)
print(gbmTune)
## Stochastic Gradient Boosting
##
## 3333 samples
## 19 predictor
## 2 classes: 'yes', 'no'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 2999, 2999, 3000, 2999, 3000, 3000, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees ROC Sens Spec
## 0.01 1 100 0.8461981 0.008256803 0.9992281
## 0.01 1 150 0.8546843 0.028103741 0.9977544
## 0.01 1 200 0.8566195 0.058750000 0.9933333
## 0.01 1 250 0.8586904 0.082372449 0.9910175
## 0.01 1 300 0.8590634 0.098503401 0.9889123
## 0.01 1 350 0.8620690 0.118818027 0.9865965
## 0.01 1 400 0.8630946 0.160654762 0.9832982
## 0.01 1 450 0.8643686 0.183044218 0.9818246
## 0.01 1 500 0.8651805 0.194217687 0.9801404
## 0.01 1 550 0.8659168 0.200858844 0.9799298
## 0.01 1 600 0.8670683 0.207882653 0.9788070
## 0.01 1 650 0.8675877 0.219489796 0.9781754
## 0.01 1 700 0.8682437 0.232338435 0.9775439
## 0.01 1 750 0.8688993 0.247678571 0.9767018
## 0.01 1 800 0.8693165 0.259294218 0.9758596
## 0.01 1 850 0.8700491 0.267993197 0.9752982
## 0.01 1 900 0.8707730 0.277125850 0.9745965
## 0.01 1 950 0.8711186 0.284982993 0.9734035
## 0.01 1 1000 0.8717472 0.292040816 0.9725614
## 0.01 3 100 0.8909088 0.023188776 1.0000000
## 0.01 3 150 0.8972161 0.180994898 0.9987368
## 0.01 3 200 0.9026482 0.353265306 0.9952982
## 0.01 3 250 0.9049503 0.466224490 0.9932632
## 0.01 3 300 0.9073634 0.597134354 0.9925614
## 0.01 3 350 0.9094271 0.621590136 0.9919298
## 0.01 3 400 0.9106166 0.632789116 0.9914386
## 0.01 3 450 0.9119150 0.642295918 0.9910175
## 0.01 3 500 0.9124010 0.655552721 0.9910877
## 0.01 3 550 0.9136894 0.666743197 0.9912982
## 0.01 3 600 0.9141278 0.672534014 0.9912281
## 0.01 3 650 0.9144034 0.680824830 0.9913684
## 0.01 3 700 0.9149327 0.687440476 0.9915789
## 0.01 3 750 0.9152928 0.694481293 0.9915789
## 0.01 3 800 0.9159243 0.701522109 0.9915789
## 0.01 3 850 0.9162725 0.707755102 0.9912982
## 0.01 3 900 0.9165091 0.716037415 0.9910175
## 0.01 3 950 0.9168537 0.719345238 0.9908070
## 0.01 3 1000 0.9174025 0.722670068 0.9907368
## 0.01 5 100 0.9017963 0.269574830 0.9967018
## 0.01 5 150 0.9079121 0.525476190 0.9948772
## 0.01 5 200 0.9107058 0.600493197 0.9935439
## 0.01 5 250 0.9111579 0.632797619 0.9928421
## 0.01 5 300 0.9137708 0.649795918 0.9926316
## 0.01 5 350 0.9143629 0.669243197 0.9924912
## 0.01 5 400 0.9157668 0.684098639 0.9922807
## 0.01 5 450 0.9158892 0.696122449 0.9924912
## 0.01 5 500 0.9161578 0.707729592 0.9927719
## 0.01 5 550 0.9165144 0.718095238 0.9925614
## 0.01 5 600 0.9166529 0.722619048 0.9923509
## 0.01 5 650 0.9167992 0.726768707 0.9922807
## 0.01 5 700 0.9173938 0.730918367 0.9923509
## 0.01 5 750 0.9173420 0.734226190 0.9922807
## 0.01 5 800 0.9179530 0.737125850 0.9922105
## 0.01 5 850 0.9178821 0.739642857 0.9921404
## 0.01 5 900 0.9178731 0.744200680 0.9923509
## 0.01 5 950 0.9178667 0.745025510 0.9922807
## 0.01 5 1000 0.9178358 0.748324830 0.9922807
## 0.01 7 100 0.9079063 0.523384354 0.9956491
## 0.01 7 150 0.9132710 0.590552721 0.9946667
## 0.01 7 200 0.9138385 0.624549320 0.9941053
## 0.01 7 250 0.9165410 0.654727891 0.9935439
## 0.01 7 300 0.9170727 0.676241497 0.9930526
## 0.01 7 350 0.9173536 0.698630952 0.9927719
## 0.01 7 400 0.9171845 0.706887755 0.9927719
## 0.01 7 450 0.9170545 0.719753401 0.9929825
## 0.01 7 500 0.9170975 0.730544218 0.9926316
## 0.01 7 550 0.9170182 0.733852041 0.9926316
## 0.01 7 600 0.9171606 0.737108844 0.9925614
## 0.01 7 650 0.9168902 0.741658163 0.9924211
## 0.01 7 700 0.9170749 0.744557823 0.9923509
## 0.01 7 750 0.9171558 0.746224490 0.9923509
## 0.01 7 800 0.9171355 0.749107143 0.9921404
## 0.01 7 850 0.9168739 0.751573129 0.9920702
## 0.01 7 900 0.9169372 0.752831633 0.9917193
## 0.01 7 950 0.9168490 0.752006803 0.9917193
## 0.01 7 1000 0.9163790 0.751581633 0.9915088
## 0.10 1 100 0.8718703 0.295756803 0.9722807
## 0.10 1 150 0.8738238 0.340858844 0.9690526
## 0.10 1 200 0.8751538 0.369897959 0.9654737
## 0.10 1 250 0.8758499 0.382321429 0.9644211
## 0.10 1 300 0.8756943 0.391819728 0.9637193
## 0.10 1 350 0.8753305 0.401360544 0.9637193
## 0.10 1 400 0.8751134 0.398869048 0.9629474
## 0.10 1 450 0.8740544 0.407568027 0.9623860
## 0.10 1 500 0.8737100 0.415059524 0.9609825
## 0.10 1 550 0.8734029 0.416232993 0.9602105
## 0.10 1 600 0.8732570 0.419600340 0.9599298
## 0.10 1 650 0.8726793 0.420807823 0.9601404
## 0.10 1 700 0.8724335 0.417925170 0.9599298
## 0.10 1 750 0.8717840 0.420433673 0.9592982
## 0.10 1 800 0.8720388 0.422865646 0.9589474
## 0.10 1 850 0.8714425 0.419974490 0.9585965
## 0.10 1 900 0.8706423 0.422500000 0.9585263
## 0.10 1 950 0.8703107 0.422517007 0.9586667
## 0.10 1 1000 0.8697184 0.427100340 0.9587368
## 0.10 3 100 0.9162221 0.719761905 0.9895439
## 0.10 3 150 0.9181797 0.734634354 0.9884912
## 0.10 3 200 0.9181876 0.742100340 0.9877895
## 0.10 3 250 0.9166147 0.743792517 0.9876491
## 0.10 3 300 0.9157307 0.744591837 0.9874386
## 0.10 3 350 0.9151694 0.747049320 0.9877895
## 0.10 3 400 0.9151442 0.742882653 0.9870877
## 0.10 3 450 0.9148876 0.747066327 0.9868070
## 0.10 3 500 0.9143090 0.744158163 0.9865263
## 0.10 3 550 0.9138603 0.743698980 0.9870877
## 0.10 3 600 0.9137892 0.746198980 0.9870175
## 0.10 3 650 0.9134310 0.744957483 0.9868070
## 0.10 3 700 0.9130599 0.743715986 0.9871579
## 0.10 3 750 0.9125879 0.739115646 0.9870175
## 0.10 3 800 0.9127169 0.739540816 0.9873684
## 0.10 3 850 0.9124624 0.737465986 0.9875088
## 0.10 3 900 0.9120293 0.739149660 0.9871579
## 0.10 3 950 0.9122754 0.738715986 0.9872281
## 0.10 3 1000 0.9117295 0.736666667 0.9875088
## 0.10 5 100 0.9175928 0.743316327 0.9910877
## 0.10 5 150 0.9159789 0.749940476 0.9905263
## 0.10 5 200 0.9153601 0.748639456 0.9897544
## 0.10 5 250 0.9149472 0.751938776 0.9899649
## 0.10 5 300 0.9149078 0.751530612 0.9891930
## 0.10 5 350 0.9148999 0.752738095 0.9890526
## 0.10 5 400 0.9143009 0.755221088 0.9894737
## 0.10 5 450 0.9139375 0.752755102 0.9889825
## 0.10 5 500 0.9138000 0.750272109 0.9888421
## 0.10 5 550 0.9139463 0.749073129 0.9891228
## 0.10 5 600 0.9131624 0.749090136 0.9886316
## 0.10 5 650 0.9134728 0.747423469 0.9885614
## 0.10 5 700 0.9131046 0.744540816 0.9882807
## 0.10 5 750 0.9129605 0.745790816 0.9887719
## 0.10 5 800 0.9129375 0.745773810 0.9886316
## 0.10 5 850 0.9129241 0.747406463 0.9885614
## 0.10 5 900 0.9127498 0.744498299 0.9890526
## 0.10 5 950 0.9130856 0.744906463 0.9887719
## 0.10 5 1000 0.9128339 0.748664966 0.9888421
## 0.10 7 100 0.9164922 0.741275510 0.9903860
## 0.10 7 150 0.9149505 0.749965986 0.9887018
## 0.10 7 200 0.9150859 0.749557823 0.9884211
## 0.10 7 250 0.9151614 0.749940476 0.9884211
## 0.10 7 300 0.9145190 0.743690476 0.9886316
## 0.10 7 350 0.9138952 0.739974490 0.9885614
## 0.10 7 400 0.9133353 0.743316327 0.9888421
## 0.10 7 450 0.9134895 0.741641156 0.9884211
## 0.10 7 500 0.9127745 0.744574830 0.9882105
## 0.10 7 550 0.9128931 0.744132653 0.9884211
## 0.10 7 600 0.9129918 0.742049320 0.9888421
## 0.10 7 650 0.9129074 0.743724490 0.9890526
## 0.10 7 700 0.9129720 0.742066327 0.9890526
## 0.10 7 750 0.9128330 0.745765306 0.9890526
## 0.10 7 800 0.9128972 0.746190476 0.9889123
## 0.10 7 850 0.9127248 0.746207483 0.9891930
## 0.10 7 900 0.9129390 0.746181973 0.9891228
## 0.10 7 950 0.9128828 0.747431973 0.9887018
## 0.10 7 1000 0.9132160 0.747431973 0.9887719
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200,
## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
#n=100 trees
#Interaction.depth of 5 indicates a "moderately deep tree"
#Shrinkage indicates speed of learning= 0.1 is a "fast" learner, 0.01 is a "slow" learner
#If you want to adjust the parameters, you don't have to rereun the model- just update
#plot(gbmTune)
library(ggplot2)
ggplot(gbmTune) + theme(legend.position = "top")
#what are boosting iterations?
#look at top right under 0.1- with very few trees, model picks best
#gbmTune$finalModel
#
gbmPred <- predict(gbmTune, churnTest)
str(gbmPred)
## Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
gbmProbs <- predict(gbmTune, churnTest, type = "prob")
str(gbmProbs)
## 'data.frame': 1667 obs. of 2 variables:
## $ yes: num 0.0327 0.0811 0.3858 0.0257 0.0264 ...
## $ no : num 0.967 0.919 0.614 0.974 0.974 ...
library(caret)
confusionMatrix(gbmPred, churnTest$churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 157 9
## no 67 1434
##
## Accuracy : 0.9544
## 95% CI : (0.9433, 0.9639)
## No Information Rate : 0.8656
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.78
## Mcnemar's Test P-Value : 6.22e-11
##
## Sensitivity : 0.70089
## Specificity : 0.99376
## Pos Pred Value : 0.94578
## Neg Pred Value : 0.95536
## Prevalence : 0.13437
## Detection Rate : 0.09418
## Detection Prevalence : 0.09958
## Balanced Accuracy : 0.84733
##
## 'Positive' Class : yes
##
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
rocCurve <- roc(response = churnTest$churn,
predictor = gbmProbs[, "yes"],
levels = rev(levels(churnTest$churn)))
rocCurve
##
## Call:
## roc.default(response = churnTest$churn, predictor = gbmProbs[, "yes"], levels = rev(levels(churnTest$churn)))
##
## Data: gbmProbs[, "yes"] in 1443 controls (churnTest$churn no) < 224 cases (churnTest$churn yes).
## Area under the curve: 0.9257
plot(rocCurve,
print.thres = c(.5, .2), #print nominal 50% cutoff and 20%
print.thres.pch = 16,
print.thres.cex = 1.2)