load(".RData")
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
detectCores()
## [1] 4
registerDoParallel(cores=4)
library(C50)
data(churn)
str(churnTrain)
## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
predictors <- names(churnTrain)[names(churnTrain) != "churn"]
library(caret)
set.seed(1)
inTrainingSet <- createDataPartition(allData$churn, p=.75, list = FALSE) #an index
churnTrain <- allData[inTrainingSet,]
churnTest <- allData[-inTrainingSet,]
#also see createFolds (K-fold CV), createMultiFolds, createResamples (bootstrapping)
numerics <- c("account_length", "total_day_calls", "total_night_calls")
##means and sd's
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
procValues <- preProcess(churnTrain[,numerics],
                         method=c("center", "scale", "YeoJohnson")) #note: it is going to do ALL of these, order is automatic
str(procValues)
## List of 21
##  $ dim              : int [1:2] 3333 3
##  $ bc               : NULL
##  $ yj               : Named num [1:3] 0.89 1.169 0.934
##   ..- attr(*, "names")= chr [1:3] "account_length" "total_day_calls" "total_night_calls"
##  $ et               : NULL
##  $ invHyperbolicSine: NULL
##  $ mean             : Named num [1:3] 67.3 189.2 78.6
##   ..- attr(*, "names")= chr [1:3] "account_length" "total_day_calls" "total_night_calls"
##  $ std              : Named num [1:3] 24.2 43.5 14.4
##   ..- attr(*, "names")= chr [1:3] "account_length" "total_day_calls" "total_night_calls"
##  $ ranges           : NULL
##  $ rotation         : NULL
##  $ method           :List of 4
##   ..$ center    : chr [1:3] "account_length" "total_day_calls" "total_night_calls"
##   ..$ scale     : chr [1:3] "account_length" "total_day_calls" "total_night_calls"
##   ..$ YeoJohnson: chr [1:3] "account_length" "total_day_calls" "total_night_calls"
##   ..$ ignore    : chr(0) 
##  $ thresh           : num 0.95
##  $ pcaComp          : NULL
##  $ numComp          : NULL
##  $ ica              : NULL
##  $ wildcards        :List of 2
##   ..$ PCA: chr(0) 
##   ..$ ICA: chr(0) 
##  $ k                : num 5
##  $ knnSummary       :function (x, ...)  
##  $ bagImp           : NULL
##  $ median           : NULL
##  $ data             : NULL
##  $ rangeBounds      : num [1:2] 0 1
##  - attr(*, "class")= chr "preProcess"
procValues
## Created from 3333 samples and 3 variables
## 
## Pre-processing:
##   - centered (3)
##   - ignored (0)
##   - scaled (3)
##   - Yeo-Johnson transformation (3)
## 
## Lambda estimates for Yeo-Johnson transformation:
## 0.89, 1.17, 0.93
##use the predict method to do the adjustments
trainScaled <- predict(procValues, churnTrain[,numerics])
testScaled <- predict(procValues, churnTrain[,numerics])
#note preProcess can called within other functions (more later)

“There are three classes of functions: helper functions, training functions and some feature selection methods”

#great discussion in the webinar
#train fits a sequence of models, estimates performance using resampling, and we pick the model with the best performance
#basic method
gbmTune <- train(x=churnTrain[,predictors],
                 y=churnTrain$churn, 
                 method = "gbm")

#or, use formula
gbmTune <- train(churn ~ ., data = churnTrain, method="gbm")

#add option for sampling method
ctrl <- trainControl(method = "repeatedcv", repeats = 5) #for resampling, you can use boostrapping, 10-fold cross-validation
gbmTune <- train(x=churnTrain[,predictors],
                 y=churnTrain$churn, 
                 method = "gbm",
                 verbose = FALSE, 
                 trControl = ctrl)

#add option for accuracy
ctrl <- trainControl(method = "repeatedcv", repeats = 5,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary) #twoClassSummary gives AOC/specificity/sensitivity

gbmTune <- train(churn ~ ., data=churnTrain, 
                 method = "gbm",
                 metric = "ROC", #optimizes for ROC (there are others)
                 verbose = FALSE, 
                 trControl = ctrl)

#tuning parameters/search grid, by default, 3 values for each tuning parameter
ctrl <- trainControl(method = "repeatedcv", repeats = 5,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary) #twoClassSummary gives AOC/specificity/sensitivity

grid <- expand.grid(interaction.depth = seq(1,7, by = 2),
                    n.trees = seq(100, 1000, by = 50),
                    shrinkage = c(0.01, 0.1))

gbmTune <- train(churn ~ ., data=churnTrain, 
                 method = "gbm",
                 metric = "ROC", 
                 verbose = FALSE, 
                 trControl = ctrl,
                 tuneGrid = grid)
ctrl <- trainControl(method = "repeatedcv", repeats = 5,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary) #twoClassSummary gives AOC/specificity/sensitivity

grid <- expand.grid(interaction.depth = seq(1,7, by = 2),
                    n.trees = seq(100, 1000, by = 50),
                    shrinkage = c(0.01, 0.1),
                    n.minobsinnode = 10)

gbmTune <- train(churn ~ ., data=churnTrain, 
                 method = "gbm",
                 metric = "ROC", 
                 verbose = FALSE, 
                 trControl = ctrl,
                 tuneGrid = grid)
print(gbmTune)
## Stochastic Gradient Boosting 
## 
## 3333 samples
##   19 predictor
##    2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 2999, 2999, 3000, 2999, 3000, 3000, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  ROC        Sens         Spec     
##   0.01       1                   100     0.8461981  0.008256803  0.9992281
##   0.01       1                   150     0.8546843  0.028103741  0.9977544
##   0.01       1                   200     0.8566195  0.058750000  0.9933333
##   0.01       1                   250     0.8586904  0.082372449  0.9910175
##   0.01       1                   300     0.8590634  0.098503401  0.9889123
##   0.01       1                   350     0.8620690  0.118818027  0.9865965
##   0.01       1                   400     0.8630946  0.160654762  0.9832982
##   0.01       1                   450     0.8643686  0.183044218  0.9818246
##   0.01       1                   500     0.8651805  0.194217687  0.9801404
##   0.01       1                   550     0.8659168  0.200858844  0.9799298
##   0.01       1                   600     0.8670683  0.207882653  0.9788070
##   0.01       1                   650     0.8675877  0.219489796  0.9781754
##   0.01       1                   700     0.8682437  0.232338435  0.9775439
##   0.01       1                   750     0.8688993  0.247678571  0.9767018
##   0.01       1                   800     0.8693165  0.259294218  0.9758596
##   0.01       1                   850     0.8700491  0.267993197  0.9752982
##   0.01       1                   900     0.8707730  0.277125850  0.9745965
##   0.01       1                   950     0.8711186  0.284982993  0.9734035
##   0.01       1                  1000     0.8717472  0.292040816  0.9725614
##   0.01       3                   100     0.8909088  0.023188776  1.0000000
##   0.01       3                   150     0.8972161  0.180994898  0.9987368
##   0.01       3                   200     0.9026482  0.353265306  0.9952982
##   0.01       3                   250     0.9049503  0.466224490  0.9932632
##   0.01       3                   300     0.9073634  0.597134354  0.9925614
##   0.01       3                   350     0.9094271  0.621590136  0.9919298
##   0.01       3                   400     0.9106166  0.632789116  0.9914386
##   0.01       3                   450     0.9119150  0.642295918  0.9910175
##   0.01       3                   500     0.9124010  0.655552721  0.9910877
##   0.01       3                   550     0.9136894  0.666743197  0.9912982
##   0.01       3                   600     0.9141278  0.672534014  0.9912281
##   0.01       3                   650     0.9144034  0.680824830  0.9913684
##   0.01       3                   700     0.9149327  0.687440476  0.9915789
##   0.01       3                   750     0.9152928  0.694481293  0.9915789
##   0.01       3                   800     0.9159243  0.701522109  0.9915789
##   0.01       3                   850     0.9162725  0.707755102  0.9912982
##   0.01       3                   900     0.9165091  0.716037415  0.9910175
##   0.01       3                   950     0.9168537  0.719345238  0.9908070
##   0.01       3                  1000     0.9174025  0.722670068  0.9907368
##   0.01       5                   100     0.9017963  0.269574830  0.9967018
##   0.01       5                   150     0.9079121  0.525476190  0.9948772
##   0.01       5                   200     0.9107058  0.600493197  0.9935439
##   0.01       5                   250     0.9111579  0.632797619  0.9928421
##   0.01       5                   300     0.9137708  0.649795918  0.9926316
##   0.01       5                   350     0.9143629  0.669243197  0.9924912
##   0.01       5                   400     0.9157668  0.684098639  0.9922807
##   0.01       5                   450     0.9158892  0.696122449  0.9924912
##   0.01       5                   500     0.9161578  0.707729592  0.9927719
##   0.01       5                   550     0.9165144  0.718095238  0.9925614
##   0.01       5                   600     0.9166529  0.722619048  0.9923509
##   0.01       5                   650     0.9167992  0.726768707  0.9922807
##   0.01       5                   700     0.9173938  0.730918367  0.9923509
##   0.01       5                   750     0.9173420  0.734226190  0.9922807
##   0.01       5                   800     0.9179530  0.737125850  0.9922105
##   0.01       5                   850     0.9178821  0.739642857  0.9921404
##   0.01       5                   900     0.9178731  0.744200680  0.9923509
##   0.01       5                   950     0.9178667  0.745025510  0.9922807
##   0.01       5                  1000     0.9178358  0.748324830  0.9922807
##   0.01       7                   100     0.9079063  0.523384354  0.9956491
##   0.01       7                   150     0.9132710  0.590552721  0.9946667
##   0.01       7                   200     0.9138385  0.624549320  0.9941053
##   0.01       7                   250     0.9165410  0.654727891  0.9935439
##   0.01       7                   300     0.9170727  0.676241497  0.9930526
##   0.01       7                   350     0.9173536  0.698630952  0.9927719
##   0.01       7                   400     0.9171845  0.706887755  0.9927719
##   0.01       7                   450     0.9170545  0.719753401  0.9929825
##   0.01       7                   500     0.9170975  0.730544218  0.9926316
##   0.01       7                   550     0.9170182  0.733852041  0.9926316
##   0.01       7                   600     0.9171606  0.737108844  0.9925614
##   0.01       7                   650     0.9168902  0.741658163  0.9924211
##   0.01       7                   700     0.9170749  0.744557823  0.9923509
##   0.01       7                   750     0.9171558  0.746224490  0.9923509
##   0.01       7                   800     0.9171355  0.749107143  0.9921404
##   0.01       7                   850     0.9168739  0.751573129  0.9920702
##   0.01       7                   900     0.9169372  0.752831633  0.9917193
##   0.01       7                   950     0.9168490  0.752006803  0.9917193
##   0.01       7                  1000     0.9163790  0.751581633  0.9915088
##   0.10       1                   100     0.8718703  0.295756803  0.9722807
##   0.10       1                   150     0.8738238  0.340858844  0.9690526
##   0.10       1                   200     0.8751538  0.369897959  0.9654737
##   0.10       1                   250     0.8758499  0.382321429  0.9644211
##   0.10       1                   300     0.8756943  0.391819728  0.9637193
##   0.10       1                   350     0.8753305  0.401360544  0.9637193
##   0.10       1                   400     0.8751134  0.398869048  0.9629474
##   0.10       1                   450     0.8740544  0.407568027  0.9623860
##   0.10       1                   500     0.8737100  0.415059524  0.9609825
##   0.10       1                   550     0.8734029  0.416232993  0.9602105
##   0.10       1                   600     0.8732570  0.419600340  0.9599298
##   0.10       1                   650     0.8726793  0.420807823  0.9601404
##   0.10       1                   700     0.8724335  0.417925170  0.9599298
##   0.10       1                   750     0.8717840  0.420433673  0.9592982
##   0.10       1                   800     0.8720388  0.422865646  0.9589474
##   0.10       1                   850     0.8714425  0.419974490  0.9585965
##   0.10       1                   900     0.8706423  0.422500000  0.9585263
##   0.10       1                   950     0.8703107  0.422517007  0.9586667
##   0.10       1                  1000     0.8697184  0.427100340  0.9587368
##   0.10       3                   100     0.9162221  0.719761905  0.9895439
##   0.10       3                   150     0.9181797  0.734634354  0.9884912
##   0.10       3                   200     0.9181876  0.742100340  0.9877895
##   0.10       3                   250     0.9166147  0.743792517  0.9876491
##   0.10       3                   300     0.9157307  0.744591837  0.9874386
##   0.10       3                   350     0.9151694  0.747049320  0.9877895
##   0.10       3                   400     0.9151442  0.742882653  0.9870877
##   0.10       3                   450     0.9148876  0.747066327  0.9868070
##   0.10       3                   500     0.9143090  0.744158163  0.9865263
##   0.10       3                   550     0.9138603  0.743698980  0.9870877
##   0.10       3                   600     0.9137892  0.746198980  0.9870175
##   0.10       3                   650     0.9134310  0.744957483  0.9868070
##   0.10       3                   700     0.9130599  0.743715986  0.9871579
##   0.10       3                   750     0.9125879  0.739115646  0.9870175
##   0.10       3                   800     0.9127169  0.739540816  0.9873684
##   0.10       3                   850     0.9124624  0.737465986  0.9875088
##   0.10       3                   900     0.9120293  0.739149660  0.9871579
##   0.10       3                   950     0.9122754  0.738715986  0.9872281
##   0.10       3                  1000     0.9117295  0.736666667  0.9875088
##   0.10       5                   100     0.9175928  0.743316327  0.9910877
##   0.10       5                   150     0.9159789  0.749940476  0.9905263
##   0.10       5                   200     0.9153601  0.748639456  0.9897544
##   0.10       5                   250     0.9149472  0.751938776  0.9899649
##   0.10       5                   300     0.9149078  0.751530612  0.9891930
##   0.10       5                   350     0.9148999  0.752738095  0.9890526
##   0.10       5                   400     0.9143009  0.755221088  0.9894737
##   0.10       5                   450     0.9139375  0.752755102  0.9889825
##   0.10       5                   500     0.9138000  0.750272109  0.9888421
##   0.10       5                   550     0.9139463  0.749073129  0.9891228
##   0.10       5                   600     0.9131624  0.749090136  0.9886316
##   0.10       5                   650     0.9134728  0.747423469  0.9885614
##   0.10       5                   700     0.9131046  0.744540816  0.9882807
##   0.10       5                   750     0.9129605  0.745790816  0.9887719
##   0.10       5                   800     0.9129375  0.745773810  0.9886316
##   0.10       5                   850     0.9129241  0.747406463  0.9885614
##   0.10       5                   900     0.9127498  0.744498299  0.9890526
##   0.10       5                   950     0.9130856  0.744906463  0.9887719
##   0.10       5                  1000     0.9128339  0.748664966  0.9888421
##   0.10       7                   100     0.9164922  0.741275510  0.9903860
##   0.10       7                   150     0.9149505  0.749965986  0.9887018
##   0.10       7                   200     0.9150859  0.749557823  0.9884211
##   0.10       7                   250     0.9151614  0.749940476  0.9884211
##   0.10       7                   300     0.9145190  0.743690476  0.9886316
##   0.10       7                   350     0.9138952  0.739974490  0.9885614
##   0.10       7                   400     0.9133353  0.743316327  0.9888421
##   0.10       7                   450     0.9134895  0.741641156  0.9884211
##   0.10       7                   500     0.9127745  0.744574830  0.9882105
##   0.10       7                   550     0.9128931  0.744132653  0.9884211
##   0.10       7                   600     0.9129918  0.742049320  0.9888421
##   0.10       7                   650     0.9129074  0.743724490  0.9890526
##   0.10       7                   700     0.9129720  0.742066327  0.9890526
##   0.10       7                   750     0.9128330  0.745765306  0.9890526
##   0.10       7                   800     0.9128972  0.746190476  0.9889123
##   0.10       7                   850     0.9127248  0.746207483  0.9891930
##   0.10       7                   900     0.9129390  0.746181973  0.9891228
##   0.10       7                   950     0.9128828  0.747431973  0.9887018
##   0.10       7                  1000     0.9132160  0.747431973  0.9887719
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 200,
##  interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
#n=100 trees
#Interaction.depth of 5 indicates a "moderately deep tree" 
#Shrinkage indicates speed of learning= 0.1 is a "fast" learner, 0.01 is a "slow" learner
#If you want to adjust the parameters, you don't have to rereun the model- just update
#plot(gbmTune)
library(ggplot2)
ggplot(gbmTune) + theme(legend.position = "top")

#what are boosting iterations?
#look at top right under 0.1- with very few trees, model picks best
#gbmTune$finalModel
#
gbmPred <- predict(gbmTune, churnTest)
str(gbmPred)
##  Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
gbmProbs <- predict(gbmTune, churnTest, type = "prob")
str(gbmProbs)
## 'data.frame':    1667 obs. of  2 variables:
##  $ yes: num  0.0327 0.0811 0.3858 0.0257 0.0264 ...
##  $ no : num  0.967 0.919 0.614 0.974 0.974 ...
library(caret)
confusionMatrix(gbmPred, churnTest$churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  157    9
##        no    67 1434
##                                           
##                Accuracy : 0.9544          
##                  95% CI : (0.9433, 0.9639)
##     No Information Rate : 0.8656          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.78            
##  Mcnemar's Test P-Value : 6.22e-11        
##                                           
##             Sensitivity : 0.70089         
##             Specificity : 0.99376         
##          Pos Pred Value : 0.94578         
##          Neg Pred Value : 0.95536         
##              Prevalence : 0.13437         
##          Detection Rate : 0.09418         
##    Detection Prevalence : 0.09958         
##       Balanced Accuracy : 0.84733         
##                                           
##        'Positive' Class : yes             
## 
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
rocCurve <- roc(response = churnTest$churn, 
                predictor = gbmProbs[, "yes"],
                levels = rev(levels(churnTest$churn)))

rocCurve
## 
## Call:
## roc.default(response = churnTest$churn, predictor = gbmProbs[,     "yes"], levels = rev(levels(churnTest$churn)))
## 
## Data: gbmProbs[, "yes"] in 1443 controls (churnTest$churn no) < 224 cases (churnTest$churn yes).
## Area under the curve: 0.9257
plot(rocCurve,
     print.thres = c(.5, .2), #print nominal 50% cutoff and 20% 
     print.thres.pch = 16,
     print.thres.cex = 1.2)