#MAX KUHN PRACTICE
load("~/Documents/NYDSA/Max Kuhn/Dataset/bank-full-axws.RData")
raw <- read.csv("/Users/SKSN/Documents/NYDSA/Max Kuhn/Dataset/RAW")
raw$y <- factor(as.character(raw$y), levels = c("yes", "no"))
#Required libraries
library(caret); library(rpart); library(rattle);library(pROC); library(gbm)
## Warning: package 'caret' was built under R version 3.1.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.1.2
## Loading required package: ggplot2
## Warning: package 'rpart' was built under R version 3.1.2
## Warning: package 'rattle' was built under R version 3.1.2
## Rattle: A free graphical interface for data mining with R.
## Version 3.4.1 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
##
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.1.2
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1
set.seed(100); par(mfrow=c(1,1));
#create data partition using caret package
inTest <- createDataPartition(y = raw$y, p = .9, list = FALSE)
testing <- raw[inTest,]
#class imbalance is to train the model on a balanced random sample.
YES <- subset(testing, testing$y=="yes")
NO <- subset(testing, testing$y=="no")
#make NO == YES nrows; kowing Yes are less
subsets <- sample(1:nrow(NO),nrow(YES))
NO.subset <- NO[subsets,]
equaldataset <- rbind(YES,NO.subset)
training <- equaldataset
dim(training); table(training$y)
## [1] 9522 17
##
## yes no
## 4761 4761
testing <- raw[-inTest,]; table(testing$y) #testing data
##
## yes no
## 528 3992
#DUMMY VARIABLES
dummyInfo <- dummyVars(y ~ ., data = training); dummyInfo
## Dummy Variable Object
##
## Formula: y ~ .
## 17 variables, 10 factors
## Variables and levels will be separated by '.'
## A less than full rank encoding is used
training2 <- predict(dummyInfo, newdata = training); head(training2[, 1:5])
## age job.admin job.blue_collar job.entrepreneur job.housemaid
## 84 59 1 0 0 0
## 87 56 1 0 0 0
## 88 41 0 0 0 0
## 130 55 0 0 0 0
## 169 54 1 0 0 0
## 271 42 0 0 0 0
testing2 <- predict(dummyInfo, newdata = testing) # testing
#Centering and Scaling
(preProcValues <- preProcess(training2, method = c("center", "scale")))
##
## Call:
## preProcess.default(x = training2, method = c("center", "scale"))
##
## Created from 9522 samples and 51 variables
## Pre-processing: centered, scaled
scaledTrain <- predict(preProcValues, newdata = training2); scaledTest <- predict(preProcValues, newdata = testing2)
#Filtering Problematic Variables such as outlier, single value, highly unbalanced distro
#1
nzv <- nearZeroVar(training2)
colnames(training2)[nzv]
## [1] "job.entrepreneur" "job.housemaid" "job.self_employed"
## [4] "job.student" "job.unemployed" "job.unknown"
## [7] "education.unknown" "default.no" "default.yes"
## [10] "month.dec" "month.jan" "month.mar"
## [13] "month.oct" "month.sep" "pdays"
## [16] "poutcome.other"
training3 <- training2[, -nzv]
testing3 <- testing2[, -nzv]
#2
corMat <- cor(training3)
remove <- findCorrelation(x = corMat, cutoff = .75)
colnames(training3)[remove]
## [1] "contact.unknown" "housing.yes" "marital.single" "loan.yes"
training4 <- training3[, -remove]
testing4 <- testing3[, -remove]
## make them data frames for later:
training4 <- as.data.frame(training4)
testing4 <- as.data.frame(testing4)
#1# rpart-tree modeling
rpartFull <- rpart(y ~ ., data = training) #modelA
fancyRpartPlot(rpartFull) # plot a tree using rattle pkg

#prediction
rpartPred <- predict(rpartFull, newdata = testing, type = "class")
confusionMatrix(data = rpartPred, reference = testing$y)[[3]][1]
## Accuracy
## 0.8088496
#2# caret-rpart
#a
cvCtrl <- trainControl(method = "repeatedcv", repeats = 5)
rpartTune <- train(y ~ ., data = training, method = "rpart",tuneLength = 10,trControl = cvCtrl)
#b: with metric=ROC
cvCtrl <- trainControl(method = "repeatedcv", repeats = 5, summaryFunction = twoClassSummary,classProbs = TRUE)
rpartTune2 <- train(y ~ ., data = training, method = "rpart", tuneLength = 10, metric = "ROC",trControl = cvCtrl)
rpartTune2 # ROC table
## CART
##
## 9522 samples
## 16 predictor
## 2 classes: 'yes', 'no'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
##
## Summary of sample sizes: 8570, 8570, 8570, 8569, 8570, 8570, ...
##
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec ROC SD Sens SD
## 0.003276623 0.8655057 0.8252919 0.8214236 0.01527892 0.02358546
## 0.003360639 0.8642560 0.8227709 0.8223054 0.01487138 0.02302246
## 0.004410838 0.8596225 0.8147057 0.8199105 0.01241098 0.02044797
## 0.006091157 0.8560788 0.8000015 0.8268850 0.01127559 0.02621909
## 0.008821676 0.8534113 0.7966408 0.8215937 0.01144223 0.02105091
## 0.010922075 0.8495197 0.7960106 0.8147035 0.01287016 0.01761898
## 0.013127494 0.8432031 0.8032777 0.7964716 0.01470326 0.02879820
## 0.039067423 0.8096951 0.8241542 0.7273282 0.01636162 0.02442830
## 0.046733879 0.7733716 0.8178538 0.6877999 0.04322886 0.01692238
## 0.453266121 0.6029930 0.8696449 0.3363411 0.10864723 0.20253596
## Spec SD
## 0.02196489
## 0.02060344
## 0.02123459
## 0.02192232
## 0.02286362
## 0.02444995
## 0.04294586
## 0.02292278
## 0.05922931
## 0.33530362
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.003276623.
rpartTune2$finalModel #results
## n= 9522
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9522 4761 yes (0.50000000 0.50000000)
## 2) duration>=207.5 5692 1767 yes (0.68956430 0.31043570)
## 4) duration>=518.5 2308 307 yes (0.86698440 0.13301560) *
## 5) duration< 518.5 3384 1460 yes (0.56855792 0.43144208)
## 10) contactunknown< 0.5 2811 951 yes (0.66168623 0.33831377)
## 20) poutcomesuccess>=0.5 519 21 yes (0.95953757 0.04046243) *
## 21) poutcomesuccess< 0.5 2292 930 yes (0.59424084 0.40575916)
## 42) housingyes< 0.5 1465 454 yes (0.69010239 0.30989761)
## 84) loanyes< 0.5 1351 376 yes (0.72168764 0.27831236) *
## 85) loanyes>=0.5 114 36 no (0.31578947 0.68421053) *
## 43) housingyes>=0.5 827 351 no (0.42442563 0.57557437)
## 86) pdays>=373.5 27 0 yes (1.00000000 0.00000000) *
## 87) pdays< 373.5 800 324 no (0.40500000 0.59500000)
## 174) duration>=404.5 197 83 yes (0.57868020 0.42131980) *
## 175) duration< 404.5 603 210 no (0.34825871 0.65174129)
## 350) monthmar>=0.5 23 1 yes (0.95652174 0.04347826) *
## 351) monthmar< 0.5 580 188 no (0.32413793 0.67586207)
## 702) monthjun>=0.5 20 2 yes (0.90000000 0.10000000) *
## 703) monthjun< 0.5 560 170 no (0.30357143 0.69642857) *
## 11) contactunknown>=0.5 573 64 no (0.11169284 0.88830716) *
## 3) duration< 207.5 3830 836 no (0.21827676 0.78172324)
## 6) poutcomesuccess>=0.5 246 30 yes (0.87804878 0.12195122) *
## 7) poutcomesuccess< 0.5 3584 620 no (0.17299107 0.82700893)
## 14) monthmar>=0.5 98 23 yes (0.76530612 0.23469388) *
## 15) monthmar< 0.5 3486 545 no (0.15633964 0.84366036) *
ggplot(rpartTune2) + scale_x_log10() # plot

#caret-rpart-prediction
#1 class
rpartPred2 <- predict(rpartTune2, newdata = testing)
confusionMatrix(rpartPred2, testing$y)[[3]][1]
## Accuracy
## 0.810177
#2: probability
rpartProbs2 <- predict(rpartTune2, newdata = testing, type = "prob")
head(rpartProbs2, n = 4)
## yes no
## 9 0.1563396 0.8436604
## 23 0.1563396 0.8436604
## 24 0.1116928 0.8883072
## 72 0.1563396 0.8436604
#ROC analysis
testProbsDF <- data.frame(y = testing$y, rpart = rpartProbs2[, "yes"])
head(testProbsDF, n = 4)
## y rpart
## 1 no 0.1563396
## 2 no 0.1563396
## 3 no 0.1116928
## 4 no 0.1563396
## The roc function assumes the *second* level is the one of ## interest, so we use the 'levels' argument to change the order.
rpartROC2 <- roc(response = testProbsDF$y, predictor = testProbsDF$rpart, levels = rev(levels(raw$y)))
## Get the area under the ROC curve
auc(rpartROC2)
## Area under the curve: 0.8405
plot(rpartROC2, legacy.axes = FALSE, type = "S")

##
## Call:
## roc.default(response = testProbsDF$y, predictor = testProbsDF$rpart, levels = rev(levels(raw$y)))
##
## Data: testProbsDF$rpart in 3992 controls (testProbsDF$y no) < 528 cases (testProbsDF$y yes).
## Area under the curve: 0.8405
#lift charts
rpartLift2 <- lift(y ~ rpart, data = testProbsDF)
plot(rpartLift2, values = 75)

num_leaves <- function(x) {
output <- capture.output(print(x$finalModel))
length(grep("\\*$", output)) }
num_leaves(rpartTune)
## [1] 13
#3# Gradient Boost Machine (GBM) modeling
forGBM <- training; forGBM$y <- ifelse(forGBM$y == "yes", 1, 0) # change yes/no to 1/0
system.time({
gbmFit <- gbm(formula = y ~ ., distribution = "bernoulli", # For classification
data = forGBM, n.trees = 500, interaction.depth = 3, shrinkage = 0.1,verbose = FALSE)
})[[3]]
## [1] 5.155
#Prediction
gbmPred <- predict(gbmFit, newdata = testing, n.trees = 500, type = "response") #gets prob
#need to tell number of trees to use for prediction
gbmPred <- factor(ifelse(gbmPred > .5, "yes", "no"), levels = c("yes", "no")) #change 1/0 to Y/N
head(gbmPred)
## [1] no no no no no no
## Levels: yes no
confusionMatrix(gbmPred, testing$y)[[3]][1]
## Accuracy
## 0.8449115
#4# caret-gbm with expand.grid
gbmGrid <- expand.grid(interaction.depth = c(3,5,7),
n.trees = seq(600, 1000, by=100),
shrinkage = c(0.01, 0.1)) #number of parameter=3x4x2
gbmTune <- train(y ~ ., data = training, method = "gbm", metric = "ROC",
tuneGrid = gbmGrid, trControl = cvCtrl,verbose = FALSE)
## Loading required package: plyr
gbmTune
## Stochastic Gradient Boosting
##
## 9522 samples
## 16 predictor
## 2 classes: 'yes', 'no'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
##
## Summary of sample sizes: 8570, 8570, 8570, 8570, 8569, 8570, ...
##
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees ROC Sens Spec
## 0.01 3 600 0.9124923 0.8571326 0.8240290
## 0.01 3 700 0.9151618 0.8592754 0.8278095
## 0.01 3 800 0.9173150 0.8619640 0.8310867
## 0.01 3 900 0.9191221 0.8644848 0.8333552
## 0.01 3 1000 0.9205432 0.8669633 0.8357496
## 0.01 5 600 0.9208429 0.8673831 0.8328508
## 0.01 5 700 0.9227505 0.8714163 0.8348253
## 0.01 5 800 0.9241932 0.8728862 0.8377238
## 0.01 5 900 0.9252711 0.8736417 0.8398243
## 0.01 5 1000 0.9260905 0.8740619 0.8408746
## 0.01 7 600 0.9246338 0.8730965 0.8365477
## 0.01 7 700 0.9260166 0.8741884 0.8387318
## 0.01 7 800 0.9270306 0.8759524 0.8403705
## 0.01 7 900 0.9277900 0.8770446 0.8415888
## 0.01 7 1000 0.9283474 0.8779688 0.8425127
## 0.10 3 600 0.9288482 0.8794388 0.8464620
## 0.10 3 700 0.9289032 0.8788506 0.8468398
## 0.10 3 800 0.9287098 0.8790187 0.8468819
## 0.10 3 900 0.9286284 0.8800684 0.8472171
## 0.10 3 1000 0.9283657 0.8789335 0.8470499
## 0.10 5 600 0.9285042 0.8802361 0.8463355
## 0.10 5 700 0.9281847 0.8801107 0.8472592
## 0.10 5 800 0.9280964 0.8785986 0.8471337
## 0.10 5 900 0.9276187 0.8787667 0.8483092
## 0.10 5 1000 0.9271807 0.8786817 0.8473010
## 0.10 7 600 0.9275151 0.8814546 0.8449910
## 0.10 7 700 0.9270399 0.8807824 0.8467127
## 0.10 7 800 0.9264532 0.8806982 0.8463352
## 0.10 7 900 0.9260176 0.8795641 0.8452008
## 0.10 7 1000 0.9254566 0.8794803 0.8453685
## ROC SD Sens SD Spec SD
## 0.009224010 0.01626489 0.01790540
## 0.009070030 0.01726175 0.01795409
## 0.009009807 0.01685408 0.01811146
## 0.008954494 0.01623230 0.01815365
## 0.008955336 0.01554914 0.01813144
## 0.008763987 0.01699763 0.01799369
## 0.008619736 0.01637054 0.01772908
## 0.008513611 0.01556243 0.01717297
## 0.008435253 0.01520649 0.01694587
## 0.008410602 0.01556006 0.01711245
## 0.008516824 0.01599220 0.01805090
## 0.008387081 0.01463199 0.01782039
## 0.008354274 0.01475382 0.01779051
## 0.008338943 0.01500681 0.01841702
## 0.008378113 0.01450005 0.01786519
## 0.008665467 0.01371709 0.01835123
## 0.008620076 0.01347614 0.01869131
## 0.008673132 0.01329762 0.01808075
## 0.008712339 0.01329641 0.01804649
## 0.008686267 0.01338741 0.01830660
## 0.009119058 0.01444433 0.01868660
## 0.009349265 0.01506489 0.01814584
## 0.009320601 0.01425950 0.01961081
## 0.009369041 0.01323630 0.02014674
## 0.009340413 0.01231319 0.01901648
## 0.009145292 0.01322598 0.01905469
## 0.009231870 0.01414327 0.01949647
## 0.009159798 0.01355074 0.01903808
## 0.009249301 0.01295540 0.01871902
## 0.009385694 0.01304444 0.01917653
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 700,
## interaction.depth = 3 and shrinkage = 0.1.
ggplot(gbmTune)

#caret-gbm-prediction
gbmPred <- predict(gbmTune, newdata = testing) #class
confusionMatrix(gbmPred, testing$y)[[3]][1]
## Accuracy
## 0.8449115
gbmProbs <- predict(gbmTune, newdata = testing, type = "prob") # probabilities
testProbsDF$gbm <- gbmProbs[, "yes"]
head(testProbsDF)
## y rpart gbm
## 1 no 0.1563396 0.007187026
## 2 no 0.1563396 0.013061271
## 3 no 0.1116928 0.041071705
## 4 no 0.1563396 0.011378348
## 5 no 0.1563396 0.025125842
## 6 no 0.1563396 0.029774636
gbmLift <- lift(y ~ rpart + gbm, data = testProbsDF) # lift chart
gbmLift
##
## Call:
## lift.formula(x = y ~ rpart + gbm, data = testProbsDF)
##
## Models: rpart, gbm
## Event: yes (11.7%)
plot(gbmLift, auto.key = list(lines = TRUE, points = FALSE))

## Boosted Tree ROC Curve
gbmROC <- roc(response = testProbsDF$y, predictor = testProbsDF$gbm, levels = rev(levels(raw$y)))
auc(gbmROC)
## Area under the curve: 0.9233
plot(rpartROC2, col = "#9E0142", legacy.axes = FALSE, type = "S")
##
## Call:
## roc.default(response = testProbsDF$y, predictor = testProbsDF$rpart, levels = rev(levels(raw$y)))
##
## Data: testProbsDF$rpart in 3992 controls (testProbsDF$y no) < 528 cases (testProbsDF$y yes).
## Area under the curve: 0.8405
plot(gbmROC, col = "#3288BD", legacy.axes = FALSE, add = TRUE)
##
## Call:
## roc.default(response = testProbsDF$y, predictor = testProbsDF$gbm, levels = rev(levels(raw$y)))
##
## Data: testProbsDF$gbm in 3992 controls (testProbsDF$y no) < 528 cases (testProbsDF$y yes).
## Area under the curve: 0.9233
legend(.6, .5, legend = c("rpart", "gbm"),lty = c(1, 1), col = c("#9E0142", "#3288BD"))

resamps <- resamples(list(CART=rpartTune2, GBM=gbmTune ))
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: CART, GBM
## Number of resamples: 50
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.8399 0.8539 0.8626 0.8655 0.8767 0.9055 0
## GBM 0.9085 0.9238 0.9291 0.9289 0.9354 0.9456 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.7752 0.8099 0.8248 0.8253 0.8385 0.8761 0
## GBM 0.8386 0.8697 0.8792 0.8789 0.8866 0.9076 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.7752 0.8051 0.8237 0.8214 0.8356 0.8718 0
## GBM 0.8025 0.8367 0.8508 0.8468 0.8592 0.8739 0
xyplot(resamps)
