#MAX KUHN PRACTICE 
load("~/Documents/NYDSA/Max Kuhn/Dataset/bank-full-axws.RData")
raw <- read.csv("/Users/SKSN/Documents/NYDSA/Max Kuhn/Dataset/RAW")
raw$y <- factor(as.character(raw$y), levels = c("yes", "no"))
#Required libraries
library(caret); library(rpart); library(rattle);library(pROC);  library(gbm)
## Warning: package 'caret' was built under R version 3.1.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.1.2
## Loading required package: ggplot2
## Warning: package 'rpart' was built under R version 3.1.2
## Warning: package 'rattle' was built under R version 3.1.2
## Rattle: A free graphical interface for data mining with R.
## Version 3.4.1 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## 
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.1.2
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1
set.seed(100); par(mfrow=c(1,1)); 
#create data partition using caret package
inTest <- createDataPartition(y = raw$y, p = .9, list = FALSE)
testing <- raw[inTest,]
#class imbalance is to train the model on a balanced random sample.
YES <- subset(testing, testing$y=="yes")
NO <- subset(testing, testing$y=="no")
#make NO == YES nrows; kowing Yes are less
subsets <- sample(1:nrow(NO),nrow(YES))
NO.subset <- NO[subsets,]
equaldataset <- rbind(YES,NO.subset)
training <- equaldataset
dim(training); table(training$y)
## [1] 9522   17
## 
##  yes   no 
## 4761 4761
testing <- raw[-inTest,]; table(testing$y)  #testing data
## 
##  yes   no 
##  528 3992
#DUMMY VARIABLES
dummyInfo <- dummyVars(y ~ ., data = training); dummyInfo
## Dummy Variable Object
## 
## Formula: y ~ .
## 17 variables, 10 factors
## Variables and levels will be separated by '.'
## A less than full rank encoding is used
training2 <- predict(dummyInfo, newdata = training); head(training2[, 1:5])
##     age job.admin job.blue_collar job.entrepreneur job.housemaid
## 84   59         1               0                0             0
## 87   56         1               0                0             0
## 88   41         0               0                0             0
## 130  55         0               0                0             0
## 169  54         1               0                0             0
## 271  42         0               0                0             0
testing2 <- predict(dummyInfo, newdata = testing) # testing
#Centering and Scaling
(preProcValues <- preProcess(training2, method = c("center", "scale")))
## 
## Call:
## preProcess.default(x = training2, method = c("center", "scale"))
## 
## Created from 9522 samples and 51 variables
## Pre-processing: centered, scaled
scaledTrain <- predict(preProcValues, newdata = training2); scaledTest  <- predict(preProcValues, newdata = testing2)
#Filtering Problematic Variables such as outlier, single value, highly unbalanced distro
#1
nzv <- nearZeroVar(training2)
colnames(training2)[nzv]
##  [1] "job.entrepreneur"  "job.housemaid"     "job.self_employed"
##  [4] "job.student"       "job.unemployed"    "job.unknown"      
##  [7] "education.unknown" "default.no"        "default.yes"      
## [10] "month.dec"         "month.jan"         "month.mar"        
## [13] "month.oct"         "month.sep"         "pdays"            
## [16] "poutcome.other"
training3 <- training2[, -nzv]
testing3 <- testing2[, -nzv]
#2
corMat <- cor(training3)
remove <- findCorrelation(x = corMat, cutoff = .75)
colnames(training3)[remove]
## [1] "contact.unknown" "housing.yes"     "marital.single"  "loan.yes"
training4 <- training3[, -remove]
testing4  <- testing3[, -remove]
## make them data frames for later:
training4 <- as.data.frame(training4)
testing4  <- as.data.frame(testing4)

#1# rpart-tree modeling
rpartFull <- rpart(y ~ ., data = training) #modelA
fancyRpartPlot(rpartFull) # plot a tree using rattle pkg

#prediction
rpartPred <- predict(rpartFull, newdata = testing, type = "class")
confusionMatrix(data = rpartPred, reference = testing$y)[[3]][1]
##  Accuracy 
## 0.8088496
#2# caret-rpart
#a
cvCtrl <- trainControl(method = "repeatedcv", repeats = 5)
rpartTune <- train(y ~ ., data = training, method = "rpart",tuneLength = 10,trControl = cvCtrl)
#b: with metric=ROC
cvCtrl <- trainControl(method = "repeatedcv", repeats = 5, summaryFunction = twoClassSummary,classProbs = TRUE)
rpartTune2 <- train(y ~ ., data = training, method = "rpart", tuneLength = 10, metric = "ROC",trControl = cvCtrl)
rpartTune2 # ROC table
## CART 
## 
## 9522 samples
##   16 predictor
##    2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## 
## Summary of sample sizes: 8570, 8570, 8570, 8569, 8570, 8570, ... 
## 
## Resampling results across tuning parameters:
## 
##   cp           ROC        Sens       Spec       ROC SD      Sens SD   
##   0.003276623  0.8655057  0.8252919  0.8214236  0.01527892  0.02358546
##   0.003360639  0.8642560  0.8227709  0.8223054  0.01487138  0.02302246
##   0.004410838  0.8596225  0.8147057  0.8199105  0.01241098  0.02044797
##   0.006091157  0.8560788  0.8000015  0.8268850  0.01127559  0.02621909
##   0.008821676  0.8534113  0.7966408  0.8215937  0.01144223  0.02105091
##   0.010922075  0.8495197  0.7960106  0.8147035  0.01287016  0.01761898
##   0.013127494  0.8432031  0.8032777  0.7964716  0.01470326  0.02879820
##   0.039067423  0.8096951  0.8241542  0.7273282  0.01636162  0.02442830
##   0.046733879  0.7733716  0.8178538  0.6877999  0.04322886  0.01692238
##   0.453266121  0.6029930  0.8696449  0.3363411  0.10864723  0.20253596
##   Spec SD   
##   0.02196489
##   0.02060344
##   0.02123459
##   0.02192232
##   0.02286362
##   0.02444995
##   0.04294586
##   0.02292278
##   0.05922931
##   0.33530362
## 
## ROC was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.003276623.
rpartTune2$finalModel #results
## n= 9522 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 9522 4761 yes (0.50000000 0.50000000)  
##     2) duration>=207.5 5692 1767 yes (0.68956430 0.31043570)  
##       4) duration>=518.5 2308  307 yes (0.86698440 0.13301560) *
##       5) duration< 518.5 3384 1460 yes (0.56855792 0.43144208)  
##        10) contactunknown< 0.5 2811  951 yes (0.66168623 0.33831377)  
##          20) poutcomesuccess>=0.5 519   21 yes (0.95953757 0.04046243) *
##          21) poutcomesuccess< 0.5 2292  930 yes (0.59424084 0.40575916)  
##            42) housingyes< 0.5 1465  454 yes (0.69010239 0.30989761)  
##              84) loanyes< 0.5 1351  376 yes (0.72168764 0.27831236) *
##              85) loanyes>=0.5 114   36 no (0.31578947 0.68421053) *
##            43) housingyes>=0.5 827  351 no (0.42442563 0.57557437)  
##              86) pdays>=373.5 27    0 yes (1.00000000 0.00000000) *
##              87) pdays< 373.5 800  324 no (0.40500000 0.59500000)  
##               174) duration>=404.5 197   83 yes (0.57868020 0.42131980) *
##               175) duration< 404.5 603  210 no (0.34825871 0.65174129)  
##                 350) monthmar>=0.5 23    1 yes (0.95652174 0.04347826) *
##                 351) monthmar< 0.5 580  188 no (0.32413793 0.67586207)  
##                   702) monthjun>=0.5 20    2 yes (0.90000000 0.10000000) *
##                   703) monthjun< 0.5 560  170 no (0.30357143 0.69642857) *
##        11) contactunknown>=0.5 573   64 no (0.11169284 0.88830716) *
##     3) duration< 207.5 3830  836 no (0.21827676 0.78172324)  
##       6) poutcomesuccess>=0.5 246   30 yes (0.87804878 0.12195122) *
##       7) poutcomesuccess< 0.5 3584  620 no (0.17299107 0.82700893)  
##        14) monthmar>=0.5 98   23 yes (0.76530612 0.23469388) *
##        15) monthmar< 0.5 3486  545 no (0.15633964 0.84366036) *
ggplot(rpartTune2) + scale_x_log10() # plot

#caret-rpart-prediction
#1 class
rpartPred2 <- predict(rpartTune2, newdata = testing)
confusionMatrix(rpartPred2, testing$y)[[3]][1]
## Accuracy 
## 0.810177
#2: probability
rpartProbs2 <- predict(rpartTune2, newdata = testing, type = "prob")
head(rpartProbs2, n = 4)
##          yes        no
## 9  0.1563396 0.8436604
## 23 0.1563396 0.8436604
## 24 0.1116928 0.8883072
## 72 0.1563396 0.8436604
#ROC analysis
testProbsDF <- data.frame(y = testing$y, rpart = rpartProbs2[, "yes"])
head(testProbsDF, n = 4)
##    y     rpart
## 1 no 0.1563396
## 2 no 0.1563396
## 3 no 0.1116928
## 4 no 0.1563396
## The roc function assumes the *second* level is the one of ## interest, so we use the 'levels' argument to change the order.
rpartROC2 <- roc(response = testProbsDF$y, predictor = testProbsDF$rpart, levels = rev(levels(raw$y)))
## Get the area under the ROC curve
auc(rpartROC2)
## Area under the curve: 0.8405
plot(rpartROC2, legacy.axes = FALSE, type = "S")

## 
## Call:
## roc.default(response = testProbsDF$y, predictor = testProbsDF$rpart,     levels = rev(levels(raw$y)))
## 
## Data: testProbsDF$rpart in 3992 controls (testProbsDF$y no) < 528 cases (testProbsDF$y yes).
## Area under the curve: 0.8405
#lift charts
rpartLift2 <- lift(y ~ rpart, data = testProbsDF)
 plot(rpartLift2, values = 75)

 num_leaves <- function(x) {
     output <- capture.output(print(x$finalModel))
    length(grep("\\*$", output)) }
 num_leaves(rpartTune)
## [1] 13
#3# Gradient Boost Machine (GBM) modeling
forGBM <- training; forGBM$y <- ifelse(forGBM$y == "yes", 1, 0) # change yes/no to 1/0
system.time({
    gbmFit <- gbm(formula = y ~ ., distribution = "bernoulli", # For classification
        data = forGBM, n.trees = 500, interaction.depth = 3, shrinkage = 0.1,verbose = FALSE)
            })[[3]]
## [1] 5.155
#Prediction
gbmPred <- predict(gbmFit, newdata = testing, n.trees = 500, type = "response") #gets prob
#need to tell number of trees to use for prediction
gbmPred <- factor(ifelse(gbmPred > .5, "yes", "no"), levels = c("yes", "no")) #change 1/0 to Y/N
head(gbmPred)
## [1] no no no no no no
## Levels: yes no
confusionMatrix(gbmPred, testing$y)[[3]][1]
##  Accuracy 
## 0.8449115
#4# caret-gbm with expand.grid
gbmGrid <- expand.grid(interaction.depth = c(3,5,7),  
                       n.trees = seq(600, 1000, by=100), 
                       shrinkage = c(0.01, 0.1)) #number of parameter=3x4x2
gbmTune <- train(y ~ ., data = training, method = "gbm", metric = "ROC",
                tuneGrid = gbmGrid, trControl = cvCtrl,verbose = FALSE)
## Loading required package: plyr
gbmTune
## Stochastic Gradient Boosting 
## 
## 9522 samples
##   16 predictor
##    2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## 
## Summary of sample sizes: 8570, 8570, 8570, 8570, 8569, 8570, ... 
## 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  ROC        Sens       Spec     
##   0.01       3                   600     0.9124923  0.8571326  0.8240290
##   0.01       3                   700     0.9151618  0.8592754  0.8278095
##   0.01       3                   800     0.9173150  0.8619640  0.8310867
##   0.01       3                   900     0.9191221  0.8644848  0.8333552
##   0.01       3                  1000     0.9205432  0.8669633  0.8357496
##   0.01       5                   600     0.9208429  0.8673831  0.8328508
##   0.01       5                   700     0.9227505  0.8714163  0.8348253
##   0.01       5                   800     0.9241932  0.8728862  0.8377238
##   0.01       5                   900     0.9252711  0.8736417  0.8398243
##   0.01       5                  1000     0.9260905  0.8740619  0.8408746
##   0.01       7                   600     0.9246338  0.8730965  0.8365477
##   0.01       7                   700     0.9260166  0.8741884  0.8387318
##   0.01       7                   800     0.9270306  0.8759524  0.8403705
##   0.01       7                   900     0.9277900  0.8770446  0.8415888
##   0.01       7                  1000     0.9283474  0.8779688  0.8425127
##   0.10       3                   600     0.9288482  0.8794388  0.8464620
##   0.10       3                   700     0.9289032  0.8788506  0.8468398
##   0.10       3                   800     0.9287098  0.8790187  0.8468819
##   0.10       3                   900     0.9286284  0.8800684  0.8472171
##   0.10       3                  1000     0.9283657  0.8789335  0.8470499
##   0.10       5                   600     0.9285042  0.8802361  0.8463355
##   0.10       5                   700     0.9281847  0.8801107  0.8472592
##   0.10       5                   800     0.9280964  0.8785986  0.8471337
##   0.10       5                   900     0.9276187  0.8787667  0.8483092
##   0.10       5                  1000     0.9271807  0.8786817  0.8473010
##   0.10       7                   600     0.9275151  0.8814546  0.8449910
##   0.10       7                   700     0.9270399  0.8807824  0.8467127
##   0.10       7                   800     0.9264532  0.8806982  0.8463352
##   0.10       7                   900     0.9260176  0.8795641  0.8452008
##   0.10       7                  1000     0.9254566  0.8794803  0.8453685
##   ROC SD       Sens SD     Spec SD   
##   0.009224010  0.01626489  0.01790540
##   0.009070030  0.01726175  0.01795409
##   0.009009807  0.01685408  0.01811146
##   0.008954494  0.01623230  0.01815365
##   0.008955336  0.01554914  0.01813144
##   0.008763987  0.01699763  0.01799369
##   0.008619736  0.01637054  0.01772908
##   0.008513611  0.01556243  0.01717297
##   0.008435253  0.01520649  0.01694587
##   0.008410602  0.01556006  0.01711245
##   0.008516824  0.01599220  0.01805090
##   0.008387081  0.01463199  0.01782039
##   0.008354274  0.01475382  0.01779051
##   0.008338943  0.01500681  0.01841702
##   0.008378113  0.01450005  0.01786519
##   0.008665467  0.01371709  0.01835123
##   0.008620076  0.01347614  0.01869131
##   0.008673132  0.01329762  0.01808075
##   0.008712339  0.01329641  0.01804649
##   0.008686267  0.01338741  0.01830660
##   0.009119058  0.01444433  0.01868660
##   0.009349265  0.01506489  0.01814584
##   0.009320601  0.01425950  0.01961081
##   0.009369041  0.01323630  0.02014674
##   0.009340413  0.01231319  0.01901648
##   0.009145292  0.01322598  0.01905469
##   0.009231870  0.01414327  0.01949647
##   0.009159798  0.01355074  0.01903808
##   0.009249301  0.01295540  0.01871902
##   0.009385694  0.01304444  0.01917653
## 
## ROC was used to select the optimal model using  the largest value.
## The final values used for the model were n.trees = 700,
##  interaction.depth = 3 and shrinkage = 0.1.
ggplot(gbmTune)

#caret-gbm-prediction
gbmPred <- predict(gbmTune, newdata = testing) #class
confusionMatrix(gbmPred, testing$y)[[3]][1]
##  Accuracy 
## 0.8449115
gbmProbs <- predict(gbmTune, newdata = testing, type = "prob") # probabilities
testProbsDF$gbm <- gbmProbs[, "yes"]
head(testProbsDF)
##    y     rpart         gbm
## 1 no 0.1563396 0.007187026
## 2 no 0.1563396 0.013061271
## 3 no 0.1116928 0.041071705
## 4 no 0.1563396 0.011378348
## 5 no 0.1563396 0.025125842
## 6 no 0.1563396 0.029774636
gbmLift <- lift(y ~ rpart + gbm, data = testProbsDF) # lift chart
gbmLift
## 
## Call:
## lift.formula(x = y ~ rpart + gbm, data = testProbsDF)
## 
## Models: rpart, gbm 
## Event: yes (11.7%)
plot(gbmLift, auto.key = list(lines = TRUE, points = FALSE))

## Boosted Tree ROC Curve
gbmROC <- roc(response = testProbsDF$y, predictor = testProbsDF$gbm,  levels = rev(levels(raw$y)))
auc(gbmROC)
## Area under the curve: 0.9233
plot(rpartROC2, col = "#9E0142", legacy.axes = FALSE, type = "S")
## 
## Call:
## roc.default(response = testProbsDF$y, predictor = testProbsDF$rpart,     levels = rev(levels(raw$y)))
## 
## Data: testProbsDF$rpart in 3992 controls (testProbsDF$y no) < 528 cases (testProbsDF$y yes).
## Area under the curve: 0.8405
plot(gbmROC,   col = "#3288BD", legacy.axes = FALSE, add = TRUE)
## 
## Call:
## roc.default(response = testProbsDF$y, predictor = testProbsDF$gbm,     levels = rev(levels(raw$y)))
## 
## Data: testProbsDF$gbm in 3992 controls (testProbsDF$y no) < 528 cases (testProbsDF$y yes).
## Area under the curve: 0.9233
    legend(.6, .5, legend = c("rpart", "gbm"),lty = c(1, 1), col = c("#9E0142", "#3288BD"))

resamps <- resamples(list(CART=rpartTune2, GBM=gbmTune ))
summary(resamps)
## 
## Call:
## summary.resamples(object = resamps)
## 
## Models: CART, GBM 
## Number of resamples: 50 
## 
## ROC 
##        Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## CART 0.8399  0.8539 0.8626 0.8655  0.8767 0.9055    0
## GBM  0.9085  0.9238 0.9291 0.9289  0.9354 0.9456    0
## 
## Sens 
##        Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## CART 0.7752  0.8099 0.8248 0.8253  0.8385 0.8761    0
## GBM  0.8386  0.8697 0.8792 0.8789  0.8866 0.9076    0
## 
## Spec 
##        Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## CART 0.7752  0.8051 0.8237 0.8214  0.8356 0.8718    0
## GBM  0.8025  0.8367 0.8508 0.8468  0.8592 0.8739    0
xyplot(resamps)