GMB with Caret

library(RMariaDB)
con <- dbConnect(MariaDB(),
                 user = 'guest',
                 password = 'password',
                 host = 'mydbinstance4.c1uducbod6js.us-west-1.rds.amazonaws.com',
                 dbname='mldata')

d <- dbReadTable(conn = con, name = 'sfdata', value = d, overwrite = TRUE)

suppressMessages(library(doParallel))
registerDoParallel(cores = 3)

Wrangle Data

d$x34 <- as.factor(d$x34) #x34, different car types- this should be a factor
d$x35[d$x35 == "fri"] <- "friday"; d$x35[d$x35 == "thur"] <- "thursday"; d$x35[d$x35 == "thurday"] <- "thursday"; d$x35[d$x35 == "wed"] <- "wednesday"; d$x35 <- as.factor(d$x35) #x35 day of the week with some redundant entries- this needs to be cleaned
d$x41 <- gsub("[$]", "", d$x41); d$x41 <- as.numeric(d$x41) #x41, $ sign makes variable a character- remove $ and convert to numeric
d$x45 <- gsub("[%]", "", d$x45); d$x45 <- as.numeric(d$x45) # x45% sign makes variable a character- remove % and convert to numeric
d$x68[d$x68 == "Dev"] <- "Dec"; d$x68[d$x68 == "sept."] <- "Sep"; d$x68 <- as.factor(d$x68) #month factor with some typos, x68
d$x93 <- as.factor(d$x93) #continent... should be factor, x93
d$y <- as.factor(d$y); levels(d$y) <- c("no", "yes"); sum(d$y == "no")/nrow(d) #mostly "nos"; convert y to factor; sample distribution

library(naniar)
#sum(is.na(d$y)) #no missing observations for y
d <- d[complete.cases(d), ]; # gg_miss_var(d); sum(is.na(d))

suppressMessages(library(caret))
nearZeroVar(d) #"integer(0) means no near zero variance"
#nzv <- nearZeroVar(d, saveMetrics=TRUE) #head(nzv) #a low percentUnique reflects near zero variance

Pre- Processing

suppressMessages(library(caret))
suppressMessages(library(tidyverse))

## Warning: package 'purrr' was built under R version 3.5.3

set.seed(1234)
trainIndex <- createDataPartition(d$y, p = .8, 
                                  list = FALSE, 
                                  times = 1)
dTrain <- d[ trainIndex,]; dTest  <- d[-trainIndex,]

x = 30 #training data sample size
dTrain <- sample_n(dTrain, x, replace = FALSE)
dTest <- sample_n(dTest, x*20/80, replace = FALSE) #always test with 80/20 split

Model

library(DMwR)
library(caret)
library(tictoc)
tic()

ctrl <- trainControl(method = "repeatedcv", repeats = 5,
                     classProbs = TRUE,
                     preProcOptions = list(thresh = 0.85), #or list(pcaComp = 7); 
                     summaryFunction = twoClassSummary, #twoClassSummary gives AOC/specificity/sensitivity
                     sampling = "smote") #hybrid sampling method using DMwR package

#original grid

grid <- expand.grid(interaction.depth = seq(3,9, by = 2),
                    n.trees = seq(100, 1500, by = 100),
                    shrinkage = c(0.01,0.1),
                    n.minobsinnode = c(5,10,15))

#final grid
'
grid <- expand.grid(interaction.depth = 9,
                    n.trees = 1500,
                    shrinkage = 0.1,
                    n.minobsinnode = 5)
'

## [1] "\ngrid <- expand.grid(interaction.depth = 9,\n                    n.trees = 1500,\n                    shrinkage = 0.1,\n                    n.minobsinnode = 5)\n"

gbmTune <- train(y ~ ., data=dTrain, 
                 method = "gbm",
                 metric = "ROC", 
                 verbose = FALSE, 
                 trControl = ctrl,
                 tuneGrid = grid)
toc()

## 243.06 sec elapsed

Analysis

Results w/ original grid:

30 obs: time to run model: .33 minute; 30% specificity 300 obs: time to run model: 19 minutes; 87% sensitivity / 80% specificity; 1400 or 1500 trees, depth of 7 shrinkage of .1 and min obs of 5

Results with final grid: 3000 obs: time to run model: 46 minutes; 96% / 70% specificity 313364 observations: 5 hrs; 98%; 90%

library(caret)
library(ggplot2)
suppressMessages(library(pROC))
#print(gbmTune)
ggplot(gbmTune) + theme(legend.position = "top")

gbmPred <- predict(gbmTune, dTest); gbmProbs <- predict(gbmTune, dTest, type = "prob"); confusionMatrix(gbmPred, dTest$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction no yes
##        no   5   2
##        yes  0   0
##                                           
##                Accuracy : 0.7143          
##                  95% CI : (0.2904, 0.9633)
##     No Information Rate : 0.7143          
##     P-Value [Acc > NIR] : 0.6792          
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.7143          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.7143          
##          Detection Rate : 0.7143          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : no              
##

rocCurve <- roc(response = dTest$y, 
                predictor = gbmProbs[, "yes"],
                levels = rev(levels(dTest$y)))
plot(rocCurve,
     print.thres = c(.23), #print nominal 50% cutoff and another. 
     print.thres.pch = 16,
     print.thres.cex = 1.2)