library(RMariaDB)
con <- dbConnect(MariaDB(),
user = 'guest',
password = 'password',
host = 'mydbinstance4.c1uducbod6js.us-west-1.rds.amazonaws.com',
dbname='mldata')
d <- dbReadTable(conn = con, name = 'sfdata', value = d, overwrite = TRUE)
suppressMessages(library(doParallel))
registerDoParallel(cores = 3)
d$x34 <- as.factor(d$x34) #x34, different car types- this should be a factor
d$x35[d$x35 == "fri"] <- "friday"; d$x35[d$x35 == "thur"] <- "thursday"; d$x35[d$x35 == "thurday"] <- "thursday"; d$x35[d$x35 == "wed"] <- "wednesday"; d$x35 <- as.factor(d$x35) #x35 day of the week with some redundant entries- this needs to be cleaned
d$x41 <- gsub("[$]", "", d$x41); d$x41 <- as.numeric(d$x41) #x41, $ sign makes variable a character- remove $ and convert to numeric
d$x45 <- gsub("[%]", "", d$x45); d$x45 <- as.numeric(d$x45) # x45% sign makes variable a character- remove % and convert to numeric
d$x68[d$x68 == "Dev"] <- "Dec"; d$x68[d$x68 == "sept."] <- "Sep"; d$x68 <- as.factor(d$x68) #month factor with some typos, x68
d$x93 <- as.factor(d$x93) #continent... should be factor, x93
d$y <- as.factor(d$y); levels(d$y) <- c("no", "yes"); sum(d$y == "no")/nrow(d) #mostly "nos"; convert y to factor; sample distribution
library(naniar)
#sum(is.na(d$y)) #no missing observations for y
d <- d[complete.cases(d), ]; # gg_miss_var(d); sum(is.na(d))
suppressMessages(library(caret))
nearZeroVar(d) #"integer(0) means no near zero variance"
#nzv <- nearZeroVar(d, saveMetrics=TRUE) #head(nzv) #a low percentUnique reflects near zero variance
suppressMessages(library(caret))
suppressMessages(library(tidyverse))
## Warning: package 'purrr' was built under R version 3.5.3
set.seed(1234)
trainIndex <- createDataPartition(d$y, p = .8,
list = FALSE,
times = 1)
dTrain <- d[ trainIndex,]; dTest <- d[-trainIndex,]
x = 30 #training data sample size
dTrain <- sample_n(dTrain, x, replace = FALSE)
dTest <- sample_n(dTest, x*20/80, replace = FALSE) #always test with 80/20 split
library(DMwR)
library(caret)
library(tictoc)
tic()
ctrl <- trainControl(method = "repeatedcv", repeats = 5,
classProbs = TRUE,
preProcOptions = list(thresh = 0.85), #or list(pcaComp = 7);
summaryFunction = twoClassSummary, #twoClassSummary gives AOC/specificity/sensitivity
sampling = "smote") #hybrid sampling method using DMwR package
#original grid
grid <- expand.grid(interaction.depth = seq(3,9, by = 2),
n.trees = seq(100, 1500, by = 100),
shrinkage = c(0.01,0.1),
n.minobsinnode = c(5,10,15))
#final grid
'
grid <- expand.grid(interaction.depth = 9,
n.trees = 1500,
shrinkage = 0.1,
n.minobsinnode = 5)
'
## [1] "\ngrid <- expand.grid(interaction.depth = 9,\n n.trees = 1500,\n shrinkage = 0.1,\n n.minobsinnode = 5)\n"
gbmTune <- train(y ~ ., data=dTrain,
method = "gbm",
metric = "ROC",
verbose = FALSE,
trControl = ctrl,
tuneGrid = grid)
toc()
## 243.06 sec elapsed
Results w/ original grid:
30 obs: time to run model: .33 minute; 30% specificity 300 obs: time to run model: 19 minutes; 87% sensitivity / 80% specificity; 1400 or 1500 trees, depth of 7 shrinkage of .1 and min obs of 5
Results with final grid: 3000 obs: time to run model: 46 minutes; 96% / 70% specificity 313364 observations: 5 hrs; 98%; 90%
library(caret)
library(ggplot2)
suppressMessages(library(pROC))
#print(gbmTune)
ggplot(gbmTune) + theme(legend.position = "top")
gbmPred <- predict(gbmTune, dTest); gbmProbs <- predict(gbmTune, dTest, type = "prob"); confusionMatrix(gbmPred, dTest$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 5 2
## yes 0 0
##
## Accuracy : 0.7143
## 95% CI : (0.2904, 0.9633)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 0.6792
##
## Kappa : 0
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7143
## Neg Pred Value : NaN
## Prevalence : 0.7143
## Detection Rate : 0.7143
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : no
##
rocCurve <- roc(response = dTest$y,
predictor = gbmProbs[, "yes"],
levels = rev(levels(dTest$y)))
plot(rocCurve,
print.thres = c(.23), #print nominal 50% cutoff and another.
print.thres.pch = 16,
print.thres.cex = 1.2)