#Illustrative Example Optimizing probability thresholds for class imbalances
library(caret); library(pROC); library(reshape2); library(ggplot2)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.1.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.1.3
## Warning: package 'pROC' was built under R version 3.1.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## Warning: package 'reshape2' was built under R version 3.1.2
# CLEANED DATA
setwd("~/Documents/ACS/Acxion2")
trainingSet <- dget("trainingSet.csv")
testingSet <- dget("testingSet.csv")
dim(trainingSet)
## [1] 2310 20
trainingSet[1:2,c(1,20)] # sample
## ibe2779_0 Class
## 4 6 NO
## 5 10 NO
table(trainingSet$Class)
##
## NO YES
## 2100 210
dim(testingSet)
## [1] 990 20
table(testingSet$Class)
##
## NO YES
## 900 90
#There is almost a 9:1 imbalance in these data.
#Let's use a standard random forest model with these data using the default value of mtry.
#We'll also use repeated 10-fold cross validation to get a sense of performance:
set.seed(949)
mod0 <- train(Class ~ ., data = trainingSet,
method = "rf",
metric = "ROC",
tuneGrid = data.frame(mtry = 3),
ntree = 1000,
trControl = trainControl(method = "repeatedcv",
repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary))
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 3.1.1
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
getTrainPerf(mod0)
## TrainROC TrainSens TrainSpec method
## 1 0.9860023 0.9975238 0.6047619 rf
## Get the ROC curve using TestingSet
roc0 <- roc(testingSet$Class,
predict(mod0, testingSet, type = "prob")[,1],
levels = rev(levels(testingSet$Class)))
roc0
##
## Call:
## roc.default(response = testingSet$Class, predictor = predict(mod0, testingSet, type = "prob")[, 1], levels = rev(levels(testingSet$Class)))
##
## Data: predict(mod0, testingSet, type = "prob")[, 1] in 90 controls (testingSet$Class YES) < 900 cases (testingSet$Class NO).
## Area under the curve: 0.9907
## Now plot
plot(roc0, print.thres = c(.5), type = "S",
print.thres.pattern = "%.3f (Spec = %.2f, Sens = %.2f)",
print.thres.cex = .8,
legacy.axes = TRUE)

##
## Call:
## roc.default(response = testingSet$Class, predictor = predict(mod0, testingSet, type = "prob")[, 1], levels = rev(levels(testingSet$Class)))
##
## Data: predict(mod0, testingSet, type = "prob")[, 1] in 90 controls (testingSet$Class YES) < 900 cases (testingSet$Class NO).
## Area under the curve: 0.9907
# ****The area under the ROC curve is very high, indicating that the model has very good
# predictive power for these data.
# The plot shows the default probability cut off value of 50%.
# The sensitivity and specificity values associated with this point indicate
# that performance is **not** that good when an actual call needs to be made on a sample.
# One of the most common ways to deal with this is to determine an alternate probability
# cut off using the ROC curve. But to do this well, another set of data (not the test set)
# is needed to set the cut off and the test set is used to validate it. We don't have a
# lot of data this is difficult since we will be spending some of our data just to get
# a single cut off value.
# Alternatively the model can be tuned, using resampling, to determine any model tuning
# parameters as well as an appropriate cut off for the probabilities.
## Get the model code for the original random forest method:
# Suppose the model has one tuning parameter and we want to look at four candidate values for tuning.
# Suppose we also want to tune the probability cut off over 20 different thresholds.
# Now we have to look at 20×4=80 different models (and that is for each resample).
# One other feature that has been opened up his ability to use sequential parameters:
# these are tuning parameters that don't require a completely new model fit to produce predictions.
# In this case, we can fit one random forest model and get it's predicted class probabilities and
# evaluate the candidate probability cutoffs using these same hold-out samples. Here is what the
# model code looks like:
thresh_code <- getModelInfo("rf", regex = FALSE)[[1]]
# rf model modified with following parameters
# thresh_code$type
# thresh_code$parameters
# thresh_code$grid
# thresh_code$loop
# thresh_code$fit
# thresh_code$predict
# thresh_code$prob
thresh_code$type <- c("Classification") #...1
## Add the threshold as another tuning parameter
thresh_code$parameters <- data.frame(parameter = c("mtry", "threshold"), #...2
class = c("numeric", "numeric"),
label = c("#Randomly Selected Predictors",
"Probability Cutoff"))
## The default tuning grid code:
thresh_code$grid <- function(x, y, len = NULL, search = "grid") { #...3
p <- ncol(x)
if(search == "grid") {
grid <- expand.grid(mtry = floor(sqrt(p)),
threshold = seq(.01, .99, length = len))
} else {
grid <- expand.grid(mtry = sample(1:p, size = len),
threshold = runif(1, 0, size = len))
}
grid
}
## Here we fit a single random forest model (with a fixed mtry)
## and loop over the threshold values to get predictions from the same
## randomForest model.
thresh_code$loop = function(grid) { #...4
library(plyr)
loop <- ddply(grid, c("mtry"),
function(x) c(threshold = max(x$threshold)))
submodels <- vector(mode = "list", length = nrow(loop))
for(i in seq(along = loop$threshold)) {
index <- which(grid$mtry == loop$mtry[i])
cuts <- grid[index, "threshold"]
submodels[[i]] <- data.frame(threshold = cuts[cuts != loop$threshold[i]])
}
list(loop = loop, submodels = submodels)
}
## Fit the model independent of the threshold parameter
thresh_code$fit = function(x, y, wts, param, lev, last, classProbs, ...) { #...5
if(length(levels(y)) != 2)
stop("This works only for 2-class problems")
randomForest(x, y, mtry = param$mtry, ...)
}
## Now get a probability prediction and use different thresholds to
## get the predicted class
thresh_code$predict = function(modelFit, newdata, submodels = NULL) { #...6
class1Prob <- predict(modelFit,
newdata,
type = "prob")[, modelFit$obsLevels[1]]
## Raise the threshold for class #1 and a higher level of
## evidence is needed to call it class 1 so it should
## decrease sensitivity and increase specificity
out <- ifelse(class1Prob >= modelFit$tuneValue$threshold,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
if(!is.null(submodels)) {
tmp2 <- out
out <- vector(mode = "list", length = length(submodels$threshold))
out[[1]] <- tmp2
for(i in seq(along = submodels$threshold)) {
out[[i+1]] <- ifelse(class1Prob >= submodels$threshold[[i]],
modelFit$obsLevels[1],
modelFit$obsLevels[2])
}
}
out
}
## The probabilities are always the same but we have to create
## mulitple versions of the probs to evaluate the data across
## thresholds
thresh_code$prob = function(modelFit, newdata, submodels = NULL) { #...7
out <- as.data.frame(predict(modelFit, newdata, type = "prob"))
if(!is.null(submodels)) {
probs <- out
out <- vector(mode = "list", length = length(submodels$threshold)+1)
out <- lapply(out, function(x) probs)
}
out
}
### for summaryFunction in trControl
fourStats <- function (data, lev = levels(data$obs), model = NULL) {
## This code will get use the area under the ROC curve and the
## sensitivity and specificity values using the current candidate
## value of the probability threshold.
out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))
## The best possible model has sensitivity of 1 and specificity of 1.
## How far are we from that value?
coords <- matrix(c(1, 1, out["Spec"], out["Sens"]),
ncol = 2,
byrow = TRUE)
colnames(coords) <- c("Spec", "Sens")
rownames(coords) <- c("Best", "Current")
c(out, Dist = dist(coords)[1])
}
################## Modeling with customized RF model ###################
#>>>>>>>>>>>>>>>
set.seed(949)
mod1 <- train(Class ~ ., data = trainingSet,
method = thresh_code, # modified model -- in lieu of "rf"
## Minimize the distance to the perfect model
metric = "Dist",
maximize = FALSE,
tuneLength = 10, # 20,
ntree = 200, # 1000,
trControl = trainControl(method = "cv", # "repeatedcv",
# repeats = 5,
classProbs = TRUE,
summaryFunction = fourStats))
## Warning: package 'plyr' was built under R version 3.1.3
#<<<<<<<<<<<<<<<<
mod1
## Random Forest
##
## 2310 samples
## 19 predictor
## 2 classes: 'NO', 'YES'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 2079, 2079, 2079, 2079, 2079, 2079, ...
##
## Resampling results across tuning parameters:
##
## threshold ROC Sens Spec Dist ROC SD
## 0.0100000 0.9883673 1.0000000 0.004761905 0.99523810 0.0129724
## 0.1188889 0.9883673 1.0000000 0.247619048 0.75238095 0.0129724
## 0.2277778 0.9883673 1.0000000 0.404761905 0.59523810 0.0129724
## 0.3366667 0.9883673 0.9980952 0.523809524 0.47620107 0.0129724
## 0.4455556 0.9883673 0.9966667 0.652380952 0.34765627 0.0129724
## 0.5544444 0.9883673 0.9942857 0.771428571 0.22873865 0.0129724
## 0.6633333 0.9883673 0.9866667 0.866666667 0.13690502 0.0129724
## 0.7722222 0.9883673 0.9690476 0.928571429 0.08679229 0.0129724
## 0.8811111 0.9883673 0.9166667 0.976190476 0.09334572 0.0129724
## 0.9900000 0.9883673 0.5823810 0.995238095 0.41787433 0.0129724
## Sens SD Spec SD Dist SD
## 0.000000000 0.01505847 0.01505847
## 0.000000000 0.09733149 0.09733149
## 0.000000000 0.10588622 0.10588622
## 0.002459037 0.08399211 0.08398538
## 0.003214041 0.11235135 0.11232431
## 0.004918074 0.12046772 0.12036599
## 0.010480999 0.11620956 0.11286980
## 0.006447650 0.09323286 0.08424655
## 0.024203096 0.04627740 0.03730600
## 0.034415539 0.01505847 0.03463233
##
## Tuning parameter 'mtry' was held constant at a value of 6
## Dist was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 6 and threshold
## = 0.7722222.
metrics <- mod1$results[, c(2, 4:6)]
metrics <- melt(metrics, id.vars = "threshold",
variable.name = "Resampled",
value.name = "Data")
ggplot(metrics, aes(x = threshold, y = Data, color = Resampled)) +
geom_line() +
ylab("") + xlab("Probability Cutoff") +
theme(legend.position = "top")
