Illustration of Optimizing Probability Thresholds for Class Imbalance

#Illustrative Example Optimizing probability thresholds for class imbalances
library(caret); library(pROC); library(reshape2); library(ggplot2)

## Warning: package 'caret' was built under R version 3.1.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 3.1.2

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.1.3

## Warning: package 'pROC' was built under R version 3.1.3

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

## Warning: package 'reshape2' was built under R version 3.1.2

# CLEANED DATA
setwd("~/Documents/ACS/Acxion2")
trainingSet <- dget("trainingSet.csv")
testingSet <- dget("testingSet.csv")
dim(trainingSet)

## [1] 2310   20

trainingSet[1:2,c(1,20)] # sample

##   ibe2779_0 Class
## 4         6    NO
## 5        10    NO

table(trainingSet$Class)

## 
##   NO  YES 
## 2100  210

dim(testingSet)

## [1] 990  20

table(testingSet$Class)

## 
##  NO YES 
## 900  90

#There is almost a 9:1 imbalance in these data. 
#Let's use a standard random forest model with these data using the default value of mtry. 
#We'll also use repeated 10-fold cross validation to get a sense of performance:
set.seed(949)
mod0 <- train(Class ~ ., data = trainingSet,
              method = "rf",
              metric = "ROC",
              tuneGrid = data.frame(mtry = 3),
              ntree = 1000,
              trControl = trainControl(method = "repeatedcv",
                                       repeats = 5,
                                       classProbs = TRUE,
                                       summaryFunction = twoClassSummary))

## Loading required package: randomForest

## Warning: package 'randomForest' was built under R version 3.1.1

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

getTrainPerf(mod0)

##    TrainROC TrainSens TrainSpec method
## 1 0.9860023 0.9975238 0.6047619     rf

## Get the ROC curve using TestingSet
roc0 <- roc(testingSet$Class,
            predict(mod0, testingSet, type = "prob")[,1],
            levels = rev(levels(testingSet$Class)))
roc0

## 
## Call:
## roc.default(response = testingSet$Class, predictor = predict(mod0,     testingSet, type = "prob")[, 1], levels = rev(levels(testingSet$Class)))
## 
## Data: predict(mod0, testingSet, type = "prob")[, 1] in 90 controls (testingSet$Class YES) < 900 cases (testingSet$Class NO).
## Area under the curve: 0.9907

## Now plot
plot(roc0, print.thres = c(.5), type = "S", 
     print.thres.pattern = "%.3f (Spec = %.2f, Sens = %.2f)", 
     print.thres.cex = .8,
     legacy.axes = TRUE)

## 
## Call:
## roc.default(response = testingSet$Class, predictor = predict(mod0,     testingSet, type = "prob")[, 1], levels = rev(levels(testingSet$Class)))
## 
## Data: predict(mod0, testingSet, type = "prob")[, 1] in 90 controls (testingSet$Class YES) < 900 cases (testingSet$Class NO).
## Area under the curve: 0.9907

# ****The area under the ROC curve is very high, indicating that the model has very good 
# predictive power for these data. 
# The plot shows the default probability cut off value of 50%. 
# The sensitivity and specificity values associated with this point indicate 
# that performance is **not** that good when an actual call needs to be made on a sample.

# One of the most common ways to deal with this is to determine an alternate probability 
# cut off using the ROC curve. But to do this well, another set of data (not the test set) 
# is needed to set the cut off and the test set is used to validate it. We don't have a 
# lot of data this is difficult since we will be spending some of our data just to get 
# a single cut off value.
# Alternatively the model can be tuned, using resampling, to determine any model tuning 
# parameters as well as an appropriate cut off for the probabilities.

## Get the model code for the original random forest method:
# Suppose the model has one tuning parameter and we want to look at four candidate values for tuning. 
# Suppose we also want to tune the probability cut off over 20 different thresholds. 
# Now we have to look at 20×4=80 different models (and that is for each resample). 
# One other feature that has been opened up his ability to use sequential parameters: 
#     these are tuning parameters that don't require a completely new model fit to produce predictions. 
# In this case, we can fit one random forest model and get it's predicted class probabilities and 
# evaluate the candidate probability cutoffs using these same hold-out samples. Here is what the 
# model code looks like:

thresh_code <- getModelInfo("rf", regex = FALSE)[[1]]
# rf model modified with following parameters
#         thresh_code$type
#         thresh_code$parameters
#         thresh_code$grid
#         thresh_code$loop
#         thresh_code$fit
#         thresh_code$predict
#         thresh_code$prob

                thresh_code$type <- c("Classification") #...1
                ## Add the threshold as another tuning parameter
                thresh_code$parameters <- data.frame(parameter = c("mtry", "threshold"),  #...2
                                                     class = c("numeric", "numeric"),
                                                     label = c("#Randomly Selected Predictors",
                                                               "Probability Cutoff"))
                ## The default tuning grid code:
                thresh_code$grid <- function(x, y, len = NULL, search = "grid") {  #...3
                    p <- ncol(x)
                    if(search == "grid") {
                        grid <- expand.grid(mtry = floor(sqrt(p)),
                                            threshold = seq(.01, .99, length = len))
                    } else {
                        grid <- expand.grid(mtry = sample(1:p, size = len),
                                            threshold = runif(1, 0, size = len))
                    }
                    grid
                }
                
                ## Here we fit a single random forest model (with a fixed mtry)
                ## and loop over the threshold values to get predictions from the same
                ## randomForest model.
                thresh_code$loop = function(grid) {    #...4
                    library(plyr)
                    loop <- ddply(grid, c("mtry"),
                                  function(x) c(threshold = max(x$threshold)))
                    submodels <- vector(mode = "list", length = nrow(loop))
                    for(i in seq(along = loop$threshold)) {
                        index <- which(grid$mtry == loop$mtry[i])
                        cuts <- grid[index, "threshold"]
                        submodels[[i]] <- data.frame(threshold = cuts[cuts != loop$threshold[i]])
                    }
                    list(loop = loop, submodels = submodels)
                }
                
                ## Fit the model independent of the threshold parameter
                thresh_code$fit = function(x, y, wts, param, lev, last, classProbs, ...) {  #...5
                    if(length(levels(y)) != 2)
                        stop("This works only for 2-class problems")
                    randomForest(x, y, mtry = param$mtry, ...)
                }
                
                ## Now get a probability prediction and use different thresholds to
                ## get the predicted class
                thresh_code$predict = function(modelFit, newdata, submodels = NULL) {    #...6
                    class1Prob <- predict(modelFit,
                                          newdata,
                                          type = "prob")[, modelFit$obsLevels[1]]
                    ## Raise the threshold for class #1 and a higher level of
                    ## evidence is needed to call it class 1 so it should 
                    ## decrease sensitivity and increase specificity
                    out <- ifelse(class1Prob >= modelFit$tuneValue$threshold,
                                  modelFit$obsLevels[1],
                                  modelFit$obsLevels[2])
                    if(!is.null(submodels)) {
                        tmp2 <- out
                        out <- vector(mode = "list", length = length(submodels$threshold))
                        out[[1]] <- tmp2
                        for(i in seq(along = submodels$threshold)) {
                            out[[i+1]] <- ifelse(class1Prob >= submodels$threshold[[i]],
                                                 modelFit$obsLevels[1],
                                                 modelFit$obsLevels[2])
                        }
                    }
                    out
                }                  
                ## The probabilities are always the same but we have to create
                ## mulitple versions of the probs to evaluate the data across
                ## thresholds
                thresh_code$prob = function(modelFit, newdata, submodels = NULL) {   #...7
                    out <- as.data.frame(predict(modelFit, newdata, type = "prob"))
                    if(!is.null(submodels)) {
                        probs <- out
                        out <- vector(mode = "list", length = length(submodels$threshold)+1)
                        out <- lapply(out, function(x) probs)
                    }
                    out
                }
                
                ### for summaryFunction in trControl 
                fourStats <- function (data, lev = levels(data$obs), model = NULL) {
                    ## This code will get use the area under the ROC curve and the
                    ## sensitivity and specificity values using the current candidate
                    ## value of the probability threshold.
                    out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))
                    
                    ## The best possible model has sensitivity of 1 and specificity of 1. 
                    ## How far are we from that value?
                    coords <- matrix(c(1, 1, out["Spec"], out["Sens"]),
                                     ncol = 2,
                                     byrow = TRUE)
                    colnames(coords) <- c("Spec", "Sens")
                    rownames(coords) <- c("Best", "Current")
                    c(out, Dist = dist(coords)[1])
                }
################## Modeling with customized RF model ###################
#>>>>>>>>>>>>>>>
set.seed(949)
mod1 <- train(Class ~ ., data = trainingSet,
              method = thresh_code,  # modified model -- in lieu of "rf"
              ## Minimize the distance to the perfect model
              metric = "Dist",
              maximize = FALSE,
              tuneLength = 10,   #  20,
              ntree = 200,          # 1000,
              trControl = trainControl(method = "cv",  #      "repeatedcv",
                                       #       repeats = 5,
                                       classProbs = TRUE,
                                       summaryFunction = fourStats))

## Warning: package 'plyr' was built under R version 3.1.3

#<<<<<<<<<<<<<<<<
mod1

## Random Forest 
## 
## 2310 samples
##   19 predictor
##    2 classes: 'NO', 'YES' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 2079, 2079, 2079, 2079, 2079, 2079, ... 
## 
## Resampling results across tuning parameters:
## 
##   threshold  ROC        Sens       Spec         Dist        ROC SD   
##   0.0100000  0.9883673  1.0000000  0.004761905  0.99523810  0.0129724
##   0.1188889  0.9883673  1.0000000  0.247619048  0.75238095  0.0129724
##   0.2277778  0.9883673  1.0000000  0.404761905  0.59523810  0.0129724
##   0.3366667  0.9883673  0.9980952  0.523809524  0.47620107  0.0129724
##   0.4455556  0.9883673  0.9966667  0.652380952  0.34765627  0.0129724
##   0.5544444  0.9883673  0.9942857  0.771428571  0.22873865  0.0129724
##   0.6633333  0.9883673  0.9866667  0.866666667  0.13690502  0.0129724
##   0.7722222  0.9883673  0.9690476  0.928571429  0.08679229  0.0129724
##   0.8811111  0.9883673  0.9166667  0.976190476  0.09334572  0.0129724
##   0.9900000  0.9883673  0.5823810  0.995238095  0.41787433  0.0129724
##   Sens SD      Spec SD     Dist SD   
##   0.000000000  0.01505847  0.01505847
##   0.000000000  0.09733149  0.09733149
##   0.000000000  0.10588622  0.10588622
##   0.002459037  0.08399211  0.08398538
##   0.003214041  0.11235135  0.11232431
##   0.004918074  0.12046772  0.12036599
##   0.010480999  0.11620956  0.11286980
##   0.006447650  0.09323286  0.08424655
##   0.024203096  0.04627740  0.03730600
##   0.034415539  0.01505847  0.03463233
## 
## Tuning parameter 'mtry' was held constant at a value of 6
## Dist was used to select the optimal model using  the smallest value.
## The final values used for the model were mtry = 6 and threshold
##  = 0.7722222.

metrics <- mod1$results[, c(2, 4:6)]
metrics <- melt(metrics, id.vars = "threshold",
                variable.name = "Resampled",
                value.name = "Data")

ggplot(metrics, aes(x = threshold, y = Data, color = Resampled)) +
    geom_line() +
    ylab("") + xlab("Probability Cutoff") +
    theme(legend.position = "top")

Illustration of Optimizing Probability Thresholds for Class Imbalance

SA