How to handle Imbalanced Data?

ROSE

Data

##install.packages("ROSE")
library(ROSE)
data(hacide)
str(hacide.train)

## 'data.frame':    1000 obs. of  3 variables:
##  $ cls: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ x1 : num  0.2008 0.0166 0.2287 0.1264 0.6008 ...
##  $ x2 : num  0.678 1.5766 -0.5595 -0.0938 -0.2984 ...

table(hacide.train$cls)

## 
##   0   1 
## 980  20

library(rpart)
treeimb <- rpart(cls ~ ., data = hacide.train)
pred.treeimb <- predict(treeimb, newdata = hacide.test)
accuracy.meas(hacide.test$cls, pred.treeimb[,2])

## 
## Call: 
## accuracy.meas(response = hacide.test$cls, predicted = pred.treeimb[, 
##     2])
## 
## Examples are labelled as positive when predicted is greater than 0.5 
## 
## precision: 1.000
## recall: 0.200
## F: 0.167

roc.curve(hacide.test$cls, pred.treeimb[,2], plotit = F)

## Area under the curve (AUC): 0.600

data_balanced_over <- ovun.sample(cls ~ ., data = hacide.train, method = "over",N = 1960)$data
table(data_balanced_over$cls)

## 
##   0   1 
## 980 980

data_balanced_under <- ovun.sample(cls ~ ., data = hacide.train, method = "under", N = 40, seed = 1)$data
table(data_balanced_under$cls)

## 
##  0  1 
## 20 20

data_balanced_both <- ovun.sample(cls ~ ., data = hacide.train, method = "both", p=0.5, N=1000, seed = 1)$data
table(data_balanced_both$cls)

## 
##   0   1 
## 520 480

data.rose <- ROSE(cls ~ ., data = hacide.train, seed = 1)$data
table(data.rose$cls)

## 
##   0   1 
## 520 480

#build decision tree models
tree.rose <- rpart(cls ~ ., data = data.rose)
tree.over <- rpart(cls ~ ., data = data_balanced_over)
tree.under <- rpart(cls ~ ., data = data_balanced_under)
tree.both <- rpart(cls ~ ., data = data_balanced_both)
#make predictions on unseen data
pred.tree.rose <- predict(tree.rose, newdata = hacide.test)
pred.tree.over <- predict(tree.over, newdata = hacide.test)
pred.tree.under <- predict(tree.under, newdata = hacide.test)
pred.tree.both <- predict(tree.both, newdata = hacide.test)
ROSE.holdout <- ROSE.eval(cls ~ ., data = hacide.train, learner = rpart, 
                          method.assess = "holdout", extr.pred = function(obj)obj[,2], seed = 1)
ROSE.holdout

## 
## Call: 
## ROSE.eval(formula = cls ~ ., data = hacide.train, learner = rpart, 
##     extr.pred = function(obj) obj[, 2], method.assess = "holdout", 
##     seed = 1)
## 
## Holdout estimate of auc: 0.980

Figure

roc.curve(hacide.test$cls, pred.tree.rose[,2])

## Area under the curve (AUC): 0.993

roc.curve(hacide.test$cls, pred.tree.over[,2])

## Area under the curve (AUC): 0.798

roc.curve(hacide.test$cls, pred.tree.under[,2])

## Area under the curve (AUC): 0.924

roc.curve(hacide.test$cls, pred.tree.both[,2])

## Area under the curve (AUC): 0.798

imbalance

Data

library("imbalance")
data(newthyroid1)
head(newthyroid1)

##   T3resin Thyroxin Triiodothyronine Thyroidstimulating TSH_value    Class
## 1     105      7.3              1.5                1.5      -0.1 negative
## 2      67     23.3              7.4                1.8      -0.6 positive
## 3     111      8.4              1.5                0.8       1.2 negative
## 4      89     14.3              4.1                0.5       0.2 positive
## 5     105      9.5              1.8                1.6       3.6 negative
## 6     110     20.3              3.7                0.6       0.2 positive

summary(newthyroid1)

##     T3resin         Thyroxin      Triiodothyronine Thyroidstimulating
##  Min.   : 65.0   Min.   : 0.500   Min.   : 0.20    Min.   : 0.10     
##  1st Qu.:103.0   1st Qu.: 7.100   1st Qu.: 1.35    1st Qu.: 1.00     
##  Median :110.0   Median : 9.200   Median : 1.70    Median : 1.30     
##  Mean   :109.6   Mean   : 9.805   Mean   : 2.05    Mean   : 2.88     
##  3rd Qu.:117.5   3rd Qu.:11.300   3rd Qu.: 2.20    3rd Qu.: 1.70     
##  Max.   :144.0   Max.   :25.300   Max.   :10.00    Max.   :56.40     
##    TSH_value           Class    
##  Min.   :-0.700   negative:180  
##  1st Qu.: 0.550   positive: 35  
##  Median : 2.000                 
##  Mean   : 4.199                 
##  3rd Qu.: 4.100                 
##  Max.   :56.300

numPositive <- length(which(newthyroid1$Class == "positive"))
numNegative <- length(which(newthyroid1$Class == "negative"))
nInstances <- numNegative - numPositive
newSamples <- pdfos(dataset = newthyroid1, numInstances = 80,
                    classAttr = "Class")
# Bind a balanced dataset
newDataset <- rbind(newthyroid1, newSamples)
head(newthyroid1)

##   T3resin Thyroxin Triiodothyronine Thyroidstimulating TSH_value    Class
## 1     105      7.3              1.5                1.5      -0.1 negative
## 2      67     23.3              7.4                1.8      -0.6 positive
## 3     111      8.4              1.5                0.8       1.2 negative
## 4      89     14.3              4.1                0.5       0.2 positive
## 5     105      9.5              1.8                1.6       3.6 negative
## 6     110     20.3              3.7                0.6       0.2 positive

summary(newthyroid1)

##     T3resin         Thyroxin      Triiodothyronine Thyroidstimulating
##  Min.   : 65.0   Min.   : 0.500   Min.   : 0.20    Min.   : 0.10     
##  1st Qu.:103.0   1st Qu.: 7.100   1st Qu.: 1.35    1st Qu.: 1.00     
##  Median :110.0   Median : 9.200   Median : 1.70    Median : 1.30     
##  Mean   :109.6   Mean   : 9.805   Mean   : 2.05    Mean   : 2.88     
##  3rd Qu.:117.5   3rd Qu.:11.300   3rd Qu.: 2.20    3rd Qu.: 1.70     
##  Max.   :144.0   Max.   :25.300   Max.   :10.00    Max.   :56.40     
##    TSH_value           Class    
##  Min.   :-0.700   negative:180  
##  1st Qu.: 0.550   positive: 35  
##  Median : 2.000                 
##  Mean   : 4.199                 
##  3rd Qu.: 4.100                 
##  Max.   :56.300

Figure

# Plot a visual comparison between new and old dataset
plotComparison(newthyroid1, newDataset,
               attrs = names(newthyroid1)[1:3], classAttr = "Class")

SMOTE

### SOURCE: https://dpmartin42.github.io/
library(dplyr) # for data manipulation
library(caret) # for model-building
library(DMwR) # for smote implementation
library(purrr) # for functional programming (map)
library(pROC) # for AUC calculations
library(PRROC) # for Precision-Recall curve calculations


set.seed(2969)

imbal_train <- twoClassSim(5000,
                           intercept = -25,
                           linearVars = 20,
                           noiseVars = 10)

imbal_test  <- twoClassSim(5000,
                           intercept = -25,
                           linearVars = 20,
                           noiseVars = 10)

prop.table(table(imbal_train$Class))

## 
## Class1 Class2 
## 0.9796 0.0204

# Set up control function for training

ctrl <- trainControl(method = "repeatedcv",
                     number = 10,
                     repeats = 5,
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE)

# Build a standard classifier using a gradient boosted machine

set.seed(5627)

orig_fit <- train(Class ~ .,
                  data = imbal_train,
                  method = "gbm",
                  verbose = FALSE,
                  metric = "ROC",
                  trControl = ctrl)

# Build custom AUC function to extract AUC
# from the caret model object

test_roc <- function(model, data) {
  
  roc(data$Class,
      predict(model, data, type = "prob")[, "Class2"])
  
}

orig_fit %>%
  test_roc(data = imbal_test) %>%
  auc()

## Area under the curve: 0.9456

# Create model weights (they sum to one)

model_weights <- ifelse(imbal_train$Class == "Class1",
                        (1/table(imbal_train$Class)[1]) * 0.5,
                        (1/table(imbal_train$Class)[2]) * 0.5)

# Use the same seed to ensure same cross-validation splits

ctrl$seeds <- orig_fit$control$seeds

# Build weighted model

weighted_fit <- train(Class ~ .,
                      data = imbal_train,
                      method = "gbm",
                      verbose = FALSE,
                      weights = model_weights,
                      metric = "ROC",
                      trControl = ctrl)

# Build down-sampled model

ctrl$sampling <- "down"

down_fit <- train(Class ~ .,
                  data = imbal_train,
                  method = "gbm",
                  verbose = FALSE,
                  metric = "ROC",
                  trControl = ctrl)

# Build up-sampled model

ctrl$sampling <- "up"

up_fit <- train(Class ~ .,
                data = imbal_train,
                method = "gbm",
                verbose = FALSE,
                metric = "ROC",
                trControl = ctrl)

# Build smote model

ctrl$sampling <- "smote"

smote_fit <- train(Class ~ .,
                   data = imbal_train,
                   method = "gbm",
                   verbose = FALSE,
                   metric = "ROC",
                   trControl = ctrl)


# Examine results for test set

model_list <- list(original = orig_fit,
                   weighted = weighted_fit,
                   down = down_fit,
                   up = up_fit,
                   SMOTE = smote_fit)

model_list_roc <- model_list %>%
  map(test_roc, data = imbal_test)

model_list_roc %>%
  map(auc)

## $original
## Area under the curve: 0.9456
## 
## $weighted
## Area under the curve: 0.9793
## 
## $down
## Area under the curve: 0.9675
## 
## $up
## Area under the curve: 0.9784
## 
## $SMOTE
## Area under the curve: 0.9768

results_list_roc <- list(NA)
num_mod <- 1

for(the_roc in model_list_roc){
  
  results_list_roc[[num_mod]] <- 
    data_frame(tpr = the_roc$sensitivities,
               fpr = 1 - the_roc$specificities,
               model = names(model_list)[num_mod])
  
  num_mod <- num_mod + 1
  
}

results_df_roc <- bind_rows(results_list_roc)

# Plot ROC curve for all 5 models

custom_col <- c("#000000", "#009E73", "#0072B2", "#D55E00", "#CC79A7")

ggplot(aes(x = fpr,  y = tpr, group = model), data = results_df_roc) +
  geom_line(aes(color = model), size = 1) +
  scale_color_manual(values = custom_col) +
  geom_abline(intercept = 0, slope = 1, color = "gray", size = 1) +
  theme_bw(base_size = 18)

calc_auprc <- function(model, data){
  
  index_class2 <- data$Class == "Class2"
  index_class1 <- data$Class == "Class1"
  
  predictions <- predict(model, data, type = "prob")
  
  pr.curve(predictions$Class2[index_class2],
           predictions$Class2[index_class1],
           curve = TRUE)
  
}

# Get results for all 5 models

model_list_pr <- model_list %>%
  map(calc_auprc, data = imbal_test)

model_list_pr %>%
  map(function(the_mod) the_mod$auc.integral)

## $original
## [1] 0.5187951
## 
## $weighted
## [1] 0.640687
## 
## $down
## [1] 0.53057
## 
## $up
## [1] 0.6246232
## 
## $SMOTE
## [1] 0.6341753

# Plot the AUPRC curve for all 5 models

results_list_pr <- list(NA)
num_mod <- 1

for(the_pr in model_list_pr){
  
  results_list_pr[[num_mod]] <- 
    data_frame(recall = the_pr$curve[, 1],
               precision = the_pr$curve[, 2],
               model = names(model_list_pr)[num_mod])
  
  num_mod <- num_mod + 1
  
}

results_df_pr <- bind_rows(results_list_pr)

custom_col <- c("#000000", "#009E73", "#0072B2", "#D55E00", "#CC79A7")

ggplot(aes(x = recall, y = precision, group = model),
       data = results_df_pr) +
  geom_line(aes(color = model), size = 1) +
  scale_color_manual(values = custom_col) +
  geom_abline(intercept =
                sum(imbal_test$Class == "Class2")/nrow(imbal_test),
              slope = 0, color = "gray", size = 1) +
  theme_bw()

auprcSummary <- function(data, lev = NULL, model = NULL){
  
  index_class2 <- data$obs == "Class2"
  index_class1 <- data$obs == "Class1"
  
  the_curve <- pr.curve(data$Class2[index_class2],
                        data$Class2[index_class1],
                        curve = FALSE)
  
  out <- the_curve$auc.integral
  names(out) <- "AUPRC"
  
  out
  
}

# Re-initialize control function to remove smote and
# include our new summary function

ctrl <- trainControl(method = "repeatedcv",
                     number = 10,
                     repeats = 5,
                     summaryFunction = auprcSummary,
                     classProbs = TRUE,
                     seeds = orig_fit$control$seeds)

orig_pr <- train(Class ~ .,
                 data = imbal_train,
                 method = "gbm",
                 verbose = FALSE,
                 metric = "AUPRC",
                 trControl = ctrl)

# Get results for auprc on the test set

orig_fit_test <- orig_fit %>%
  calc_auprc(data = imbal_test) %>%
  (function(the_mod) the_mod$auc.integral)

orig_pr_test <- orig_pr %>%
  calc_auprc(data = imbal_test) %>%
  (function(the_mod) the_mod$auc.integral)

# The test errors are the same

identical(orig_fit_test,
          orig_pr_test)

## [1] FALSE

## [1] TRUE
# Because both chose the same
# hyperparameter combination

identical(orig_fit$bestTune,
          orig_pr$bestTune)

## [1] FALSE

Imbalance 02

Subasish Das

How to handle Imbalanced Data?

ROSE

Data

Figure

imbalance

Data

Figure

SMOTE