Unbalanced Data

data ‘okc’

smote

library(recipes)
library(modeldata)
library(themis)
library(ggplot2)

data(okc)
head(okc)
## # A tibble: 6 x 6
##     age diet              height location            date       Class
##   <int> <chr>              <int> <chr>               <date>     <fct>
## 1    22 strictly anything     75 south san francisco 2012-06-28 other
## 2    35 mostly other          70 oakland             2012-06-29 other
## 3    38 anything              68 san francisco       2012-06-27 other
## 4    23 vegetarian            71 berkeley            2012-06-28 other
## 5    29 <NA>                  66 san francisco       2012-06-27 other
## 6    29 mostly anything       67 san francisco       2012-06-29 stem
#library(DT)
#datatable(okc)

summary(okc)
##       age             diet               height        location        
##  Min.   : 18.00   Length:59855       Min.   : 1.00   Length:59855      
##  1st Qu.: 26.00   Class :character   1st Qu.:66.00   Class :character  
##  Median : 30.00   Mode  :character   Median :68.00   Mode  :character  
##  Mean   : 32.34                      Mean   :68.29                     
##  3rd Qu.: 37.00                      3rd Qu.:71.00                     
##  Max.   :110.00                      Max.   :95.00                     
##                                      NA's   :2                         
##       date              Class      
##  Min.   :2011-06-27   stem : 9539  
##  1st Qu.:2012-05-29   other:50316  
##  Median :2012-06-27                
##  Mean   :2012-05-21                
##  3rd Qu.:2012-06-30                
##  Max.   :2012-07-01                
## 
sort(table(okc$Class, useNA = "always"))
## 
##  <NA>  stem other 
##     0  9539 50316
ds_rec <- recipe(Class ~ age + height, data = okc) %>%
  step_meanimpute(all_predictors()) %>%
  step_smote(Class) %>%
  prep()

adasyn

sort(table(juice(ds_rec)$Class, useNA = "always"))
## 
##  <NA>  stem other 
##     0 50316 50316
baked_okc <- bake(ds_rec, new_data = okc)
table(baked_okc$Class, useNA = "always")
## 
##  stem other  <NA> 
##  9539 50316     0
ggplot(circle_example, aes(x, y, color = class)) +
  geom_point() +theme_bw()+
  labs(title = "Without ADASYN")

recipe(class ~ ., data = circle_example) %>%
  step_adasyn(class) %>%
  prep() %>%
  juice() %>%
  ggplot(aes(x, y, color = class)) +theme_bw()+
  geom_point()+theme_bw()+
  labs(title = "With ADASYN")

downsample

sort(table(okc$diet, useNA = "always"))
## 
##               halal              kosher      strictly halal     strictly kosher 
##                  11                  11                  18                  18 
##        mostly halal       mostly kosher               vegan      strictly vegan 
##                  48                  86                 136                 227 
##               other        mostly vegan      strictly other          vegetarian 
##                 331                 335                 450                 665 
## strictly vegetarian        mostly other   mostly vegetarian   strictly anything 
##                 874                1004                3438                5107 
##            anything     mostly anything                <NA> 
##                6174               16562               24360
ds_rec <- recipe( ~ ., data = okc) %>%
  step_downsample(diet) %>%
  prep(training = okc, retain = TRUE)
sort(table(juice(ds_rec)$diet, useNA = "always"))
## 
##            anything               halal              kosher     mostly anything 
##                  11                  11                  11                  11 
##        mostly halal       mostly kosher        mostly other        mostly vegan 
##                  11                  11                  11                  11 
##   mostly vegetarian               other   strictly anything      strictly halal 
##                  11                  11                  11                  11 
##     strictly kosher      strictly other      strictly vegan strictly vegetarian 
##                  11                  11                  11                  11 
##               vegan          vegetarian                <NA> 
##                  11                  11                  11

nearmiss

# since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(ds_rec, new_data = okc)
table(baked_okc$diet, useNA = "always")
## 
##            anything               halal              kosher     mostly anything 
##                6174                  11                  11               16562 
##        mostly halal       mostly kosher        mostly other        mostly vegan 
##                  48                  86                1004                 335 
##   mostly vegetarian               other   strictly anything      strictly halal 
##                3438                 331                5107                  18 
##     strictly kosher      strictly other      strictly vegan strictly vegetarian 
##                  18                 450                 227                 874 
##               vegan          vegetarian                <NA> 
##                 136                 665               24360
sort(table(okc$Class, useNA = "always"))
## 
##  <NA>  stem other 
##     0  9539 50316
ds_rec <- recipe(Class ~ age + height, data = okc) %>%
  step_meanimpute(all_predictors()) %>%
  step_nearmiss(Class) %>%
  prep()


sort(table(juice(ds_rec)$Class, useNA = "always"))
## 
##  <NA> other  stem 
##     0  9539  9539
# since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(ds_rec, new_data = okc)
table(baked_okc$Class, useNA = "always")
## 
## other  stem  <NA> 
## 50316  9539     0
ggplot(circle_example, aes(x, y, color = class)) +
  geom_point() +theme_bw()+
  labs(title = "Without NEARMISS") +
  xlim(c(1, 15)) +
  ylim(c(1, 15))

recipe(class ~ ., data = circle_example) %>%
  step_nearmiss(class) %>%
  prep() %>%
  juice() %>%
  ggplot(aes(x, y, color = class)) +
  geom_point() +theme_bw()+
  labs(title = "With NEARMISS") +
  xlim(c(1, 15)) +theme_bw()+
  ylim(c(1, 15))

ROSE

sort(table(okc$Class, useNA = "always"))
## 
##  <NA>  stem other 
##     0  9539 50316
ds_rec <- recipe(Class ~ age + height, data = okc) %>%
  step_rose(Class) %>%
  prep()
sort(table(juice(ds_rec)$Class, useNA = "always"))
## 
##  <NA> other  stem 
##     0 50165 50467
# since `skip` defaults to TRUE, baking the step has no effect

baked_okc <- bake(ds_rec, new_data = okc)
table(baked_okc$Class, useNA = "always")
## 
## other  stem  <NA> 
## 50316  9539     0
ds_rec2 <- recipe(Class ~ age + height, data = okc) %>%
  step_rose(Class, minority_prop = 0.3) %>%
  prep()
table(juice(ds_rec2)$Class, useNA = "always")
## 
## other  stem  <NA> 
## 70075 30557     0
ggplot(circle_example, aes(x, y, color = class)) +
  geom_point() +theme_bw()+
  labs(title = "Without ROSE")

recipe(class ~ ., data = circle_example) %>%
  step_rose(class) %>%
  prep() %>%
  juice() %>%
  ggplot(aes(x, y, color = class)) +
  geom_point() +theme_bw()+
  labs(title = "With ROSE")

tomek

sort(table(okc$Class, useNA = "always"))
## 
##  <NA>  stem other 
##     0  9539 50316
ds_rec <- recipe(Class ~ age + height, data = okc) %>%
  step_meanimpute(all_predictors()) %>%
  step_tomek(Class) %>%
  prep()
sort(table(juice(ds_rec)$Class, useNA = "always"))
## 
##  <NA>  stem other 
##     0  9539 49710
# since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(ds_rec, new_data = okc)
table(baked_okc$Class, useNA = "always")
## 
## other  stem  <NA> 
## 50316  9539     0
library(ggplot2)
ggplot(circle_example, aes(x, y, color = class)) +
  geom_point() +theme_bw()+
  labs(title = "Without Tomek") +
  xlim(c(1, 15)) +
  ylim(c(1, 15))

recipe(class ~ ., data = circle_example) %>%
  step_tomek(class) %>%
  prep() %>%
  juice() %>%
  ggplot(aes(x, y, color = class)) +
  geom_point() +theme_bw()+
  labs(title = "With Tomek") +
  xlim(c(1, 15)) +
  ylim(c(1, 15))

upsample

orig <- table(okc$diet, useNA = "always")
sort(orig, decreasing = TRUE)
## 
##                <NA>     mostly anything            anything   strictly anything 
##               24360               16562                6174                5107 
##   mostly vegetarian        mostly other strictly vegetarian          vegetarian 
##                3438                1004                 874                 665 
##      strictly other        mostly vegan               other      strictly vegan 
##                 450                 335                 331                 227 
##               vegan       mostly kosher        mostly halal      strictly halal 
##                 136                  86                  48                  18 
##     strictly kosher               halal              kosher 
##                  18                  11                  11
up_rec <- recipe( ~ ., data = okc) %>%
  # Bring the minority levels up to about 200 each
  # 200/16562 is approx 0.0121
  step_upsample(diet, over_ratio = 0.0121) %>%
  prep(training = okc, retain = TRUE)
training <- table(juice(up_rec)$diet, useNA = "always")
# Since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(up_rec, new_data = okc)
baked <- table(baked_okc$diet, useNA = "always")
# Note that if the original data contained more rows than the
# target n (= ratio * majority_n), the data are left alone:
data.frame(
  level = names(orig),
  orig_freq = as.vector(orig),
  train_freq = as.vector(training),
  baked_freq = as.vector(baked)
)
##                  level orig_freq train_freq baked_freq
## 1             anything      6174       6174       6174
## 2                halal        11        200         11
## 3               kosher        11        200         11
## 4      mostly anything     16562      16562      16562
## 5         mostly halal        48        200         48
## 6        mostly kosher        86        200         86
## 7         mostly other      1004       1004       1004
## 8         mostly vegan       335        335        335
## 9    mostly vegetarian      3438       3438       3438
## 10               other       331        331        331
## 11   strictly anything      5107       5107       5107
## 12      strictly halal        18        200         18
## 13     strictly kosher        18        200         18
## 14      strictly other       450        450        450
## 15      strictly vegan       227        227        227
## 16 strictly vegetarian       874        874        874
## 17               vegan       136        200        136
## 18          vegetarian       665        665        665
## 19                <NA>     24360      24360      24360
ggplot(circle_example, aes(x, y, color = class)) +
  geom_point() +theme_bw()+
  labs(title = "Without upsample")

recipe(class ~ ., data = circle_example) %>%
  step_nearmiss(class) %>%
  prep() %>%
  juice() %>%
  ggplot(aes(x, y, color = class)) +
  geom_jitter() +theme_bw()+
  labs(title = "With upsample (with jittering)")

data ‘credit_data’

bsmote

data(credit_data)
head(credit_data)
##   Status Seniority  Home Time Age Marital Records       Job Expenses Income
## 1   good         9  rent   60  30 married      no freelance       73    129
## 2   good        17  rent   60  58   widow      no     fixed       48    131
## 3    bad        10 owner   36  46 married     yes freelance       90    200
## 4   good         0  rent   60  24  single      no     fixed       63    182
## 5   good         0  rent   36  26  single      no     fixed       46    107
## 6   good         1 owner   60  36 married      no     fixed       75    214
##   Assets Debt Amount Price
## 1      0    0    800   846
## 2      0    0   1000  1658
## 3   3000    0   2000  2985
## 4   2500    0    900  1325
## 5      0    0    310   910
## 6   3500    0    650  1645
dim(credit_data)
## [1] 4454   14
#datatable(credit_data)


sort(table(credit_data$Status, useNA = "always"))
## 
## <NA>  bad good 
##    0 1254 3200
ds_rec <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>%
    step_meanimpute(all_predictors()) %>%
    step_bsmote(Status) %>%
    prep()
sort(table(juice(ds_rec)$Status, useNA = "always"))
## 
## <NA>  bad good 
##    0 3200 3200
# since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(ds_rec, new_data = credit_data)
table(baked_okc$Status, useNA = "always")
## 
##  bad good <NA> 
## 1254 3200    0
ds_rec2 <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>%
    step_meanimpute(all_predictors()) %>%
    step_bsmote(Status, over_ratio = 0.2) %>%
    prep()
table(juice(ds_rec2)$Status, useNA = "always")
## 
##  bad good <NA> 
## 1254 3200    0
ggplot(circle_example, aes(x, y, color = class)) +
    geom_point() +
    labs(title = "Without SMOTE")

recipe(class ~ ., data = circle_example) %>%
    step_bsmote(class, all_neighbors = FALSE) %>%
    prep() %>%
    juice() %>%
    ggplot(aes(x, y, color = class)) +
    geom_point()+theme_bw()+
    labs(title = "With borderline-SMOTE, all_neighbors = FALSE")

recipe(class ~ ., data = circle_example) %>%
    step_bsmote(class, all_neighbors = TRUE) %>%
    prep() %>%
    juice() %>%
    ggplot(aes(x, y, color = class)) +
    geom_point() +theme_bw()+
    labs(title = "With borderline-SMOTE, all_neighbors = TRUE")

smote

sort(table(credit_data$Status, useNA = "always"))
## 
## <NA>  bad good 
##    0 1254 3200
ds_rec <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>%
  step_meanimpute(all_predictors()) %>%
  step_smote(Status) %>%
  prep()
sort(table(juice(ds_rec)$Status, useNA = "always"))
## 
## <NA>  bad good 
##    0 3200 3200
# since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(ds_rec, new_data = credit_data)
table(baked_okc$Status, useNA = "always")
## 
##  bad good <NA> 
## 1254 3200    0
ds_rec2 <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>%
  step_meanimpute(all_predictors()) %>%
  step_smote(Status, over_ratio = 0.2) %>%
  prep()
table(juice(ds_rec2)$Status, useNA = "always")
## 
##  bad good <NA> 
## 1254 3200    0
ggplot(circle_example, aes(x, y, color = class)) +
  geom_point() +theme_bw()+
  labs(title = "Without SMOTE")

recipe(class ~ ., data = circle_example) %>%
  step_smote(class) %>%
  prep() %>%
  juice() %>%
  ggplot(aes(x, y, color = class)) +
  geom_point() +theme_bw()+
  labs(title = "With SMOTE")