library(recipes)
library(modeldata)
library(themis)
library(ggplot2)
data(okc)
head(okc)## # A tibble: 6 x 6
## age diet height location date Class
## <int> <chr> <int> <chr> <date> <fct>
## 1 22 strictly anything 75 south san francisco 2012-06-28 other
## 2 35 mostly other 70 oakland 2012-06-29 other
## 3 38 anything 68 san francisco 2012-06-27 other
## 4 23 vegetarian 71 berkeley 2012-06-28 other
## 5 29 <NA> 66 san francisco 2012-06-27 other
## 6 29 mostly anything 67 san francisco 2012-06-29 stem
#library(DT)
#datatable(okc)
summary(okc)## age diet height location
## Min. : 18.00 Length:59855 Min. : 1.00 Length:59855
## 1st Qu.: 26.00 Class :character 1st Qu.:66.00 Class :character
## Median : 30.00 Mode :character Median :68.00 Mode :character
## Mean : 32.34 Mean :68.29
## 3rd Qu.: 37.00 3rd Qu.:71.00
## Max. :110.00 Max. :95.00
## NA's :2
## date Class
## Min. :2011-06-27 stem : 9539
## 1st Qu.:2012-05-29 other:50316
## Median :2012-06-27
## Mean :2012-05-21
## 3rd Qu.:2012-06-30
## Max. :2012-07-01
##
sort(table(okc$Class, useNA = "always"))##
## <NA> stem other
## 0 9539 50316
ds_rec <- recipe(Class ~ age + height, data = okc) %>%
step_meanimpute(all_predictors()) %>%
step_smote(Class) %>%
prep()sort(table(juice(ds_rec)$Class, useNA = "always"))##
## <NA> stem other
## 0 50316 50316
baked_okc <- bake(ds_rec, new_data = okc)
table(baked_okc$Class, useNA = "always")##
## stem other <NA>
## 9539 50316 0
ggplot(circle_example, aes(x, y, color = class)) +
geom_point() +theme_bw()+
labs(title = "Without ADASYN")recipe(class ~ ., data = circle_example) %>%
step_adasyn(class) %>%
prep() %>%
juice() %>%
ggplot(aes(x, y, color = class)) +theme_bw()+
geom_point()+theme_bw()+
labs(title = "With ADASYN")sort(table(okc$diet, useNA = "always"))##
## halal kosher strictly halal strictly kosher
## 11 11 18 18
## mostly halal mostly kosher vegan strictly vegan
## 48 86 136 227
## other mostly vegan strictly other vegetarian
## 331 335 450 665
## strictly vegetarian mostly other mostly vegetarian strictly anything
## 874 1004 3438 5107
## anything mostly anything <NA>
## 6174 16562 24360
ds_rec <- recipe( ~ ., data = okc) %>%
step_downsample(diet) %>%
prep(training = okc, retain = TRUE)
sort(table(juice(ds_rec)$diet, useNA = "always"))##
## anything halal kosher mostly anything
## 11 11 11 11
## mostly halal mostly kosher mostly other mostly vegan
## 11 11 11 11
## mostly vegetarian other strictly anything strictly halal
## 11 11 11 11
## strictly kosher strictly other strictly vegan strictly vegetarian
## 11 11 11 11
## vegan vegetarian <NA>
## 11 11 11
# since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(ds_rec, new_data = okc)
table(baked_okc$diet, useNA = "always")##
## anything halal kosher mostly anything
## 6174 11 11 16562
## mostly halal mostly kosher mostly other mostly vegan
## 48 86 1004 335
## mostly vegetarian other strictly anything strictly halal
## 3438 331 5107 18
## strictly kosher strictly other strictly vegan strictly vegetarian
## 18 450 227 874
## vegan vegetarian <NA>
## 136 665 24360
sort(table(okc$Class, useNA = "always"))##
## <NA> stem other
## 0 9539 50316
ds_rec <- recipe(Class ~ age + height, data = okc) %>%
step_meanimpute(all_predictors()) %>%
step_nearmiss(Class) %>%
prep()
sort(table(juice(ds_rec)$Class, useNA = "always"))##
## <NA> other stem
## 0 9539 9539
# since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(ds_rec, new_data = okc)
table(baked_okc$Class, useNA = "always")##
## other stem <NA>
## 50316 9539 0
ggplot(circle_example, aes(x, y, color = class)) +
geom_point() +theme_bw()+
labs(title = "Without NEARMISS") +
xlim(c(1, 15)) +
ylim(c(1, 15))recipe(class ~ ., data = circle_example) %>%
step_nearmiss(class) %>%
prep() %>%
juice() %>%
ggplot(aes(x, y, color = class)) +
geom_point() +theme_bw()+
labs(title = "With NEARMISS") +
xlim(c(1, 15)) +theme_bw()+
ylim(c(1, 15))sort(table(okc$Class, useNA = "always"))##
## <NA> stem other
## 0 9539 50316
ds_rec <- recipe(Class ~ age + height, data = okc) %>%
step_rose(Class) %>%
prep()
sort(table(juice(ds_rec)$Class, useNA = "always"))##
## <NA> other stem
## 0 50165 50467
# since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(ds_rec, new_data = okc)
table(baked_okc$Class, useNA = "always")##
## other stem <NA>
## 50316 9539 0
ds_rec2 <- recipe(Class ~ age + height, data = okc) %>%
step_rose(Class, minority_prop = 0.3) %>%
prep()
table(juice(ds_rec2)$Class, useNA = "always")##
## other stem <NA>
## 70075 30557 0
ggplot(circle_example, aes(x, y, color = class)) +
geom_point() +theme_bw()+
labs(title = "Without ROSE")recipe(class ~ ., data = circle_example) %>%
step_rose(class) %>%
prep() %>%
juice() %>%
ggplot(aes(x, y, color = class)) +
geom_point() +theme_bw()+
labs(title = "With ROSE")sort(table(okc$Class, useNA = "always"))##
## <NA> stem other
## 0 9539 50316
ds_rec <- recipe(Class ~ age + height, data = okc) %>%
step_meanimpute(all_predictors()) %>%
step_tomek(Class) %>%
prep()
sort(table(juice(ds_rec)$Class, useNA = "always"))##
## <NA> stem other
## 0 9539 49710
# since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(ds_rec, new_data = okc)
table(baked_okc$Class, useNA = "always")##
## other stem <NA>
## 50316 9539 0
library(ggplot2)
ggplot(circle_example, aes(x, y, color = class)) +
geom_point() +theme_bw()+
labs(title = "Without Tomek") +
xlim(c(1, 15)) +
ylim(c(1, 15))recipe(class ~ ., data = circle_example) %>%
step_tomek(class) %>%
prep() %>%
juice() %>%
ggplot(aes(x, y, color = class)) +
geom_point() +theme_bw()+
labs(title = "With Tomek") +
xlim(c(1, 15)) +
ylim(c(1, 15))orig <- table(okc$diet, useNA = "always")
sort(orig, decreasing = TRUE)##
## <NA> mostly anything anything strictly anything
## 24360 16562 6174 5107
## mostly vegetarian mostly other strictly vegetarian vegetarian
## 3438 1004 874 665
## strictly other mostly vegan other strictly vegan
## 450 335 331 227
## vegan mostly kosher mostly halal strictly halal
## 136 86 48 18
## strictly kosher halal kosher
## 18 11 11
up_rec <- recipe( ~ ., data = okc) %>%
# Bring the minority levels up to about 200 each
# 200/16562 is approx 0.0121
step_upsample(diet, over_ratio = 0.0121) %>%
prep(training = okc, retain = TRUE)
training <- table(juice(up_rec)$diet, useNA = "always")
# Since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(up_rec, new_data = okc)
baked <- table(baked_okc$diet, useNA = "always")
# Note that if the original data contained more rows than the
# target n (= ratio * majority_n), the data are left alone:
data.frame(
level = names(orig),
orig_freq = as.vector(orig),
train_freq = as.vector(training),
baked_freq = as.vector(baked)
)## level orig_freq train_freq baked_freq
## 1 anything 6174 6174 6174
## 2 halal 11 200 11
## 3 kosher 11 200 11
## 4 mostly anything 16562 16562 16562
## 5 mostly halal 48 200 48
## 6 mostly kosher 86 200 86
## 7 mostly other 1004 1004 1004
## 8 mostly vegan 335 335 335
## 9 mostly vegetarian 3438 3438 3438
## 10 other 331 331 331
## 11 strictly anything 5107 5107 5107
## 12 strictly halal 18 200 18
## 13 strictly kosher 18 200 18
## 14 strictly other 450 450 450
## 15 strictly vegan 227 227 227
## 16 strictly vegetarian 874 874 874
## 17 vegan 136 200 136
## 18 vegetarian 665 665 665
## 19 <NA> 24360 24360 24360
ggplot(circle_example, aes(x, y, color = class)) +
geom_point() +theme_bw()+
labs(title = "Without upsample")recipe(class ~ ., data = circle_example) %>%
step_nearmiss(class) %>%
prep() %>%
juice() %>%
ggplot(aes(x, y, color = class)) +
geom_jitter() +theme_bw()+
labs(title = "With upsample (with jittering)")data(credit_data)
head(credit_data)## Status Seniority Home Time Age Marital Records Job Expenses Income
## 1 good 9 rent 60 30 married no freelance 73 129
## 2 good 17 rent 60 58 widow no fixed 48 131
## 3 bad 10 owner 36 46 married yes freelance 90 200
## 4 good 0 rent 60 24 single no fixed 63 182
## 5 good 0 rent 36 26 single no fixed 46 107
## 6 good 1 owner 60 36 married no fixed 75 214
## Assets Debt Amount Price
## 1 0 0 800 846
## 2 0 0 1000 1658
## 3 3000 0 2000 2985
## 4 2500 0 900 1325
## 5 0 0 310 910
## 6 3500 0 650 1645
dim(credit_data)## [1] 4454 14
#datatable(credit_data)
sort(table(credit_data$Status, useNA = "always"))##
## <NA> bad good
## 0 1254 3200
ds_rec <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>%
step_meanimpute(all_predictors()) %>%
step_bsmote(Status) %>%
prep()
sort(table(juice(ds_rec)$Status, useNA = "always"))##
## <NA> bad good
## 0 3200 3200
# since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(ds_rec, new_data = credit_data)
table(baked_okc$Status, useNA = "always")##
## bad good <NA>
## 1254 3200 0
ds_rec2 <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>%
step_meanimpute(all_predictors()) %>%
step_bsmote(Status, over_ratio = 0.2) %>%
prep()
table(juice(ds_rec2)$Status, useNA = "always")##
## bad good <NA>
## 1254 3200 0
ggplot(circle_example, aes(x, y, color = class)) +
geom_point() +
labs(title = "Without SMOTE")recipe(class ~ ., data = circle_example) %>%
step_bsmote(class, all_neighbors = FALSE) %>%
prep() %>%
juice() %>%
ggplot(aes(x, y, color = class)) +
geom_point()+theme_bw()+
labs(title = "With borderline-SMOTE, all_neighbors = FALSE")recipe(class ~ ., data = circle_example) %>%
step_bsmote(class, all_neighbors = TRUE) %>%
prep() %>%
juice() %>%
ggplot(aes(x, y, color = class)) +
geom_point() +theme_bw()+
labs(title = "With borderline-SMOTE, all_neighbors = TRUE")sort(table(credit_data$Status, useNA = "always"))##
## <NA> bad good
## 0 1254 3200
ds_rec <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>%
step_meanimpute(all_predictors()) %>%
step_smote(Status) %>%
prep()
sort(table(juice(ds_rec)$Status, useNA = "always"))##
## <NA> bad good
## 0 3200 3200
# since `skip` defaults to TRUE, baking the step has no effect
baked_okc <- bake(ds_rec, new_data = credit_data)
table(baked_okc$Status, useNA = "always")##
## bad good <NA>
## 1254 3200 0
ds_rec2 <- recipe(Status ~ Age + Income + Assets, data = credit_data) %>%
step_meanimpute(all_predictors()) %>%
step_smote(Status, over_ratio = 0.2) %>%
prep()
table(juice(ds_rec2)$Status, useNA = "always")##
## bad good <NA>
## 1254 3200 0
ggplot(circle_example, aes(x, y, color = class)) +
geom_point() +theme_bw()+
labs(title = "Without SMOTE")recipe(class ~ ., data = circle_example) %>%
step_smote(class) %>%
prep() %>%
juice() %>%
ggplot(aes(x, y, color = class)) +
geom_point() +theme_bw()+
labs(title = "With SMOTE")