## 'data.frame': 1000 obs. of 3 variables:
## $ cls: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ x1 : num 0.2008 0.0166 0.2287 0.1264 0.6008 ...
## $ x2 : num 0.678 1.5766 -0.5595 -0.0938 -0.2984 ...
##
## 0 1
## 980 20
library(rpart)
treeimb <- rpart(cls ~ ., data = hacide.train)
pred.treeimb <- predict(treeimb, newdata = hacide.test)
accuracy.meas(hacide.test$cls, pred.treeimb[,2])
##
## Call:
## accuracy.meas(response = hacide.test$cls, predicted = pred.treeimb[,
## 2])
##
## Examples are labelled as positive when predicted is greater than 0.5
##
## precision: 1.000
## recall: 0.200
## F: 0.167
## Area under the curve (AUC): 0.600
data_balanced_over <- ovun.sample(cls ~ ., data = hacide.train, method = "over",N = 1960)$data
table(data_balanced_over$cls)
##
## 0 1
## 980 980
data_balanced_under <- ovun.sample(cls ~ ., data = hacide.train, method = "under", N = 40, seed = 1)$data
table(data_balanced_under$cls)
##
## 0 1
## 20 20
data_balanced_both <- ovun.sample(cls ~ ., data = hacide.train, method = "both", p=0.5, N=1000, seed = 1)$data
table(data_balanced_both$cls)
##
## 0 1
## 520 480
##
## 0 1
## 520 480
#build decision tree models
tree.rose <- rpart(cls ~ ., data = data.rose)
tree.over <- rpart(cls ~ ., data = data_balanced_over)
tree.under <- rpart(cls ~ ., data = data_balanced_under)
tree.both <- rpart(cls ~ ., data = data_balanced_both)
#make predictions on unseen data
pred.tree.rose <- predict(tree.rose, newdata = hacide.test)
pred.tree.over <- predict(tree.over, newdata = hacide.test)
pred.tree.under <- predict(tree.under, newdata = hacide.test)
pred.tree.both <- predict(tree.both, newdata = hacide.test)
ROSE.holdout <- ROSE.eval(cls ~ ., data = hacide.train, learner = rpart,
method.assess = "holdout", extr.pred = function(obj)obj[,2], seed = 1)
ROSE.holdout
##
## Call:
## ROSE.eval(formula = cls ~ ., data = hacide.train, learner = rpart,
## extr.pred = function(obj) obj[, 2], method.assess = "holdout",
## seed = 1)
##
## Holdout estimate of auc: 0.980
## Area under the curve (AUC): 0.993
## Area under the curve (AUC): 0.798
## Area under the curve (AUC): 0.924
## Area under the curve (AUC): 0.798
## T3resin Thyroxin Triiodothyronine Thyroidstimulating TSH_value Class
## 1 105 7.3 1.5 1.5 -0.1 negative
## 2 67 23.3 7.4 1.8 -0.6 positive
## 3 111 8.4 1.5 0.8 1.2 negative
## 4 89 14.3 4.1 0.5 0.2 positive
## 5 105 9.5 1.8 1.6 3.6 negative
## 6 110 20.3 3.7 0.6 0.2 positive
## T3resin Thyroxin Triiodothyronine Thyroidstimulating
## Min. : 65.0 Min. : 0.500 Min. : 0.20 Min. : 0.10
## 1st Qu.:103.0 1st Qu.: 7.100 1st Qu.: 1.35 1st Qu.: 1.00
## Median :110.0 Median : 9.200 Median : 1.70 Median : 1.30
## Mean :109.6 Mean : 9.805 Mean : 2.05 Mean : 2.88
## 3rd Qu.:117.5 3rd Qu.:11.300 3rd Qu.: 2.20 3rd Qu.: 1.70
## Max. :144.0 Max. :25.300 Max. :10.00 Max. :56.40
## TSH_value Class
## Min. :-0.700 negative:180
## 1st Qu.: 0.550 positive: 35
## Median : 2.000
## Mean : 4.199
## 3rd Qu.: 4.100
## Max. :56.300
numPositive <- length(which(newthyroid1$Class == "positive"))
numNegative <- length(which(newthyroid1$Class == "negative"))
nInstances <- numNegative - numPositive
newSamples <- pdfos(dataset = newthyroid1, numInstances = 80,
classAttr = "Class")
# Bind a balanced dataset
newDataset <- rbind(newthyroid1, newSamples)
head(newthyroid1)
## T3resin Thyroxin Triiodothyronine Thyroidstimulating TSH_value Class
## 1 105 7.3 1.5 1.5 -0.1 negative
## 2 67 23.3 7.4 1.8 -0.6 positive
## 3 111 8.4 1.5 0.8 1.2 negative
## 4 89 14.3 4.1 0.5 0.2 positive
## 5 105 9.5 1.8 1.6 3.6 negative
## 6 110 20.3 3.7 0.6 0.2 positive
## T3resin Thyroxin Triiodothyronine Thyroidstimulating
## Min. : 65.0 Min. : 0.500 Min. : 0.20 Min. : 0.10
## 1st Qu.:103.0 1st Qu.: 7.100 1st Qu.: 1.35 1st Qu.: 1.00
## Median :110.0 Median : 9.200 Median : 1.70 Median : 1.30
## Mean :109.6 Mean : 9.805 Mean : 2.05 Mean : 2.88
## 3rd Qu.:117.5 3rd Qu.:11.300 3rd Qu.: 2.20 3rd Qu.: 1.70
## Max. :144.0 Max. :25.300 Max. :10.00 Max. :56.40
## TSH_value Class
## Min. :-0.700 negative:180
## 1st Qu.: 0.550 positive: 35
## Median : 2.000
## Mean : 4.199
## 3rd Qu.: 4.100
## Max. :56.300
### SOURCE: https://dpmartin42.github.io/
library(dplyr) # for data manipulation
library(caret) # for model-building
library(DMwR) # for smote implementation
library(purrr) # for functional programming (map)
library(pROC) # for AUC calculations
library(PRROC) # for Precision-Recall curve calculations
set.seed(2969)
imbal_train <- twoClassSim(5000,
intercept = -25,
linearVars = 20,
noiseVars = 10)
imbal_test <- twoClassSim(5000,
intercept = -25,
linearVars = 20,
noiseVars = 10)
prop.table(table(imbal_train$Class))
##
## Class1 Class2
## 0.9796 0.0204
# Set up control function for training
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
summaryFunction = twoClassSummary,
classProbs = TRUE)
# Build a standard classifier using a gradient boosted machine
set.seed(5627)
orig_fit <- train(Class ~ .,
data = imbal_train,
method = "gbm",
verbose = FALSE,
metric = "ROC",
trControl = ctrl)
# Build custom AUC function to extract AUC
# from the caret model object
test_roc <- function(model, data) {
roc(data$Class,
predict(model, data, type = "prob")[, "Class2"])
}
orig_fit %>%
test_roc(data = imbal_test) %>%
auc()
## Area under the curve: 0.9456
# Create model weights (they sum to one)
model_weights <- ifelse(imbal_train$Class == "Class1",
(1/table(imbal_train$Class)[1]) * 0.5,
(1/table(imbal_train$Class)[2]) * 0.5)
# Use the same seed to ensure same cross-validation splits
ctrl$seeds <- orig_fit$control$seeds
# Build weighted model
weighted_fit <- train(Class ~ .,
data = imbal_train,
method = "gbm",
verbose = FALSE,
weights = model_weights,
metric = "ROC",
trControl = ctrl)
# Build down-sampled model
ctrl$sampling <- "down"
down_fit <- train(Class ~ .,
data = imbal_train,
method = "gbm",
verbose = FALSE,
metric = "ROC",
trControl = ctrl)
# Build up-sampled model
ctrl$sampling <- "up"
up_fit <- train(Class ~ .,
data = imbal_train,
method = "gbm",
verbose = FALSE,
metric = "ROC",
trControl = ctrl)
# Build smote model
ctrl$sampling <- "smote"
smote_fit <- train(Class ~ .,
data = imbal_train,
method = "gbm",
verbose = FALSE,
metric = "ROC",
trControl = ctrl)
# Examine results for test set
model_list <- list(original = orig_fit,
weighted = weighted_fit,
down = down_fit,
up = up_fit,
SMOTE = smote_fit)
model_list_roc <- model_list %>%
map(test_roc, data = imbal_test)
model_list_roc %>%
map(auc)
## $original
## Area under the curve: 0.9456
##
## $weighted
## Area under the curve: 0.9793
##
## $down
## Area under the curve: 0.9675
##
## $up
## Area under the curve: 0.9784
##
## $SMOTE
## Area under the curve: 0.9768
results_list_roc <- list(NA)
num_mod <- 1
for(the_roc in model_list_roc){
results_list_roc[[num_mod]] <-
data_frame(tpr = the_roc$sensitivities,
fpr = 1 - the_roc$specificities,
model = names(model_list)[num_mod])
num_mod <- num_mod + 1
}
results_df_roc <- bind_rows(results_list_roc)
# Plot ROC curve for all 5 models
custom_col <- c("#000000", "#009E73", "#0072B2", "#D55E00", "#CC79A7")
ggplot(aes(x = fpr, y = tpr, group = model), data = results_df_roc) +
geom_line(aes(color = model), size = 1) +
scale_color_manual(values = custom_col) +
geom_abline(intercept = 0, slope = 1, color = "gray", size = 1) +
theme_bw(base_size = 18)
calc_auprc <- function(model, data){
index_class2 <- data$Class == "Class2"
index_class1 <- data$Class == "Class1"
predictions <- predict(model, data, type = "prob")
pr.curve(predictions$Class2[index_class2],
predictions$Class2[index_class1],
curve = TRUE)
}
# Get results for all 5 models
model_list_pr <- model_list %>%
map(calc_auprc, data = imbal_test)
model_list_pr %>%
map(function(the_mod) the_mod$auc.integral)
## $original
## [1] 0.5187951
##
## $weighted
## [1] 0.640687
##
## $down
## [1] 0.53057
##
## $up
## [1] 0.6246232
##
## $SMOTE
## [1] 0.6341753
# Plot the AUPRC curve for all 5 models
results_list_pr <- list(NA)
num_mod <- 1
for(the_pr in model_list_pr){
results_list_pr[[num_mod]] <-
data_frame(recall = the_pr$curve[, 1],
precision = the_pr$curve[, 2],
model = names(model_list_pr)[num_mod])
num_mod <- num_mod + 1
}
results_df_pr <- bind_rows(results_list_pr)
custom_col <- c("#000000", "#009E73", "#0072B2", "#D55E00", "#CC79A7")
ggplot(aes(x = recall, y = precision, group = model),
data = results_df_pr) +
geom_line(aes(color = model), size = 1) +
scale_color_manual(values = custom_col) +
geom_abline(intercept =
sum(imbal_test$Class == "Class2")/nrow(imbal_test),
slope = 0, color = "gray", size = 1) +
theme_bw()
auprcSummary <- function(data, lev = NULL, model = NULL){
index_class2 <- data$obs == "Class2"
index_class1 <- data$obs == "Class1"
the_curve <- pr.curve(data$Class2[index_class2],
data$Class2[index_class1],
curve = FALSE)
out <- the_curve$auc.integral
names(out) <- "AUPRC"
out
}
# Re-initialize control function to remove smote and
# include our new summary function
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
summaryFunction = auprcSummary,
classProbs = TRUE,
seeds = orig_fit$control$seeds)
orig_pr <- train(Class ~ .,
data = imbal_train,
method = "gbm",
verbose = FALSE,
metric = "AUPRC",
trControl = ctrl)
# Get results for auprc on the test set
orig_fit_test <- orig_fit %>%
calc_auprc(data = imbal_test) %>%
(function(the_mod) the_mod$auc.integral)
orig_pr_test <- orig_pr %>%
calc_auprc(data = imbal_test) %>%
(function(the_mod) the_mod$auc.integral)
# The test errors are the same
identical(orig_fit_test,
orig_pr_test)
## [1] FALSE
## [1] TRUE
# Because both chose the same
# hyperparameter combination
identical(orig_fit$bestTune,
orig_pr$bestTune)
## [1] FALSE