library(mlbench)
library(magrittr)
library(dplyr)
library(tidyr)
library(ggplot2)
data(BreastCancer)
str(BreastCancer)
## 'data.frame': 699 obs. of 11 variables:
## $ Id : chr "1000025" "1002945" "1015425" "1016277" ...
## $ Cl.thickness : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
head(BreastCancer)
#Create new key
BreastCancer$Id <- c(1:nrow(BreastCancer))
#Tranform data and scale
BreastCancer <- BreastCancer %>%
na.omit() %>%
mutate(Id = as.character(Id),
Class = as.character(Class)) %>%
mutate_if(is.factor, as.numeric)
#Scale function to faster converged in Random Forest
f_scale <- function(x){(x-min(x))/(max(x)-min(x))}
#Assign data after scaling
BreastCancer %<>% mutate_if(is.numeric,f_scale)
#Split train and test
test <- BreastCancer %>%
group_by(Class) %>%
sample_frac(size=0.2) %>%
ungroup()
train <- dplyr::setdiff(BreastCancer,test)
#Remove index
train <- select(train,-Id)
test <- select(test,-Id)
#Register parallel processing with 7 core
library(doParallel)
registerDoParallel(7)
library(caret)
#Make a trainControl function using cross-validation in 5-folds and repeat 5 times
train.control <- trainControl(method = 'repeatedcv',
number = 5,
repeats = 5,
classProbs = TRUE,
allowParallel = TRUE,
summaryFunction = multiClassSummary)
#Training model and check accuracy in test by running theshold from 0.2 to 0.8
running_thres <- function(train, test){
#Train model
my_rf <- train(Class ~ .,
data = train,
method = 'rf',
trControl = train.control)
sapply(seq(0.2,0.8,by = 0.05), eval_thres, data = test, train = my_rf) %>% t() %>% data.frame()
}
#Evaluate threshold in test data set
eval_thres <- function(thres, data, train){
#Predict in test
pred <- predict(train, data, type = 'prob') %>% pull(malignant)
pred <- case_when(pred >= thres ~ 'malignant',
pred < thres ~ 'benign')
#Summarise results
cm <- confusionMatrix(data$Class %>% as.factor(), pred %>% as.factor())
cross_tbl <- cm$table %>%
as.vector() %>%
matrix(ncol = 4) %>%
data.frame()
names(cross_tbl) <- c('Be-Be','Ma-Be','Be-Ma','Ma-Ma')
ten <- c(Threshold = thres, cm$byClass, cm$overall) %>% names()
kq <- c(Threshold = thres,
cm$byClass,
cm$overall) %>%
matrix(ncol = 19) %>%
data.frame()
names(kq) <- ten
kq <- bind_cols(kq, cross_tbl)
kq
}
#running_thres(fold_train, fold_test)
#Cross-validation model in 5-folds do not replace
CV_kfold <- function(k){
#remain will save remaining data after withdraw 1/k for fold_test to ensure that each folds will not overlap each other.
remain <- BreastCancer
df_kfold <- data.frame()
for (i in k:1){
fold_test <- remain %>%
group_by(Class) %>%
sample_frac(size=1/k) %>%
ungroup()
fold_train <- setdiff(BreastCancer,fold_test)
remain <- setdiff(remain, fold_test)
fold_train <- select(fold_train, -Id)
fold_test <- select(fold_test, -Id)
df_kfold <- running_thres(fold_train, fold_test) %>% mutate(FoldTime = i) %>%
bind_rows(df_kfold)
}
lapply(df_kfold, as.numeric) %>% as.data.frame()
}
set.seed(1)
my_result <- CV_kfold(10)
my_result
#Visualization Accuracy and Threshold
theme_set(theme_minimal())
library(hrbrthemes)
my_result %>% group_by(Threshold) %>%
summarise_at(vars(Accuracy:Kappa, Sensitivity:Neg.Pred.Value), mean) %>%
gather(Metric, Accuracy,-Threshold) %>%
ggplot(aes(x = Threshold,y = Accuracy,color = Metric)) +
geom_line() +
geom_point(size = 3) +
scale_y_percent() +
scale_x_continuous(breaks = seq(0.05, 0.95, by = 0.05)) +
scale_fill_manual(values = palette()[1:6]) +
labs(y = "Accuracy Rate",
title = "Variation of Random Forest Classifier's Metrics by Threshold",
subtitle = "Data Used: Breast Cancer Wisconsin Data from\nhttps://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)")
We can see at threshold 0.25 the Negative Prediction Value is highest and at threshold 0.4 the Accuracy and Kappa is highest. We should confirm this exploration in whole dataset.
CV_kfold_test_fulldata <- function(k){
#remain will save remaining data after withdraw 1/k for fold_test to ensure that each fold will not overlap each other.
remain <- BreastCancer
df_kfold <- data.frame()
for (i in k:1){
fold_test <- remain %>%
group_by(Class) %>%
sample_frac(size=1/k) %>%
ungroup()
fold_train <- setdiff(BreastCancer,fold_test)
remain <- setdiff(remain, fold_test)
fold_train <- select(fold_train, -Id)
fold_test <- select(fold_test, -Id)
df_kfold <- running_thres(fold_train, select(BreastCancer,-Id)) %>% mutate(FoldTime = i) %>%
bind_rows(df_kfold)
}
lapply(df_kfold, as.numeric) %>% as.data.frame()
}
set.seed(2)
my_result_test_fulldata <- CV_kfold_test_fulldata(10)
my_result_test_fulldata
my_result_test_fulldata %>% group_by(Threshold) %>%
summarise_at(vars(Accuracy:Kappa, Sensitivity:Neg.Pred.Value), mean) %>%
gather(Metric, Accuracy,-Threshold) %>%
ggplot(aes(x = Threshold,y = Accuracy,color = Metric)) +
geom_line() +
geom_point(size = 3) +
scale_y_percent() +
scale_x_continuous(breaks = seq(0.05, 0.95, by = 0.05)) +
scale_fill_manual(values = palette()[1:6]) +
labs(y = "Accuracy Rate",
title = "Variation of Random Forest Classifier's Metrics by Threshold",
subtitle = "Data Used: Breast Cancer Wisconsin Data from\nhttps://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)")
The results when test in whole dataset also the same when only test in test dataset. At threshold 0.25 the Negative Prediction Value is highest and at threshold 0.4 the Accuracy and Kappa is highest. This prove that our best threshold when we forecus on predict Negative Event is 0.25 and the best threshold when we want to gain highest Accuracy (in both Negative and Positive Event) is 0.4.