1 Data preparation

library(mlbench)
library(magrittr)
library(dplyr)
library(tidyr)
library(ggplot2)

data(BreastCancer)

str(BreastCancer)

## 'data.frame':    699 obs. of  11 variables:
##  $ Id             : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ Cl.thickness   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...

head(BreastCancer)

#Create new key
BreastCancer$Id <- c(1:nrow(BreastCancer))

#Tranform data and scale
BreastCancer <- BreastCancer %>% 
  na.omit() %>% 
  mutate(Id = as.character(Id),
    Class = as.character(Class)) %>%
  mutate_if(is.factor, as.numeric)


#Scale function to faster converged in Random Forest
f_scale <- function(x){(x-min(x))/(max(x)-min(x))}

#Assign data after scaling
BreastCancer %<>% mutate_if(is.numeric,f_scale)


#Split train and test
test <- BreastCancer %>% 
  group_by(Class) %>% 
  sample_frac(size=0.2) %>% 
  ungroup()

train <- dplyr::setdiff(BreastCancer,test)

#Remove index
train <- select(train,-Id)
test <- select(test,-Id)

2 Training model and cross-validation in test dataset

#Register parallel processing with 7 core
library(doParallel)
registerDoParallel(7)

library(caret)
#Make a trainControl function using cross-validation in 5-folds and repeat 5 times
train.control <- trainControl(method = 'repeatedcv',
                              number = 5,
                              repeats = 5,
                              classProbs = TRUE,
                              allowParallel = TRUE,
                              summaryFunction = multiClassSummary)


#Training model and check accuracy in test by running theshold from 0.2 to 0.8
running_thres <- function(train, test){
  #Train model  
  my_rf <- train(Class ~ .,
                 data = train,
                 method = 'rf',
                 trControl = train.control)
  
  sapply(seq(0.2,0.8,by = 0.05), eval_thres, data = test, train = my_rf) %>% t() %>% data.frame()
}

#Evaluate threshold in test data set
eval_thres <- function(thres, data, train){
  #Predict in test
  pred <- predict(train, data, type = 'prob') %>% pull(malignant)
  pred <- case_when(pred >= thres ~ 'malignant',
                    pred < thres ~ 'benign')
  
  #Summarise results    
  cm <- confusionMatrix(data$Class %>% as.factor(), pred %>% as.factor())
  
  cross_tbl <- cm$table %>% 
    as.vector() %>% 
    matrix(ncol = 4) %>% 
    data.frame() 
  
  names(cross_tbl) <- c('Be-Be','Ma-Be','Be-Ma','Ma-Ma')
  
  ten <- c(Threshold = thres, cm$byClass, cm$overall) %>% names()
  
  kq <- c(Threshold = thres, 
          cm$byClass,
          cm$overall) %>% 
    matrix(ncol = 19) %>% 
    data.frame()
    
  names(kq) <- ten
  
  kq <- bind_cols(kq, cross_tbl)
  kq
}

#running_thres(fold_train, fold_test)

#Cross-validation model in 5-folds do not replace
CV_kfold  <- function(k){
  #remain will save remaining data after withdraw 1/k for fold_test to ensure that each folds will not overlap each other.
  remain <- BreastCancer
  df_kfold <- data.frame()
  for (i in k:1){
    fold_test <- remain %>% 
                 group_by(Class) %>% 
                 sample_frac(size=1/k) %>% 
                 ungroup()
    
    fold_train <- setdiff(BreastCancer,fold_test)
    remain <- setdiff(remain, fold_test)
    
    fold_train <- select(fold_train, -Id)
    fold_test <- select(fold_test, -Id)
    
    df_kfold <- running_thres(fold_train, fold_test) %>% mutate(FoldTime = i) %>% 
      bind_rows(df_kfold)
  }
  lapply(df_kfold, as.numeric) %>% as.data.frame()
}

set.seed(1)
my_result <- CV_kfold(10)
my_result

#Visualization Accuracy and Threshold
theme_set(theme_minimal()) 
library(hrbrthemes)
my_result %>% group_by(Threshold) %>% 
  summarise_at(vars(Accuracy:Kappa, Sensitivity:Neg.Pred.Value), mean) %>% 
  gather(Metric, Accuracy,-Threshold) %>% 
  ggplot(aes(x = Threshold,y = Accuracy,color = Metric)) +
  geom_line() +
  geom_point(size = 3) +
  scale_y_percent() + 
  scale_x_continuous(breaks = seq(0.05, 0.95, by = 0.05)) + 
  scale_fill_manual(values = palette()[1:6]) +
  labs(y = "Accuracy Rate", 
       title = "Variation of Random Forest Classifier's Metrics by Threshold", 
       subtitle = "Data Used: Breast Cancer Wisconsin Data from\nhttps://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)")

We can see at threshold 0.25 the Negative Prediction Value is highest and at threshold 0.4 the Accuracy and Kappa is highest. We should confirm this exploration in whole dataset.

3 Cross-validation in whole dataset

CV_kfold_test_fulldata  <- function(k){
  #remain will save remaining data after withdraw 1/k for fold_test to ensure that each fold will not overlap each other.
  remain <- BreastCancer
  df_kfold <- data.frame()
  for (i in k:1){
    fold_test <- remain %>% 
                 group_by(Class) %>% 
                 sample_frac(size=1/k) %>% 
                 ungroup()
    
    fold_train <- setdiff(BreastCancer,fold_test)
    remain <- setdiff(remain, fold_test)
    
    fold_train <- select(fold_train, -Id)
    fold_test <- select(fold_test, -Id)
    
    df_kfold <- running_thres(fold_train, select(BreastCancer,-Id)) %>% mutate(FoldTime = i) %>% 
      bind_rows(df_kfold)
  }
  lapply(df_kfold, as.numeric) %>% as.data.frame()
}

set.seed(2)
my_result_test_fulldata <- CV_kfold_test_fulldata(10)
my_result_test_fulldata

my_result_test_fulldata %>% group_by(Threshold) %>% 
  summarise_at(vars(Accuracy:Kappa, Sensitivity:Neg.Pred.Value), mean) %>% 
  gather(Metric, Accuracy,-Threshold) %>% 
  ggplot(aes(x = Threshold,y = Accuracy,color = Metric)) +
  geom_line() +
  geom_point(size = 3) +
  scale_y_percent() + 
  scale_x_continuous(breaks = seq(0.05, 0.95, by = 0.05)) + 
  scale_fill_manual(values = palette()[1:6]) +
  labs(y = "Accuracy Rate", 
       title = "Variation of Random Forest Classifier's Metrics by Threshold", 
       subtitle = "Data Used: Breast Cancer Wisconsin Data from\nhttps://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)")

The results when test in whole dataset also the same when only test in test dataset. At threshold 0.25 the Negative Prediction Value is highest and at threshold 0.4 the Accuracy and Kappa is highest. This prove that our best threshold when we forecus on predict Negative Event is 0.25 and the best threshold when we want to gain highest Accuracy (in both Negative and Positive Event) is 0.4.

Tunning Threshold according to Cross Validation

Pham Dinh Khanh

May 20, 2018

1 Data preparation

2 Training model and cross-validation in test dataset

3 Cross-validation in whole dataset