1 First train model

1.1 Data preparation

library(mlbench)
library(magrittr)
library(dplyr)
library(tidyr)
library(ggplot2)

data(BreastCancer)

str(BreastCancer)

## 'data.frame':    699 obs. of  11 variables:
##  $ Id             : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ Cl.thickness   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...

head(BreastCancer)

#Check for whether Id is key?
BreastCancer %>% group_by(Id) %>% 
  summarise(n = n()) %>% 
  filter(n >= 2)

#Create new key
BreastCancer$Id <- c(1:nrow(BreastCancer))

#Tranform data and scale
BreastCancer <- BreastCancer %>% 
  mutate(Id = as.character(Id),
    Class = as.character(Class)) %>%
  mutate_if(is.factor, as.numeric)

#Check na
BreastCancer %>% 
  summarise_all(funs(sum(is.na(.))))

#Have 16 NA value at Bare.nuclei

#Remove NA
BreastCancer <- BreastCancer %>% na.omit()

#Scale function to faster converged in Random Forest
f_scale <- function(x){(x-min(x))/(max(x)-min(x))}

#Assign data after scaling
BreastCancer %<>% mutate_if(is.numeric,f_scale)


#Data split train and test
library(caret)
set.seed(1)

id <- createDataPartition(y = BreastCancer$Class,
                          p = 500/nrow(BreastCancer),
                          list = FALSE)

train <- BreastCancer[id,]
test <- BreastCancer[-id,]

#Check balance group in train, test
train %>% group_by(Class) %>% 
  summarise(n=n(), perc = n/nrow(.))

test %>% group_by(Class) %>% 
  summarise(n=n(), perc = n/nrow(.))

#We can see devide by createDataPartition will not return equal balance ratio between each group in train and test

#Divide again
#Add index
BreastCancer$Id <- c(1:nrow(BreastCancer))
train <- BreastCancer %>% 
  group_by(Class) %>% 
  sample_frac(size=500/nrow(.)) %>% 
  ungroup()

test <- dplyr::setdiff(BreastCancer,train)

#Check balance group in train/test again
train %>% group_by(Class) %>% 
  summarise(n=n(), perc = n/nrow(.))

test %>% group_by(Class) %>% 
  summarise(n=n(), perc = n/nrow(.))

#Remove index
train <- select(train,-Id)
test <- select(test,-Id)

1.2 Training model

#Make a trainControl function using cross-validation in 5-folds and repeat 5 times
train.control <- trainControl(method = 'repeatedcv',
                              number = 5,
                              repeats = 5,
                              classProbs = TRUE,
                              allowParallel = TRUE,
                              summaryFunction = multiClassSummary)


#Register parallel
library(doParallel)

#Check maximum core by detectCores(). We should choose highest core is maximum - 1.
registerDoParallel(7)

#Training model
set.seed(2)
my_rf <- train(Class ~ .,
               data = train,
               method = 'rf',
               trControl = train.control)

1.3 Tunning threshold

#Create a function predict depending on threshold and 100 resample times in test dataset. Each sample will have n observation.
eval_thres <- function(thres, n){
  all_df <- data.frame()
  for(i in 1:100){
    test_df <- test[sample(nrow(test), n), ]
    pred <- predict(my_rf, test_df, type = 'prob') %>% pull(malignant)
    pred <- case_when(pred >= thres ~ 'malignant',
                      pred < thres ~ 'benign')
    
    cm <- confusionMatrix(test_df$Class %>% as.factor(), pred %>% as.factor())
    
    cross_tbl <- cm$table %>% 
      as.vector() %>% 
      matrix(ncol = 4) %>% 
      data.frame() 
    
    names(cross_tbl) <- c('Be-Be','Ma-Be','Be-Ma','Ma-ma')
    
    ten <- c(Threshold = thres, cm$byClass, cm$overall) %>% names()
    
    kq <- c(Threshold = thres, 
            cm$byClass,
            cm$overall) %>% 
      matrix(ncol = 19) %>% 
      data.frame()
      
    names(kq) <- ten
    
    kq <- bind_cols(kq, cross_tbl)
    
    all_df <- bind_rows(all_df,kq)
  }
  all_df
}

eval_thres_100 <- function(thres){eval_thres(thres,100)} 

compare_thres_df <- lapply(seq(0.05, 0.95, by = 0.05),eval_thres_100)
compare_thres_df <- do.call('rbind',compare_thres_df)

#Visualization Accuracy and Threshold
theme_set(theme_minimal()) 
library(hrbrthemes)

compare_thres_df %>% group_by(Threshold) %>% 
  summarise_at(vars(Accuracy:Kappa, Sensitivity:`Neg Pred Value`), mean) %>% 
  gather(Metric, Accuracy,-Threshold) %>% 
  ggplot(aes(x = Threshold,y = Accuracy,color = Metric)) +
  geom_line() +
  geom_point(size = 3) +
  scale_y_percent() + 
  scale_x_continuous(breaks = seq(0.05, 0.95, by = 0.05)) + 
  labs(y = "Accuracy Rate", 
       title = "Variation of Random Forest Classifier's Metrics by Threshold", 
       subtitle = "Data Used: Breast Cancer Wisconsin Data from\nhttps://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)")

We could see our plot show 0.45 is threshold have highest Accuracy, Kappa and Negative Predict Value. But we suspect that this result can be exactly in only our specific train data set. How about the best threshold if we change to new train and test data set? In order to answer this question, we build model in new train data set and check in test data set again.

2 Second train model

2.1 Data preparation

#Split train and test again
set.seed(3)
train <- BreastCancer %>% 
  group_by(Class) %>% 
  sample_frac(size=500/nrow(.)) %>% 
  ungroup()

test <- dplyr::setdiff(BreastCancer,train)

#Check balance group in train/test again
train %>% group_by(Class) %>% 
  summarise(n=n(), perc = n/nrow(.))

test %>% group_by(Class) %>% 
  summarise(n=n(), perc = n/nrow(.))

#Remove index
train <- select(train,-Id)
test <- select(test,-Id)

2.2 Training model

#Make a trainControl function using cross-validation in 5-folds and repeat 5 times
set.seed(4)
train.control <- trainControl(method = 'repeatedcv',
                              number = 5,
                              repeats = 5,
                              classProbs = TRUE,
                              allowParallel = TRUE,
                              summaryFunction = multiClassSummary)


#Register parallel
library(doParallel)

#Check maximum core by detectCores(). We should choose highest core is maximum - 1.
registerDoParallel(7)

#Training model
set.seed(5)
my_rf <- train(Class ~ .,
               data = train,
               method = 'rf',
               trControl = train.control)

2.3 Tunning Threshold

#Use eval_thres_100 function we have created in previous step
compare_thres_df <- lapply(seq(0.05, 0.95, by = 0.05),eval_thres_100)
compare_thres_df <- do.call('rbind',compare_thres_df)

#Visualization Accuracy and Threshold
theme_set(theme_minimal()) 
library(hrbrthemes)

compare_thres_df %>% group_by(Threshold) %>% 
  summarise_at(vars(Accuracy:Kappa, Sensitivity:`Neg Pred Value`), mean) %>% 
  gather(Metric, Accuracy,-Threshold) %>% 
  ggplot(aes(x = Threshold,y = Accuracy,color = Metric)) +
  geom_line() +
  geom_point(size = 3) +
  scale_y_percent() + 
  scale_x_continuous(breaks = seq(0.05, 0.95, by = 0.05)) + 
  labs(y = "Accuracy Rate", 
       title = "Variation of Random Forest Classifier's Metrics by Threshold", 
       subtitle = "Data Used: Breast Cancer Wisconsin Data from\nhttps://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)")

With second training model the best threshold have changed from 0.45 to 0.3 in which highest Accuracy, Kappa, and Neg Predict Value. This result confirms that our best threshold is relatively and it will change corresponding with the data splitting times. To define the best threshold in reality, we should cross-validation train/test many times and compare average of Accuracy Metric in each thresholds to define the highest Accuracy value. But this seem to be high cost of time because training Random Forest model in many thousands time is pretty waste time.

Should we optimize Threshold

Pham Dinh Khanh

May 19, 2018