Approach in Machine Learning and caret Package

Data Science in Banking Course

Nguyen Chi Dung

#==========================================================
#       Approach in Machine Learning and caret package
#==========================================================

#--------------------------------------------------------------------------------------------------
#   Approach in Machine Learning
#   Data Source: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
#--------------------------------------------------------------------------------------------------

# Đọc dữ liệu: 
rm(list = ls())
bc_data <- read.table("D:/Teaching/data_science_banking/breast_cancer/breast-cancer-wisconsin.data.txt", 
                      sep = ",")

# Đổi tên cho cột biến: 

colnames(bc_data) <- c("sample_code_number", 
                       "clump_thickness", 
                       "uniformity_of_cell_size", 
                       "uniformity_of_cell_shape", 
                       "marginal_adhesion", 
                       "single_epithelial_cell_size", 
                       "bare_nuclei", 
                       "bland_chromatin", 
                       "normal_nucleoli", 
                       "mitosis", 
                       "classes")

library(tidyverse)
library(magrittr)

dan_nhan <- function(x) {
  case_when(x == 2 ~ "B", 
            x == 4 ~ "M")
}

replace_na <- function(x) {
  ELSE <- TRUE
  case_when(x == "?" ~ "1", 
            ELSE ~ x)
}


scale01 <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

# Sử dụng các hàm trên và thực hiện một số bước tiền xử lí số liệu đơn giản: 

bc_data %<>% 
  select(-sample_code_number) %>% 
  mutate(classes = dan_nhan(classes), 
         classes = as.factor(classes), 
         bare_nuclei = as.character(bare_nuclei), 
         bare_nuclei = replace_na(bare_nuclei) %>% as.numeric()) %>% 
  mutate_if(is.numeric, scale01)


# Chuẩn bị dữ liệu: 
set.seed(1)
train <- bc_data %>% 
  group_by(classes) %>% 
  sample_frac(0.7) %>% 
  ungroup()

test <- setdiff(bc_data, train)

# Chạy mô hình KNN với k = 1: 

library(class)
set.seed(1)
pred <- knn(train = train %>% select(-classes), 
            test = test %>% select(-classes),
            cl = train$classes, 
            k = 1)


length(pred) == nrow(test)

## [1] TRUE

mean(pred == test$classes)

## [1] 0.9032258

# Chạy mô hình KNN với k = 3: 

set.seed(1)
pred <- knn(train = train %>% select(-classes), 
            test = test %>% select(-classes),
            cl = train$classes, 
            k = 3)

mean(pred == test$classes)

## [1] 0.9193548

knn_accuracy <- function(train, test, chon_k) {
  set.seed(1)
  pred_knn <- knn(train = train %>% select(-classes), 
                  test = test %>% select(-classes),
                  cl = train$classes, 
                  k = chon_k)
  
  return(mean(pred_knn == test$classes))
  
}


knn_accuracy(train, test, chon_k = 1)

## [1] 0.9032258

knn_accuracy(train, test, chon_k = 3)

## [1] 0.9193548

#-------------------------------------------------------------------------
#        Chọn k tối ưu
#  https://www.isical.ac.in/~akghosh/CSDA-2006.pdf
#  https://saravananthirumuruganathan.wordpress.com/2010/05/17/a-detailed-introduction-to-k-nearest-neighbor-knn-algorithm/
#  https://arxiv.org/ftp/arxiv/papers/1409/1409.0919.pdf
#-------------------------------------------------------------------------

# Tính Accuracy cho một dải các giá trị của k: 

n <- ncol(train) - 1
range_k <- 1:n
acc_k <- c()

for (i in 1:n) {
  acc <- knn_accuracy(train, test, chon_k = range_k[i])
  acc_k <- c(acc_k, acc)
}

acc_k

## [1] 0.9032258 0.9193548 0.9193548 0.9274194 0.9193548 0.9112903 0.9112903
## [8] 0.9193548 0.9193548

# Tính toán Accuracy trung bình của 30 lần chọn mẫu: 
n_samples <- 30
ti_le <- 0.7
k <- 1

u <- c()
for (j in 1:n_samples) {
  set.seed(j)
  train <- bc_data %>% 
    group_by(classes) %>% 
    sample_frac(ti_le) %>% 
    ungroup()
  
  test <- dplyr::setdiff(bc_data, train)
  
  chinh_xac <- knn_accuracy(train, test, chon_k = k)
  u <- c(u, chinh_xac)
  
}
  
u %>% mean()

## [1] 0.9263821

u %>% head()

## [1] 0.9032258 0.8923077 0.9047619 0.9242424 0.8790323 0.9593496

# Nên viết thành hàm: 

acc_mean_samples <- function(n_samples, ti_le, k) {
  u <- c()
  for (j in 1:n_samples) {
    set.seed(j)
    train <- bc_data %>% 
      group_by(classes) %>% 
      sample_frac(ti_le) %>% 
      ungroup()
    
    test <- dplyr::setdiff(bc_data, train)
    
    chinh_xac <- knn_accuracy(train, test, chon_k = k)
    u <- c(u, chinh_xac)
  }
  u %>% 
    mean() %>% 
    return()
  
}

# Sử dụng hàm: 

acc_mean_samples(n_samples = 30, ti_le = 0.7, k = 1)

## [1] 0.9263821

# Khảo sát trung bình của Accuracy theo k: 
acc_trung_binh <- c()

for (i in 1:9) {
  tb <- acc_mean_samples(30, 0.7, i)
  acc_trung_binh <- c(acc_trung_binh, tb)
}

max(acc_trung_binh)

## [1] 0.9428568

# Tạo ra data frame kết quả: 

my_df <- data.frame(Acc_mean = acc_trung_binh, K = 1:9)

# Hình ảnh hóa: 
theme_set(theme_minimal())

my_df %>% ggplot(aes(K, Acc_mean)) + 
  geom_line() + 
  geom_point() + 
  geom_point(data = my_df %>% filter(Acc_mean == max(Acc_mean)), color = "red", size = 3) + 
  scale_x_continuous(breaks = seq(1, 9, by = 1)) + 
  labs(x = "Number of K", 
       y = "Accuracy Mean", 
       title = "The Optimal K for KNN Algorithm", 
       subtitle = "Note: Based on 30 Samples")

#---------------------------------------------------------------------------------------------------
#    caret package
#    References: 
#    - http://www.springer.com/us/book/9781461468486
#    - https://topepo.github.io/caret/model-training-and-tuning.html
#    - http://web.ipac.caltech.edu/staff/fmasci/home/astro_refs/BuildingPredictiveModelsR_caret.pdf
#---------------------------------------------------------------------------------------------------

library(caret)
models <- names(getModelInfo())

# Số lượng các Model: 
models %>% length()

## [1] 238

models %>% head()

## [1] "ada"         "AdaBag"      "AdaBoost.M1" "adaboost"    "amdai"      
## [6] "ANFIS"

# Các tham số có thể tinh chỉnh của mô hình: 
modelLookup(model = "knn")

##   model parameter      label forReg forClass probModel
## 1   knn         k #Neighbors   TRUE     TRUE      TRUE

#---------------------------------------
#   Perform KNN Model using caret
#---------------------------------------

# Chuẩn bị dữ liệu: 

set.seed(29)
id <- createDataPartition(y = bc_data$classes, p = 0.7, list = FALSE)
train_df <- bc_data[id, ]
test_df <- bc_data[-id, ]

# Thiết lập Cross - Validation và các thống kê đánh giá mô hình: 

set.seed(1)
control <- trainControl(method = "repeatedcv", 
                        number = 5, 
                        repeats = 10, 
                        classProbs = TRUE, 
                        summaryFunction = multiClassSummary)

# Các giá trị ứng viên của k: 
my_grid <- expand.grid(k = 1:9)

# Huấn luyện KNN Model: 

set.seed(29)
knn_draff <- train(classes ~ ., 
                   data = train, 
                   method = "knn", 
                   metric = "Accuracy",
                   tuneGrid = my_grid, 
                   trControl = control)

knn_draff

## k-Nearest Neighbors 
## 
## 490 samples
##   9 predictor
##   2 classes: 'B', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 393, 392, 391, 392, 392, 392, ... 
## Resampling results across tuning parameters:
## 
##   k  logLoss    AUC        prAUC       Accuracy   Kappa      F1       
##   1  1.7793114  0.9396267  0.04792925  0.9461352  0.8796834  0.9592820
##   2  0.9541786  0.9694507  0.09798717  0.9445024  0.8760641  0.9580312
##   3  0.6472846  0.9790459  0.13150673  0.9498275  0.8890513  0.9616528
##   4  0.4234178  0.9845158  0.15646335  0.9500191  0.8894091  0.9618222
##   5  0.3844927  0.9853606  0.17833597  0.9545048  0.8995668  0.9651557
##   6  0.3812298  0.9851716  0.20380570  0.9532784  0.8967079  0.9642820
##   7  0.3702794  0.9852621  0.22216012  0.9510375  0.8919503  0.9624895
##   8  0.3400787  0.9852884  0.23908770  0.9524765  0.8950002  0.9636443
##   9  0.3019675  0.9858946  0.25605294  0.9522703  0.8944211  0.9635374
##   Sensitivity  Specificity  Pos_Pred_Value  Neg_Pred_Value  Precision
##   0.9679423    0.9047415    0.9513019       0.9386102       0.9513019
##   0.9642019    0.9071658    0.9526561       0.9317082       0.9526561
##   0.9604663    0.9296613    0.9633696       0.9268333       0.9633696
##   0.9607596    0.9296613    0.9634259       0.9272542       0.9634259
##   0.9620144    0.9402674    0.9688781       0.9302919       0.9688781
##   0.9620096    0.9367736    0.9670676       0.9296986       0.9670676
##   0.9588942    0.9361141    0.9666385       0.9245120       0.9666385
##   0.9616971    0.9349911    0.9660214       0.9290496       0.9660214
##   0.9623221    0.9332086    0.9651404       0.9298148       0.9651404
##   Recall     Detection_Rate  Balanced_Accuracy
##   0.9679423  0.6341049       0.9363419        
##   0.9642019  0.6316516       0.9356838        
##   0.9604663  0.6292089       0.9450638        
##   0.9607596  0.6294005       0.9452105        
##   0.9620144  0.6302231       0.9511409        
##   0.9620096  0.6302210       0.9493916        
##   0.9588942  0.6281801       0.9475042        
##   0.9616971  0.6300169       0.9483441        
##   0.9623221  0.6304251       0.9477653        
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.

# Plot nhanh: 
plot(knn_draff, metric = "Accuracy")

plot(knn_draff, metric = "AUC")

# Khai thác caret object: 

report1 <- knn_draff$resample %>% select_if(is.numeric)
report1 %>% dim()

## [1] 50 14

report1 %>% summary()

##     logLoss             AUC             prAUC           Accuracy     
##  Min.   :0.03198   Min.   :0.9414   Min.   :0.1218   Min.   :0.8980  
##  1st Qu.:0.08465   1st Qu.:0.9797   1st Qu.:0.1483   1st Qu.:0.9388  
##  Median :0.41007   Median :0.9867   Median :0.1728   Median :0.9590  
##  Mean   :0.38449   Mean   :0.9854   Mean   :0.1783   Mean   :0.9545  
##  3rd Qu.:0.44112   3rd Qu.:0.9930   3rd Qu.:0.2012   3rd Qu.:0.9694  
##  Max.   :1.51165   Max.   :1.0000   Max.   :0.2879   Max.   :1.0000  
##      Kappa              F1          Sensitivity      Specificity    
##  Min.   :0.7717   Min.   :0.9231   Min.   :0.8750   Min.   :0.8235  
##  1st Qu.:0.8632   1st Qu.:0.9538   1st Qu.:0.9531   1st Qu.:0.9098  
##  Median :0.9070   Median :0.9683   Median :0.9688   Median :0.9554  
##  Mean   :0.8996   Mean   :0.9652   Mean   :0.9620   Mean   :0.9403  
##  3rd Qu.:0.9330   3rd Qu.:0.9764   3rd Qu.:0.9806   3rd Qu.:0.9706  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  Pos_Pred_Value   Neg_Pred_Value     Precision          Recall      
##  Min.   :0.9091   Min.   :0.8049   Min.   :0.9091   Min.   :0.8750  
##  1st Qu.:0.9531   1st Qu.:0.9118   1st Qu.:0.9531   1st Qu.:0.9531  
##  Median :0.9761   Median :0.9384   Median :0.9761   Median :0.9688  
##  Mean   :0.9689   Mean   :0.9303   Mean   :0.9689   Mean   :0.9620  
##  3rd Qu.:0.9844   3rd Qu.:0.9611   3rd Qu.:0.9844   3rd Qu.:0.9806  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  Detection_Rate   Balanced_Accuracy
##  Min.   :0.5714   Min.   :0.8805   
##  1st Qu.:0.6224   1st Qu.:0.9312   
##  Median :0.6327   Median :0.9511   
##  Mean   :0.6302   Mean   :0.9511   
##  3rd Qu.:0.6429   3rd Qu.:0.9698   
##  Max.   :0.6598   Max.   :1.0000

report2 <- knn_draff$results

# Vẽ theo kiểu của chúng ta: 
report2 %>% 
  ggplot(aes(k, Accuracy)) + 
  geom_line() + 
  geom_point()

report2 %>% 
  select(k, AUC, Accuracy, Kappa, Sensitivity, Specificity, Recall) %>% 
  gather(Metric, Value, -k) %>% 
  ggplot(aes(k, Value)) + 
  geom_line() + 
  geom_point() + 
  facet_wrap(~ Metric, scales = "free") + 
  scale_x_continuous(breaks = seq(1, 9, by = 1))

# Sử dụng mô hinh cho dự báo: 
pred <- predict(knn_draff, test %>% select(-classes))
pred %>% head()

## [1] B M M B B M
## Levels: B M

confusionMatrix(pred, test$classes, positive = "M")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 54  0
##          M  1 71
##                                           
##                Accuracy : 0.9921          
##                  95% CI : (0.9566, 0.9998)
##     No Information Rate : 0.5635          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9838          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9818          
##          Pos Pred Value : 0.9861          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5635          
##          Detection Rate : 0.5635          
##    Detection Prevalence : 0.5714          
##       Balanced Accuracy : 0.9909          
##                                           
##        'Positive' Class : M               
##