A Big Project for Data Science in Banking Course (Part 1)

Nguyen Chi Dung

#=======================================================
#   A Big Project for Data Science in Banking Course
#=======================================================


#---------------------------------------------------------------------------------------
#  Giải thích 1: Một mô hình có mức độ chính xác toàn cục khi phân loại
#  hồ sơ xin vay tín dụng thường không phải là mô hình mà Ngân Hàng lựa chọn
#  Data Source: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
#---------------------------------------------------------------------------------------


# Đọc và xem qua dữ liệu: 

rm(list = ls())
library(tidyverse)
library(magrittr)
taiwan_default <- read.csv("D:/Teaching/new_data/taiwan_credit1.csv")

taiwan_default %>% dim()

## [1] 30000    25

taiwan_default %>% str()

## 'data.frame':    30000 obs. of  25 variables:
##  $ id : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ X1 : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ X2 : int  2 2 2 2 1 1 1 2 2 1 ...
##  $ X3 : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ X4 : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ X5 : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ X6 : int  2 -1 0 0 -1 0 0 0 0 -2 ...
##  $ X7 : int  2 2 0 0 0 0 0 -1 0 -2 ...
##  $ X8 : int  -1 0 0 0 -1 0 0 -1 2 -2 ...
##  $ X9 : int  -1 0 0 0 0 0 0 0 0 -2 ...
##  $ X10: int  -2 0 0 0 0 0 0 0 0 -1 ...
##  $ X11: int  -2 2 0 0 0 0 0 -1 0 -1 ...
##  $ X12: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ X13: int  3102 1725 14027 48233 5670 57069 412023 380 14096 0 ...
##  $ X14: int  689 2682 13559 49291 35835 57608 445007 601 12108 0 ...
##  $ X15: int  0 3272 14331 28314 20940 19394 542653 221 12211 0 ...
##  $ X16: int  0 3455 14948 28959 19146 19619 483003 -159 11793 13007 ...
##  $ X17: int  0 3261 15549 29547 19131 20024 473944 567 3719 13912 ...
##  $ X18: int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ X19: int  689 1000 1500 2019 36681 1815 40000 601 0 0 ...
##  $ X20: int  0 1000 1000 1200 10000 657 38000 0 432 0 ...
##  $ X21: int  0 1000 1000 1100 9000 1000 20239 581 1000 13007 ...
##  $ X22: int  0 0 1000 1069 689 1000 13750 1687 1000 1122 ...
##  $ X23: int  0 2000 5000 1000 679 800 13770 1542 1000 0 ...
##  $ Y  : int  1 1 0 0 0 0 0 0 0 0 ...

# Viết hàm dán lại nhãn: 
recode_default <- function(x) {
  case_when(x == 1 ~ "Default", 
            x == 0 ~ "NonDefault")
}

# Chỉ lấy một số biến được cho là quan trọng để phân tích và dán lại nhãn: 
dung <- taiwan_default %>% 
  select(X1, X6, X18:X23, -id, Y) %>% 
  mutate(Y = recode_default(Y), 
         Y = as.factor(Y))

dung %>% head()

##       X1 X6  X18   X19   X20  X21  X22  X23          Y
## 1  20000  2    0   689     0    0    0    0    Default
## 2 120000 -1    0  1000  1000 1000    0 2000    Default
## 3  90000  0 1518  1500  1000 1000 1000 5000 NonDefault
## 4  50000  0 2000  2019  1200 1100 1069 1000 NonDefault
## 5  50000 -1 2000 36681 10000 9000  689  679 NonDefault
## 6  50000  0 2500  1815   657 1000 1000  800 NonDefault

# Chỉ lấy 2000 quan sát bất kì trong bộ số liệu đầu: 
set.seed(1709)
dung_small <- dung %>% 
  sample_n(2000)

# Chia bộ dữ liệu thành hai phần bằng nhau (1000 cho mỗi phần):
library(caret)
set.seed(123) 
indxTrain <- createDataPartition(y = dung_small$Y, p = 1 / 2, list = FALSE) 
training <- dung_small[indxTrain, ]
testing <- dung_small[-indxTrain, ]

# Mô hình Logistic: 
logistic <- train(Y ~., 
                  data = training, 
                  method = "glm",
                  family = "binomial")

# Đánh giá mô hình trên test data: 
pred1 <- predict(logistic, newdata = testing %>% select(-Y))
pred1 %>% head()

## [1] NonDefault NonDefault NonDefault NonDefault NonDefault NonDefault
## Levels: Default NonDefault

confusionMatrix(data = pred1, testing$Y, positive = "Default")

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Default NonDefault
##   Default         66         36
##   NonDefault     184        714
##                                          
##                Accuracy : 0.78           
##                  95% CI : (0.753, 0.8053)
##     No Information Rate : 0.75           
##     P-Value [Acc > NIR] : 0.0147         
##                                          
##                   Kappa : 0.2691         
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.2640         
##             Specificity : 0.9520         
##          Pos Pred Value : 0.6471         
##          Neg Pred Value : 0.7951         
##              Prevalence : 0.2500         
##          Detection Rate : 0.0660         
##    Detection Prevalence : 0.1020         
##       Balanced Accuracy : 0.6080         
##                                          
##        'Positive' Class : Default        
##

# Mô hình Probit: 
probit <- train(Y ~., 
                data = training, 
                method = "glm", 
                family = "binomial"(link = "probit"))

# Đánh giá mô hình trên bộ dữ testing: 
pred2 <- predict(probit, newdata = testing %>% select(-Y))
confusionMatrix(data = pred2, testing$Y, positive = "Default")

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Default NonDefault
##   Default         52         32
##   NonDefault     198        718
##                                           
##                Accuracy : 0.77            
##                  95% CI : (0.7426, 0.7958)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.07626         
##                                           
##                   Kappa : 0.2123          
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.2080          
##             Specificity : 0.9573          
##          Pos Pred Value : 0.6190          
##          Neg Pred Value : 0.7838          
##              Prevalence : 0.2500          
##          Detection Rate : 0.0520          
##    Detection Prevalence : 0.0840          
##       Balanced Accuracy : 0.5827          
##                                           
##        'Positive' Class : Default         
##

#---------------------------------------------------------------------------------------
#  Giải thích 2: Nếu chọn một tiêu chí nào đó, chẳng hạn, mức chính xác toàn cục để 
#  lựa chọn mô hình phân loại thì tiêu chí đó phải được đánh giá trên nhiều lần chọn 
#  mẫu khác nhau chứ không phải chỉ dựa trên một mẫu hay một tình huống cụ thể nào đó. 
#---------------------------------------------------------------------------------------

set.seed(29) 
indxTrain <- createDataPartition(y = dung_small$Y, p = 1 / 2, list = FALSE) 
training <- dung_small[indxTrain, ]
testing <- dung_small[-indxTrain, ]

# Mô hình Logistic: 
logistic <- train(Y ~., 
                  data = training, 
                  method = "glm",
                  family = "binomial")

# Đánh giá mô hình trên test data: 
pred1 <- predict(logistic, newdata = testing %>% select(-Y))
confusionMatrix(data = pred1, testing$Y, positive = "Default")

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Default NonDefault
##   Default         75         41
##   NonDefault     175        709
##                                           
##                Accuracy : 0.784           
##                  95% CI : (0.7572, 0.8091)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.006582        
##                                           
##                   Kappa : 0.2987          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3000          
##             Specificity : 0.9453          
##          Pos Pred Value : 0.6466          
##          Neg Pred Value : 0.8020          
##              Prevalence : 0.2500          
##          Detection Rate : 0.0750          
##    Detection Prevalence : 0.1160          
##       Balanced Accuracy : 0.6227          
##                                           
##        'Positive' Class : Default         
##

# Mô hình Probit: 
probit <- train(Y ~., 
                data = training, 
                method = "glm", 
                family = "binomial"(link = "probit"))

# Đánh giá mô hình trên bộ dữ testing: 
pred2 <- predict(probit, newdata = testing %>% select(-Y))
confusionMatrix(data = pred2, testing$Y, positive = "Default")

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Default NonDefault
##   Default         68         40
##   NonDefault     182        710
##                                           
##                Accuracy : 0.778           
##                  95% CI : (0.7509, 0.8034)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.02128         
##                                           
##                   Kappa : 0.2697          
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.2720          
##             Specificity : 0.9467          
##          Pos Pred Value : 0.6296          
##          Neg Pred Value : 0.7960          
##              Prevalence : 0.2500          
##          Detection Rate : 0.0680          
##    Detection Prevalence : 0.1080          
##       Balanced Accuracy : 0.6093          
##                                           
##        'Positive' Class : Default         
##

# Đánh giá hai mô hình trên 100 mẫu khác nhau: 

acc_logistic <- c()
acc_probit <- c()

for (i in 1:100) {
  set.seed(i)
  testing <- dung %>% sample_n(1000)
  
  pred1 <- predict(logistic, newdata = testing %>% select(-Y))
  acc1 <- mean(pred1 == testing$Y)
  acc_logistic <- c(acc_logistic, acc1)
  
  pred2 <- predict(probit, newdata = testing %>% select(-Y))
  acc2 <- mean(pred2 == testing$Y)
  acc_probit <- c(acc_probit, acc2)
  
}

acc_logistic %>% mean()

## [1] 0.81566

acc_probit %>% mean()

## [1] 0.81355

all_df <- data.frame(Accuracy = c(acc_logistic, acc_probit), 
                     Model = c(rep("Logistic", 100), rep("Probit", 100)))


theme_set(theme_minimal())
all_df %>% 
  ggplot(aes(Model, Accuracy)) + 
  geom_boxplot()

# Viết hàm so sánh hai mô hình dựa trên tiêu chí Accuracy: 
evaluating_fun <- function(so_lan, kich_thuoc_mau, model1, model2) {
  acc_m1 <- c()
  acc_m2 <- c()
  
  for (i in 1:so_lan) {
    set.seed(i)
    testing <- dung %>% sample_n(kich_thuoc_mau)
    
    pred1 <- predict(model1, newdata = testing %>% select(-Y))
    acc1 <- mean(pred1 == testing$Y)
    acc_m1 <- c(acc_m1, acc1)
    
    pred2 <- predict(model2, newdata = testing %>% select(-Y))
    acc2 <- mean(pred2 == testing$Y)
    acc_m2 <- c(acc_m2, acc2)
    
  }
  my_df <- data.frame(Accuracy = c(acc_m1, acc_m2), 
                      Model = c(rep("Model1", so_lan), rep("Model2", so_lan)))
  return(my_df)
  
}

# Sử dụng hàm: 
so_sanh <- evaluating_fun(so_lan = 100, kich_thuoc_mau = 1000, logistic, probit)

# Đánh giá sơ bộ bằng công cụ hình ảnh: 

so_sanh %>% 
  ggplot(aes(Model, Accuracy)) + 
  geom_boxplot()

# Đánh giá sơ bộ bằng các tiêu chí thống kê: 
so_sanh %>% 
  group_by(Model) %>% 
  summarise_each(funs(mean, median, min, max, sd), Accuracy)

## # A tibble: 2 x 6
##   Model  Accuracy_mean Accuracy_median Accuracy_min Accuracy_max
##   <fct>          <dbl>           <dbl>        <dbl>        <dbl>
## 1 Model1         0.816           0.817        0.775        0.851
## 2 Model2         0.814           0.815        0.776        0.848
## # ... with 1 more variable: Accuracy_sd <dbl>

#------------------------------------------------------------------------------------
#  Giải thích 3: Lợi nhuận nên là tiêu chí đầu tiên được lựa chọn mô hình phân loại
#  và phải dựa vào nhiều mẫu số liệu (nhiều tình huống khác nhau)
#  Tham khảo: http://rpubs.com/chidungkt/320301
#------------------------------------------------------------------------------------

# Viết hàm đánh giá lợi nhuận: 

df_evaluate_profit <- function(model, so_lan_chon_mau, kich_thuoc) {
  
  ket_qua <- data.frame()
  for (j in 1:so_lan_chon_mau) {
    set.seed(j)
    testing <- dung %>% sample_n(kich_thuoc)
    du_bao <- predict(model, testing %>% select(-Y))
    u <- confusionMatrix(du_bao, testing$Y, positive = "Default")
    v <- u$table %>% as.vector()
    ket_qua <- rbind(ket_qua, v)
    names(ket_qua) <- c("BB", "GB", "BG", "GG")
  }
  
  return(ket_qua)
}


# Sử dụng hàm: 
m <- df_evaluate_profit(logistic, so_lan_chon_mau = 1000, kich_thuoc = 3000)

# Tính toán lợi nhuận: 
m %<>% mutate(Profit = 0.2*GG - BG)

# Đánh giá các thống kê về lợi nhuận: 
m$Profit %>% summary()

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   332.2   361.6   368.4   368.5   375.9   398.2

# Hình ảnh hóa lợi nhuận này: 

m %>% 
  ggplot(aes(Profit)) + 
  geom_density(fill = "blue", color = "blue", alpha = 0.3) + 
  geom_histogram(aes(y = ..density..), fill = "red", color = "red", alpha = 0.3)

n <- df_evaluate_profit(probit, so_lan_chon_mau = 1000, kich_thuoc = 3000) %>% 
  mutate(Profit = 0.2*GG - BG)


df_compare <- bind_rows(m %>% mutate(Model = "Logistic"), 
                        n %>% mutate(Model = "Probit")) 

df_compare %<>% mutate(Accuracy = (BB + GG) / (BB + GG + BG + GB))


df_compare %>% 
  ggplot(aes(Profit)) + 
  geom_density(fill = "blue", color = "blue", alpha = 0.3) + 
  geom_histogram(aes(y = ..density..), fill = "red", color = "red", alpha = 0.3) + 
  facet_wrap(~ Model)

df_compare %>% 
  group_by(Model) %>% 
  summarise_each(funs(mean, median, min, max, sd), Profit)

## # A tibble: 2 x 6
##   Model    Profit_mean Profit_median Profit_min Profit_max Profit_sd
##   <chr>          <dbl>         <dbl>      <dbl>      <dbl>     <dbl>
## 1 Logistic        369.          368.       332.       398.      11.1
## 2 Probit          376.          376.       341.       408.      10.7

df_compare %>% 
  group_by(Model) %>% 
  summarise_each(funs(mean, median, min, max, sd), Accuracy)

## # A tibble: 2 x 6
##   Model    Accuracy_mean Accuracy_median Accuracy_min Accuracy_max
##   <chr>            <dbl>           <dbl>        <dbl>        <dbl>
## 1 Logistic         0.815           0.815        0.792        0.835
## 2 Probit           0.813           0.813        0.790        0.833
## # ... with 1 more variable: Accuracy_sd <dbl>

# Viết hàm báo cáo các thống kê về lợi nhuận và hình ảnh hóa 
# cho một mô hình được lựa chọn: 

profit_report <- function(ketqua, rate) {
  ketqua %<>% mutate(Profit = rate*GG - BG)
  
  mean_pro <- mean(ketqua$Profit) %>% round(digits = 0)
  min_pro <- min(ketqua$Profit) %>% round(digits = 0)
  max_pro <- max(ketqua$Profit) %>% round(digits = 0)
  sd_pro <- sd(ketqua$Profit) %>% round(digits = 0)
  n_samples <- nrow(ketqua)
  
  t1 <- paste("Profit Report: ", paste("Mean = ", mean_pro))
  t2 <- paste0(t1, paste(", Min = ", min_pro))
  t3 <- paste0(t2, paste(", Max = ", max_pro))
  t4 <- paste0(t3, paste0(", SD = ", sd_pro))
  
  
  s1 <- paste("Notes: ", paste("Number of Samples = ", n_samples))
  s1 <- paste0(s1, ", ")
  s2 <- paste(s1, paste("Interest Rate = ", rate))
  
  
  ketqua %>% 
    ggplot(aes(Profit)) + 
    geom_vline(xintercept = mean_pro, color = "blue", size = 1.1) + 
    geom_density(fill = "blue", color = "blue", alpha = 0.2) + 
    geom_histogram(aes(y = ..density..), fill = "red", color = "red", alpha = 0.2) + 
    labs(title = t4, 
         subtitle = s2)
  
  
}

# Sử dụng hàm: 
profit_report(ketqua = m, rate = 0.2)

profit_report(ketqua = m, rate = 0.1)

# Assignment: Viết hàm hiển thị các thống kê về lợi nhuận + hình ảnh
# cho hai mô hình bất kì được lựa chọn. 


# Viết hàm trả lời có hay không một hồ sơ được vay dựa trên thông tin cung cấp: 

duyet_vay <- function(X1, X6, X18, X19, X20, X21, X22, X23, model_selected) {
  input_df <- data.frame(X1 = X1, 
                         X6 = X6, 
                         X18 = X18, 
                         X19 = X19,
                         X20 = X20, 
                         X21 = X21, 
                         X22 = X22, 
                         X23 = X23)
  ket_qua <- predict(model_selected, input_df)
  return(as.character(ket_qua))
}


# Sử dụng hàm: 
duyet_vay(X1 = 200000, 
          X6 = 0, 
          X18 = 4000, 
          X19 = 3500, 
          X20 = 0, 
          X21 = 500, 
          X22 = 1000, 
          X23 = 100, 
          model_selected = logistic)

## [1] "NonDefault"

duyet_vay(X1 = 200000, 
          X6 = 0, 
          X18 = 4000, 
          X19 = 3500, 
          X20 = 0, 
          X21 = 500, 
          X22 = 1000, 
          X23 = 100, 
          model_selected = probit)

## [1] "NonDefault"

#------------------------------------------------------------------------------------
#  Giải thích 4: Mức độ chính xác của mô hình phân loại còn phụ thuộc vào ngưỡng 
#  (Threshold) để phân loại nhóm (hay nhãn) của biến đích và do đó có thể ảnh hưởng
#  đến các tiêu chí khác như lợi nhuận. 
#------------------------------------------------------------------------------------

# Viết hàm re-convert nhãn: 
reconvert <- function(x) {
  case_when(x == "Default" ~ 1, 
            x == "NonDefault" ~ 0)
}

# Áp dụng hàm: 

dung10 <- dung %>% mutate(Y = reconvert(Y))

set.seed(1709)
dung_small <- dung10 %>% 
  sample_n(2000)

# Chia bộ dữ liệu thành hai phần bằng nhau (1000 cho mỗi phần):

set.seed(123) 
indxTrain <- createDataPartition(y = dung_small$Y, p = 1 / 2, list = FALSE) 
training <- dung_small[indxTrain, ]
testing <- dung_small[-indxTrain, ]

# Mô hình Logistic: 
logistic <- train(Y ~., 
                  data = training, 
                  method = "glm",
                  family = "binomial")

# Đánh giá mô hình trên test data: 
pred1 <- predict(logistic, newdata = testing %>% select(-Y))

pred1 %>% head()

##      8267        27     17126     18156      5700       273 
## 0.1200793 0.3761079 0.2411857 0.3834187 0.2220664 0.3915485

#  Viết hàm phân loại hồ sơ theo ngưỡng mà chúng ta lựa chọn: 
xep_loai <- function(x, nguong) {
  case_when(x >= nguong ~ "Default", 
            x < nguong ~ "NonDefault")
  
}

# Đánh giá mô hình khi ngưỡng là 0.5: 

confusionMatrix(data = pred1 %>% xep_loai(nguong = 0.5), 
                testing$Y %>% recode_default(), positive = "Default")

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Default NonDefault
##   Default         74         35
##   NonDefault     176        715
##                                           
##                Accuracy : 0.789           
##                  95% CI : (0.7624, 0.8139)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.002135        
##                                           
##                   Kappa : 0.3071          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.2960          
##             Specificity : 0.9533          
##          Pos Pred Value : 0.6789          
##          Neg Pred Value : 0.8025          
##              Prevalence : 0.2500          
##          Detection Rate : 0.0740          
##    Detection Prevalence : 0.1090          
##       Balanced Accuracy : 0.6247          
##                                           
##        'Positive' Class : Default         
##

# Đánh giá mô hình khi ngưỡng là 0.6: 

confusionMatrix(data = pred1 %>% xep_loai(nguong = 0.6), 
                testing$Y %>% recode_default(), positive = "Default")

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Default NonDefault
##   Default         12          7
##   NonDefault     238        743
##                                           
##                Accuracy : 0.755           
##                  95% CI : (0.7271, 0.7814)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.3733          
##                                           
##                   Kappa : 0.0559          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0480          
##             Specificity : 0.9907          
##          Pos Pred Value : 0.6316          
##          Neg Pred Value : 0.7574          
##              Prevalence : 0.2500          
##          Detection Rate : 0.0120          
##    Detection Prevalence : 0.0190          
##       Balanced Accuracy : 0.5193          
##                                           
##        'Positive' Class : Default         
##

# Viết hàm: 
my_fun <- function(nguong, so_lan_lap, n_sample, model) {
  ket_qua <- data.frame()
  for (i in 1:so_lan_lap) {
    set.seed(i)
    testing <- dung10 %>% 
      sample_n(n_sample)
    
  
    dubao <- predict(model, testing %>% select(-Y))
    dubao <- xep_loai(dubao, nguong)
    u <- confusionMatrix(dubao, testing$Y %>% recode_default(), positive = "Default")
    v <- u$table %>% as.vector()
    ket_qua <- rbind(ket_qua, v)
    names(ket_qua) <- c("BB", "GB", "BG", "GG")
  }
  return(ket_qua)
}


# Sử dụng hàm với ngưỡng là 0.5 và 1000 lần chạy mô hình, kích cỡ 3000: 
kq5 <- my_fun(0.5, 1000, 3000, logistic)
# Sử dụng hàm với ngưỡng là 0.6 và 1000 lần chạy mô hình, kích cỡ 3000: 
kq6 <- my_fun(0.6, 1000, 3000, logistic)

df_nguong <- bind_rows(kq5 %>% mutate(Thr = "T0.5"), 
                       kq6 %>% mutate(Thr = "T0.6"))

df_nguong %<>%  
  mutate(Profit = 0.2*GG - BG, 
         Accuracy = (BB + GG) / (BB + GG + BG + GB))

df_nguong %>% 
  group_by(Thr) %>% 
  summarise_each(funs(mean, median, min, max, sd, n()), Profit)

## # A tibble: 2 x 7
##   Thr   Profit_mean Profit_median Profit_min Profit_max Profit_sd Profit_n
##   <chr>       <dbl>         <dbl>      <dbl>      <dbl>     <dbl>    <int>
## 1 T0.5         353.          353.       309.       385.     11.5      1000
## 2 T0.6         448.          448.       428.       465.      5.89     1000

df_nguong %>% 
  group_by(Thr) %>% 
  summarise_each(funs(mean, median, min, max, sd, n()), Accuracy)

## # A tibble: 2 x 7
##   Thr   Accuracy_mean Accuracy_median Accuracy_min Accuracy_max
##   <chr>         <dbl>           <dbl>        <dbl>        <dbl>
## 1 T0.5          0.816           0.816        0.796        0.837
## 2 T0.6          0.785           0.785        0.758        0.803
## # ... with 2 more variables: Accuracy_sd <dbl>, Accuracy_n <int>

#---------------------------------------------------------------------------------
#  Giải thích 5: 
#  Feature Engineering - bỏ / thêm / chuyển hóa biến đầu vào có thể cải thiện
#  mức độ chính xác của mô hình phân loại đối với nhiều mô hình phân loại. 

#  References: 
#  - Mastering Feature Engineering Principles and Techniques for Data Scientists
#  - ds_breast_cancer_cancer.r
#----------------------------------------------------------------------------------

#------  Case 1: Bỏ biến tương quan cao và phương sai không  ---------#
data("GermanCredit")

df_ori <- GermanCredit
Class <- GermanCredit$Class

feature_df <- GermanCredit %>% select(-Class)

# Vị trí cột biến có phương sai zero và loại bỏ: 
zero_pos <- nearZeroVar(feature_df)
zero_va <- names(feature_df)[zero_pos]
feature_df %<>% select(-zero_va)

# Loại tương quan trên 0.75: 
tuong_quan <- cor(feature_df)
highCorr <- findCorrelation(tuong_quan, cutoff = .75)

var_name <- names(feature_df)[highCorr]

final_df <- feature_df %>% 
  select(-var_name) %>% 
  mutate(Class = Class)


# Model thứ nhất: 
set.seed(1)
ctrl <- trainControl(method = "repeatedcv",
                     repeats = 20,
                     number = 10)

logistic1 <- train(Class ~ .,
                   data = final_df,
                   method = "glm", 
                   trControl = ctrl)

# Model thứ 2: 
logistic2 <- train(Class ~ .,
                   data = GermanCredit,
                   method = "glm", 
                   trControl = ctrl)

# Đánh giá nhanh: 
logistic1$resample %>% summary()

##     Accuracy         Kappa          Resample        
##  Min.   :0.670   Min.   :0.1579   Length:200        
##  1st Qu.:0.730   1st Qu.:0.3112   Class :character  
##  Median :0.750   Median :0.3750   Mode  :character  
##  Mean   :0.755   Mean   :0.3800                     
##  3rd Qu.:0.780   3rd Qu.:0.4471                     
##  Max.   :0.850   Max.   :0.6250

logistic2$resample %>% summary()

##     Accuracy         Kappa           Resample        
##  Min.   :0.620   Min.   :0.05941   Length:200        
##  1st Qu.:0.720   1st Qu.:0.30003   Class :character  
##  Median :0.750   Median :0.37500   Mode  :character  
##  Mean   :0.751   Mean   :0.37171                     
##  3rd Qu.:0.780   3rd Qu.:0.45312                     
##  Max.   :0.850   Max.   :0.61165

# Đánh giá hình bằng hình ảnh: 

comp_df <- bind_rows(logistic1$resample %>% mutate(Model = "Logistic1"), 
                     logistic2$resample %>% mutate(Model = "Logistic2"))

comp_df %>% 
  ggplot(aes(Model, Accuracy)) + geom_boxplot()

# Đánh giá bằng các tiêu chí thống kê: 
comp_df %>% 
  group_by(Model) %>% 
  summarise_each(funs(mean, median, min, max, n()), Accuracy)

## # A tibble: 2 x 6
##   Model Accuracy_mean Accuracy_median Accuracy_min Accuracy_max Accuracy_n
##   <chr>         <dbl>           <dbl>        <dbl>        <dbl>      <int>
## 1 Logi~         0.755           0.750        0.670        0.850        200
## 2 Logi~         0.751           0.750        0.620        0.850        200

#------  Case 2: Chuẩn hóa dữ liệu (ds_breast_cancer_cancer)  ---------#


#---------------------------------------------------------------
#       Main Project: hmeq.csv  (to be continued in part 2)
#---------------------------------------------------------------