Approach in Machine Learning and caret Package
Data Science in Banking Course
Nguyen Chi Dung
#==========================================================
# Approach in Machine Learning and caret package
#==========================================================
#--------------------------------------------------------------------------------------------------
# Approach in Machine Learning
# Data Source: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
#--------------------------------------------------------------------------------------------------
# Đọc dữ liệu:
rm(list = ls())
bc_data <- read.table("D:/Teaching/data_science_banking/breast_cancer/breast-cancer-wisconsin.data.txt",
sep = ",")
# Đổi tên cho cột biến:
colnames(bc_data) <- c("sample_code_number",
"clump_thickness",
"uniformity_of_cell_size",
"uniformity_of_cell_shape",
"marginal_adhesion",
"single_epithelial_cell_size",
"bare_nuclei",
"bland_chromatin",
"normal_nucleoli",
"mitosis",
"classes")
library(tidyverse)
library(magrittr)
dan_nhan <- function(x) {
case_when(x == 2 ~ "B",
x == 4 ~ "M")
}
replace_na <- function(x) {
ELSE <- TRUE
case_when(x == "?" ~ "1",
ELSE ~ x)
}
scale01 <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
# Sử dụng các hàm trên và thực hiện một số bước tiền xử lí số liệu đơn giản:
bc_data %<>%
select(-sample_code_number) %>%
mutate(classes = dan_nhan(classes),
classes = as.factor(classes),
bare_nuclei = as.character(bare_nuclei),
bare_nuclei = replace_na(bare_nuclei) %>% as.numeric()) %>%
mutate_if(is.numeric, scale01)
# Chuẩn bị dữ liệu:
set.seed(1)
train <- bc_data %>%
group_by(classes) %>%
sample_frac(0.7) %>%
ungroup()
test <- setdiff(bc_data, train)
# Chạy mô hình KNN với k = 1:
library(class)
set.seed(1)
pred <- knn(train = train %>% select(-classes),
test = test %>% select(-classes),
cl = train$classes,
k = 1)
length(pred) == nrow(test)
## [1] TRUE
mean(pred == test$classes)
## [1] 0.9032258
# Chạy mô hình KNN với k = 3:
set.seed(1)
pred <- knn(train = train %>% select(-classes),
test = test %>% select(-classes),
cl = train$classes,
k = 3)
mean(pred == test$classes)
## [1] 0.9193548
knn_accuracy <- function(train, test, chon_k) {
set.seed(1)
pred_knn <- knn(train = train %>% select(-classes),
test = test %>% select(-classes),
cl = train$classes,
k = chon_k)
return(mean(pred_knn == test$classes))
}
knn_accuracy(train, test, chon_k = 1)
## [1] 0.9032258
knn_accuracy(train, test, chon_k = 3)
## [1] 0.9193548
#-------------------------------------------------------------------------
# Chọn k tối ưu
# https://www.isical.ac.in/~akghosh/CSDA-2006.pdf
# https://saravananthirumuruganathan.wordpress.com/2010/05/17/a-detailed-introduction-to-k-nearest-neighbor-knn-algorithm/
# https://arxiv.org/ftp/arxiv/papers/1409/1409.0919.pdf
#-------------------------------------------------------------------------
# Tính Accuracy cho một dải các giá trị của k:
n <- ncol(train) - 1
range_k <- 1:n
acc_k <- c()
for (i in 1:n) {
acc <- knn_accuracy(train, test, chon_k = range_k[i])
acc_k <- c(acc_k, acc)
}
acc_k
## [1] 0.9032258 0.9193548 0.9193548 0.9274194 0.9193548 0.9112903 0.9112903
## [8] 0.9193548 0.9193548
# Tính toán Accuracy trung bình của 30 lần chọn mẫu:
n_samples <- 30
ti_le <- 0.7
k <- 1
u <- c()
for (j in 1:n_samples) {
set.seed(j)
train <- bc_data %>%
group_by(classes) %>%
sample_frac(ti_le) %>%
ungroup()
test <- dplyr::setdiff(bc_data, train)
chinh_xac <- knn_accuracy(train, test, chon_k = k)
u <- c(u, chinh_xac)
}
u %>% mean()
## [1] 0.9263821
u %>% head()
## [1] 0.9032258 0.8923077 0.9047619 0.9242424 0.8790323 0.9593496
# Nên viết thành hàm:
acc_mean_samples <- function(n_samples, ti_le, k) {
u <- c()
for (j in 1:n_samples) {
set.seed(j)
train <- bc_data %>%
group_by(classes) %>%
sample_frac(ti_le) %>%
ungroup()
test <- dplyr::setdiff(bc_data, train)
chinh_xac <- knn_accuracy(train, test, chon_k = k)
u <- c(u, chinh_xac)
}
u %>%
mean() %>%
return()
}
# Sử dụng hàm:
acc_mean_samples(n_samples = 30, ti_le = 0.7, k = 1)
## [1] 0.9263821
# Khảo sát trung bình của Accuracy theo k:
acc_trung_binh <- c()
for (i in 1:9) {
tb <- acc_mean_samples(30, 0.7, i)
acc_trung_binh <- c(acc_trung_binh, tb)
}
max(acc_trung_binh)
## [1] 0.9428568
# Tạo ra data frame kết quả:
my_df <- data.frame(Acc_mean = acc_trung_binh, K = 1:9)
# Hình ảnh hóa:
theme_set(theme_minimal())
my_df %>% ggplot(aes(K, Acc_mean)) +
geom_line() +
geom_point() +
geom_point(data = my_df %>% filter(Acc_mean == max(Acc_mean)), color = "red", size = 3) +
scale_x_continuous(breaks = seq(1, 9, by = 1)) +
labs(x = "Number of K",
y = "Accuracy Mean",
title = "The Optimal K for KNN Algorithm",
subtitle = "Note: Based on 30 Samples")

#---------------------------------------------------------------------------------------------------
# caret package
# References:
# - http://www.springer.com/us/book/9781461468486
# - https://topepo.github.io/caret/model-training-and-tuning.html
# - http://web.ipac.caltech.edu/staff/fmasci/home/astro_refs/BuildingPredictiveModelsR_caret.pdf
#---------------------------------------------------------------------------------------------------
library(caret)
models <- names(getModelInfo())
# Số lượng các Model:
models %>% length()
## [1] 238
models %>% head()
## [1] "ada" "AdaBag" "AdaBoost.M1" "adaboost" "amdai"
## [6] "ANFIS"
# Các tham số có thể tinh chỉnh của mô hình:
modelLookup(model = "knn")
## model parameter label forReg forClass probModel
## 1 knn k #Neighbors TRUE TRUE TRUE
#---------------------------------------
# Perform KNN Model using caret
#---------------------------------------
# Chuẩn bị dữ liệu:
set.seed(29)
id <- createDataPartition(y = bc_data$classes, p = 0.7, list = FALSE)
train_df <- bc_data[id, ]
test_df <- bc_data[-id, ]
# Thiết lập Cross - Validation và các thống kê đánh giá mô hình:
set.seed(1)
control <- trainControl(method = "repeatedcv",
number = 5,
repeats = 10,
classProbs = TRUE,
summaryFunction = multiClassSummary)
# Các giá trị ứng viên của k:
my_grid <- expand.grid(k = 1:9)
# Huấn luyện KNN Model:
set.seed(29)
knn_draff <- train(classes ~ .,
data = train,
method = "knn",
metric = "Accuracy",
tuneGrid = my_grid,
trControl = control)
knn_draff
## k-Nearest Neighbors
##
## 490 samples
## 9 predictor
## 2 classes: 'B', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 393, 392, 391, 392, 392, 392, ...
## Resampling results across tuning parameters:
##
## k logLoss AUC prAUC Accuracy Kappa F1
## 1 1.7793114 0.9396267 0.04792925 0.9461352 0.8796834 0.9592820
## 2 0.9541786 0.9694507 0.09798717 0.9445024 0.8760641 0.9580312
## 3 0.6472846 0.9790459 0.13150673 0.9498275 0.8890513 0.9616528
## 4 0.4234178 0.9845158 0.15646335 0.9500191 0.8894091 0.9618222
## 5 0.3844927 0.9853606 0.17833597 0.9545048 0.8995668 0.9651557
## 6 0.3812298 0.9851716 0.20380570 0.9532784 0.8967079 0.9642820
## 7 0.3702794 0.9852621 0.22216012 0.9510375 0.8919503 0.9624895
## 8 0.3400787 0.9852884 0.23908770 0.9524765 0.8950002 0.9636443
## 9 0.3019675 0.9858946 0.25605294 0.9522703 0.8944211 0.9635374
## Sensitivity Specificity Pos_Pred_Value Neg_Pred_Value Precision
## 0.9679423 0.9047415 0.9513019 0.9386102 0.9513019
## 0.9642019 0.9071658 0.9526561 0.9317082 0.9526561
## 0.9604663 0.9296613 0.9633696 0.9268333 0.9633696
## 0.9607596 0.9296613 0.9634259 0.9272542 0.9634259
## 0.9620144 0.9402674 0.9688781 0.9302919 0.9688781
## 0.9620096 0.9367736 0.9670676 0.9296986 0.9670676
## 0.9588942 0.9361141 0.9666385 0.9245120 0.9666385
## 0.9616971 0.9349911 0.9660214 0.9290496 0.9660214
## 0.9623221 0.9332086 0.9651404 0.9298148 0.9651404
## Recall Detection_Rate Balanced_Accuracy
## 0.9679423 0.6341049 0.9363419
## 0.9642019 0.6316516 0.9356838
## 0.9604663 0.6292089 0.9450638
## 0.9607596 0.6294005 0.9452105
## 0.9620144 0.6302231 0.9511409
## 0.9620096 0.6302210 0.9493916
## 0.9588942 0.6281801 0.9475042
## 0.9616971 0.6300169 0.9483441
## 0.9623221 0.6304251 0.9477653
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
# Plot nhanh:
plot(knn_draff, metric = "Accuracy")

plot(knn_draff, metric = "AUC")

# Khai thác caret object:
report1 <- knn_draff$resample %>% select_if(is.numeric)
report1 %>% dim()
## [1] 50 14
report1 %>% summary()
## logLoss AUC prAUC Accuracy
## Min. :0.03198 Min. :0.9414 Min. :0.1218 Min. :0.8980
## 1st Qu.:0.08465 1st Qu.:0.9797 1st Qu.:0.1483 1st Qu.:0.9388
## Median :0.41007 Median :0.9867 Median :0.1728 Median :0.9590
## Mean :0.38449 Mean :0.9854 Mean :0.1783 Mean :0.9545
## 3rd Qu.:0.44112 3rd Qu.:0.9930 3rd Qu.:0.2012 3rd Qu.:0.9694
## Max. :1.51165 Max. :1.0000 Max. :0.2879 Max. :1.0000
## Kappa F1 Sensitivity Specificity
## Min. :0.7717 Min. :0.9231 Min. :0.8750 Min. :0.8235
## 1st Qu.:0.8632 1st Qu.:0.9538 1st Qu.:0.9531 1st Qu.:0.9098
## Median :0.9070 Median :0.9683 Median :0.9688 Median :0.9554
## Mean :0.8996 Mean :0.9652 Mean :0.9620 Mean :0.9403
## 3rd Qu.:0.9330 3rd Qu.:0.9764 3rd Qu.:0.9806 3rd Qu.:0.9706
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Pos_Pred_Value Neg_Pred_Value Precision Recall
## Min. :0.9091 Min. :0.8049 Min. :0.9091 Min. :0.8750
## 1st Qu.:0.9531 1st Qu.:0.9118 1st Qu.:0.9531 1st Qu.:0.9531
## Median :0.9761 Median :0.9384 Median :0.9761 Median :0.9688
## Mean :0.9689 Mean :0.9303 Mean :0.9689 Mean :0.9620
## 3rd Qu.:0.9844 3rd Qu.:0.9611 3rd Qu.:0.9844 3rd Qu.:0.9806
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Detection_Rate Balanced_Accuracy
## Min. :0.5714 Min. :0.8805
## 1st Qu.:0.6224 1st Qu.:0.9312
## Median :0.6327 Median :0.9511
## Mean :0.6302 Mean :0.9511
## 3rd Qu.:0.6429 3rd Qu.:0.9698
## Max. :0.6598 Max. :1.0000
report2 <- knn_draff$results
# Vẽ theo kiểu của chúng ta:
report2 %>%
ggplot(aes(k, Accuracy)) +
geom_line() +
geom_point()

report2 %>%
select(k, AUC, Accuracy, Kappa, Sensitivity, Specificity, Recall) %>%
gather(Metric, Value, -k) %>%
ggplot(aes(k, Value)) +
geom_line() +
geom_point() +
facet_wrap(~ Metric, scales = "free") +
scale_x_continuous(breaks = seq(1, 9, by = 1))

# Sử dụng mô hinh cho dự báo:
pred <- predict(knn_draff, test %>% select(-classes))
pred %>% head()
## [1] B M M B B M
## Levels: B M
confusionMatrix(pred, test$classes, positive = "M")
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 54 0
## M 1 71
##
## Accuracy : 0.9921
## 95% CI : (0.9566, 0.9998)
## No Information Rate : 0.5635
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9838
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.9818
## Pos Pred Value : 0.9861
## Neg Pred Value : 1.0000
## Prevalence : 0.5635
## Detection Rate : 0.5635
## Detection Prevalence : 0.5714
## Balanced Accuracy : 0.9909
##
## 'Positive' Class : M
##