Missing-data Imputation and Use map() Function for Comparing among Alternatives (diabetes.csv)
R for Pleasure
Nguyen Chi Dung
#=========================================================
# Load some packages and perform data prep-rocessing
#=========================================================
# Load data:
library(tidyverse)
library(magrittr)
pima <- read_csv("/home/chidung/Desktop/python_learning/pima/diabetes.csv")
# Function converts zero to missing values:
convert_to_na <- function(x) {
x[x == 0] <- NA
return(x)
}
# Apply above function and convert Outcome to factor:
pima_missing <- pima %>%
select(-Pregnancies, -Outcome) %>%
mutate_all(convert_to_na) %>%
mutate(Pregnancies = pima$Pregnancies, Outcome = pima$Outcome) %>%
mutate(Outcome = as.factor(case_when(Outcome == 1 ~ "Yes", TRUE ~ "No")))
# Missing rate:
library(purrr)
pima_missing %>%
map_dbl(function(x) {100*sum(is.na(x)) / length(x)}) %>%
round(2)
## Glucose BloodPressure SkinThickness
## 0.65 4.56 29.56
## Insulin BMI DiabetesPedigreeFunction
## 48.70 1.43 0.00
## Age Pregnancies Outcome
## 0.00 0.00 0.00
# Use mice package for imputing missing values. Note that by default
# mice() function calculates five (m = 5) imputed data sets:
library(mice)
pima_imputed <- mice(data = pima_missing, print = FALSE)
# Convert to long form of data frame:
all_df_imputed <- pima_imputed %>%
complete("long") %>%
select(-.id) %>%
rename(IMP = .imp) %>%
mutate(IMP = paste0("Inter", IMP))
# Train five KNN models for five imputed data sets and show results:
library(caret)
all_df_imputed %>%
split(.$IMP) %>%
map(function(df) {
caret::train(Outcome ~ .,
data = df %>% select(-IMP),
method = "knn",
metric = "Accuracy",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 10,
classProbs = TRUE,
summaryFunction = multiClassSummary,
allowParallel = TRUE))})
## $Inter1
## k-Nearest Neighbors
##
## 768 samples
## 8 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 614, 614, 615, 614, 615, 614, ...
## Resampling results across tuning parameters:
##
## k logLoss AUC prAUC Accuracy Kappa F1
## 5 1.7183885 0.7587655 0.5011685 0.7110670 0.3557825 0.7810776
## 7 1.1631262 0.7669039 0.5614104 0.7031067 0.3358275 0.7758512
## 9 0.9636735 0.7733832 0.6049174 0.7141762 0.3594629 0.7846972
## Sensitivity Specificity Pos_Pred_Value Neg_Pred_Value Precision
## 0.7938 0.5567435 0.7697755 0.5947897 0.7697755
## 0.7912 0.5387561 0.7621811 0.5835338 0.7621811
## 0.8012 0.5518449 0.7697547 0.5999685 0.7697547
## Recall Detection_Rate Balanced_Accuracy
## 0.7938 0.5167965 0.6752718
## 0.7912 0.5150955 0.6649781
## 0.8012 0.5216043 0.6765224
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
##
## $Inter2
## k-Nearest Neighbors
##
## 768 samples
## 8 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 614, 614, 615, 615, 614, 614, ...
## Resampling results across tuning parameters:
##
## k logLoss AUC prAUC Accuracy Kappa F1
## 5 1.6885520 0.7516231 0.5018885 0.6942755 0.3110327 0.7710541
## 7 1.2207235 0.7631487 0.5658469 0.7023512 0.3243804 0.7788882
## 9 0.9936626 0.7715920 0.6158858 0.7118462 0.3413209 0.7875447
## Sensitivity Specificity Pos_Pred_Value Neg_Pred_Value Precision
## 0.7918 0.5123690 0.7524774 0.5705503 0.7524774
## 0.8060 0.5089797 0.7544080 0.5857499 0.7544080
## 0.8212 0.5078477 0.7574027 0.6053809 0.7574027
## Recall Detection_Rate Balanced_Accuracy
## 0.7918 0.5154961 0.6520845
## 0.8060 0.5247475 0.6574899
## 0.8212 0.5346346 0.6645238
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
##
## $Inter3
## k-Nearest Neighbors
##
## 768 samples
## 8 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 614, 614, 614, 615, 615, 614, ...
## Resampling results across tuning parameters:
##
## k logLoss AUC prAUC Accuracy Kappa F1
## 5 1.838101 0.7550819 0.5062901 0.7199228 0.3636439 0.7921464
## 7 1.221218 0.7697977 0.5752214 0.7242136 0.3722233 0.7957260
## 9 1.030836 0.7782783 0.6217214 0.7333265 0.3949195 0.8017558
## Sensitivity Specificity Pos_Pred_Value Neg_Pred_Value Precision
## 0.8208 0.5316771 0.7663060 0.6159808 0.7663060
## 0.8262 0.5339133 0.7684105 0.6249309 0.7684105
## 0.8288 0.5551922 0.7775590 0.6364070 0.7775590
## Recall Detection_Rate Balanced_Accuracy
## 0.8208 0.5343842 0.6762386
## 0.8262 0.5378941 0.6800567
## 0.8288 0.5395841 0.6919961
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
##
## $Inter4
## k-Nearest Neighbors
##
## 768 samples
## 8 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 615, 614, 615, 614, 614, 615, ...
## Resampling results across tuning parameters:
##
## k logLoss AUC prAUC Accuracy Kappa F1
## 5 1.689816 0.7583132 0.5169414 0.7205713 0.3659158 0.7923841
## 7 1.299537 0.7651733 0.5815072 0.7194075 0.3624877 0.7917762
## 9 1.047743 0.7710775 0.6146097 0.7204635 0.3633619 0.7930665
## Sensitivity Specificity Pos_Pred_Value Neg_Pred_Value Precision
## 0.8196 0.5357792 0.7676677 0.6153435 0.7676677
## 0.8208 0.5301328 0.7654461 0.6160214 0.7654461
## 0.8244 0.5264710 0.7648786 0.6201190 0.7648786
## Recall Detection_Rate Balanced_Accuracy
## 0.8196 0.5336007 0.6776896
## 0.8208 0.5343961 0.6754664
## 0.8244 0.5367456 0.6754355
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
##
## $Inter5
## k-Nearest Neighbors
##
## 768 samples
## 8 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times)
## Summary of sample sizes: 614, 615, 615, 614, 614, 615, ...
## Resampling results across tuning parameters:
##
## k logLoss AUC prAUC Accuracy Kappa F1
## 5 1.814065 0.7393340 0.5067846 0.7045599 0.3256416 0.7817371
## 7 1.246406 0.7511268 0.5711721 0.7090994 0.3338116 0.7859094
## 9 0.986312 0.7647697 0.6189651 0.7170597 0.3525630 0.7915692
## Sensitivity Specificity Pos_Pred_Value Neg_Pred_Value Precision
## 0.8132 0.5018379 0.7537623 0.5909299 0.7537623
## 0.8206 0.5009923 0.7549119 0.6004018 0.7549119
## 0.8256 0.5144794 0.7611879 0.6133326 0.7611879
## Recall Detection_Rate Balanced_Accuracy
## 0.8132 0.5294364 0.6575189
## 0.8206 0.5342484 0.6607962
## 0.8256 0.5375146 0.6700397
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
#==================================================================
# Compare results derived from non-scaled and scaled data set
#==================================================================
# Replace NA values by mean:
pima_imputed_mean <- pima_missing %>%
mutate_if(is.numeric, function(x) {replace_na(x, mean(x, na.rm = TRUE))})
# Scale our data:
pima_scaled <- pima_imputed_mean %>%
mutate_if(is.numeric, function(x) {(x - min(x)) / (max(x) - min(x))})
# Compare results derived from non-scaled and scaled data set:
list(Imputed_Data = pima_imputed_mean, Scaled_Data = pima_scaled) %>%
map(function(df) {
caret::train(Outcome ~ .,
data = df,
method = "knn",
metric = "Accuracy",
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 10,
classProbs = TRUE,
summaryFunction = multiClassSummary,
allowParallel = TRUE))}) -> list_for_comparing
# Compare effect of using scaled data based on some criteria of model performance:
theme_set(theme_minimal())
bind_rows(list_for_comparing$Imputed_Data$resample %>% mutate(Data = "Non-scaled"),
list_for_comparing$Scaled_Data$resample %>% mutate(Data = "Scaled")) %>%
select(AUC, Accuracy, Specificity, Sensitivity, Kappa, Recall, Data) %>%
gather(a, b, -Data) %>%
ggplot(aes(Data, b, color = Data, fill = Data)) +
geom_boxplot(alpha = 0.3, show.legend = FALSE) +
facet_wrap(~ a, scales = "free") +
scale_y_continuous(labels = scales::percent) +
labs(x = NULL, y = NULL,
title = "Model Performance: Effect of Using Scaled Data for Training KNN Model",
subtitle = "Data Used: https://www.kaggle.com/uciml/pima-indians-diabetes-database")
