Missing-data Imputation and Use map() Function for Comparing among Alternatives (diabetes.csv)

R for Pleasure

Nguyen Chi Dung

#=========================================================
#   Load some packages and perform data prep-rocessing
#=========================================================

# Load data: 
library(tidyverse)
library(magrittr)
pima <- read_csv("/home/chidung/Desktop/python_learning/pima/diabetes.csv")

# Function converts zero to missing values: 
convert_to_na <- function(x) {
  x[x == 0] <- NA
  return(x)
}

# Apply above function and convert Outcome to factor: 

pima_missing <- pima %>% 
  select(-Pregnancies, -Outcome) %>% 
  mutate_all(convert_to_na) %>% 
  mutate(Pregnancies = pima$Pregnancies, Outcome = pima$Outcome) %>% 
  mutate(Outcome = as.factor(case_when(Outcome == 1 ~ "Yes", TRUE ~ "No")))

# Missing rate: 
library(purrr)

pima_missing %>% 
  map_dbl(function(x) {100*sum(is.na(x)) / length(x)}) %>% 
  round(2)
##                  Glucose            BloodPressure            SkinThickness 
##                     0.65                     4.56                    29.56 
##                  Insulin                      BMI DiabetesPedigreeFunction 
##                    48.70                     1.43                     0.00 
##                      Age              Pregnancies                  Outcome 
##                     0.00                     0.00                     0.00
# Use mice package for imputing missing values. Note that by default
# mice() function calculates five (m = 5) imputed data sets: 

library(mice)
pima_imputed <- mice(data = pima_missing,  print = FALSE)

# Convert to long form of data frame: 
all_df_imputed <- pima_imputed %>% 
  complete("long") %>% 
  select(-.id) %>% 
  rename(IMP = .imp) %>% 
  mutate(IMP = paste0("Inter", IMP))


# Train five KNN models for five imputed data sets and show results: 
library(caret)

all_df_imputed %>% 
  split(.$IMP) %>% 
  map(function(df) {
    caret::train(Outcome ~ .,
                 data = df %>% select(-IMP),
                 method = "knn",
                 metric = "Accuracy", 
                 trControl = trainControl(method = "repeatedcv", 
                                          number = 5, 
                                          repeats = 10, 
                                          classProbs = TRUE, 
                                          summaryFunction = multiClassSummary, 
                                          allowParallel = TRUE))})
## $Inter1
## k-Nearest Neighbors 
## 
## 768 samples
##   8 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 614, 614, 615, 614, 615, 614, ... 
## Resampling results across tuning parameters:
## 
##   k  logLoss    AUC        prAUC      Accuracy   Kappa      F1       
##   5  1.7183885  0.7587655  0.5011685  0.7110670  0.3557825  0.7810776
##   7  1.1631262  0.7669039  0.5614104  0.7031067  0.3358275  0.7758512
##   9  0.9636735  0.7733832  0.6049174  0.7141762  0.3594629  0.7846972
##   Sensitivity  Specificity  Pos_Pred_Value  Neg_Pred_Value  Precision
##   0.7938       0.5567435    0.7697755       0.5947897       0.7697755
##   0.7912       0.5387561    0.7621811       0.5835338       0.7621811
##   0.8012       0.5518449    0.7697547       0.5999685       0.7697547
##   Recall  Detection_Rate  Balanced_Accuracy
##   0.7938  0.5167965       0.6752718        
##   0.7912  0.5150955       0.6649781        
##   0.8012  0.5216043       0.6765224        
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
## 
## $Inter2
## k-Nearest Neighbors 
## 
## 768 samples
##   8 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 614, 614, 615, 615, 614, 614, ... 
## Resampling results across tuning parameters:
## 
##   k  logLoss    AUC        prAUC      Accuracy   Kappa      F1       
##   5  1.6885520  0.7516231  0.5018885  0.6942755  0.3110327  0.7710541
##   7  1.2207235  0.7631487  0.5658469  0.7023512  0.3243804  0.7788882
##   9  0.9936626  0.7715920  0.6158858  0.7118462  0.3413209  0.7875447
##   Sensitivity  Specificity  Pos_Pred_Value  Neg_Pred_Value  Precision
##   0.7918       0.5123690    0.7524774       0.5705503       0.7524774
##   0.8060       0.5089797    0.7544080       0.5857499       0.7544080
##   0.8212       0.5078477    0.7574027       0.6053809       0.7574027
##   Recall  Detection_Rate  Balanced_Accuracy
##   0.7918  0.5154961       0.6520845        
##   0.8060  0.5247475       0.6574899        
##   0.8212  0.5346346       0.6645238        
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
## 
## $Inter3
## k-Nearest Neighbors 
## 
## 768 samples
##   8 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 614, 614, 614, 615, 615, 614, ... 
## Resampling results across tuning parameters:
## 
##   k  logLoss   AUC        prAUC      Accuracy   Kappa      F1       
##   5  1.838101  0.7550819  0.5062901  0.7199228  0.3636439  0.7921464
##   7  1.221218  0.7697977  0.5752214  0.7242136  0.3722233  0.7957260
##   9  1.030836  0.7782783  0.6217214  0.7333265  0.3949195  0.8017558
##   Sensitivity  Specificity  Pos_Pred_Value  Neg_Pred_Value  Precision
##   0.8208       0.5316771    0.7663060       0.6159808       0.7663060
##   0.8262       0.5339133    0.7684105       0.6249309       0.7684105
##   0.8288       0.5551922    0.7775590       0.6364070       0.7775590
##   Recall  Detection_Rate  Balanced_Accuracy
##   0.8208  0.5343842       0.6762386        
##   0.8262  0.5378941       0.6800567        
##   0.8288  0.5395841       0.6919961        
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
## 
## $Inter4
## k-Nearest Neighbors 
## 
## 768 samples
##   8 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 615, 614, 615, 614, 614, 615, ... 
## Resampling results across tuning parameters:
## 
##   k  logLoss   AUC        prAUC      Accuracy   Kappa      F1       
##   5  1.689816  0.7583132  0.5169414  0.7205713  0.3659158  0.7923841
##   7  1.299537  0.7651733  0.5815072  0.7194075  0.3624877  0.7917762
##   9  1.047743  0.7710775  0.6146097  0.7204635  0.3633619  0.7930665
##   Sensitivity  Specificity  Pos_Pred_Value  Neg_Pred_Value  Precision
##   0.8196       0.5357792    0.7676677       0.6153435       0.7676677
##   0.8208       0.5301328    0.7654461       0.6160214       0.7654461
##   0.8244       0.5264710    0.7648786       0.6201190       0.7648786
##   Recall  Detection_Rate  Balanced_Accuracy
##   0.8196  0.5336007       0.6776896        
##   0.8208  0.5343961       0.6754664        
##   0.8244  0.5367456       0.6754355        
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
## 
## $Inter5
## k-Nearest Neighbors 
## 
## 768 samples
##   8 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 10 times) 
## Summary of sample sizes: 614, 615, 615, 614, 614, 615, ... 
## Resampling results across tuning parameters:
## 
##   k  logLoss   AUC        prAUC      Accuracy   Kappa      F1       
##   5  1.814065  0.7393340  0.5067846  0.7045599  0.3256416  0.7817371
##   7  1.246406  0.7511268  0.5711721  0.7090994  0.3338116  0.7859094
##   9  0.986312  0.7647697  0.6189651  0.7170597  0.3525630  0.7915692
##   Sensitivity  Specificity  Pos_Pred_Value  Neg_Pred_Value  Precision
##   0.8132       0.5018379    0.7537623       0.5909299       0.7537623
##   0.8206       0.5009923    0.7549119       0.6004018       0.7549119
##   0.8256       0.5144794    0.7611879       0.6133326       0.7611879
##   Recall  Detection_Rate  Balanced_Accuracy
##   0.8132  0.5294364       0.6575189        
##   0.8206  0.5342484       0.6607962        
##   0.8256  0.5375146       0.6700397        
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
#==================================================================
#   Compare results derived from non-scaled and scaled data set
#==================================================================

# Replace NA values by mean: 
pima_imputed_mean <- pima_missing %>% 
  mutate_if(is.numeric, function(x) {replace_na(x, mean(x, na.rm = TRUE))})

# Scale our data: 
pima_scaled <- pima_imputed_mean %>% 
  mutate_if(is.numeric, function(x) {(x - min(x)) / (max(x) - min(x))})

# Compare results derived from non-scaled and scaled data set: 

list(Imputed_Data = pima_imputed_mean, Scaled_Data = pima_scaled) %>% 
  map(function(df) {
    caret::train(Outcome ~ .,
                 data = df,
                 method = "knn",
                 metric = "Accuracy", 
                 trControl = trainControl(method = "repeatedcv", 
                                          number = 5, 
                                          repeats = 10, 
                                          classProbs = TRUE, 
                                          summaryFunction = multiClassSummary, 
                                          allowParallel = TRUE))}) -> list_for_comparing


# Compare effect of using scaled data based on some criteria of model performance: 

theme_set(theme_minimal())
bind_rows(list_for_comparing$Imputed_Data$resample %>% mutate(Data = "Non-scaled"), 
          list_for_comparing$Scaled_Data$resample %>% mutate(Data = "Scaled")) %>% 
  select(AUC, Accuracy, Specificity, Sensitivity, Kappa, Recall, Data) %>% 
  gather(a, b, -Data) %>% 
  ggplot(aes(Data, b, color = Data, fill = Data)) + 
  geom_boxplot(alpha = 0.3, show.legend = FALSE) + 
  facet_wrap(~ a, scales = "free") + 
  scale_y_continuous(labels = scales::percent) + 
  labs(x = NULL, y = NULL, 
       title = "Model Performance: Effect of Using Scaled Data for Training KNN Model", 
       subtitle = "Data Used: https://www.kaggle.com/uciml/pima-indians-diabetes-database")