Feature Selection for Logistic Classifier based on AIC Criterion

R for Pleasure

Nguyen Chi Dung

#===========================================================================================================
#   Load some packages and perform data prep-rocessing
#   Data downloaded from: https://www.kaggle.com/uciml/pima-indians-diabetes-database/version/1#diabetes.csv
#===========================================================================================================

# Clear workspace: 
rm(list = ls())

# Load data: 
library(tidyverse)
library(magrittr)
pima <- read_csv("/home/chidung/Desktop/python_learning/pima/diabetes.csv")

# Function converts zero to missing values: 
convert_to_na <- function(x) {
  x[x == 0] <- NA
  return(x)
}

# Apply above function and convert Outcome to factor: 

pima_missing <- pima %>% 
  select(-Pregnancies, -Outcome) %>% 
  mutate_all(convert_to_na) %>% 
  mutate(Pregnancies = pima$Pregnancies, Outcome = pima$Outcome) %>% 
  mutate(Outcome = as.factor(case_when(Outcome == 1 ~ "Yes", TRUE ~ "No")))

# Missing rate: 
library(purrr)

pima_missing %>% 
  map(function(x) {100*sum(is.na(x)) / length(x)})
## $Glucose
## [1] 0.6510417
## 
## $BloodPressure
## [1] 4.557292
## 
## $SkinThickness
## [1] 29.55729
## 
## $Insulin
## [1] 48.69792
## 
## $BMI
## [1] 1.432292
## 
## $DiabetesPedigreeFunction
## [1] 0
## 
## $Age
## [1] 0
## 
## $Pregnancies
## [1] 0
## 
## $Outcome
## [1] 0
# Replace NA values by mean: 
pima_missing %<>% mutate_if(is.numeric, function(x) {replace_na(x, mean(x, na.rm = TRUE))})

# Scale our data: 
pima_scaled <- pima_missing %>% 
  mutate_if(is.numeric, function(x) {(x - min(x)) / (max(x) - min(x))})

#=========================================================================
#   Feature Selection for Logistic Classifier based on AIC Criterion
#=========================================================================

#-------------------------------------------------------------------
#  In case of you want to use stepAIC() function from MASS package
#------------------------------------------------------------------

# Logistic with all predictors: 

full_variables_logistic <- glm(Outcome ~ ., family = binomial, data = pima_scaled)

# Feature selection based on AIC Criterion from stepAIC() function: 
best_model <- MASS::stepAIC(full_variables_logistic, trace = TRUE) 
## Start:  AIC=731.3
## Outcome ~ Glucose + BloodPressure + SkinThickness + Insulin + 
##     BMI + DiabetesPedigreeFunction + Age + Pregnancies
## 
##                            Df Deviance    AIC
## - SkinThickness             1   713.37 729.37
## - Insulin                   1   713.74 729.74
## - BloodPressure             1   714.36 730.36
## - Age                       1   715.20 731.20
## <none>                          713.30 731.30
## - DiabetesPedigreeFunction  1   722.05 738.05
## - Pregnancies               1   728.74 744.74
## - BMI                       1   742.64 758.64
## - Glucose                   1   830.73 846.73
## 
## Step:  AIC=729.37
## Outcome ~ Glucose + BloodPressure + Insulin + BMI + DiabetesPedigreeFunction + 
##     Age + Pregnancies
## 
##                            Df Deviance    AIC
## - Insulin                   1   713.81 727.81
## - BloodPressure             1   714.43 728.43
## - Age                       1   715.32 729.32
## <none>                          713.37 729.37
## - DiabetesPedigreeFunction  1   722.15 736.15
## - Pregnancies               1   728.86 742.86
## - BMI                       1   754.72 768.72
## - Glucose                   1   831.05 845.05
## 
## Step:  AIC=727.81
## Outcome ~ Glucose + BloodPressure + BMI + DiabetesPedigreeFunction + 
##     Age + Pregnancies
## 
##                            Df Deviance    AIC
## - BloodPressure             1   714.78 726.78
## - Age                       1   715.73 727.73
## <none>                          713.81 727.81
## - DiabetesPedigreeFunction  1   722.48 734.48
## - Pregnancies               1   729.34 741.34
## - BMI                       1   754.74 766.74
## - Glucose                   1   845.77 857.77
## 
## Step:  AIC=726.78
## Outcome ~ Glucose + BMI + DiabetesPedigreeFunction + Age + Pregnancies
## 
##                            Df Deviance    AIC
## - Age                       1   716.18 726.18
## <none>                          714.78 726.78
## - DiabetesPedigreeFunction  1   723.73 733.73
## - Pregnancies               1   729.97 739.97
## - BMI                       1   755.70 765.70
## - Glucose                   1   845.79 855.79
## 
## Step:  AIC=726.18
## Outcome ~ Glucose + BMI + DiabetesPedigreeFunction + Pregnancies
## 
##                            Df Deviance    AIC
## <none>                          716.18 726.18
## - DiabetesPedigreeFunction  1   725.35 733.35
## - Pregnancies               1   744.39 752.39
## - BMI                       1   756.14 764.14
## - Glucose                   1   860.82 868.82
# Show Results for best model: 
best_model %>% summary()
## 
## Call:
## glm(formula = Outcome ~ Glucose + BMI + DiabetesPedigreeFunction + 
##     Pregnancies, family = binomial, data = pima_scaled)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8232  -0.7242  -0.3992   0.7254   2.4353  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -5.8827     0.4249 -13.845  < 2e-16 ***
## Glucose                    5.7233     0.5410  10.579  < 2e-16 ***
## BMI                        4.3355     0.7202   6.020 1.75e-09 ***
## DiabetesPedigreeFunction   2.0664     0.6903   2.994  0.00276 ** 
## Pregnancies                2.4368     0.4682   5.204 1.95e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 716.18  on 763  degrees of freedom
## AIC: 726.18
## 
## Number of Fisher Scoring iterations: 5
#--------------------------------------------------------------
#  In case of you want to perform feature selection manually
#--------------------------------------------------------------

# Function list all combinations of predictors: 

all_combinations <- function(your_predictors) {

  n <- length(your_predictors)
  map(1:n, function(x) {combn(variables, x)}) %>%  
    map(as.data.frame) -> k
  
  all_vec <- c()
  
  for (i in 1:n) {
    df <- k[[i]]
    n_col <- ncol(df)
    
    for (j in 1:n_col) {
      my_vec <- df[, j] %>% as.character() %>% list()
      all_vec <- c(all_vec, my_vec)
    }
  }
  
  return(all_vec)
  
}

# Number of all combinations: 
variables <- pima_scaled %>% select(-Outcome) %>% names()  
all_predictors <- variables %>% all_combinations()
all_predictors %>% length() 
## [1] 255
# AICs for all models: 

all_combinations(variables) %>% 
  map(function(x) {as.formula(paste("Outcome ~", paste(x, collapse = " + ")))}) %>% 
  map(function(formular) {glm(formular, family = binomial, data = pima_scaled)}) %>% 
  map_dbl("aic") -> aic_values 

# Min AIC: 
min(aic_values)
## [1] 726.1796
# Combination of predictors that results min AIC: 
predictors_selected <- all_predictors[[which.min(aic_values)]] 
predictors_selected
## [1] "Glucose"                  "BMI"                     
## [3] "DiabetesPedigreeFunction" "Pregnancies"
# All Results are identical as we can see: 
pima_scaled %>% 
  glm(as.formula(paste("Outcome ~", paste(predictors_selected, collapse = " + "))), family = binomial, data = .) %>% 
  summary()
## 
## Call:
## glm(formula = as.formula(paste("Outcome ~", paste(predictors_selected, 
##     collapse = " + "))), family = binomial, data = .)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8232  -0.7242  -0.3992   0.7254   2.4353  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -5.8827     0.4249 -13.845  < 2e-16 ***
## Glucose                    5.7233     0.5410  10.579  < 2e-16 ***
## BMI                        4.3355     0.7202   6.020 1.75e-09 ***
## DiabetesPedigreeFunction   2.0664     0.6903   2.994  0.00276 ** 
## Pregnancies                2.4368     0.4682   5.204 1.95e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 716.18  on 763  degrees of freedom
## AIC: 726.18
## 
## Number of Fisher Scoring iterations: 5