Feature Selection for Logistic Classifier based on AIC Criterion
R for Pleasure
Nguyen Chi Dung
#===========================================================================================================
# Load some packages and perform data prep-rocessing
# Data downloaded from: https://www.kaggle.com/uciml/pima-indians-diabetes-database/version/1#diabetes.csv
#===========================================================================================================
# Clear workspace:
rm(list = ls())
# Load data:
library(tidyverse)
library(magrittr)
pima <- read_csv("/home/chidung/Desktop/python_learning/pima/diabetes.csv")
# Function converts zero to missing values:
convert_to_na <- function(x) {
x[x == 0] <- NA
return(x)
}
# Apply above function and convert Outcome to factor:
pima_missing <- pima %>%
select(-Pregnancies, -Outcome) %>%
mutate_all(convert_to_na) %>%
mutate(Pregnancies = pima$Pregnancies, Outcome = pima$Outcome) %>%
mutate(Outcome = as.factor(case_when(Outcome == 1 ~ "Yes", TRUE ~ "No")))
# Missing rate:
library(purrr)
pima_missing %>%
map(function(x) {100*sum(is.na(x)) / length(x)})
## $Glucose
## [1] 0.6510417
##
## $BloodPressure
## [1] 4.557292
##
## $SkinThickness
## [1] 29.55729
##
## $Insulin
## [1] 48.69792
##
## $BMI
## [1] 1.432292
##
## $DiabetesPedigreeFunction
## [1] 0
##
## $Age
## [1] 0
##
## $Pregnancies
## [1] 0
##
## $Outcome
## [1] 0
# Replace NA values by mean:
pima_missing %<>% mutate_if(is.numeric, function(x) {replace_na(x, mean(x, na.rm = TRUE))})
# Scale our data:
pima_scaled <- pima_missing %>%
mutate_if(is.numeric, function(x) {(x - min(x)) / (max(x) - min(x))})
#=========================================================================
# Feature Selection for Logistic Classifier based on AIC Criterion
#=========================================================================
#-------------------------------------------------------------------
# In case of you want to use stepAIC() function from MASS package
#------------------------------------------------------------------
# Logistic with all predictors:
full_variables_logistic <- glm(Outcome ~ ., family = binomial, data = pima_scaled)
# Feature selection based on AIC Criterion from stepAIC() function:
best_model <- MASS::stepAIC(full_variables_logistic, trace = TRUE)
## Start: AIC=731.3
## Outcome ~ Glucose + BloodPressure + SkinThickness + Insulin +
## BMI + DiabetesPedigreeFunction + Age + Pregnancies
##
## Df Deviance AIC
## - SkinThickness 1 713.37 729.37
## - Insulin 1 713.74 729.74
## - BloodPressure 1 714.36 730.36
## - Age 1 715.20 731.20
## <none> 713.30 731.30
## - DiabetesPedigreeFunction 1 722.05 738.05
## - Pregnancies 1 728.74 744.74
## - BMI 1 742.64 758.64
## - Glucose 1 830.73 846.73
##
## Step: AIC=729.37
## Outcome ~ Glucose + BloodPressure + Insulin + BMI + DiabetesPedigreeFunction +
## Age + Pregnancies
##
## Df Deviance AIC
## - Insulin 1 713.81 727.81
## - BloodPressure 1 714.43 728.43
## - Age 1 715.32 729.32
## <none> 713.37 729.37
## - DiabetesPedigreeFunction 1 722.15 736.15
## - Pregnancies 1 728.86 742.86
## - BMI 1 754.72 768.72
## - Glucose 1 831.05 845.05
##
## Step: AIC=727.81
## Outcome ~ Glucose + BloodPressure + BMI + DiabetesPedigreeFunction +
## Age + Pregnancies
##
## Df Deviance AIC
## - BloodPressure 1 714.78 726.78
## - Age 1 715.73 727.73
## <none> 713.81 727.81
## - DiabetesPedigreeFunction 1 722.48 734.48
## - Pregnancies 1 729.34 741.34
## - BMI 1 754.74 766.74
## - Glucose 1 845.77 857.77
##
## Step: AIC=726.78
## Outcome ~ Glucose + BMI + DiabetesPedigreeFunction + Age + Pregnancies
##
## Df Deviance AIC
## - Age 1 716.18 726.18
## <none> 714.78 726.78
## - DiabetesPedigreeFunction 1 723.73 733.73
## - Pregnancies 1 729.97 739.97
## - BMI 1 755.70 765.70
## - Glucose 1 845.79 855.79
##
## Step: AIC=726.18
## Outcome ~ Glucose + BMI + DiabetesPedigreeFunction + Pregnancies
##
## Df Deviance AIC
## <none> 716.18 726.18
## - DiabetesPedigreeFunction 1 725.35 733.35
## - Pregnancies 1 744.39 752.39
## - BMI 1 756.14 764.14
## - Glucose 1 860.82 868.82
# Show Results for best model:
best_model %>% summary()
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + DiabetesPedigreeFunction +
## Pregnancies, family = binomial, data = pima_scaled)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8232 -0.7242 -0.3992 0.7254 2.4353
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.8827 0.4249 -13.845 < 2e-16 ***
## Glucose 5.7233 0.5410 10.579 < 2e-16 ***
## BMI 4.3355 0.7202 6.020 1.75e-09 ***
## DiabetesPedigreeFunction 2.0664 0.6903 2.994 0.00276 **
## Pregnancies 2.4368 0.4682 5.204 1.95e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 716.18 on 763 degrees of freedom
## AIC: 726.18
##
## Number of Fisher Scoring iterations: 5
#--------------------------------------------------------------
# In case of you want to perform feature selection manually
#--------------------------------------------------------------
# Function list all combinations of predictors:
all_combinations <- function(your_predictors) {
n <- length(your_predictors)
map(1:n, function(x) {combn(variables, x)}) %>%
map(as.data.frame) -> k
all_vec <- c()
for (i in 1:n) {
df <- k[[i]]
n_col <- ncol(df)
for (j in 1:n_col) {
my_vec <- df[, j] %>% as.character() %>% list()
all_vec <- c(all_vec, my_vec)
}
}
return(all_vec)
}
# Number of all combinations:
variables <- pima_scaled %>% select(-Outcome) %>% names()
all_predictors <- variables %>% all_combinations()
all_predictors %>% length()
## [1] 255
# AICs for all models:
all_combinations(variables) %>%
map(function(x) {as.formula(paste("Outcome ~", paste(x, collapse = " + ")))}) %>%
map(function(formular) {glm(formular, family = binomial, data = pima_scaled)}) %>%
map_dbl("aic") -> aic_values
# Min AIC:
min(aic_values)
## [1] 726.1796
# Combination of predictors that results min AIC:
predictors_selected <- all_predictors[[which.min(aic_values)]]
predictors_selected
## [1] "Glucose" "BMI"
## [3] "DiabetesPedigreeFunction" "Pregnancies"
# All Results are identical as we can see:
pima_scaled %>%
glm(as.formula(paste("Outcome ~", paste(predictors_selected, collapse = " + "))), family = binomial, data = .) %>%
summary()
##
## Call:
## glm(formula = as.formula(paste("Outcome ~", paste(predictors_selected,
## collapse = " + "))), family = binomial, data = .)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8232 -0.7242 -0.3992 0.7254 2.4353
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.8827 0.4249 -13.845 < 2e-16 ***
## Glucose 5.7233 0.5410 10.579 < 2e-16 ***
## BMI 4.3355 0.7202 6.020 1.75e-09 ***
## DiabetesPedigreeFunction 2.0664 0.6903 2.994 0.00276 **
## Pregnancies 2.4368 0.4682 5.204 1.95e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 716.18 on 763 degrees of freedom
## AIC: 726.18
##
## Number of Fisher Scoring iterations: 5