Here are the steps on how to assess variable importance in a logistic regression analysis.
setwd("C:/MyRData/Logistic Regression Model Variable Importance")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.2 v purrr 1.0.1
## v tibble 3.2.1 v dplyr 1.1.2
## v tidyr 1.3.0 v stringr 1.5.0
## v readr 2.1.3 v forcats 0.5.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data<-read.csv("Medicaid1986.csv", fileEncoding = "UTF-8-BOM")
data %>% glimpse()
## Rows: 996
## Columns: 15
## $ rownames <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1~
## $ visits <int> 0, 1, 0, 0, 11, 3, 0, 6, 1, 0, 1, 0, 15, 5, 3, 2, 0, 1, 0, 0~
## $ exposure <int> 100, 90, 106, 114, 115, 102, 92, 92, 117, 101, 90, 111, 91, ~
## $ children <int> 1, 3, 4, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, ~
## $ age <int> 24, 19, 17, 29, 26, 22, 24, 21, 21, 24, 22, 20, 34, 21, 34, ~
## $ income <dbl> 14.500, 6.000, 8.377, 6.000, 8.500, 6.000, 4.000, 6.000, 6.0~
## $ health1 <dbl> 0.495, 0.520, -1.227, -1.524, 0.173, -0.905, -1.202, 0.656, ~
## $ health2 <dbl> -0.854, -0.969, 0.317, 0.457, -0.599, 0.062, 0.202, -0.981, ~
## $ access <dbl> 0.50, 0.17, 0.42, 0.33, 0.67, 0.25, 0.50, 0.67, 0.25, 0.67, ~
## $ married <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "yes"~
## $ gender <chr> "female", "female", "female", "female", "female", "female", ~
## $ ethnicity <chr> "cauc", "cauc", "cauc", "cauc", "cauc", "other", "cauc", "ca~
## $ school <int> 13, 11, 12, 12, 16, 12, 11, 11, 12, 15, 12, 12, 13, 9, 18, 1~
## $ program <chr> "afdc", "afdc", "afdc", "afdc", "afdc", "afdc", "afdc", "afd~
## $ enroll <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
data <- data %>%
mutate_if(is.character,as.factor)
data %>% glimpse()
## Rows: 996
## Columns: 15
## $ rownames <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1~
## $ visits <int> 0, 1, 0, 0, 11, 3, 0, 6, 1, 0, 1, 0, 15, 5, 3, 2, 0, 1, 0, 0~
## $ exposure <int> 100, 90, 106, 114, 115, 102, 92, 92, 117, 101, 90, 111, 91, ~
## $ children <int> 1, 3, 4, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, ~
## $ age <int> 24, 19, 17, 29, 26, 22, 24, 21, 21, 24, 22, 20, 34, 21, 34, ~
## $ income <dbl> 14.500, 6.000, 8.377, 6.000, 8.500, 6.000, 4.000, 6.000, 6.0~
## $ health1 <dbl> 0.495, 0.520, -1.227, -1.524, 0.173, -0.905, -1.202, 0.656, ~
## $ health2 <dbl> -0.854, -0.969, 0.317, 0.457, -0.599, 0.062, 0.202, -0.981, ~
## $ access <dbl> 0.50, 0.17, 0.42, 0.33, 0.67, 0.25, 0.50, 0.67, 0.25, 0.67, ~
## $ married <fct> no, no, no, no, no, no, no, yes, no, yes, no, no, no, no, no~
## $ gender <fct> female, female, female, female, female, female, female, fema~
## $ ethnicity <fct> cauc, cauc, cauc, cauc, cauc, other, cauc, cauc, cauc, cauc,~
## $ school <int> 13, 11, 12, 12, 16, 12, 11, 11, 12, 15, 12, 12, 13, 9, 18, 1~
## $ program <fct> afdc, afdc, afdc, afdc, afdc, afdc, afdc, afdc, afdc, afdc, ~
## $ enroll <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
data<- na.omit(data) #removing NA values
data.df<- data %>%
select(-rownames,-health1,-health2,-exposure,-access)
data.df %>% glimpse()
## Rows: 996
## Columns: 10
## $ visits <int> 0, 1, 0, 0, 11, 3, 0, 6, 1, 0, 1, 0, 15, 5, 3, 2, 0, 1, 0, 0~
## $ children <int> 1, 3, 4, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, ~
## $ age <int> 24, 19, 17, 29, 26, 22, 24, 21, 21, 24, 22, 20, 34, 21, 34, ~
## $ income <dbl> 14.500, 6.000, 8.377, 6.000, 8.500, 6.000, 4.000, 6.000, 6.0~
## $ married <fct> no, no, no, no, no, no, no, yes, no, yes, no, no, no, no, no~
## $ gender <fct> female, female, female, female, female, female, female, fema~
## $ ethnicity <fct> cauc, cauc, cauc, cauc, cauc, other, cauc, cauc, cauc, cauc,~
## $ school <int> 13, 11, 12, 12, 16, 12, 11, 11, 12, 15, 12, 12, 13, 9, 18, 1~
## $ program <fct> afdc, afdc, afdc, afdc, afdc, afdc, afdc, afdc, afdc, afdc, ~
## $ enroll <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
data.df$enroll<- as.factor(data.df$enroll)
# View dataset structure
str(data.df)
## 'data.frame': 996 obs. of 10 variables:
## $ visits : int 0 1 0 0 11 3 0 6 1 0 ...
## $ children : int 1 3 4 2 1 1 2 1 1 1 ...
## $ age : int 24 19 17 29 26 22 24 21 21 24 ...
## $ income : num 14.5 6 8.38 6 8.5 ...
## $ married : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 2 ...
## $ gender : Factor w/ 2 levels "female","male": 1 1 1 1 1 1 1 1 1 1 ...
## $ ethnicity: Factor w/ 2 levels "cauc","other": 1 1 1 1 1 2 1 1 1 1 ...
## $ school : int 13 11 12 12 16 12 11 11 12 15 ...
## $ program : Factor w/ 2 levels "afdc","ssi": 1 1 1 1 1 1 1 1 1 1 ...
## $ enroll : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
summary(data.df)
## visits children age income married
## Min. : 0.000 Min. :0.000 Min. : 16.00 Min. : 0.500 no :780
## 1st Qu.: 0.000 1st Qu.:0.000 1st Qu.: 29.00 1st Qu.: 6.000 yes:216
## Median : 1.000 Median :1.000 Median : 66.00 Median : 7.990
## Mean : 1.931 Mean :1.314 Mean : 55.21 Mean : 8.191
## 3rd Qu.: 3.000 3rd Qu.:2.000 3rd Qu.: 78.00 3rd Qu.: 8.500
## Max. :50.000 Max. :9.000 Max. :105.00 Max. :17.500
## gender ethnicity school program enroll
## female:843 cauc :691 Min. : 0.000 afdc:485 0:506
## male :153 other:305 1st Qu.: 6.000 ssi :511 1:490
## Median :10.000
## Mean : 9.029
## 3rd Qu.:12.000
## Max. :18.000
#make example reproducible
set.seed(1)
#Use 80% of dataset as training set and remaining 20% as testing set
sample <- sample(c(TRUE, FALSE), nrow(data.df), replace=TRUE, prob=c(0.8,0.2))
train <- data.df[sample, ]
test <- data.df[!sample, ]
#fit logistic regression model
model <- glm(enroll~., family="binomial", data=train)
#disable scientific notation for model summary
options(scipen=999)
#view model summary
summary(model)
##
## Call:
## glm(formula = enroll ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5133 -1.1654 -0.7826 1.1643 1.5164
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.880784 0.453035 -1.944 0.0519 .
## visits -0.028141 0.022821 -1.233 0.2175
## children -0.064019 0.068056 -0.941 0.3469
## age 0.010034 0.008899 1.128 0.2595
## income 0.042327 0.021534 1.966 0.0493 *
## marriedyes -0.147728 0.191791 -0.770 0.4411
## gendermale 0.013473 0.223582 0.060 0.9520
## ethnicityother 0.040286 0.166634 0.242 0.8090
## school 0.042901 0.019314 2.221 0.0263 *
## programssi -0.519103 0.454656 -1.142 0.2536
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1106.2 on 797 degrees of freedom
## Residual deviance: 1093.4 on 788 degrees of freedom
## AIC: 1113.4
##
## Number of Fisher Scoring iterations: 4
pscl::pR2(model)["McFadden"]
## fitting null model for pseudo-r2
## McFadden
## 0.0116419
In this model, two variables have a VIF over 5.
Therefore there is multicollinearity in this model.
#calculate VIF values for each predictor variable in our model
car::vif(model)
## visits children age income married gender ethnicity school
## 1.024421 2.008243 9.548457 1.189181 1.242147 1.243427 1.157217 1.379913
## program
## 10.143437
predicted <- predict(model, test, type="response")
Note. We can find the optimal probability to use to maximize the accuracy of our model by using the optimalCutoff() function from the {InformationValue} package. Any individual with a probability of enrolling of 0.5839173 or higher will be predicted to enroll.
library(InformationValue)
#find optimal cutoff probability to use to maximize accuracy
optimal <- optimalCutoff(test$enroll, predicted)[1]
optimal
## [1] 0.5839173
confusionMatrix(test$enroll, predicted)
## 0 1
## 0 56 49
## 1 49 44
#calculate sensitivity
sensitivity(test$enroll, predicted)
## [1] 0.4731183
#calculate specificity
specificity(test$enroll, predicted)
## [1] 0.5333333
#calculate total misclassification error rate
misClassError(test$enroll, predicted, threshold=optimal)
## [1] 0.4545
We can see that the AUC is 0.5262, which is low. This indicates that our model does not do a good job of predicting whether or not an individual will enroll.
#plot the ROC curve
plotROC(test$enroll, predicted)
Although we understand that this model is not a good model, we are still going to compute the importance of each predictor variable in the model.
By the way, higher ‘Overall’ values indicate more importance!
The most important variables in predicting medicaid enrollment were
level of education (school) and income.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:InformationValue':
##
## confusionMatrix, precision, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
modelImp<-varImp(model)
modelImp
## Overall
## visits 1.23311015
## children 0.94068391
## age 1.12750114
## income 1.96555749
## marriedyes 0.77025851
## gendermale 0.06025786
## ethnicityother 0.24176266
## school 2.22127609
## programssi 1.14174779
library(report)
report(model)
## We fitted a logistic model (estimated using ML) to predict enroll with visits,
## children, age, income, married, gender, ethnicity, school and program (formula:
## enroll ~ visits + children + age + income + married + gender + ethnicity +
## school + program). The model's explanatory power is very weak (Tjur's R2 =
## 0.02). The model's intercept, corresponding to visits = 0, children = 0, age =
## 0, income = 0, married = no, gender = female, ethnicity = cauc, school = 0 and
## program = afdc, is at -0.88 (95% CI [-1.77, 3.99e-03], p = 0.052). Within this
## model:
##
## - The effect of visits is statistically non-significant and negative (beta =
## -0.03, 95% CI [-0.08, 0.01], p = 0.218; Std. beta = -0.10, 95% CI [-0.26,
## 0.05])
## - The effect of children is statistically non-significant and negative (beta =
## -0.06, 95% CI [-0.20, 0.07], p = 0.347; Std. beta = -0.10, 95% CI [-0.29,
## 0.10])
## - The effect of age is statistically non-significant and positive (beta = 0.01,
## 95% CI [-7.38e-03, 0.03], p = 0.260; Std. beta = 0.25, 95% CI [-0.18, 0.68])
## - The effect of income is statistically significant and positive (beta = 0.04,
## 95% CI [3.65e-04, 0.08], p = 0.049; Std. beta = 0.15, 95% CI [1.33e-03, 0.31])
## - The effect of married [yes] is statistically non-significant and negative
## (beta = -0.15, 95% CI [-0.52, 0.23], p = 0.441; Std. beta = -0.15, 95% CI
## [-0.52, 0.23])
## - The effect of gender [male] is statistically non-significant and positive
## (beta = 0.01, 95% CI [-0.43, 0.45], p = 0.952; Std. beta = 0.01, 95% CI [-0.43,
## 0.45])
## - The effect of ethnicity [other] is statistically non-significant and positive
## (beta = 0.04, 95% CI [-0.29, 0.37], p = 0.809; Std. beta = 0.04, 95% CI [-0.29,
## 0.37])
## - The effect of school is statistically significant and positive (beta = 0.04,
## 95% CI [5.21e-03, 0.08], p = 0.026; Std. beta = 0.19, 95% CI [0.02, 0.35])
## - The effect of program [ssi] is statistically non-significant and negative
## (beta = -0.52, 95% CI [-1.41, 0.37], p = 0.254; Std. beta = -0.52, 95% CI
## [-1.41, 0.37])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
Congratulations for completing all the 22 steps for finding variable importance is a logistic model with the {caret} package!
If you are new to R Programming Language, don’t give up. Your R skills will get better with time.
Note. The ‘Medicaid1986.csv’ file can be downloaded from: https://vincentarelbundock.github.io/Rdatasets/articles/data.html