Logistic Regression Example

Problem Statement

This analysis randomly samples from UCM’s Fall 2011 admitted first-time freshmen. The goal of this analysis is to explain factors explain Enrollment in the Fall 2011 term.

Load the Data

library(tidyverse)

admits <- read_csv("C:/Users/pattiz/Desktop/MidAir Workshop/admits_data.csv")

admits_DF <-
  admits %>%
  rename(HS_GPA = `HS GPA`, ACT_COMP = `ACT COMP`) %>%
  mutate(INSTITUTION_RANK = as.factor(INSTITUTION_RANK))

Explore the data

summary(admits_DF)

##    COUNTY_IND       FIRST_GEN         ACT_COMP         LEGACY      
##  Min.   :0.0000   Min.   :0.0000   Min.   :13.00   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:19.00   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000   Median :22.00   Median :0.0000  
##  Mean   :0.0704   Mean   :0.5188   Mean   :21.98   Mean   :0.1164  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:24.00   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :34.00   Max.   :1.0000  
##                                    NA's   :41                      
##      HS_GPA      SCHOLARSHIP_IND     PELL_IND      INSTITUTION_RANK
##  Min.   :1.300   Min.   :0.0000   Min.   :0.0000   1   :1988       
##  1st Qu.:2.930   1st Qu.:0.0000   1st Qu.:0.0000   2   :  48       
##  Median :3.360   Median :1.0000   Median :0.0000   3   :  12       
##  Mean   :3.305   Mean   :0.5124   Mean   :0.3356   4   : 402       
##  3rd Qu.:3.730   3rd Qu.:1.0000   3rd Qu.:1.0000   NA's:  50       
##  Max.   :4.930   Max.   :1.0000   Max.   :1.0000                   
##  NA's   :23                                                        
##     ENROLLED    
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.462  
##  3rd Qu.:1.000  
##  Max.   :1.000  
##

Estimate a Logistic Regression Model

admit_model <- glm(ENROLLED ~ COUNTY_IND + FIRST_GEN + LEGACY + HS_GPA + ACT_COMP + SCHOLARSHIP_IND + 
                     PELL_IND + INSTITUTION_RANK, data = admits_DF, family = "binomial")

attributes(admit_model)

## $names
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "na.action"         "call"              "formula"          
## [25] "terms"             "data"              "offset"           
## [28] "control"           "method"            "contrasts"        
## [31] "xlevels"          
## 
## $class
## [1] "glm" "lm"

summary(admit_model)

## 
## Call:
## glm(formula = ENROLLED ~ COUNTY_IND + FIRST_GEN + LEGACY + HS_GPA + 
##     ACT_COMP + SCHOLARSHIP_IND + PELL_IND + INSTITUTION_RANK, 
##     family = "binomial", data = admits_DF)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1029  -1.1450  -0.1823   1.0455   2.8382  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.02635    0.45072  -0.058   0.9534    
## COUNTY_IND         0.96993    0.21135   4.589 4.45e-06 ***
## FIRST_GEN          0.10161    0.10323   0.984   0.3250    
## LEGACY             0.42962    0.15985   2.688   0.0072 ** 
## HS_GPA            -0.17678    0.11114  -1.591   0.1117    
## ACT_COMP           0.02935    0.01660   1.768   0.0771 .  
## SCHOLARSHIP_IND   -0.22993    0.12894  -1.783   0.0746 .  
## PELL_IND           0.48084    0.10207   4.711 2.47e-06 ***
## INSTITUTION_RANK2 -1.95674    0.41688  -4.694 2.68e-06 ***
## INSTITUTION_RANK3 -1.78805    0.78183  -2.287   0.0222 *  
## INSTITUTION_RANK4 -3.94452    0.34152 -11.550  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3317.9  on 2403  degrees of freedom
## Residual deviance: 2724.7  on 2393  degrees of freedom
##   (96 observations deleted due to missingness)
## AIC: 2746.7
## 
## Number of Fisher Scoring iterations: 6

To evaluate the overall performance of the saturated model compared to the null model, look at the null deviance and residual deviance. The difference between the deviances of the two models follows a \(\chi^2_{10}\) distribution.

1 - pchisq(593, df= 10)

## [1] 0

This the null hypothesis is rejected.

Coefficients

Any odds ratio larger than will shows an association between Enrollment and the explainatory varable.

exp(cbind(ODDS_RATIO = coef(admit_model), confint(admit_model)))

##                   ODDS_RATIO       2.5 %     97.5 %
## (Intercept)       0.97399199 0.401996523 2.35463244
## COUNTY_IND        2.63776627 1.762376850 4.04482355
## FIRST_GEN         1.10695625 0.904061796 1.35515180
## LEGACY            1.53667282 1.125082747 2.10650378
## HS_GPA            0.83796682 0.673565750 1.04155020
## ACT_COMP          1.02978818 0.996875869 1.06394546
## SCHOLARSHIP_IND   0.79459210 0.616818496 1.02275296
## PELL_IND          1.61743283 1.324877028 1.97695121
## INSTITUTION_RANK2 0.14131871 0.057218524 0.30054636
## INSTITUTION_RANK3 0.16728597 0.025473022 0.64573613
## INSTITUTION_RANK4 0.01936053 0.009183549 0.03558603