library(tidyverse)
admission <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
head(admission)
## admit gre gpa rank
## 1 0 380 3.61 3
## 2 1 660 3.67 3
## 3 1 800 4.00 1
## 4 1 640 3.19 4
## 5 0 520 2.93 4
## 6 1 760 3.00 2
glm1 <- glm(formula = admit ~ gre + gpa + rank,
family = binomial(link = "logit"),
data = admission)
summary(glm1)
##
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = binomial(link = "logit"),
## data = admission)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5802 -0.8848 -0.6382 1.1575 2.1732
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.449548 1.132846 -3.045 0.00233 **
## gre 0.002294 0.001092 2.101 0.03564 *
## gpa 0.777014 0.327484 2.373 0.01766 *
## rank -0.560031 0.127137 -4.405 0.0000106 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 459.44 on 396 degrees of freedom
## AIC: 467.44
##
## Number of Fisher Scoring iterations: 4
If linearity is reasonable, the plot should look approximately linear. It does not look so linear.
## Create quartiles of GRE scores
admission$greQ <- cut(admission$gre,
breaks = quantile(admission$gre),
labels = c("Q1","Q2","Q3","Q4"),
right = FALSE, include.lowest = TRUE)
summary(admission$greQ)
## Q1 Q2 Q3 Q4
## 99 75 103 123
## Obtain the mid-point values for each category
midpoint_values <- rowMeans(cbind(head(quantile(admission$gre),-1),
tail(quantile(admission$gre),-1)))
## Refit the model with categorical variable
glm2 <- glm(formula = admit ~ greQ + gpa + rank,
family = binomial(link = "logit"),
data = admission)
summary(glm2)
##
## Call:
## glm(formula = admit ~ greQ + gpa + rank, family = binomial(link = "logit"),
## data = admission)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5296 -0.8759 -0.6321 1.1424 2.1914
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.6796 1.1241 -2.384 0.0171 *
## greQQ2 0.6491 0.3709 1.750 0.0801 .
## greQQ3 0.3678 0.3559 1.033 0.3014
## greQQ4 0.7987 0.3450 2.315 0.0206 *
## gpa 0.8099 0.3278 2.471 0.0135 *
## rank -0.5607 0.1275 -4.397 0.000011 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 457.49 on 394 degrees of freedom
## AIC: 469.49
##
## Number of Fisher Scoring iterations: 4
## Extract log odds ratios for Q2,Q3,Q4 (add 0 for Q1 reference)
log_odds_ratios <- c(0,coef(glm2)[Filter(f = function(elt) {grepl("gre", elt)}, x = names(coef(glm2)))])
data_plot <- data.frame(midpoint_values = midpoint_values,
log_odds_ratios = log_odds_ratios)
## Plot
ggplot(data = data_plot, mapping = aes(x = midpoint_values, y = log_odds_ratios)) +
geom_line() +
geom_point() +
theme_bw() + theme(legend.key = element_blank())
We could use GAM to see if more flexible relationship with the continuous variable and the log odds of the outcome probability is reasonable. In this case, the algorithm decided a linear term is enough.
library(mgcv)
gam1 <- gam(formula = admit ~ s(gre) + gpa + rank,
family = binomial(link = "logit"),
data = admission)
summary(gam1)
##
## Family: binomial
## Link function: logit
##
## Formula:
## admit ~ s(gre) + gpa + rank
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1014 1.1508 -1.826 0.0679 .
## gpa 0.7770 0.3275 2.373 0.0177 *
## rank -0.5600 0.1271 -4.405 0.0000106 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(gre) 1 1 4.414 0.0356 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0924 Deviance explained = 8.11%
## UBRE = 0.1686 Scale est. = 1 n = 400
plot(gam1)