knitr::opts_chunk$set(echo = TRUE)
#pacman::p_load(readxl, tidyverse, DescTools, gtools, car, nnet, ResourceSelection, brant, VGAM)
pacman::p_load(tidyverse, VGAM)
## Data
grad <- readxl::read_excel("GradSchool.xlsx")
xtabs() will quickly create a set of conditional tables
we can summarize the categorical variables!
xtabs(
formula = ~ par_ed + apply + uni,
data = grad
) |>
addmargins()
## , , uni = 0
##
## apply
## par_ed somewhat likely unlikely very likely Sum
## 0 12 25 7 44
## 1 4 6 3 13
## Sum 16 31 10 57
##
## , , uni = 1
##
## apply
## par_ed somewhat likely unlikely very likely Sum
## 0 98 175 20 293
## 1 26 14 10 50
## Sum 124 189 30 343
##
## , , uni = Sum
##
## apply
## par_ed somewhat likely unlikely very likely Sum
## 0 110 200 27 337
## 1 30 20 13 63
## Sum 140 220 40 400
It’s best to convert the response, apply to an ordered
factor using
factor(apply, ordered = T, levels = c(...))
grad <-
grad |>
mutate(
apply = factor(apply,
levels = c('unlikely', 'somewhat likely', 'very likely'),
ordered = T,
labels = c('not', 'somewhat', 'very')), #changing the labels
# Changing par_ed and public to factors as well
par_ed1 = factor(par_ed, labels = c("no parent","1+ parent")),
uni1 = factor(uni, labels = c("private","public"))
)
head(grad, 10)
## # A tibble: 10 × 6
## apply par_ed uni gpa par_ed1 uni1
## <ord> <dbl> <dbl> <dbl> <fct> <fct>
## 1 very 0 1 3.26 no parent public
## 2 somewhat 1 1 3.21 1+ parent public
## 3 not 1 0 3.94 1+ parent private
## 4 somewhat 0 1 2.81 no parent public
## 5 somewhat 0 1 2.53 no parent public
## 6 not 0 0 2.59 no parent private
## 7 somewhat 0 1 2.56 no parent public
## 8 somewhat 0 1 2.73 no parent public
## 9 not 0 1 3 no parent public
## 10 somewhat 1 1 3.5 1+ parent public
Important to note: The estimate for the predictors in R estimates the change in probability for an increase in the ordinal variable: \(\log(P(Y \ge j)/P(Y < j)) = .....\)
That is, if beta is positive, then increasing the predictor increases the chance of being in the higher values of the response, NOT lower, which is what the notes have parameterized.
The MASS library has function polr() -
proportional odds logistic regression
grad_clm <-
MASS::polr(
formula = apply ~ par_ed1 + uni1 + gpa,
data = grad,
Hess = T # Will calculate the standard errors
)
broom::tidy(grad_clm)
## # A tibble: 5 × 5
## term estimate std.error statistic coef.type
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 par_ed11+ parent 1.05 0.266 3.94 coefficient
## 2 uni1public 0.0587 0.298 0.197 coefficient
## 3 gpa 0.616 0.261 2.36 coefficient
## 4 not|somewhat 2.26 0.882 2.56 scale
## 5 somewhat|very 4.36 0.904 4.82 scale
broom::glance(grad_clm)
## # A tibble: 1 × 7
## edf logLik AIC BIC deviance df.residual nobs
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5 -359. 727. 747. 717. 395 400
With SE and test stats - Add summary around grad_clm
By default, polr() won’d calculate the p-value since you
have to specify that you want the standard errors using
Hess = T, so we’ll need to calculate the p-values for a
Wald test ourselves!
coef_table <-
broom::tidy(grad_clm) |>
# p-value and confidence interval
mutate(
p_val = pnorm(abs(statistic), lower.tail = F)*2,
lower = estimate - qnorm(0.975)*std.error,
upper = estimate + qnorm(0.975)*std.error
)
coef_table
## # A tibble: 5 × 8
## term estimate std.error statistic coef.type p_val lower upper
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 par_ed11+ parent 1.05 0.266 3.94 coefficient 8.09e-5 0.527 1.57
## 2 uni1public 0.0587 0.298 0.197 coefficient 8.44e-1 -0.525 0.642
## 3 gpa 0.616 0.261 2.36 coefficient 1.82e-2 0.105 1.13
## 4 not|somewhat 2.26 0.882 2.56 scale 1.03e-2 0.533 3.99
## 5 somewhat|very 4.36 0.904 4.82 scale 1.45e-6 2.58 6.13
Always a good idea to check the assumption of parallel slopes between levels of the response!
Since the function is called polr(), it can only fit a
model with parallel slopes (otherwise it wouldn’t be proportional
odds!)
Instead, we can use the more flexible vglm() function
from VGAM. We can specify a cumulative model using
family = cumulative and non-parallel lines using
family = cumulative(parallel = F)
Calculating the loglikelihood without parallel slopes:
ll_b <-
vglm(
apply ~ par_ed1 + uni1 + gpa,
data = grad,
family = cumulative(parallel = F)
) |>
logLik()
# Calculating the loglikelihood of the proportional odds logit model
ll_p <- logLik(grad_clm)
# Twice the difference in loglikelihoods and p-value using chi-squared
G_LRT <- -2*(ll_p - ll_b)
p_val <- pchisq(G_LRT, df = 3, lower.tail = F)
c('test stat' = G_LRT, 'p-val' = p_val)
## test stat p-val
## 4.0137586 0.2599823
Our null hypothesis is that the lines are parallel, and since we don’t reject it, we can assume that our assumption is true!
We can also do it a little more formally using the Brant test with
the brant() function from the brant
package
…Brant
brant::brant(grad_clm)
## ----------------------------------------------------
## Test for X2 df probability
## ----------------------------------------------------
## Omnibus 4.34 3 0.23
## par_ed11+ parent 0.13 1 0.72
## uni1public 3.44 1 0.06
## gpa 0.18 1 0.67
## ----------------------------------------------------
##
## H0: Parallel Regression Assumption holds