library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(foreign)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
math165 <- read_csv("F165GradesAsPredictorsForS181wFirstMath.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## MathTERM = col_double(),
## gpa = col_double(),
## grade165 = col_character(),
## math165grade = col_double(),
## RAVEN_GRADE_HISTORY_CRSE_NUMB = col_character(),
## grade181 = col_character(),
## math181grade = col_double(),
## `1stMathTerm` = col_double(),
## SUBJ_CODE = col_character(),
## MinOf1stCourse = col_character(),
## firstmathgrade = col_character(),
## `1stGrade` = col_character()
## )
gsub( " ", "", math165)
summary(math165)
## MathTERM gpa grade165 math165grade
## Min. :201520 Min. :0.400 Length:1423 Min. :2.000
## 1st Qu.:201620 1st Qu.:2.800 Class :character 1st Qu.:2.000
## Median :201720 Median :3.200 Mode :character Median :3.000
## Mean :201720 Mean :3.208 Mean :3.082
## 3rd Qu.:201820 3rd Qu.:3.600 3rd Qu.:4.000
## Max. :201920 Max. :4.000 Max. :4.000
## RAVEN_GRADE_HISTORY_CRSE_NUMB grade181 math181grade
## Length:1423 Length:1423 Min. :-1.000
## Class :character Class :character 1st Qu.: 1.000
## Mode :character Mode :character Median : 2.000
## Mean : 1.996
## 3rd Qu.: 3.000
## Max. : 4.000
## 1stMathTerm SUBJ_CODE MinOf1stCourse firstmathgrade
## Min. :201020 Length:1423 Length:1423 Length:1423
## 1st Qu.:201520 Class :character Class :character Class :character
## Median :201620 Mode :character Mode :character Mode :character
## Mean :201644
## 3rd Qu.:201820
## Max. :201920
## 1stGrade
## Length:1423
## Class :character
## Mode :character
##
##
##
mean(math165$math165grade)
## [1] 3.081518
mean(math165$math181grade)
## [1] 1.996486
math165$MinOf1stCourse <- as.numeric(math165$MinOf1stCourse)
## Warning: NAs introduced by coercion
math165new <- math165 %>%
filter(math181grade >= 0) %>%
mutate(level = ifelse(MinOf1stCourse %in% 0:105, "develop", "college"))
math165new
## # A tibble: 1,248 x 13
## MathTERM gpa grade165 math165grade RAVEN_GRADE_HISTO~ grade181 math181grade
## <dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl>
## 1 201620 3 A 4 181 B 3
## 2 201820 4 A 4 181 A 4
## 3 201520 3 A 4 181 A 4
## 4 201820 2.4 A 4 181 A 4
## 5 201620 4 A 4 181 B 3
## 6 201620 3.6 A 4 181 A 4
## 7 201720 3 A 4 181 B 3
## 8 201520 3.4 A 4 181 D 1
## 9 201620 3.4 A 4 181 C 2
## 10 201920 3.8 A 4 181 B 3
## # ... with 1,238 more rows, and 6 more variables: 1stMathTerm <dbl>,
## # SUBJ_CODE <chr>, MinOf1stCourse <dbl>, firstmathgrade <chr>,
## # 1stGrade <chr>, level <chr>
par(mfrow = c(1,2))
boxplot(math165new$math165grade)
boxplot(math165new$math181grade)
# These plots make sense because you have to get a C or higher to get into MATH 181
plot(jitter(math165new$math181grade) ~ jitter(math165new$math165grade))
fit1 <- lm(math181grade ~ math165grade, math165new)
summary(fit1)
##
## Call:
## lm(formula = math181grade ~ math165grade, data = math165new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1719 -0.4199 0.5801 0.8281 2.5801
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.33205 0.13182 -2.519 0.0119 *
## math165grade 0.87599 0.04066 21.545 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.172 on 1246 degrees of freedom
## Multiple R-squared: 0.2714, Adjusted R-squared: 0.2708
## F-statistic: 464.2 on 1 and 1246 DF, p-value: < 2.2e-16
fit2 <- lm(math181grade ~ gpa + math165grade + as.factor(level), math165new)
summary(fit2)
##
## Call:
## lm(formula = math181grade ~ gpa + math165grade + as.factor(level),
## data = math165new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5407 -0.7159 0.2597 0.7268 2.9805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.51494 0.18853 -8.036 2.14e-15 ***
## gpa 0.82480 0.07489 11.013 < 2e-16 ***
## math165grade 0.43912 0.05383 8.158 8.27e-16 ***
## as.factor(level)develop -0.26751 0.06465 -4.138 3.75e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.113 on 1244 degrees of freedom
## Multiple R-squared: 0.344, Adjusted R-squared: 0.3424
## F-statistic: 217.4 on 3 and 1244 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit2)
Residual plots of fitted values clearly indicate that linear regression is not a good model.
ftable(xtabs(~ gpa + math165grade, data = math165new))
## math165grade 2 3 4
## gpa
## 0.4 1 0 0
## 0.8 1 0 0
## 1 1 0 0
## 1.2 1 0 0
## 1.4 3 0 0
## 1.6 2 2 0
## 1.8 3 3 0
## 2 23 3 2
## 2.2 23 7 1
## 2.4 40 11 3
## 2.6 51 27 4
## 2.8 62 46 10
## 3 54 72 26
## 3.2 47 68 34
## 3.4 24 69 46
## 3.6 4 60 85
## 3.8 1 26 96
## 4 0 0 206
We can also examine the distribution of gpa at every level of Math165 grade. This creates a 2 x 2 grid with a boxplot of math181 grades to gpa for every level of Math165 grade. To better see the data, we also add the raw data points on top of the box plots, with a small amount of noise (often called “jitter”) and 3% transparency so they do not overwhelm the boxplots.
p1 <- ggplot(math165new, aes(x = grade181, y = gpa)) +
geom_boxplot() +
geom_jitter(alpha = .03) +
facet_grid(. ~ grade165, margins = TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
labs(title="Boxplots of Math181 Grades to GPA, separated by Math165 grades")
p1
library(MASS)
math165new$grade181 <- as.factor(math165new$grade181)
logitfit1 <- polr(grade181 ~ gpa + math165grade + as.factor(level), data = math165new, Hess = TRUE)
summary(logitfit1)
## Call:
## polr(formula = grade181 ~ gpa + math165grade + as.factor(level),
## data = math165new, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## gpa -1.3477 0.13330 -10.110
## math165grade -0.7222 0.09177 -7.870
## as.factor(level)develop 0.4373 0.10756 4.066
##
## Intercepts:
## Value Std. Error t value
## A|B -7.7382 0.3823 -20.2390
## B|C -6.2067 0.3586 -17.3064
## C|D -4.9707 0.3437 -14.4603
## D|F -4.0886 0.3387 -12.0731
##
## Residual Deviance: 3354.567
## AIC: 3368.567
logit(P-hat(Y<=D)))) = -4.123 + 1.347(gpa) + 0.742(math165grade) - 0.435(develomental)
logit(P-hat(Y<=C)))) = -5.004 + 1.347(gpa) + 0.742(math165grade) - 0.435(develomental)
logit(P-hat(Y<=B)))) = -6.255 + 1.347(gpa) + 0.742(math165grade) - 0.435(develomental)
logit(P-hat(Y<=A)))) = -7.797 + 1.347(gpa) + 0.742(math165grade) - 0.435(develomental)
This shows the usual regression output coefficient table including the value of each coefficient, standard errors, and t value, which is simply the ratio of the coefficient to its standard error. There is no significance test by default.
Next we see the estimates for the two intercepts, which are sometimes called cutpoints. The intercepts indicate where the latent variable is cut to make the three groups that we observe in our data. Note that this latent variable is continuous. In general, these are not used in the interpretation of the results. The cutpoints are closely related to thresholds, which are reported by other statistical packages.
Finally, we see the residual deviance, -2 * Log Likelihood of the model as well as the AIC. Both the deviance and AIC are useful for model comparison.
One way to calculate a p-value in this case is by comparing the t-value against the standard normal distribution, like a z test. Of course this is only true with infinite degrees of freedom, but is reasonably approximated by large samples, becoming increasingly biased as sample size decreases.
ctable <- coef(summary(logitfit1))
(ci <- confint(logitfit1))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## gpa -1.6109926 -1.0882855
## math165grade -0.9025632 -0.5427238
## as.factor(level)develop 0.2266321 0.6483706
## calculate and store p values
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
## combined table
(ctable <- cbind(ctable, "p value" = p))
## Value Std. Error t value p value
## gpa -1.3477063 0.13329796 -10.110480 4.964039e-24
## math165grade -0.7221654 0.09176726 -7.869532 3.559710e-15
## as.factor(level)develop 0.4373333 0.10756413 4.065791 4.786973e-05
## A|B -7.7382155 0.38234138 -20.239022 4.438937e-91
## B|C -6.2066881 0.35863608 -17.306368 4.211566e-67
## C|D -4.9706547 0.34374528 -14.460285 2.159015e-47
## D|F -4.0885901 0.33865158 -12.073147 1.464263e-33
The coefficients from the model can be somewhat difficult to interpret because they are scaled in terms of logs. Another way to interpret logistic regression models is to convert the coefficients into odds ratios. To get the OR and confidence intervals, we just exponentiate the estimates and confidence intervals.
exp(coef(logitfit1))
## gpa math165grade as.factor(level)develop
## 0.2598356 0.4856994 1.5485721
exp(cbind(OR = coef(logitfit1), ci))
## OR 2.5 % 97.5 %
## gpa 0.2598356 0.1996893 0.3367934
## math165grade 0.4856994 0.4055289 0.5811631
## as.factor(level)develop 1.5485721 1.2543683 1.9124223
One of the assumptions underlying ordinal logistic (and ordinal probit) regression is that the relationship between each pair of outcome groups is the same. In other words, ordinal logistic regression assumes that the coefficients that describe the relationship between, say, the lowest versus all higher categories of the response variable are the same as those that describe the relationship between the next lowest category and all higher categories, etc. This is called the proportional odds assumption or the parallel regression assumption. Because the relationship between all pairs of groups is the same, there is only one set of coefficients.
In order to asses the appropriateness of our model, we need to evaluate whether the proportional odds assumption is tenable. The following code creates a graph to test the proportional odds assumption.
Graph predicted logits from individual logistic regressions with a single predictor where the outcome groups are defined by Math181grade >= 1, Math181grade >= 2 or Math181grade >= 3. If the difference between predicted logits for varying levels of a predictor, say level, are the same whether the outcome is defined by Math181grade >= 1, Math181grade >= 2 or Math181grade >= 3, then we can be confident that the proportional odds assumption holds. In other words, if the difference between logits for level = 0 (developmental) versus level = 1 (college) is the same when the outcome is Math181grade >= 1 as the difference when the outcome is Math181grade >= 2 or Math181grade >= 3, then the proportional odds assumption likely holds.
The first command creates the function that estimates the values that will be graphed. The first line of this command tells R that sf is a function, and that this function takes one argument, which we label y. The sf function will calculate the log odds of being greater than or equal to each value of the target variable. For our purposes, we would like the log odds of Math181grade being greater than or equal to 1, and then greater than or equal to 2, and then greater than or equal to 3.
Inside the sf function we find the qlogis function, which transforms a probability to a logit, and it will return the logit transformations of these probabilites. Inside the qlogis function we see that we want the log odds of the mean of y >= 1. When we supply a y argument, such as Math181grade, to function sf, y >= 1 will evaluate to a 0/1 (FALSE/TRUE) vector, and taking the mean of that vector will give you the proportion of or probability that Math181grade >= 1.
The second command calls the function sf on several subsets of the data defined by the predictors. In this statement we see the summary function with a formula supplied as the first argument. When R sees a call to summary with a formula argument, it will calculate descriptive statistics for the variable on the left side of the formula by groups on the right side of the formula and will return the results in a nice table. By default, summary will calculate the mean of the left side variable. So, if we had used the code summary(as.numeric(Math181grade) ~ gpa + level + grade165) without the fun argument, we would get means on Math181grade by level, then by math165grade, and finally by gpa broken up into 4 equal groups. However, we can override calculation of the mean by supplying our own function, namely sf to the fun= argument. The final command asks R to return the contents to the object s, which is a table. ___________________________________________________________________
sf <- function(y) {
c('Y>=1' = qlogis(mean(y >= 1)),
'Y>=2' = qlogis(mean(y >= 2)),
'Y>=3' = qlogis(mean(y >= 3)))
}
(s <- with(math165new, summary(as.numeric(math181grade) ~ gpa + level + grade165, fun=sf)))
## as.numeric(math181grade) N= 1248
##
## +--------+---------+----+---------+-----------+-----------+
## | | | N| Y>=1| Y>=2| Y>=3|
## +--------+---------+----+---------+-----------+-----------+
## | gpa|[0.4,3.0)| 330|0.6390800|-0.08489944|-1.34883680|
## | |[3.0,3.4)| 301|1.8471096| 0.88395535|-0.07312226|
## | |[3.4,4.0)| 411|2.9216243| 1.93207867| 0.66047640|
## | | 4.0| 206|4.2145937| 3.50655790| 2.28666964|
## +--------+---------+----+---------+-----------+-----------+
## | level| college| 658|2.0966537| 1.48011312| 0.51569181|
## | | develop| 590|1.5071878| 0.69823625|-0.24529031|
## +--------+---------+----+---------+-----------+-----------+
## |grade165| A| 513|3.2047769| 2.44340692| 1.41827741|
## | | B| 394|1.6402096| 0.94849387|-0.12197843|
## | | C| 341|0.9514546| 0.04106149|-1.30052754|
## +--------+---------+----+---------+-----------+-----------+
## | Overall| |1248|1.7870931| 1.07313320| 0.15092687|
## +--------+---------+----+---------+-----------+-----------+
The table above displays the (linear) predicted values we would get if we regressed our dependent variable on our predictor variables one at a time, without the parallel slopes assumption. Evaluate the parallel slopes assumption by running a series of binary logistic regressions with varying cutpoints on the dependent variable and checking the equality of coefficients across cutpoints. Relax the parallel slopes assumption to checks its tenability. To accomplish this, transform the original, ordinal, dependent variable into a new, binary, dependent variable which is equal to zero if the original, ordinal dependent variable (here Math181grade) is less than some value a, and 1 if the ordinal variable is greater than or equal to a (note, this is what the ordinal regression model coefficients represent as well). This is done for k-1 levels of the ordinal variable and is executed by the as.numeric(math181grade) >= a coding below. The first line of code estimates the effect of level on the resulting calculus 1 grade of 0 or 1 versus 2 or 3 or 4. The second line of code estimates the effect of of level on the resulting calculus 1 grade of 0 or 1 or 2 versus getting 3 or 4. The third line of code estimates the effect of of level on the resulting calculus 1 grade of 0, 1, 2, or 3 versus getting a 4.
Looking at the intercept for the first model (2.0967), we see that it matches the predicted value in the cell for level equal to “college” in the column for Y>=1, the value below it, for level equals “developmental” is equal to the intercept plus the coefficient for level (i.e. 2.0967 - 0.5895 = 1.507).
glm(I(as.numeric(math181grade) >= 1) ~ level, family="binomial", data = math165new)
##
## Call: glm(formula = I(as.numeric(math181grade) >= 1) ~ level, family = "binomial",
## data = math165new)
##
## Coefficients:
## (Intercept) leveldevelop
## 2.0967 -0.5895
##
## Degrees of Freedom: 1247 Total (i.e. Null); 1246 Residual
## Null Deviance: 1026
## Residual Deviance: 1013 AIC: 1017
Looking at the intercept for the second model (1.4081), we see that it matches the predicted value in the cell for level equal to “college” in the column for Y>=2, the value below it, for level equals “developmental” is equal to the intercept plus the coefficient for level (i.e. 1.4801 -0.7819 = 0.6982).
glm(I(as.numeric(math181grade) >= 2) ~ level, family="binomial", data = math165new)
##
## Call: glm(formula = I(as.numeric(math181grade) >= 2) ~ level, family = "binomial",
## data = math165new)
##
## Coefficients:
## (Intercept) leveldevelop
## 1.4801 -0.7819
##
## Degrees of Freedom: 1247 Total (i.e. Null); 1246 Residual
## Null Deviance: 1417
## Residual Deviance: 1381 AIC: 1385
Looking at the intercept for the third model (0.5157), we see that it matches the predicted value in the cell for level equal to “college” in the column for Y>=3, the value below it, for level equals “developmental” is equal to the intercept plus the coefficient for level (i.e. 0.5157 - 0.7610 = -0.2453).
glm(I(as.numeric(math181grade) >= 3) ~ level, family="binomial", data = math165new)
##
## Call: glm(formula = I(as.numeric(math181grade) >= 3) ~ level, family = "binomial",
## data = math165new)
##
## Coefficients:
## (Intercept) leveldevelop
## 0.5157 -0.7610
##
## Degrees of Freedom: 1247 Total (i.e. Null); 1246 Residual
## Null Deviance: 1723
## Residual Deviance: 1679 AIC: 1683
s[, 4] <- s[, 4] - s[, 3]
s[, 3] <- s[, 3] - s[, 3]
s # print
## as.numeric(math181grade) N= 1248
##
## +--------+---------+----+---------+----+----------+
## | | | N| Y>=1|Y>=2| Y>=3|
## +--------+---------+----+---------+----+----------+
## | gpa|[0.4,3.0)| 330|0.6390800| 0|-1.2639374|
## | |[3.0,3.4)| 301|1.8471096| 0|-0.9570776|
## | |[3.4,4.0)| 411|2.9216243| 0|-1.2716023|
## | | 4.0| 206|4.2145937| 0|-1.2198883|
## +--------+---------+----+---------+----+----------+
## | level| college| 658|2.0966537| 0|-0.9644213|
## | | develop| 590|1.5071878| 0|-0.9435266|
## +--------+---------+----+---------+----+----------+
## |grade165| A| 513|3.2047769| 0|-1.0251295|
## | | B| 394|1.6402096| 0|-1.0704723|
## | | C| 341|0.9514546| 0|-1.3415890|
## +--------+---------+----+---------+----+----------+
## | Overall| |1248|1.7870931| 0|-0.9222063|
## +--------+---------+----+---------+----+----------+
plot(s, which=1:3, pch=1:3, xlab='logit', main=' ', xlim=range(s[,3:4]))
If the proportional odds assumption holds, for each predictor variable, distance between the symbols for each set of categories of the dependent variable, should remain similar. To help demonstrate this, we normalize all the first set of coefficients to be zero so there is a common reference point. Looking at the coefficients for the variable “level” we see that the distance between the two sets of coefficients is similar. In contrast, the distances between the estimates for gpa are more varied (i.e., the markers are much further apart on the second line than on the first), suggesting that the proportional odds assumption may not hold.
Once we are done assessing whether the assumptions of our model hold, we can obtain predicted probabilities, which are usually easier to understand than either the coefficients or the odds ratios. We create a simulated dataset of the same length as the original dataset, where we vary “level” (college and develpmental), gpa for each level, and for math165grade for each grade A, B, and C, and then we calculate the probability of being in each category of math181grade.
newdat <- data.frame(
level = rep(c("college","develop"), 624),
math165grade = rep(2:4, each = 416),
gpa = rep(seq(from = 0.4, to = 4, length.out = 312), 4))
newdat <- cbind(newdat, predict(logitfit1, newdat, type = "probs"))
##show first few rows
head(newdat)
## level math165grade gpa A B C D
## 1 college 2 0.4000000 0.003157532 0.011281516 0.03356656 0.06058908
## 2 develop 2 0.4115756 0.002073307 0.007444687 0.02249818 0.04197694
## 3 college 2 0.4231511 0.003257277 0.011632571 0.03456195 0.06220028
## 4 develop 2 0.4347267 0.002138875 0.007677809 0.02318069 0.04316219
## 5 college 2 0.4463023 0.003360162 0.011994340 0.03558472 0.06384524
## 6 develop 2 0.4578778 0.002206512 0.007918140 0.02388293 0.04437649
## F
## 1 0.8914053
## 2 0.9260069
## 3 0.8883479
## 4 0.9238404
## 5 0.8852155
## 6 0.9216159
Finally, reshape the data to long format with the reshape2 package and plot all of the predicted probabilities for the different conditions.
lnewdat <- melt(newdat, id.vars = c("level", "math165grade", "gpa"),
variable.name = "Math181Grade", value.name="Probability")
## view first few rows
head(lnewdat)
## level math165grade gpa Math181Grade Probability
## 1 college 2 0.4000000 A 0.003157532
## 2 develop 2 0.4115756 A 0.002073307
## 3 college 2 0.4231511 A 0.003257277
## 4 develop 2 0.4347267 A 0.002138875
## 5 college 2 0.4463023 A 0.003360162
## 6 develop 2 0.4578778 A 0.002206512
We plot the predicted probilities, connected with a line, colored by level of the outcome, math181grade, and facetted by level and math165grade. We also use a custom label function, to add clearer labels showing what each column and row of the plot represent.
ggplot(lnewdat, aes(x = gpa, y = Probability, color = Math181Grade)) +
geom_line() + facet_grid(level ~ math165grade, labeller="label_both")
These final plots show the predicted probabilities for getting an A, B, C, D, or F in Math 181, based on a student’s gpa, Math 165 grade, and starting level of math at MC (developmental or college). Follow each curve on each faceted plot for each predicted probability.