Fit the Rasch model to starter data via the generalized linear mixed model framework.
The data are the responses of a sample of 150 individuals to 9 multiple-choice items from a General Certificate of Education O-level mathematics paper containing 60 items in all. The responses are coded 1 for correct answer and 0 for incorrect. The items appear to test ability in two-dimensional Euclidean geometry. The data have one column with 1,350 records ordered by item within individuals. There are no missing data.
Source: Goldstein, H., & Wood, R. (1989). Five decades of item response modelling, British Journal of Mathematical and Statistical Psychology, 42, 139-67.
Column 1: Subject ID Column 2: Item ID Column 3: Answer, 1 = Correct, 0 = Incorrect
pacman::p_load(MASS, gee, tidyverse, geepack, ltm, lme4)
dta <- read.table("data/starter.txt", header=T)
head(dta)
sid item resp
1 1 1 1
2 1 2 1
3 1 3 1
4 1 4 1
5 1 5 1
6 1 6 1
dtaW <- dta %>%
gather(key, value, -item, -sid) %>%
unite(col, key, item) %>%
spread(col, value) %>%
select(-sid)
plot(descript(dtaW),
ylim=c(0,1),
type='b',
main="Item response curves")
descript(dtaW)$perc[c(1,7,2,3,4,5,6,8,9), ]
0 1 logit
resp_1 0.08000 0.92000 2.44235
resp_7 0.18000 0.82000 1.51635
resp_2 0.20000 0.80000 1.38629
resp_3 0.25333 0.74667 1.08091
resp_4 0.26667 0.73333 1.01160
resp_5 0.30667 0.69333 0.81575
resp_6 0.32000 0.68000 0.75377
resp_8 0.46000 0.54000 0.16034
resp_9 0.66667 0.33333 -0.69315
dta$item <- factor(dta$item)
sjPlot::tab_model(m0 <- glmer(resp ~ -1 + item + (1 | sid), data=dta, family=binomial), show.obs=F, show.ngroups=F, transform=NULL, show.se=T, show.r2=F,show.icc=F)
| resp | ||||
|---|---|---|---|---|
| Predictors | Log-Odds | std. Error | CI | p |
| item [1] | 2.90 | 0.34 | 2.24 – 3.56 | <0.001 |
| item [2] | 1.70 | 0.24 | 1.22 – 2.17 | <0.001 |
| item [3] | 1.35 | 0.23 | 0.90 – 1.80 | <0.001 |
| item [4] | 1.24 | 0.22 | 0.80 – 1.69 | <0.001 |
| item [5] | 1.02 | 0.22 | 0.60 – 1.45 | <0.001 |
| item [6] | 0.94 | 0.22 | 0.52 – 1.36 | <0.001 |
| item [7] | 1.86 | 0.25 | 1.37 – 2.36 | <0.001 |
| item [8] | 0.21 | 0.20 | -0.19 – 0.61 | 0.312 |
| item [9] | -0.86 | 0.21 | -1.28 – -0.44 | <0.001 |
| Random Effects | ||||
| σ2 | 3.29 | |||
| τ00 sid | 1.17 | |||
unlist(summary(m0)$varcor)/(unlist(summary(m0)$varcor)+pi^2/3)
sid
0.26195
coef(rm0 <- ltm::rasch(LSAT, constraint=cbind(ncol(LSAT)+1, 1)))[,1]
Item 1 Item 2 Item 3 Item 4 Item 5
-2.87197 -1.06303 -0.25761 -1.38806 -2.21878
plot(rm0)
library(eRm)
rm0 <- RM(LSAT)
eRm::plotICC(rm0, item.subset=1:5, ask=F, empICC=list("raw"), empCI=list(lty="solid"))
plotPImap(rm0)
The item-response looks good, the distance between each item are very similar, which means the difficulty of items were gradually increased.
Replicate the results of analysis of clustered ordinal data reported in the teach again example.
Teachers from a sample of 16 schools in California and Michigan were asked:
“If you could go back to college and start all over again, would you again choose teaching as a profession?”
The teachers’ perception of task variety was measured by the extent to which teachers followed the same routine each day.
Source: Raudenbush, S.W., Rowan, B., & Cheong, Y. (1993). Teaching as a non-routine task: Implications for the organizational design of schools. Educational Administrative Quarterly, 29(4), 479-500.
Column 1: Response categories: Yes, Not sure, No Column 2: Task routine Column 3: School ID
Mixed-effects cumulative logit model (proportional odds)
# load packages
pacman::p_load(tidyverse, HH, ordinal)
#
dta <- read.table("data/teach_again.txt", h=T)
#
head(dta)
Answer Task School
1 Yes -0.26428 S01
2 Yes 0.57090 S01
3 Yes 0.13297 S01
4 Unsure -0.26428 S01
5 No -1.09946 S01
6 Yes 0.53022 S01
str(dta)
'data.frame': 650 obs. of 3 variables:
$ Answer: chr "Yes" "Yes" "Yes" "Unsure" ...
$ Task : num -0.264 0.571 0.133 -0.264 -1.099 ...
$ School: chr "S01" "S01" "S01" "S01" ...
with(dta, ftable(School, Answer))
Answer No Unsure Yes
School
S01 5 4 15
S02 9 9 20
S03 14 10 17
S04 0 1 8
S05 1 5 23
S06 9 12 18
S07 12 4 26
S08 7 5 18
S09 2 8 18
S10 21 27 69
S11 14 12 32
S12 15 19 18
S13 13 6 18
S14 0 1 7
S15 17 11 25
S16 13 10 22
# Probability summary
prop.table(with(dta, ftable(School, Answer)),1)
Answer No Unsure Yes
School
S01 0.208333 0.166667 0.625000
S02 0.236842 0.236842 0.526316
S03 0.341463 0.243902 0.414634
S04 0.000000 0.111111 0.888889
S05 0.034483 0.172414 0.793103
S06 0.230769 0.307692 0.461538
S07 0.285714 0.095238 0.619048
S08 0.233333 0.166667 0.600000
S09 0.071429 0.285714 0.642857
S10 0.179487 0.230769 0.589744
S11 0.241379 0.206897 0.551724
S12 0.288462 0.365385 0.346154
S13 0.351351 0.162162 0.486486
S14 0.000000 0.125000 0.875000
S15 0.320755 0.207547 0.471698
S16 0.288889 0.222222 0.488889
# mean
aggregate(Task ~ School, mean, data = dta)
School Task
1 S01 0.020897
2 S02 -0.117797
3 S03 -0.172874
4 S04 0.080857
5 S05 -0.149081
6 S06 0.237408
7 S07 -0.133343
8 S08 0.186577
9 S09 0.429029
10 S10 0.010578
11 S11 -0.157703
12 S12 -0.080748
13 S13 0.138858
14 S14 0.899354
15 S15 -0.121687
16 S16 -0.013271
# save numerical summary as data frame
m <- as.numeric(with(dta, table(School, Answer)))
# arrange
m <- as.data.frame(matrix(m, 16, 3))
# rename
names(m) <- c("No", "Unsure", "Yes")
#
rownames(m) <- levels(dta$School)
# calculate sum
m$tot <- apply(m, 1, sum)
# order
m <- m[order((m[,1]+m[,2])/m[,4]), ]
# plot
likert(m[, -4], as.percent = T, main="", ylab="")
dtap <- prop.table(with(dta, ftable(School, Answer)), 1)
#
dtapl <- as.data.frame(dtap) %>%
mutate( Answer = factor(Answer))
#
dtapl$Task <- rep(aggregate(Task ~ School, mean, data = dta)[,2],3)
#
dtacp <- data.frame(School = length(dta$School),
as.data.frame(t(apply(dtap, 1, cumsum))))
#
dtacpl <- gather(dtacp, Answer, Prop, 2:4) %>%
mutate( Answer = factor(Answer))
#
ot <- theme_set(theme_bw())
#
ggplot(dtapl, aes(Answer, Freq, group = School)) +
geom_point(alpha = .5)+
geom_line(alpha = .5) +
ylim(c(0, 1)) +
labs(x = "Answer", y = "Categorical response proprtions")
#
ggplot(dtacpl, aes(Answer, Prop, group = School)) +
geom_point(alpha = .5)+
geom_line(alpha = .5) +
labs(x = "Answer", y = "Cumulative proportions")
ggplot(dtapl, aes(Task, Freq, color = Answer)) +
geom_point()+
stat_smooth(method = "lm", se=F) +
scale_y_continuous(limits=c(0, 1)) +
labs(x = "Task", y = "Categorical response proprtions") +
theme(legend.position = c(.9, .5))
Ordinal package ## Cumulative mixed proportional odds model
dta$Answer <- factor(dta$Answer)
summary(m0 <- clmm(Answer ~ Task + (1 | School), data=dta))
Cumulative Link Mixed Model fitted with the Laplace approximation
formula: Answer ~ Task + (1 | School)
data: dta
link threshold nobs logLik AIC niter max.grad cond.H
logit flexible 650 -642.14 1292.27 177(393) 1.91e-04 6.4e+01
Random effects:
Groups Name Variance Std.Dev.
School (Intercept) 0.0909 0.301
Number of groups: School 16
Coefficients:
Estimate Std. Error z value Pr(>|z|)
Task 0.3649 0.0879 4.15 3.3e-05
Threshold coefficients:
Estimate Std. Error z value
No|Unsure -1.266 0.130 -9.73
Unsure|Yes -0.217 0.118 -1.84
The results revealed that only few teachers said “no” response for again choose teaching as a professor.
dta_m0 <- data.frame(na.omit(dta), phat = fitted(m0))
ggplot(dta_m0, aes(Task, phat, color = Answer)) +
geom_point(alpha = .2, pch = 20)+
geom_point(data = dtapl, aes(Task, Freq, color = Answer)) +
stat_smooth(method = "lm", se=F, alpha = .2) +
stat_smooth(data = dtapl, aes(Task, Freq, color = Answer),
method = "lm", se=F, linetype = "dotted") +
scale_y_continuous(limits=(c(0, 1))) +
labs(x = "Task", y = "Categorical response proprtions") +
theme(legend.position = c(.1, .8))
# This is done because there are 0 responses in the frequency table
dta_pro <- dta_m0 %>%
group_by(School, Answer, .drop = FALSE) %>%
summarize(phat = mean(phat))
# dta_prou <- dta_m0 %>%
# group_by(School, .drop = FALSE) %>%
# filter(Answer == "Unsure") %>%
# summarize(phat_u = mean(phat))
# dta_proy <- dta_m0 %>%
# group_by(School, .drop = FALSE) %>%
# filter(Answer == "Yes") %>%
# summarize(phat_y = mean(phat))
# dta_pro <- cbind(dta_pron, dta_prou, dta_proy)
# dta_pro <- dta_pro[,c(-3,-5)]
# dta_proL <- gather(dta_pro, Answer, phat, phat_n:phat_y, factor_key=TRUE)
# append predicted prob to the observed p-table
dta_sum <- inner_join(dtapl, dta_pro)
# plot observed categ. prop and fitted prob. against Task
ggplot(dta_sum, aes(Task, Freq, color = Answer)) +
geom_point(alpha = .3)+
stat_smooth(method = "lm", se = F) +
geom_point(aes(Task, phat, color = Answer), pch = 1)+
stat_smooth(aes(Task, phat, color = Answer),
method = "lm", se = F, lty = 2, lwd = .8) +
scale_y_continuous(limits=c(0, 1)) +
labs(x = "Task", y = "Mean observed and fitted catergorical responses (school)") +
theme(legend.position = c(.9, .5))
The model predition looks well, althogh there are still some distance between the data and model for answer is no, but the pattern between two lines are similar.
Reproduce the results of analysis in the Rutger alcohol problem index example of longitudinal count data using generalized linear mixed models.
The dataset is taken from an intervention study on problematic drinking in college students. Alcohol-related problems, as measured by the Rutgers Alcohol Problem Index, were recorded across 2 years (5 time points) for a sample of 881 students.
Source: Atkins, D.C., Baldwin, S.A., Zheng, C., Gallop, R.J., & Neighbors, C. (2013). A tutorial on count regression and zero-altered count models for longitudinal substance use data. Psycholology of Addictive Behavior. 27(1), 166-177.
Column 1: Subject ID Column 2: Observation ID within subject Column 3: Gender ID Column 4: Time in months Column 5: Rutgers Alcohol Problem Index