Kreft and de Leeuw (1998) obtained a sub-sample of students in eighth grade from the National Education Longitudinal Study of 1988 (NELS–88) collected by the National Center for Educational Statistics of the U.S. Department of Education.
The students are nested in schools.
Here, we consider the following subset of the variables:
schid: school identifier
math: continuous measure of achievement in mathematics (standardized to have a mean of 50 and a standard deviation of 10)
homework: number of hours of homework done per week
library(tidyverse)
library(haven)
fL <- "https://stats.idre.ucla.edu/stat/examples/imm/imm10.dta"
dta <- read_dta(fL)
glimpse(dta)
Rows: 260
Columns: 19
$ schid <dbl> 7472, 7472, 7472, 7472, 7472, 7472, 7472, 7472, 7472, 7472,…
$ stuid <dbl> 3, 8, 13, 17, 27, 28, 30, 36, 37, 42, 52, 53, 61, 64, 72, 8…
$ ses <dbl> -0.13, -0.39, -0.80, -0.72, -0.74, -0.58, -0.83, -0.51, -0.…
$ meanses <dbl> -0.48261, -0.48261, -0.48261, -0.48261, -0.48261, -0.48261,…
$ homework <dbl> 1, 0, 0, 1, 2, 1, 5, 1, 1, 2, 1, 1, 1, 2, 1, 4, 1, 2, 1, 1,…
$ white <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ parented <dbl> 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 1, 2, 3, 3, 1, 3, 3,…
$ public <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ ratio <dbl> 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,…
$ percmin <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ math <dbl> 48, 48, 53, 42, 43, 57, 33, 64, 36, 56, 48, 48, 44, 35, 50,…
$ sex <dbl> 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1,…
$ race <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
$ sctype <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ cstr <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
$ scsize <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ urban <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
$ region <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
$ schnum <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
# make schid a factor type
dta$schid <- factor(dta$schid)
coef(m0 <- nlme::lmList(math ~ homework | schid, data=dta))
(Intercept) homework
7472 50.684 -3.5538
7829 49.012 -2.9201
7930 38.750 7.9091
24725 34.394 5.5927
25456 53.939 -4.7184
25642 49.259 -2.4861
62821 59.210 1.0946
68448 36.055 6.4963
68493 38.520 5.8600
72292 37.714 6.3351
ggplot(data=dta, aes(homework, math, color=schid)) +
geom_point(alpha=0.5) +
stat_smooth(aes(group=1), method="lm", formula=y~x) +
stat_smooth(method="lm", formula=y ~ x, se=F) +
labs(x="Homework (hours per week)",
y="Math score") +
guides(color=guide_legend(ncol=2))+
theme_minimal() +
theme(legend.position=c(.9,.3))
library(lme4)
summary(m1 <- lmer(math ~ homework + (homework | schid), data=dta))
Linear mixed model fit by REML ['lmerMod']
Formula: math ~ homework + (homework | schid)
Data: dta
REML criterion at convergence: 1764
Scaled residuals:
Min 1Q Median 3Q Max
-2.5110 -0.5357 0.0175 0.6121 2.5708
Random effects:
Groups Name Variance Std.Dev. Corr
schid (Intercept) 69.3 8.33
homework 22.5 4.74 -0.81
Residual 43.1 6.56
Number of obs: 260, groups: schid, 10
Fixed effects:
Estimate Std. Error t value
(Intercept) 44.77 2.74 16.32
homework 2.04 1.55 1.31
Correlation of Fixed Effects:
(Intr)
homework -0.804
library(lattice)
qqmath(ranef(m1))
$schid
plot(m1, resid(., scaled=TRUE) ~ fitted(.) | schid,
xlab="Fitted values", ylab= "Standardized residuals",
abline=0, lty=3)
qqmath(m1, grid=TRUE)
# data management and graphics package
library(tidyverse)
dta <- read.csv("./data/langMath.csv", h=T)
dta_a <- dta %>%
group_by(School) %>%
summarize(ave_lang = mean(Lang, na.rm=TRUE),
ave_arith = mean(Arith, na.rm=TRUE))
ggplot(data=dta, aes(x=Arith, y=Lang)) +
geom_point(color="skyblue") +
stat_smooth(method="lm", formula=y ~ x, se=F, col="skyblue") +
geom_point(data=dta_a, aes(ave_arith, ave_lang), color="steelblue") +
stat_smooth(data=dta_a, aes(ave_arith, ave_lang),
method="lm", formula= y ~ x, se=F, color="steelblue") +
labs(x="Arithmetic score",
y="Language score") +
theme_bw()
In this section, we load the exam scores data set, activate the help page for it, and examine the first 6 lines of the data frame object.
# load the package to working directory
library(mlmRev)
# load the data from the package
data(Exam, package="mlmRev")
# invoke help document
?Exam
# view first 6 lines
head(Exam)
school normexam schgend schavg vr intake standLRT sex type student
1 1 0.26132 mixed 0.16618 mid 50% bottom 25% 0.61906 F Mxd 143
2 1 0.13407 mixed 0.16618 mid 50% mid 50% 0.20580 F Mxd 145
3 1 -1.72388 mixed 0.16618 mid 50% top 25% -1.36458 M Mxd 142
4 1 0.96759 mixed 0.16618 mid 50% mid 50% 0.20580 F Mxd 141
5 1 0.54434 mixed 0.16618 mid 50% mid 50% 0.37111 F Mxd 138
6 1 1.73490 mixed 0.16618 mid 50% bottom 25% 2.18944 M Mxd 155
We first compute the correlation coefficient between exam scores and LRT scores using all students in the data set. We then compute the mean exam scores and mean LRT scores by school. The correlation coefficient between mean school exam scores and mean school LRT scores is then calculated.
with(Exam, cor(normexam, standLRT))
[1] 0.59165
# compute the means by school
mLRT <- with(Exam, tapply(standLRT, school, mean))
mexam <- with(Exam, tapply(normexam, school, mean))
cor(mexam, mLRT)
[1] 0.69241
We draw a scatter diagram of the exam scores against the LRT scores and add the regression line. Next we superimpose on the scatter plot the mean school exam scores and mean school LRT scores (in color cyan) and add the regression line based on the mean school scores.
with(Exam, plot(normexam ~ standLRT,
bty = 'n',
cex = 0.5,
xlab = 'Standardized LR test score',
ylab = 'Normalized exam score'))
grid()
with(Exam, abline(lm(normexam ~ standLRT)))
points(mLRT, mexam, pch=16, col=5)
abline(lm(mexam ~ mLRT), col=5)
# load the data from the package
data(Gcsemv, package="mlmRev")
# invoke help document
?Gcsemv
# view first 6 lines
head(Gcsemv)
school student gender written course
1 20920 16 M 23 NA
2 20920 25 F NA 71.2
3 20920 27 F 39 76.8
4 20920 31 F 36 87.9
5 20920 42 M 16 44.4
6 20920 62 F 36 NA
with(Gcsemv, cor(written, course, use="pairwise"))
[1] 0.47417
# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school, mean, na.rm=T))
cor(course_schavg, written_schavg)
[1] 0.39568
library(tidyverse)
dta <- Gcsemv %>%
group_by(school) %>%
mutate(r_sch = cor(course, written, use="pairwise"))
dtar <- dta[!duplicated(dta$school),"r_sch"]
ggplot(data=dtar, aes(r_sch)) +
geom_histogram(fill="skyblue") +
geom_vline(xintercept=c(0.39568, 0.47414),
col=c("peru","red"),
lty=c(1,3)) +
labs(x="Estimated correlation coefficients",
y="Counts") +
theme_bw()
A: The peru vertical line is the average correlation and the red line is the individual correlation.
Ten subjects were played tones at each of 5 loudnesses, in random order. Subjects were asked to draw a line on paper whose length matched the loudness of the tone. Each subject repeated each loudness 3 times, for a total of 30 trials per subject. Here is the mean of the 3 log-lengths for each loudness, the sd of the three log-lengths, and the number of replications, which is always 3.
A data frame with 50 observations on the following 5 variables.
a factor with unique values for each subject
either 50, 60, 70, 80 or 90 db. Decibels are a logrithmic scale
a numeric vector giving the mean of the log-lengths of three lines drawn.
a numeric vector, giving the sd of the three log lengths
a numeric vector, equal to the constant value 3
#install.packages("alr4")
library(alr4)
data(Stevens, package="alr4")
library(tidyverse)
library(lme4)
glimpse(Stevens)
Rows: 50
Columns: 5
$ subject <fct> A, A, A, A, A, B, B, B, B, B, C, C, C, C, C, D, D, D, D, D,…
$ loudness <dbl> 50, 60, 70, 80, 90, 50, 60, 70, 80, 90, 50, 60, 70, 80, 90,…
$ y <dbl> 0.379, 1.727, 3.016, 3.924, 4.413, 0.260, 0.833, 2.009, 2.7…
$ sd <dbl> 0.507, 0.904, 0.553, 0.363, 0.092, 0.077, 0.287, 0.396, 0.1…
$ n <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
coef(m0 <- nlme::lmList(y ~ loudness | subject, data=Stevens))
(Intercept) loudness
A -4.4937 0.10265
B -4.5853 0.09355
C -2.2936 0.06950
D -2.9194 0.07506
E -5.3851 0.10391
F -2.4212 0.07740
G -2.8109 0.06497
H -3.4207 0.08223
I -1.9487 0.06561
J -1.7591 0.05525
ggplot(data=Stevens, aes(x=loudness, y=y)) +
geom_point(alpha=0.5) +
stat_smooth(method="lm", formula=y ~ x, se=F) +
facet_grid(. ~ subject) +
labs(x="Loudness (in db)",
y="Mean Line length (in log)") +
theme_minimal()
summary(m1 <- lmer(y ~ loudness + (1 | subject), data=Stevens))
Linear mixed model fit by REML ['lmerMod']
Formula: y ~ loudness + (1 | subject)
Data: Stevens
REML criterion at convergence: 57.1
Scaled residuals:
Min 1Q Median 3Q Max
-2.8081 -0.5060 0.0184 0.5530 1.7721
Random effects:
Groups Name Variance Std.Dev.
subject (Intercept) 0.1428 0.378
Residual 0.0986 0.314
Number of obs: 50, groups: subject, 10
Fixed effects:
Estimate Std. Error t value
(Intercept) -3.20377 0.25405 -12.6
loudness 0.07901 0.00314 25.2
Correlation of Fixed Effects:
(Intr)
loudness -0.865
library(lattice)
qqmath(ranef(m1))
$subject
plot(m1, resid(., scaled=TRUE) ~ fitted(.) | subject,
xlab="Fitted values", ylab= "Standardized residuals",
abline=0, lty=3)
qqmath(m1, grid=TRUE)