1 In-class exercises 3:

1.1 Introduction

Kreft and de Leeuw (1998) obtained a sub-sample of students in eighth grade from the National Education Longitudinal Study of 1988 (NELS–88) collected by the National Center for Educational Statistics of the U.S. Department of Education.

The students are nested in schools.

Here, we consider the following subset of the variables:

  • schid: school identifier

  • math: continuous measure of achievement in mathematics (standardized to have a mean of 50 and a standard deviation of 10)

  • homework: number of hours of homework done per week

1.2 Data management

library(tidyverse)
library(haven)
fL <- "https://stats.idre.ucla.edu/stat/examples/imm/imm10.dta"
dta <- read_dta(fL)
glimpse(dta)
Rows: 260
Columns: 19
$ schid    <dbl> 7472, 7472, 7472, 7472, 7472, 7472, 7472, 7472, 7472, 7472,…
$ stuid    <dbl> 3, 8, 13, 17, 27, 28, 30, 36, 37, 42, 52, 53, 61, 64, 72, 8…
$ ses      <dbl> -0.13, -0.39, -0.80, -0.72, -0.74, -0.58, -0.83, -0.51, -0.…
$ meanses  <dbl> -0.48261, -0.48261, -0.48261, -0.48261, -0.48261, -0.48261,…
$ homework <dbl> 1, 0, 0, 1, 2, 1, 5, 1, 1, 2, 1, 1, 1, 2, 1, 4, 1, 2, 1, 1,…
$ white    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ parented <dbl> 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 1, 2, 3, 3, 1, 3, 3,…
$ public   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ ratio    <dbl> 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,…
$ percmin  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ math     <dbl> 48, 48, 53, 42, 43, 57, 33, 64, 36, 56, 48, 48, 44, 35, 50,…
$ sex      <dbl> 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1,…
$ race     <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,…
$ sctype   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ cstr     <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
$ scsize   <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ urban    <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
$ region   <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
$ schnum   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
# make schid a factor type 
dta$schid <- factor(dta$schid)

1.3 OLS regression lines over 10 schools

coef(m0 <- nlme::lmList(math ~ homework | schid, data=dta))
      (Intercept) homework
7472       50.684  -3.5538
7829       49.012  -2.9201
7930       38.750   7.9091
24725      34.394   5.5927
25456      53.939  -4.7184
25642      49.259  -2.4861
62821      59.210   1.0946
68448      36.055   6.4963
68493      38.520   5.8600
72292      37.714   6.3351

1.4 Visualization

ggplot(data=dta, aes(homework, math, color=schid)) +
  geom_point(alpha=0.5) +
  stat_smooth(aes(group=1), method="lm", formula=y~x) +
  stat_smooth(method="lm", formula=y ~ x, se=F) +
  labs(x="Homework (hours per week)",
       y="Math score") +
  guides(color=guide_legend(ncol=2))+
  theme_minimal() +
  theme(legend.position=c(.9,.3))

1.5 A random intercepts and random slopes model

library(lme4)
summary(m1 <- lmer(math ~ homework + (homework | schid), data=dta))
Linear mixed model fit by REML ['lmerMod']
Formula: math ~ homework + (homework | schid)
   Data: dta

REML criterion at convergence: 1764

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.5110 -0.5357  0.0175  0.6121  2.5708 

Random effects:
 Groups   Name        Variance Std.Dev. Corr 
 schid    (Intercept) 69.3     8.33          
          homework    22.5     4.74     -0.81
 Residual             43.1     6.56          
Number of obs: 260, groups:  schid, 10

Fixed effects:
            Estimate Std. Error t value
(Intercept)    44.77       2.74   16.32
homework        2.04       1.55    1.31

Correlation of Fixed Effects:
         (Intr)
homework -0.804

1.6 Random effects

library(lattice)
qqmath(ranef(m1))
$schid

1.7 Residual plot

plot(m1, resid(., scaled=TRUE) ~ fitted(.) | schid, 
     xlab="Fitted values", ylab= "Standardized residuals",
     abline=0, lty=3)

1.8 Normality QQ-plot

qqmath(m1, grid=TRUE)

1.9 The end

2 In-class Exercises 4: Individual correlation vs grouped correlation

# data management and graphics package
library(tidyverse)

2.1 input data

dta <- read.csv("./data/langMath.csv", h=T)

2.2 compute averages by school

dta_a <- dta %>%
        group_by(School) %>%
        summarize(ave_lang = mean(Lang, na.rm=TRUE),
                  ave_arith = mean(Arith, na.rm=TRUE))

2.3 superimpose two plots

ggplot(data=dta, aes(x=Arith, y=Lang)) +
 geom_point(color="skyblue") +
 stat_smooth(method="lm", formula=y ~ x, se=F, col="skyblue") +
 geom_point(data=dta_a, aes(ave_arith, ave_lang), color="steelblue") +
 stat_smooth(data=dta_a, aes(ave_arith, ave_lang),
             method="lm", formula= y ~ x, se=F, color="steelblue") +
 labs(x="Arithmetic score", 
      y="Language score") +
 theme_bw()

2.4 End

3 In-class Exercises 5: The normexam and standLRT by school

3.1 Data management

In this section, we load the exam scores data set, activate the help page for it, and examine the first 6 lines of the data frame object.

# load the package to working directory
library(mlmRev)
# load the data from the package
data(Exam, package="mlmRev")
# invoke help document
?Exam
# view first 6 lines
head(Exam)
  school normexam schgend  schavg      vr     intake standLRT sex type student
1      1  0.26132   mixed 0.16618 mid 50% bottom 25%  0.61906   F  Mxd     143
2      1  0.13407   mixed 0.16618 mid 50%    mid 50%  0.20580   F  Mxd     145
3      1 -1.72388   mixed 0.16618 mid 50%    top 25% -1.36458   M  Mxd     142
4      1  0.96759   mixed 0.16618 mid 50%    mid 50%  0.20580   F  Mxd     141
5      1  0.54434   mixed 0.16618 mid 50%    mid 50%  0.37111   F  Mxd     138
6      1  1.73490   mixed 0.16618 mid 50% bottom 25%  2.18944   M  Mxd     155

3.2 Summary statistics

We first compute the correlation coefficient between exam scores and LRT scores using all students in the data set. We then compute the mean exam scores and mean LRT scores by school. The correlation coefficient between mean school exam scores and mean school LRT scores is then calculated.

with(Exam, cor(normexam, standLRT))
[1] 0.59165
# compute the means by school
mLRT <- with(Exam, tapply(standLRT, school, mean))
mexam <- with(Exam, tapply(normexam, school, mean))
cor(mexam, mLRT)
[1] 0.69241

3.3 Visualization

We draw a scatter diagram of the exam scores against the LRT scores and add the regression line. Next we superimpose on the scatter plot the mean school exam scores and mean school LRT scores (in color cyan) and add the regression line based on the mean school scores.

with(Exam, plot(normexam ~ standLRT, 
                bty = 'n', 
                cex = 0.5,
                xlab = 'Standardized LR test score', 
                ylab = 'Normalized exam score'))
grid()
with(Exam, abline(lm(normexam ~ standLRT)))
points(mLRT, mexam, pch=16, col=5)
abline(lm(mexam ~ mLRT), col=5)

3.4 The end

4 In-class Exercises 6:

4.1 Data management

# load the data from the package
data(Gcsemv, package="mlmRev")
# invoke help document
?Gcsemv
# view first 6 lines
head(Gcsemv)
  school student gender written course
1  20920      16      M      23     NA
2  20920      25      F      NA   71.2
3  20920      27      F      39   76.8
4  20920      31      F      36   87.9
5  20920      42      M      16   44.4
6  20920      62      F      36     NA

4.2 Summary statistics

with(Gcsemv, cor(written, course, use="pairwise"))
[1] 0.47417
# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school, mean, na.rm=T))
cor(course_schavg, written_schavg)
[1] 0.39568

4.3 Visualization

library(tidyverse)
dta <- Gcsemv %>% 
  group_by(school) %>%
  mutate(r_sch = cor(course, written, use="pairwise")) 
dtar <- dta[!duplicated(dta$school),"r_sch"]
ggplot(data=dtar, aes(r_sch)) +
  geom_histogram(fill="skyblue") +
  geom_vline(xintercept=c(0.39568, 0.47414), 
             col=c("peru","red"), 
             lty=c(1,3)) +
  labs(x="Estimated correlation coefficients",
       y="Counts") +
  theme_bw()

A: The peru vertical line is the average correlation and the red line is the individual correlation.

4.4 The end

5 In-class Exercises 7

5.1 Introduction

Ten subjects were played tones at each of 5 loudnesses, in random order. Subjects were asked to draw a line on paper whose length matched the loudness of the tone. Each subject repeated each loudness 3 times, for a total of 30 trials per subject. Here is the mean of the 3 log-lengths for each loudness, the sd of the three log-lengths, and the number of replications, which is always 3.

A data frame with 50 observations on the following 5 variables.

  • subject
  • a factor with unique values for each subject

  • loudness
  • either 50, 60, 70, 80 or 90 db. Decibels are a logrithmic scale

  • y
  • a numeric vector giving the mean of the log-lengths of three lines drawn.

  • sd
  • a numeric vector, giving the sd of the three log lengths

  • n
  • a numeric vector, equal to the constant value 3

5.2 Data management

#install.packages("alr4")
library(alr4)
data(Stevens, package="alr4")
library(tidyverse)
library(lme4)
glimpse(Stevens)
Rows: 50
Columns: 5
$ subject  <fct> A, A, A, A, A, B, B, B, B, B, C, C, C, C, C, D, D, D, D, D,…
$ loudness <dbl> 50, 60, 70, 80, 90, 50, 60, 70, 80, 90, 50, 60, 70, 80, 90,…
$ y        <dbl> 0.379, 1.727, 3.016, 3.924, 4.413, 0.260, 0.833, 2.009, 2.7…
$ sd       <dbl> 0.507, 0.904, 0.553, 0.363, 0.092, 0.077, 0.287, 0.396, 0.1…
$ n        <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…

5.3 OLS regression lines over 10 subjects

coef(m0 <- nlme::lmList(y ~ loudness | subject, data=Stevens))
  (Intercept) loudness
A     -4.4937  0.10265
B     -4.5853  0.09355
C     -2.2936  0.06950
D     -2.9194  0.07506
E     -5.3851  0.10391
F     -2.4212  0.07740
G     -2.8109  0.06497
H     -3.4207  0.08223
I     -1.9487  0.06561
J     -1.7591  0.05525

5.4 Visualization

ggplot(data=Stevens, aes(x=loudness, y=y)) +
  geom_point(alpha=0.5) +
  stat_smooth(method="lm", formula=y ~ x, se=F) +
  facet_grid(. ~ subject) +
  labs(x="Loudness (in db)",
       y="Mean Line length (in log)") +
  theme_minimal() 

5.5 A random intercepts model

summary(m1 <- lmer(y ~ loudness + (1 | subject), data=Stevens))
Linear mixed model fit by REML ['lmerMod']
Formula: y ~ loudness + (1 | subject)
   Data: Stevens

REML criterion at convergence: 57.1

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.8081 -0.5060  0.0184  0.5530  1.7721 

Random effects:
 Groups   Name        Variance Std.Dev.
 subject  (Intercept) 0.1428   0.378   
 Residual             0.0986   0.314   
Number of obs: 50, groups:  subject, 10

Fixed effects:
            Estimate Std. Error t value
(Intercept) -3.20377    0.25405   -12.6
loudness     0.07901    0.00314    25.2

Correlation of Fixed Effects:
         (Intr)
loudness -0.865

5.6 Random effects

library(lattice)
qqmath(ranef(m1))
$subject

5.7 Residual plot

plot(m1, resid(., scaled=TRUE) ~ fitted(.) | subject, 
     xlab="Fitted values", ylab= "Standardized residuals",
     abline=0, lty=3)

5.8 Normality QQ-plot

qqmath(m1, grid=TRUE)

5.9 The end