1 Exercise 3

Reproduce the results in homework example using the rmarkdown file and publish it with a web link on the course moodle page.

1.1 Introduction

Kreft and de Leeuw (1998) obtained a sub-sample of students in eighth grade from the National Education Longitudinal Study of 1988 (NELS–88) collected by the National Center for Educational Statistics of the U.S. Department of Education.

The students are nested in schools.

Here, we consider the following subset of the variables:

  • schid: school identifier

  • math: continuous measure of achievement in mathematics (standardized to have a mean of 50 and a standard deviation of 10)

  • homework: number of hours of homework done per week

1.2 Data management

library(tidyverse)
library(haven)
fL <- "https://stats.idre.ucla.edu/stat/examples/imm/imm10.dta"
dta <- read_dta(fL)
glimpse(dta)
Observations: 260
Variables: 19
$ schid    <dbl> 7472, 7472, 7472, 7472, 7472, 7472, 7472, 7472, 7472,...
$ stuid    <dbl> 3, 8, 13, 17, 27, 28, 30, 36, 37, 42, 52, 53, 61, 64,...
$ ses      <dbl> -0.13, -0.39, -0.80, -0.72, -0.74, -0.58, -0.83, -0.5...
$ meanses  <dbl> -0.48261, -0.48261, -0.48261, -0.48261, -0.48261, -0....
$ homework <dbl> 1, 0, 0, 1, 2, 1, 5, 1, 1, 2, 1, 1, 1, 2, 1, 4, 1, 2,...
$ white    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ parented <dbl> 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 1, 2, 3, 3, 1,...
$ public   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ ratio    <dbl> 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 1...
$ percmin  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ math     <dbl> 48, 48, 53, 42, 43, 57, 33, 64, 36, 56, 48, 48, 44, 3...
$ sex      <dbl> 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2,...
$ race     <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,...
$ sctype   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ cstr     <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,...
$ scsize   <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
$ urban    <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,...
$ region   <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,...
$ schnum   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
# make schid a factor type 
dta$schid <- factor(dta$schid)

1.2.1 OLS regression lines over 10 schools

coef(m0 <- nlme::lmList(math ~ homework | schid, data=dta))
      (Intercept) homework
7472       50.684  -3.5538
7829       49.012  -2.9201
7930       38.750   7.9091
24725      34.394   5.5927
25456      53.939  -4.7184
25642      49.259  -2.4861
62821      59.210   1.0946
68448      36.055   6.4963
68493      38.520   5.8600
72292      37.714   6.3351

1.3 Visualization

ggplot(data=dta, aes(homework, math, color=schid)) +
  geom_point(alpha=0.5) +
  stat_smooth(aes(group=1), method="lm", formula=y~x) +
  stat_smooth(method="lm", formula=y ~ x, se=F) +
  labs(x="Homework (hours per week)",
       y="Math score") +
  guides(color=guide_legend(ncol=2))+
  theme_minimal() +
  theme(legend.position=c(.9,.3))

1.4 A random intercepts and random slopes model

library(lme4)
summary(m1 <- lmer(math ~ homework + (homework | schid), data=dta))
Linear mixed model fit by REML ['lmerMod']
Formula: math ~ homework + (homework | schid)
   Data: dta

REML criterion at convergence: 1764

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.5111 -0.5357  0.0175  0.6121  2.5708 

Random effects:
 Groups   Name        Variance Std.Dev. Corr 
 schid    (Intercept) 69.3     8.33          
          homework    22.5     4.74     -0.81
 Residual             43.1     6.56          
Number of obs: 260, groups:  schid, 10

Fixed effects:
            Estimate Std. Error t value
(Intercept)    44.77       2.74   16.32
homework        2.04       1.55    1.31

Correlation of Fixed Effects:
         (Intr)
homework -0.804

1.5 Random effects

library(lattice)
qqmath(ranef(m1))
$schid

1.6 Residual plot

plot(m1, resid(., scaled=TRUE) ~ fitted(.) | schid, 
     xlab="Fitted values", ylab= "Standardized residuals",
     abline=0, lty=3)

1.7 Normality QQ-plot

qqmath(m1, grid=TRUE)

1.8 The end

2 Exercise 4

Download the data file from the language and math example first to a data folder. In Rstudio, open the R script and compile a report in html directly or use the command

knitr::spin("foo.R", knit=FALSE)

to render it as an RMD file format first.

2.1 Individual correlation vs grouped correlation

2.1.1 data management and graphics package

library(tidyverse)

2.1.2 input data

dta <- read.csv("~/data/langMath.csv", h=T)

2.1.3 compute averages by school

dta_a <- dta %>%
        group_by(School) %>%
        summarize(ave_lang = mean(Lang, na.rm=TRUE),
                  ave_arith = mean(Arith, na.rm=TRUE))

2.1.4 superimpose two plots

ggplot(data=dta, aes(x=Arith, y=Lang)) +
 geom_point(color="skyblue") +
 stat_smooth(method="lm", formula=y ~ x, se=F, col="skyblue") +
 geom_point(data=dta_a, aes(ave_arith, ave_lang), color="steelblue") +
 stat_smooth(data=dta_a, aes(ave_arith, ave_lang),
             method="lm", formula= y ~ x, se=F, color="steelblue") +
 labs(x="Arithmetic score", 
      y="Language score") +
 theme_bw()

3 Exercise 5

Draw a scatter plot of the variable normexam against the variable standLRT from the data set Exam{mlmRev}. Superimpose on it another scatter plot using the means of the same two variables by school. You can start by editing this rmarkdown template in RStudio before submitting the output to moodle.

Please see the result in here

4 Exercise 6

Edit this rmarkdown template in RStudio to reproduce the following histogram for correlation coefficients between written and course variables by school from the data set Gcsemv{mlmRev}. The two vertical lines indicate averaged correlations over schools and correlation computed over individuals ignoring school label. Which is which?

histogram

histogram

4.1 Data management

# install.package("mlmRev")
library(mlmRev)
# load the data from the package
data(Gcsemv, package="mlmRev")
# invoke help document
?Gcsemv
# view first 6 lines
head(Gcsemv)
  school student gender written course
1  20920      16      M      23     NA
2  20920      25      F      NA   71.2
3  20920      27      F      39   76.8
4  20920      31      F      36   87.9
5  20920      42      M      16   44.4
6  20920      62      F      36     NA

4.2 Summary statistics

with(Gcsemv, cor(written, course, use="pairwise"))
[1] 0.47417
# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school, mean, na.rm=T))
cor(course_schavg, written_schavg)
[1] 0.39568

4.3 Visualization

library(tidyverse)
dta <- Gcsemv %>% 
  group_by(school) %>%
  mutate(r_sch = cor(course, written, use="pairwise")) 
dtar <- dta[!duplicated(dta$school),"r_sch"]
ggplot(data=dtar, aes(r_sch)) +
  geom_histogram(fill="skyblue") +
  geom_vline(xintercept=c(0.39568, 0.47414), 
             col=c("peru","black"), # 修改模板中的實線顏色,將red改成black,以符合示範圖 
             lty=c(3,1)) + # 1為實線,3為虛線
  labs(x="Estimated correlation coefficients",
       y="Counts") +
  theme_bw()

圖中二條直線,實線為以樣本個體的course和written的correlation,虛線則為在school的分群下course和written的平均數之間的correlation。

4.4 The end

5 Exercise 7

Reproduce the results of the loudness example by completing the R markdown file

5.1 Introduction

Ten subjects were played tones at each of 5 loudnesses, in random order. Subjects were asked to draw a line on paper whose length matched the loudness of the tone. Each subject repeated each loudness 3 times, for a total of 30 trials per subject. Here is the mean of the 3 log-lengths for each loudness, the sd of the three log-lengths, and the number of replications, which is always 3.

A data frame with 50 observations on the following 5 variables.

  • subject

  • a factor with unique values for each subject

  • loudness

  • either 50, 60, 70, 80 or 90 db. Decibels are a logrithmic scale

  • y

  • a numeric vector giving the mean of the log-lengths of three lines drawn.

  • sd

  • a numeric vector, giving the sd of the three log lengths

  • n

  • a numeric vector, equal to the constant value 3

5.2 Data management

# install.packages("alr4")
library(alr4)
data(Stevens, package="alr4")
library(tidyverse)
library(lme4)
glimpse(Stevens)
Observations: 50
Variables: 5
$ subject  <fct> A, A, A, A, A, B, B, B, B, B, C, C, C, C, C, D, D, D,...
$ loudness <dbl> 50, 60, 70, 80, 90, 50, 60, 70, 80, 90, 50, 60, 70, 8...
$ y        <dbl> 0.379, 1.727, 3.016, 3.924, 4.413, 0.260, 0.833, 2.00...
$ sd       <dbl> 0.507, 0.904, 0.553, 0.363, 0.092, 0.077, 0.287, 0.39...
$ n        <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...

5.3 OLS regression lines over 10 subjects

coef(m0 <- nlme::lmList(y ~ loudness | subject, data=Stevens))
  (Intercept) loudness
A     -4.4937  0.10265
B     -4.5853  0.09355
C     -2.2936  0.06950
D     -2.9194  0.07506
E     -5.3851  0.10391
F     -2.4212  0.07740
G     -2.8109  0.06497
H     -3.4207  0.08223
I     -1.9487  0.06561
J     -1.7591  0.05525

5.4 Visualization

ggplot(data=Stevens, aes(x=loudness, y=y)) +
  geom_point(alpha=0.5) +
  stat_smooth(method="lm", formula=y ~ x, se=F) +
  facet_grid(. ~ subject) +
  labs(x="Loudness (in db)",
       y="Mean Line length (in log)") +
  theme_minimal() 

5.5 A random intercepts model

library(lme4)
summary(m1 <- lmer(y ~ loudness + (1 | subject), data=Stevens))
Linear mixed model fit by REML ['lmerMod']
Formula: y ~ loudness + (1 | subject)
   Data: Stevens

REML criterion at convergence: 57.1

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.8081 -0.5060  0.0184  0.5530  1.7721 

Random effects:
 Groups   Name        Variance Std.Dev.
 subject  (Intercept) 0.1428   0.378   
 Residual             0.0986   0.314   
Number of obs: 50, groups:  subject, 10

Fixed effects:
            Estimate Std. Error t value
(Intercept) -3.20377    0.25405   -12.6
loudness     0.07901    0.00314    25.2

Correlation of Fixed Effects:
         (Intr)
loudness -0.865

5.6 Random effects

library(lattice)
qqmath(ranef(m1))
$subject

5.7 Residual plot

plot(m1, resid(., scaled=TRUE) ~ fitted(.) | subject, 
     xlab="Fitted values", ylab= "Standardized residuals",
     abline=0, lty=3)

5.8 Normality QQ-plot

qqmath(m1, grid=TRUE)

5.9 The end