1 Introduction

We use data on 1721 students nested within 60 urban public primary schools. The outcome of interest is mathematics achievement. The data were collected at the end of first grade and annually thereafter up to sixth grade, but not all students have six observations.

# read spss format and covert it to data frame 
dta <- foreign::read.spss("C:/Users/lilia/Desktop/Multi-level analysis/eg_hlm.sav") %>% as.data.frame

# check data structure
str(dta)

'data.frame':   7230 obs. of  12 variables:
 $ size    : num  380 380 380 380 380 380 380 380 380 380 ...
 $ lowinc  : num  40.3 40.3 40.3 40.3 40.3 40.3 40.3 40.3 40.3 40.3 ...
 $ mobility: num  12.5 12.5 12.5 12.5 12.5 12.5 12.5 12.5 12.5 12.5 ...
 $ female  : num  0 0 0 0 0 0 0 0 0 0 ...
 $ black   : num  0 0 0 0 0 0 0 0 0 0 ...
 $ hispanic: num  1 1 1 0 0 0 0 0 1 1 ...
 $ year    : num  0.5 1.5 2.5 -1.5 -0.5 0.5 1.5 2.5 -1.5 -0.5 ...
 $ grade   : num  2 3 4 0 1 2 3 4 0 1 ...
 $ math    : num  1.146 1.134 2.3 -1.303 0.439 ...
 $ retained: num  0 0 0 0 0 0 0 0 0 0 ...
 $ school  : Factor w/ 60 levels "        2020",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ cid     : Factor w/ 1721 levels "   101480302",..: 244 244 244 248 248 248 248 248 253 253 ...

with(dta, table(school)) |> quantile()

    0%    25%    50%    75%   100% 
 18.00  66.75 103.00 162.00 387.00

with(dta, table(cid)) |> range()

[1] 2 6

We have 60 schools with a median of 103 students for a total of 1721 children, who have between 2 and 6 observations each for a total of 7230 observations.

2 Variance components

We have specified two random part, one at the school level (level 3), and one at the child-level (level 2). They are both intercept-only, adding an error term at each level.

dta <- dta %>% group_by(school) %>% mutate(nc=n()) %>% ungroup()

ggplot(subset(dta, nc > 200), aes(reorder(cid, math, mean), math))+
  stat_summary(fun.data="mean_cl_boot", size = rel(.2))+
  facet_wrap(.~ school)+
  coord_flip()+
  labs(x="Subject ID",
       y="Mean math score")+
  theme_minimal()+
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

ggplot(dta, aes(reorder(school, math, mean), math))+
  stat_summary(fun.data="mean_cl_boot", size = rel(.2))+
  coord_flip()+
  labs(x="School ID",
       y="Mean math score")+
  theme_minimal()

m0 <- lme4::lmer(math ~ 1 + (1 | cid) + (1 | school), data=dta)
summary(m0, corr=FALSE)

Linear mixed model fit by REML ['lmerMod']
Formula: math ~ 1 + (1 | cid) + (1 | school)
   Data: dta

REML criterion at convergence: 25309.2

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-2.674 -0.658  0.026  0.674  3.334 

Random effects:
 Groups   Name        Variance Std.Dev.
 cid      (Intercept) 0.570    0.755   
 school   (Intercept) 0.324    0.569   
 Residual             1.524    1.234   
Number of obs: 7230, groups:  cid, 1721; school, 60

Fixed effects:
            Estimate Std. Error t value
(Intercept)  -0.5100     0.0787   -6.48

The estimated mean math score is -0.51 and the variances are 0.32, 0.57 and 1.52 at the school, child and observation levels.

Most of the variance comes from the measurements (within the same child), whereas the variance in the child level is greater than that at the school level. These results indicate fairly significant variation in math achievement among children and in mean math among schools.

# by hand
v <- c(sigma(m0)^2, unlist(VarCorr(m0)))
v[3]/(v[1] + v[2] + v[3])

  school 
0.134074

(v[2] + v[3])/(v[1] + v[2] + v[3])

     cid 
0.369935

# use package
performance::icc(m0, by_group=TRUE)

# ICC by Group

Group  |   ICC
--------------
cid    | 0.236
school | 0.134

At the school level, the intra-class correlation is 0.13; 0.37, at the child level. These values indicate the correlation in math achievement between two children in the same school, and between two measurements on the same child (in the same school). It means that 13% of the variation in math achievement can be attributed to attending the same schools and 37% to the children themselves (which includes being in the same school).

3 Effect of time (measurement level)

The two variables, year and grade, are the same but coded differently. Year zero is near the start of the third grade, and can be referred to as �勇hird grade�� for simplicity.

with(dta, table(grade, year))

     year
grade -2.5 -1.5 -0.5  0.5  1.5  2.5
    0  131 1346  104    1    0    0
    1    0    0 1412  194    5    1
    2    0    0    4 1470  174    5
    3    0    0    0    7 1200  149
    4    0    0    0    0    8 1010
    5    0    0    0    0    0    9

ggplot(subset(dta, nc < 67), aes(year, math, group=cid))+
  facet_wrap(. ~ school)+
  stat_smooth(method='lm', formula=y~x, se=F, size=rel(.5), alpha=.5)+
  geom_point(pch=1, size=rel(.5)) +
  labs(x="Year (centered)",
       y="Math score")+
  theme_minimal()

ggplot(dta, aes(year, math, group=school))+
  stat_smooth(method='lm', formula=y~x, se=F, col=8,
              size=rel(.5), alpha=.5)+
  geom_point(pch=1, size=rel(.5)) +
  labs(x="Year (centered)",
       y="Math score",
       subtitle='School level')+
  theme_minimal()

3.1 Problem

Plot the relationship between math score and year of age for students in those schools with number of students in the upper quartile.

We fit a model with random intercepts and slopes, with no predictors by child or school. The variable year is at the observation level.

ggplot(dta, aes(year, math, group=school))+
  geom_point(alpha=.5)+
  stat_smooth(method='lm', formula=y~x, se=FALSE)+
  stat_smooth(aes(x=year, y=math, group=1), method = "lm", size=rel(.5), color="red")+
  labs(x="Year of age(centered)",
       y="Math score")+
  theme_minimal()

m1 <- lme4::lmer(math ~ year + (1 + year | cid) + (1 + year | school), data=dta)
summary(m1, corr=FALSE)

Linear mixed model fit by REML ['lmerMod']
Formula: math ~ year + (1 + year | cid) + (1 + year | school)
   Data: dta

REML criterion at convergence: 16336.7

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-3.205 -0.564 -0.037  0.531  5.260 

Random effects:
 Groups   Name        Variance Std.Dev. Corr
 cid      (Intercept) 0.6405   0.800        
          year        0.0113   0.106    0.55
 school   (Intercept) 0.1686   0.411        
          year        0.0113   0.106    0.40
 Residual             0.3014   0.549        
Number of obs: 7230, groups:  cid, 1721; school, 60

Fixed effects:
            Estimate Std. Error t value
(Intercept)  -0.7792     0.0583   -13.4
year          0.7631     0.0154    49.6

The fixed-effect estimates yield an intercept of -0.78 and a slope of 0.76. These values can be interpreted as the average achievement in third grade, and the average rate of change per year. Average math achievement clearly increases significantly over time.

The intercepts and slopes have correlations of 0.40 at the school level and 0.55 at the child level. The latter can be interpreted as the correlation between a child�䏭 achievement around third grade and her rate of growth; the former, the correlation between a school�䏭 math achievement around third grade and the rate of change per year.

Most of the variance in the intercept comes from the children, whereas the variance in the slope is about the same at each level. These results indicate fairly significant variation among schools in mean achievement around third grade, as well as in the rates of change over time.

4 Effects of ethnicity (child) and income (school)

One way to look for school or individual characteristics that may be associated with levels and trends in mathematics achievement is to plot or regress the estimates of intercepts and slopes on potential predictors at each level.

# recode ethnicity from indicator variables into one factor variable with
# four levels with white as the reference 
dta <- mutate(dta, 
              eth = factor(1 + black + 2 * hispanic,
                           labels=c("White", "Black", "Hispanic")))
with(dta, table(eth))

eth
   White    Black Hispanic 
    1229     4977     1024

ggplot(subset(dta, nc > 167), aes(year, math, group=cid))+
  facet_wrap(eth ~ .)+
  stat_smooth(method='lm', formula=y~x, se=F, size=rel(.5), alpha=.5, col=8)+
  geom_point(pch=1, size=rel(.5)) +
  labs(x="Year (centered)",
       y="Math score")+
  theme_minimal()

4.1 Problem

Explore the effect of income (school level) on average math achievement with plots

if(!require('merTools'))
  install.packages('merTools')
  library('merTools')


m2 <- lme4::lmer(math ~ (year | school/cid), data=dta)
summary(m1, corr=FALSE)

Linear mixed model fit by REML ['lmerMod']
Formula: math ~ year + (1 + year | cid) + (1 + year | school)
   Data: dta

REML criterion at convergence: 16336.7

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-3.205 -0.564 -0.037  0.531  5.260 

Random effects:
 Groups   Name        Variance Std.Dev. Corr
 cid      (Intercept) 0.6405   0.800        
          year        0.0113   0.106    0.55
 school   (Intercept) 0.1686   0.411        
          year        0.0113   0.106    0.40
 Residual             0.3014   0.549        
Number of obs: 7230, groups:  cid, 1721; school, 60

Fixed effects:
            Estimate Std. Error t value
(Intercept)  -0.7792     0.0583   -13.4
year          0.7631     0.0154    49.6

library(lme4); library(merTools)
m2_re <- REsim(m2)
plotREsim(m2_re)

5 References

Raudenbush, S., Byrk, A., et al. (2004). HLM 6: Hierarchical Linear and Nonlinear Modeling. Chapter 4. Scientific Software International.

Rodr穩guez, G. Multilevel Models. 3-Level Model

0924Homework-1

611315016

2022-09-30