Prepare the data set and Review the research question

These data are a sub-sample from the 1982 High School and Beyond (HS&B) Survey, and include information on 7185 students nested within 160 schools: 90 public and 70 catholic.

# Import the dataset into R
library(haven)
hsb <- read_sav("hsbdataset.sav")
# Take a glance at the first 10 lines of the data
head(hsb,10)
## # A tibble: 10 x 9
##    school student     ses meanses    cses mathach   sector  himinty  female
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl+lb> <dbl+lb> <dbl+l>
##  1   1224       1 -1.53    -0.428 -1.10     5.88  0 [publ… 0 [less… 1 [fem…
##  2   1224       2 -0.588   -0.428 -0.160   19.7   0 [publ… 0 [less… 1 [fem…
##  3   1224       3 -0.528   -0.428 -0.1000  20.3   0 [publ… 0 [less… 0 [mal…
##  4   1224       4 -0.668   -0.428 -0.240    8.78  0 [publ… 0 [less… 0 [mal…
##  5   1224       5 -0.158   -0.428  0.270   17.9   0 [publ… 0 [less… 0 [mal…
##  6   1224       6  0.0220  -0.428  0.450    4.58  0 [publ… 0 [less… 0 [mal…
##  7   1224       7 -0.618   -0.428 -0.190   -2.83  0 [publ… 0 [less… 1 [fem…
##  8   1224       8 -0.998   -0.428 -0.570    0.523 0 [publ… 0 [less… 0 [mal…
##  9   1224       9 -0.888   -0.428 -0.460    1.53  0 [publ… 0 [less… 1 [fem…
## 10   1224      10 -0.458   -0.428 -0.0300  21.5   0 [publ… 0 [less… 0 [mal…
# Check the structure of the data set
str(hsb)
## Classes 'tbl_df', 'tbl' and 'data.frame':    7185 obs. of  9 variables:
##  $ school : num  1224 1224 1224 1224 1224 ...
##   ..- attr(*, "format.spss")= chr "F10.0"
##  $ student: num  1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "format.spss")= chr "F9.2"
##  $ ses    : num  -1.528 -0.588 -0.528 -0.668 -0.158 ...
##   ..- attr(*, "format.spss")= chr "F9.2"
##  $ meanses: num  -0.428 -0.428 -0.428 -0.428 -0.428 ...
##   ..- attr(*, "format.spss")= chr "F9.2"
##  $ cses   : num  -1.1 -0.16 -0.1 -0.24 0.27 ...
##   ..- attr(*, "format.spss")= chr "F9.2"
##  $ mathach: num  5.88 19.71 20.35 8.78 17.9 ...
##   ..- attr(*, "format.spss")= chr "F9.2"
##  $ sector : 'haven_labelled' num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.spss")= chr "F9.0"
##   ..- attr(*, "labels")= Named num  0 1
##   .. ..- attr(*, "names")= chr  "public" "catholic"
##  $ himinty: 'haven_labelled' num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.spss")= chr "F9.0"
##   ..- attr(*, "labels")= Named num  0 1
##   .. ..- attr(*, "names")= chr  "less than 40% minority enrollment" "more than 40% minority enrollment"
##  $ female : 'haven_labelled' num  1 1 0 0 0 0 1 0 1 0 ...
##   ..- attr(*, "format.spss")= chr "F9.0"
##   ..- attr(*, "labels")= Named num  0 1
##   .. ..- attr(*, "names")= chr  "male" "female"
# Check for the general summaries for each variable
summary(hsb)
##      school        student           ses               meanses         
##  Min.   :1224   Min.   : 1.00   Min.   :-3.758000   Min.   :-1.188000  
##  1st Qu.:3020   1st Qu.:12.00   1st Qu.:-0.538000   1st Qu.:-0.317000  
##  Median :5192   Median :23.00   Median : 0.002000   Median : 0.038000  
##  Mean   :5278   Mean   :24.51   Mean   : 0.000143   Mean   : 0.006138  
##  3rd Qu.:7342   3rd Qu.:36.00   3rd Qu.: 0.602000   3rd Qu.: 0.333000  
##  Max.   :9586   Max.   :67.00   Max.   : 2.692000   Max.   : 0.831000  
##       cses              mathach           sector          himinty    
##  Min.   :-3.657000   Min.   :-2.832   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:-0.454000   1st Qu.: 7.275   1st Qu.:0.0000   1st Qu.:0.00  
##  Median : 0.010000   Median :13.131   Median :0.0000   Median :0.00  
##  Mean   :-0.005995   Mean   :12.748   Mean   :0.4931   Mean   :0.28  
##  3rd Qu.: 0.463000   3rd Qu.:18.317   3rd Qu.:1.0000   3rd Qu.:1.00  
##  Max.   : 2.850000   Max.   :24.993   Max.   :1.0000   Max.   :1.00  
##      female      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :1.0000  
##  Mean   :0.5282  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
# Students nested within schools
# Three student-level variabels (Level-1): mathach, ses, gender.
# Three school-level variables (Level-2): sector, meanses, himinty.
# Load the statistical package that we needed for the multilevel regression
library(nlme)

A quick Review of the Research Questions:

Section 1. Centering for continuous X

# Bascially, we need two centered variable. One is the centered grand mean, the other is the centered group mean.
# Find the general ses mean value frist.
ses_mean <- mean(hsb$ses)
# Build a new variable called Grand_CSES
hsb$Grand_CSES <- hsb$ses-ses_mean
# Now, we need to get the group centered mean for each school
# We need the group_by function in "dplyr" package.
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
## 
##     collapse
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Calculate the Group Mean within each school.
newhsb <- hsb %>%
    group_by(school) %>%
    mutate(Group_mean = mean(as.numeric(as.character(ses))))
# Add a new variable as Group_CSES
newhsb$Group_CSES <- newhsb$ses - newhsb$Group_mean

Section 2: Q1-How much do U.S. high schools vary in their mean math achievement?

# This is our unconditional model
newhsb$school <-as.factor(newhsb$school)
UnconditionalModel<- lme(mathach~1,random=~1|school,data=newhsb,method="ML")
summary(UnconditionalModel)
## Linear mixed-effects model fit by maximum likelihood
##  Data: newhsb 
##        AIC      BIC    logLik
##   47121.81 47142.45 -23557.91
## 
## Random effects:
##  Formula: ~1 | school
##         (Intercept) Residual
## StdDev:    2.924631 6.256868
## 
## Fixed effects: mathach ~ 1 
##                Value Std.Error   DF  t-value p-value
## (Intercept) 12.63707 0.2436341 7025 51.86905       0
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.06261752 -0.75364531  0.02675562  0.76070102  2.74183820 
## 
## Number of Observations: 7185
## Number of Groups: 160
VarCorr(UnconditionalModel) # A parameter estimate of τ_00 is 8.55
## school = pdLogChol(1) 
##             Variance  StdDev  
## (Intercept)  8.553464 2.924631
## Residual    39.148400 6.256868
logLik(UnconditionalModel)*-2 # -2LL is equal to 47115.81, df=3
## 'log Lik.' 47115.81 (df=3)
# Load our ICC calculation function
ICC <- function(out) {
  varests <- as.numeric(VarCorr(out)[1:nrow(VarCorr(out))])
  return(paste("ICC=",varests[1]/ (varests[1]+varests[-1])))
}
Grand_mean <- mean(newhsb$mathach) # The grand mean is 12.64.
# Calculate the ICC for unconditional model
ICC(UnconditionalModel) # ICC is 17.93%
## [1] "ICC= 0.179310896530165"

Answering Q1:

Section 3: Q2-Do schools with high MEAN SES also have high math achievement?

# To address this question, we need to build a fixed intercept model.
Q2_model <- lme(mathach~meanses,random=~1|school,data=newhsb,method="ML")
summary(Q2_model)
## Linear mixed-effects model fit by maximum likelihood
##  Data: newhsb 
##        AIC      BIC    logLik
##   46967.11 46994.63 -23479.55
## 
## Random effects:
##  Formula: ~1 | school
##         (Intercept) Residual
## StdDev:    1.610337 6.257581
## 
## Fixed effects: mathach ~ meanses 
##                 Value Std.Error   DF  t-value p-value
## (Intercept) 12.649741 0.1483385 7025 85.27619       0
## meanses      5.862922 0.3591754  158 16.32328       0
##  Correlation: 
##         (Intr)
## meanses -0.004
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.13467044 -0.75260775  0.02335172  0.76835668  2.78398911 
## 
## Number of Observations: 7185
## Number of Groups: 160
VarCorr(Q2_model) # A parameter estimate of τ_00 is 2.59
## school = pdLogChol(1) 
##             Variance  StdDev  
## (Intercept)  2.593186 1.610337
## Residual    39.157321 6.257581
logLik(Q2_model)*-2 # -2LL is equal to 46959.11, df=4
## 'log Lik.' 46959.11 (df=4)
# Calculate the ICC for Q2 model
ICC(Q2_model) # ICC is 6.21%
## [1] "ICC= 0.0621114852569335"
# Compare the Q2_model with unconditional model
anova(UnconditionalModel,Q2_model)
##                    Model df      AIC      BIC    logLik   Test  L.Ratio
## UnconditionalModel     1  3 47121.81 47142.45 -23557.90                
## Q2_model               2  4 46967.11 46994.63 -23479.55 1 vs 2 156.7017
##                    p-value
## UnconditionalModel        
## Q2_model            <.0001

Answering Q2:

Section 4: Q3-Is the strength of association between student CSES and math achievement similar across schools? Or is CSES a better predictor of student math achievement in some schools than others?

# To address this question, we need to build a random intercept and slope model.
Q3_model <- lme(mathach~cses,random=~cses|school,data=newhsb,method="ML")
summary(Q3_model)
## Linear mixed-effects model fit by maximum likelihood
##  Data: newhsb 
##        AIC      BIC    logLik
##   46722.98 46764.26 -23355.49
## 
## Random effects:
##  Formula: ~cses | school
##  Structure: General positive-definite, Log-Cholesky parametrization
##             StdDev    Corr  
## (Intercept) 2.9361736 (Intr)
## cses        0.8235253 0.021 
## Residual    6.0580570       
## 
## Fixed effects: mathach ~ cses 
##                 Value Std.Error   DF  t-value p-value
## (Intercept) 12.649431 0.2437717 7024 51.89049       0
## cses         2.193148 0.1278639 7024 17.15221       0
##  Correlation: 
##      (Intr)
## cses 0.013 
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.09688177 -0.73200318  0.01795852  0.75445130  2.89904649 
## 
## Number of Observations: 7185
## Number of Groups: 160
# Estimate of γ_00 = 12.65, which is the average math achievement across the population of schools.
# Estimate of γ_10 = 2.19, which is the average SES-math slope across those schools.
VarCorr(Q3_model) 
## school = pdLogChol(cses) 
##             Variance  StdDev    Corr  
## (Intercept)  8.621115 2.9361736 (Intr)
## cses         0.678194 0.8235253 0.021 
## Residual    36.700055 6.0580570
# Estimate of τ_00 is 8.62. This is the population variance among the Level-2 unit means.
# Estimate of τ_11 is 0.68. This is the population variance among the slope.
# Estimate of σ^2 is 36.70. This is the variance among the Level-1.
# In R, we can simply use the anova function to our model. It provides us the F-value and P value of the intercept which is the same with the wald Z test. 
anova(Q3_model)
##             numDF denDF   F-value p-value
## (Intercept)     1  7024 2670.8307  <.0001
## cses            1  7024  294.1983  <.0001
# From the result, we could tell the esitmated variance among the mean is τ_00 is 8.62, F=2670.83, p<.0001. 

logLik(Q3_model)*-2 # -2LL is equal to 46710.98, df=4
## 'log Lik.' 46710.98 (df=6)
# Calculate the ICC for Q3 model
ICC(Q3_model) # ICC is 19.02%
## [1] "ICC= 0.927070495237872" "ICC= 0.190222692838689"
# Compare the Q3_model with unconditional model and Q2 Model
anova(Q3_model,Q2_model,UnconditionalModel)
##                    Model df      AIC      BIC    logLik   Test  L.Ratio
## Q3_model               1  6 46722.98 46764.26 -23355.49                
## Q2_model               2  4 46967.11 46994.63 -23479.55 1 vs 2 248.1283
## UnconditionalModel     3  3 47121.81 47142.45 -23557.90 2 vs 3 156.7017
##                    p-value
## Q3_model                  
## Q2_model            <.0001
## UnconditionalModel  <.0001

Answering Q3:

Section 5: Q4-How do public and Catholic schools compare in terms of mean math achievement and in terms of the strength of the SES-math achievement relationship, after we control for MEAN SES?

# To address this question, we need to build a random intercept and slope model with two extra interaction terms. 
# The difference between Q4 model and Q3 model is, we use group mean centered SES(CSES) variable instead of "SES" for easier interpretation.
Q4_model <- lme(mathach~meanses + sector + Group_CSES + meanses:Group_CSES + sector:Group_CSES, random=~cses|school,data=newhsb,method="ML")
summary(Q4_model)
## Linear mixed-effects model fit by maximum likelihood
##  Data: newhsb 
##        AIC      BIC    logLik
##   46516.43 46585.23 -23248.21
## 
## Random effects:
##  Formula: ~cses | school
##  Structure: General positive-definite, Log-Cholesky parametrization
##             StdDev    Corr  
## (Intercept) 1.5227660 (Intr)
## cses        0.2552747 0.485 
## Residual    6.0598017       
## 
## Fixed effects: mathach ~ meanses + sector + Group_CSES + meanses:Group_CSES +      sector:Group_CSES 
##                        Value Std.Error   DF  t-value p-value
## (Intercept)        12.096013 0.1969220 7022 61.42541   0e+00
## meanses             5.331710 0.3656888  157 14.57991   0e+00
## sector              1.226729 0.3033774  157  4.04357   1e-04
## Group_CSES          2.939386 0.1535554 7022 19.14218   0e+00
## meanses:Group_CSES  1.042425 0.2961537 7022  3.51988   4e-04
## sector:Group_CSES  -1.643907 0.2374524 7022 -6.92310   0e+00
##  Correlation: 
##                    (Intr) meanss sector G_CSES m:G_CS
## meanses             0.245                            
## sector             -0.697 -0.356                     
## Group_CSES          0.075  0.019 -0.053              
## meanses:Group_CSES  0.018  0.074 -0.026  0.283       
## sector:Group_CSES  -0.052 -0.027  0.077 -0.694 -0.351
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.16092105 -0.72444280  0.01678312  0.75488891  2.95805927 
## 
## Number of Observations: 7185
## Number of Groups: 160
# Estimate of γ_00 = 12.11,represents the average true intercept for the public school(SECTOR = 0) has MEANSES = 0; It is significantly different from zero..
# Estimate of γ_01 = 5.33, represents the group performance difference in the intercepts for public school (or MEAN SES effects on school means after controlling for sector effect). It is significantly different from zero; We estimate γ_01 to study whether high-SES schools differ from low-SES schools in mean math achievement after controlling for SECTOR.
# Estimate of γ_02 = 1.23, represents the group performance difference in the intercepts when MEAN SES = 0. It is significantly different from zero; We estimate γ_02 to learn whether Catholic schools differ from public schools in terms of the mean achievement once MEAN SES is controlled. 
# γ_00 + γ_02 = 12.11+1.21 represents the average true intercept for the private school(SECTOR=1) has MEAN SES = 0; It is significantly different from zero.
VarCorr(Q4_model) 
## school = pdLogChol(cses) 
##             Variance    StdDev    Corr  
## (Intercept)  2.31881618 1.5227660 (Intr)
## cses         0.06516518 0.2552747 0.485 
## Residual    36.72119652 6.0598017
# Estimate of τ_00 is 2.31. This is the population variance among the Level-2 unit means. The significant variation in the intercepts remains unexplained even after controlling for SECTOR and MEAN SES.
# Estimate of τ_11 is 0.07. This is the population variance among the slope. No significant variation in the slopes remains unexplained even after controlling for SECTOR and MEAN SES.
# Estimate of σ^2 is 36.72. This is the variance among the Level-1.
# In R, we can simply use the anova function to our model. It provides us the F-value and P value of the intercept which is the same with the wald Z test. 
anova(Q4_model)
##                    numDF denDF  F-value p-value
## (Intercept)            1  7022 7825.298  <.0001
## meanses                1   157  295.507  <.0001
## sector                 1   157   20.810  <.0001
## Group_CSES             1  7022  395.381  <.0001
## meanses:Group_CSES     1  7022    1.353  0.2448
## sector:Group_CSES      1  7022   47.929  <.0001
# From the result, we could tell the esitmated variance among the mean is τ_00 is 8.62, F=2670.83, p<.0001. 
logLik(Q4_model)*-2 # -2LL is equal to 46496.43, df=10
## 'log Lik.' 46496.43 (df=10)
# Calculate the ICC for Q4 model
ICC(Q4_model) # ICC is 5.94%, this tells us that about 6% of the variability in math achievement difference occurs between schools. It is okay to use a multilevel model.
## [1] "ICC= 0.972665398692547"  "ICC= 0.0593958869280798"
# Compare the Q4_model with Q3_model, unconditional model and Q2 Model
anova(Q4_model,Q3_model,Q2_model,UnconditionalModel)
##                    Model df      AIC      BIC    logLik   Test  L.Ratio
## Q4_model               1 10 46516.43 46585.23 -23248.22                
## Q3_model               2  6 46722.98 46764.26 -23355.49 1 vs 2 214.5510
## Q2_model               3  4 46967.11 46994.63 -23479.55 2 vs 3 248.1283
## UnconditionalModel     4  3 47121.81 47142.45 -23557.90 3 vs 4 156.7017
##                    p-value
## Q4_model                  
## Q3_model            <.0001
## Q2_model            <.0001
## UnconditionalModel  <.0001

Answering Q4: