Load packages

# Add additional packages you need
library(tidyverse)
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'tibble'
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'hms'
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lme4)
## Warning: package 'lme4' was built under R version 4.1.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Import Data

# Read in the data

week3data <- read.table("mlbook2_bb.dat", header=TRUE)

week3data <- as_tibble(week3data) %>%
  mutate(schoolid = as.factor(week3data$schoolnr))
describe(as.numeric(table(week3data$schoolid)))
##    vars   n  mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 211 17.83 7.14     18   17.69 8.9   4  34    30  0.1    -0.85 0.49

Descriptive Stats

Provide a short description of the sample (e.g., units and sample sizes at each level, mean number of students per school, etc), as well as the appropriate descriptive statistics for the three variables schoolnr, langPOST, and aritPOST. Indicate at which level each of the three variables is defined.

# these will tell you the structure

str(week3data) 
## tibble [3,762 × 13] (S3: tbl_df/tbl/data.frame)
##  $ schoolnr   : num [1:3762] 1 1 1 1 1 1 1 1 1 1 ...
##  $ pupilNR_new: num [1:3762] 3 4 5 6 7 8 9 10 11 12 ...
##  $ langPOST   : num [1:3762] 46 45 33 46 20 30 30 57 36 36 ...
##  $ aritPOST   : num [1:3762] 24 19 24 26 9 13 13 30 23 22 ...
##  $ ses        : num [1:3762] -4.73 -17.73 -12.73 -4.73 -17.73 ...
##  $ IQ_verb    : num [1:3762] 3.13 2.63 -2.37 -0.87 -3.87 -2.37 -2.37 1.13 -2.37 -0.87 ...
##  $ IQ_perf    : num [1:3762] 1.25 -1.08 -0.08 -1.08 -4.41 -2.08 -0.75 3.25 -2.41 3.92 ...
##  $ Minority   : num [1:3762] 0 1 0 0 0 1 1 0 1 1 ...
##  $ denomina   : num [1:3762] 1 1 1 1 1 1 1 1 1 1 ...
##  $ sch_ses    : num [1:3762] -14 -14 -14 -14 -14 ...
##  $ sch_iqv    : num [1:3762] -1.4 -1.4 -1.4 -1.4 -1.4 ...
##  $ sch_min    : num [1:3762] 0.63 0.63 0.63 0.63 0.63 0.63 0.63 0.63 0.63 0.63 ...
##  $ schoolid   : Factor w/ 211 levels "1","2","3","9",..: 1 1 1 1 1 1 1 1 1 1 ...
glimpse(week3data) 
## Rows: 3,762
## Columns: 13
## $ schoolnr    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pupilNR_new <dbl> 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1…
## $ langPOST    <dbl> 46, 45, 33, 46, 20, 30, 30, 57, 36, 36, 29, 40, 41, 47, 33…
## $ aritPOST    <dbl> 24, 19, 24, 26, 9, 13, 13, 30, 23, 22, 19, 23, 18, 22, 15,…
## $ ses         <dbl> -4.73, -17.73, -12.73, -4.73, -17.73, -17.73, -4.73, -17.7…
## $ IQ_verb     <dbl> 3.13, 2.63, -2.37, -0.87, -3.87, -2.37, -2.37, 1.13, -2.37…
## $ IQ_perf     <dbl> 1.25, -1.08, -0.08, -1.08, -4.41, -2.08, -0.75, 3.25, -2.4…
## $ Minority    <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1…
## $ denomina    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ sch_ses     <dbl> -14.035, -14.035, -14.035, -14.035, -14.035, -14.035, -14.…
## $ sch_iqv     <dbl> -1.4039, -1.4039, -1.4039, -1.4039, -1.4039, -1.4039, -1.4…
## $ sch_min     <dbl> 0.630, 0.630, 0.630, 0.630, 0.630, 0.630, 0.630, 0.630, 0.…
## $ schoolid    <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
# descriptive stats summaries

## school id

describe(select(week3data, -schoolnr))
##             vars    n    mean      sd  median trimmed     mad    min     max
## pupilNR_new    1 3762 2176.25 1198.91 2211.50 2184.72 1540.42   3.00 4214.00
## langPOST       2 3762   41.42    8.89   42.00   42.00    8.90   8.00   58.00
## aritPOST       3 3762   19.82    6.61   21.00   20.19    7.41   2.00   30.00
## ses            4 3762    0.05   10.90   -1.73   -0.49   11.86 -17.73   22.27
## IQ_verb        5 3762    0.04    2.04    0.13    0.07    1.48  -7.87    6.63
## IQ_perf        6 3762    0.04    2.19   -0.08    0.03    2.46  -6.41    6.59
## Minority       7 3762    0.05    0.21    0.00    0.00    0.00   0.00    1.00
## denomina       8 3762    2.25    1.11    2.00    2.11    1.48   1.00    5.00
## sch_ses        9 3762    0.02    6.16    0.13    0.04    6.65 -17.73   15.54
## sch_iqv       10 3762    0.02    0.82    0.09    0.04    0.64  -4.81    2.48
## sch_min       11 3762    0.05    0.12    0.00    0.02    0.00   0.00    0.92
## schoolid*     12 3762  105.53   58.22  108.00  105.48   72.65   1.00  211.00
##               range  skew kurtosis    se
## pupilNR_new 4211.00 -0.05    -1.19 19.55
## langPOST      50.00 -0.55    -0.17  0.14
## aritPOST      28.00 -0.41    -0.79  0.11
## ses           40.00  0.40    -0.82  0.18
## IQ_verb       14.50 -0.19     0.67  0.03
## IQ_perf       13.00  0.04    -0.26  0.04
## Minority       1.00  4.28    16.29  0.00
## denomina       4.00  0.83     0.22  0.02
## sch_ses       33.27 -0.06    -0.39  0.10
## sch_iqv        7.29 -0.55     2.29  0.01
## sch_min        0.92  4.42    23.61  0.00
## schoolid*    210.00 -0.02    -1.11  0.95
n_groups = length(unique(week3data$schoolid))

## langPOST

describe(week3data$langPOST)
##    vars    n  mean   sd median trimmed mad min max range  skew kurtosis   se
## X1    1 3762 41.42 8.89     42      42 8.9   8  58    50 -0.55    -0.17 0.14
hist(week3data$langPOST)

## aritPOST

describe(week3data$aritPOST)
##    vars    n  mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 3762 19.82 6.61     21   20.19 7.41   2  30    28 -0.41    -0.79 0.11
hist(week3data$aritPOST)

Random Intercept Model

Fit a random intercept model (i.e., langPOST as the outcome variable and no predictor in the model)

# random effects = what's allowed to vary

week3data_rand_mod <- lmer(langPOST ~ 1 + (1 | schoolid), data=week3data) 
summary(week3data_rand_mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: langPOST ~ 1 + (1 | schoolid)
##    Data: week3data
## 
## REML criterion at convergence: 26619.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1873 -0.6449  0.0896  0.7231  2.5297 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  schoolid (Intercept) 18.24    4.271   
##  Residual             62.78    7.923   
## Number of obs: 3762, groups:  schoolid, 211
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  41.0011     0.3257   125.9

Intraclass Correlation

Calculate the intraclass correlation

(variance_components <-as.data.frame(VarCorr(week3data_rand_mod)))
##        grp        var1 var2     vcov    sdcor
## 1 schoolid (Intercept) <NA> 18.24153 4.271011
## 2 Residual        <NA> <NA> 62.77833 7.923278
between_var <- variance_components$vcov[1]
within_var <- variance_components$vcov[2]
(icc_cor <- between_var/(between_var + within_var))
## [1] 0.2251489
lmer_icc <- function(obj) {
  v <- as.data.frame(VarCorr(obj))
icc <- v$vcov[1]/v$vcov[1]+v$vcov[2]
icc}

lmer_icc(week3data_rand_mod)
## [1] 63.77833

Plot a Few Schools

Adapt the code from lecture to plot the distribution of langPOST across 5–10 schools. Write a sentence to describe what the graph shows. (Hint: you should use the schoolnr variable for plotting, but be sure that R thinks of it as a nominal variable [i.e. factor] and not as a number).

random_ids <- sample(unique(week3data$schoolid), size = 10) 
(p_subset <- week3data %>%
  filter(schoolid %in% random_ids) %>% # select only 10 schools
ggplot(aes(x = schoolid, y = langPOST)) + geom_jitter(height = 0, width = 0.1, alpha = 0.3)) + # Add school means
stat_summary(
fun = "mean", geom = "point", col = "red",
shape = 17, # use triangles
size = 4
) # make them larger

Answer: The graph above shows us the distributions of scores on langPOST at each school. The red triangles indicate the mean langPOST score for each school.

Extract Information from the Model

Based on the model, which school has the highest langPOST score? Which has the lowest?

intercepts <- 
data.frame(ranef(week3data_rand_mod))
headTail(arrange(intercepts, condval))
##       grpvar        term  grp condval condsd
## 1   schoolid (Intercept)   47  -12.24   2.34
## 2   schoolid (Intercept)    2  -11.59   2.45
## 3   schoolid (Intercept)  103  -11.29    2.9
## 4   schoolid (Intercept)  256   -8.71   2.16
## ...     <NA>        <NA> <NA>     ...    ...
## 208 schoolid (Intercept)  228    6.56   1.57
## 209 schoolid (Intercept)  147    7.03   1.57
## 210 schoolid (Intercept)  117    8.28   1.44
## 211 schoolid (Intercept)  153    8.72   1.57

Answer: The school with the highest langPOST score is group 153, and the school with the lowest is group 47.

MLM Necessary?

Is a multilevel model needed with langPOST as the outcome? To do this, consider the intraclass correlation coefficient along with a plain ANOVA that checks for a significant effect of schoolnr variable (but make sure that R thinks of it as a factor). (We’ll talk about other criteria over the course of the semester.)

week3_aov <- aov(langPOST ~ schoolid, week3data) 
summary(week3_aov)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## schoolid     210  74699   355.7   5.682 <2e-16 ***
## Residuals   3551 222318    62.6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Answer: Yes, because our data is multi-leveled which gives us more information regarding each level. The issue with anova in this case, is that we can only understand if there is a significant difference among all of the schools, but we don’t have a deeper understanding of how the levels relate to one another.