# Add additional packages you need
library(tidyverse)
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'tibble'
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'hms'
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lme4)
## Warning: package 'lme4' was built under R version 4.1.2
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
# Read in the data
week3data <- read.table("mlbook2_bb.dat", header=TRUE)
week3data <- as_tibble(week3data) %>%
mutate(schoolid = as.factor(week3data$schoolnr))
describe(as.numeric(table(week3data$schoolid)))
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 211 17.83 7.14 18 17.69 8.9 4 34 30 0.1 -0.85 0.49
Provide a short description of the sample (e.g., units and sample sizes at each level, mean number of students per school, etc), as well as the appropriate descriptive statistics for the three variables schoolnr, langPOST, and aritPOST. Indicate at which level each of the three variables is defined.
# these will tell you the structure
str(week3data)
## tibble [3,762 × 13] (S3: tbl_df/tbl/data.frame)
## $ schoolnr : num [1:3762] 1 1 1 1 1 1 1 1 1 1 ...
## $ pupilNR_new: num [1:3762] 3 4 5 6 7 8 9 10 11 12 ...
## $ langPOST : num [1:3762] 46 45 33 46 20 30 30 57 36 36 ...
## $ aritPOST : num [1:3762] 24 19 24 26 9 13 13 30 23 22 ...
## $ ses : num [1:3762] -4.73 -17.73 -12.73 -4.73 -17.73 ...
## $ IQ_verb : num [1:3762] 3.13 2.63 -2.37 -0.87 -3.87 -2.37 -2.37 1.13 -2.37 -0.87 ...
## $ IQ_perf : num [1:3762] 1.25 -1.08 -0.08 -1.08 -4.41 -2.08 -0.75 3.25 -2.41 3.92 ...
## $ Minority : num [1:3762] 0 1 0 0 0 1 1 0 1 1 ...
## $ denomina : num [1:3762] 1 1 1 1 1 1 1 1 1 1 ...
## $ sch_ses : num [1:3762] -14 -14 -14 -14 -14 ...
## $ sch_iqv : num [1:3762] -1.4 -1.4 -1.4 -1.4 -1.4 ...
## $ sch_min : num [1:3762] 0.63 0.63 0.63 0.63 0.63 0.63 0.63 0.63 0.63 0.63 ...
## $ schoolid : Factor w/ 211 levels "1","2","3","9",..: 1 1 1 1 1 1 1 1 1 1 ...
glimpse(week3data)
## Rows: 3,762
## Columns: 13
## $ schoolnr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pupilNR_new <dbl> 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1…
## $ langPOST <dbl> 46, 45, 33, 46, 20, 30, 30, 57, 36, 36, 29, 40, 41, 47, 33…
## $ aritPOST <dbl> 24, 19, 24, 26, 9, 13, 13, 30, 23, 22, 19, 23, 18, 22, 15,…
## $ ses <dbl> -4.73, -17.73, -12.73, -4.73, -17.73, -17.73, -4.73, -17.7…
## $ IQ_verb <dbl> 3.13, 2.63, -2.37, -0.87, -3.87, -2.37, -2.37, 1.13, -2.37…
## $ IQ_perf <dbl> 1.25, -1.08, -0.08, -1.08, -4.41, -2.08, -0.75, 3.25, -2.4…
## $ Minority <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1…
## $ denomina <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ sch_ses <dbl> -14.035, -14.035, -14.035, -14.035, -14.035, -14.035, -14.…
## $ sch_iqv <dbl> -1.4039, -1.4039, -1.4039, -1.4039, -1.4039, -1.4039, -1.4…
## $ sch_min <dbl> 0.630, 0.630, 0.630, 0.630, 0.630, 0.630, 0.630, 0.630, 0.…
## $ schoolid <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
# descriptive stats summaries
## school id
describe(select(week3data, -schoolnr))
## vars n mean sd median trimmed mad min max
## pupilNR_new 1 3762 2176.25 1198.91 2211.50 2184.72 1540.42 3.00 4214.00
## langPOST 2 3762 41.42 8.89 42.00 42.00 8.90 8.00 58.00
## aritPOST 3 3762 19.82 6.61 21.00 20.19 7.41 2.00 30.00
## ses 4 3762 0.05 10.90 -1.73 -0.49 11.86 -17.73 22.27
## IQ_verb 5 3762 0.04 2.04 0.13 0.07 1.48 -7.87 6.63
## IQ_perf 6 3762 0.04 2.19 -0.08 0.03 2.46 -6.41 6.59
## Minority 7 3762 0.05 0.21 0.00 0.00 0.00 0.00 1.00
## denomina 8 3762 2.25 1.11 2.00 2.11 1.48 1.00 5.00
## sch_ses 9 3762 0.02 6.16 0.13 0.04 6.65 -17.73 15.54
## sch_iqv 10 3762 0.02 0.82 0.09 0.04 0.64 -4.81 2.48
## sch_min 11 3762 0.05 0.12 0.00 0.02 0.00 0.00 0.92
## schoolid* 12 3762 105.53 58.22 108.00 105.48 72.65 1.00 211.00
## range skew kurtosis se
## pupilNR_new 4211.00 -0.05 -1.19 19.55
## langPOST 50.00 -0.55 -0.17 0.14
## aritPOST 28.00 -0.41 -0.79 0.11
## ses 40.00 0.40 -0.82 0.18
## IQ_verb 14.50 -0.19 0.67 0.03
## IQ_perf 13.00 0.04 -0.26 0.04
## Minority 1.00 4.28 16.29 0.00
## denomina 4.00 0.83 0.22 0.02
## sch_ses 33.27 -0.06 -0.39 0.10
## sch_iqv 7.29 -0.55 2.29 0.01
## sch_min 0.92 4.42 23.61 0.00
## schoolid* 210.00 -0.02 -1.11 0.95
n_groups = length(unique(week3data$schoolid))
## langPOST
describe(week3data$langPOST)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 3762 41.42 8.89 42 42 8.9 8 58 50 -0.55 -0.17 0.14
hist(week3data$langPOST)
## aritPOST
describe(week3data$aritPOST)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 3762 19.82 6.61 21 20.19 7.41 2 30 28 -0.41 -0.79 0.11
hist(week3data$aritPOST)
Fit a random intercept model (i.e., langPOST as the outcome variable and no predictor in the model)
# random effects = what's allowed to vary
week3data_rand_mod <- lmer(langPOST ~ 1 + (1 | schoolid), data=week3data)
summary(week3data_rand_mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: langPOST ~ 1 + (1 | schoolid)
## Data: week3data
##
## REML criterion at convergence: 26619.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.1873 -0.6449 0.0896 0.7231 2.5297
##
## Random effects:
## Groups Name Variance Std.Dev.
## schoolid (Intercept) 18.24 4.271
## Residual 62.78 7.923
## Number of obs: 3762, groups: schoolid, 211
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 41.0011 0.3257 125.9
Calculate the intraclass correlation
(variance_components <-as.data.frame(VarCorr(week3data_rand_mod)))
## grp var1 var2 vcov sdcor
## 1 schoolid (Intercept) <NA> 18.24153 4.271011
## 2 Residual <NA> <NA> 62.77833 7.923278
between_var <- variance_components$vcov[1]
within_var <- variance_components$vcov[2]
(icc_cor <- between_var/(between_var + within_var))
## [1] 0.2251489
lmer_icc <- function(obj) {
v <- as.data.frame(VarCorr(obj))
icc <- v$vcov[1]/v$vcov[1]+v$vcov[2]
icc}
lmer_icc(week3data_rand_mod)
## [1] 63.77833
Adapt the code from lecture to plot the distribution of langPOST across 5–10 schools. Write a sentence to describe what the graph shows. (Hint: you should use the schoolnr variable for plotting, but be sure that R thinks of it as a nominal variable [i.e. factor] and not as a number).
random_ids <- sample(unique(week3data$schoolid), size = 10)
(p_subset <- week3data %>%
filter(schoolid %in% random_ids) %>% # select only 10 schools
ggplot(aes(x = schoolid, y = langPOST)) + geom_jitter(height = 0, width = 0.1, alpha = 0.3)) + # Add school means
stat_summary(
fun = "mean", geom = "point", col = "red",
shape = 17, # use triangles
size = 4
) # make them larger
Answer: The graph above shows us the distributions of scores on langPOST at each school. The red triangles indicate the mean langPOST score for each school.
Based on the model, which school has the highest langPOST score? Which has the lowest?
intercepts <-
data.frame(ranef(week3data_rand_mod))
headTail(arrange(intercepts, condval))
## grpvar term grp condval condsd
## 1 schoolid (Intercept) 47 -12.24 2.34
## 2 schoolid (Intercept) 2 -11.59 2.45
## 3 schoolid (Intercept) 103 -11.29 2.9
## 4 schoolid (Intercept) 256 -8.71 2.16
## ... <NA> <NA> <NA> ... ...
## 208 schoolid (Intercept) 228 6.56 1.57
## 209 schoolid (Intercept) 147 7.03 1.57
## 210 schoolid (Intercept) 117 8.28 1.44
## 211 schoolid (Intercept) 153 8.72 1.57
Answer: The school with the highest langPOST score is group 153, and the school with the lowest is group 47.
Is a multilevel model needed with langPOST as the outcome? To do this, consider the intraclass correlation coefficient along with a plain ANOVA that checks for a significant effect of schoolnr variable (but make sure that R thinks of it as a factor). (We’ll talk about other criteria over the course of the semester.)
week3_aov <- aov(langPOST ~ schoolid, week3data)
summary(week3_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## schoolid 210 74699 355.7 5.682 <2e-16 ***
## Residuals 3551 222318 62.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Answer: Yes, because our data is multi-leveled which gives us more information regarding each level. The issue with anova in this case, is that we can only understand if there is a significant difference among all of the schools, but we don’t have a deeper understanding of how the levels relate to one another.