##This week, we will use a wide version of the Project STAR data (STAR_wide.dta). This is a sub-sample of more than 1,300 students who participated in the Tennessee Class Size Experiment and were followed from 3rd to 8th grade. This sample does not include any grade repeaters, so each student should be observed 6 times.
suppressPackageStartupMessages(library(tidyverse))
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##Starting with the wide version of the STAR data (with one row per person), reshape the dataset into long format, where one row represents each observation (multiple rows per person). Use pivot_longer function in the dplyr function in R, creating a new variable called grade or year (your choice). Glimpse in R below once you have reshaped the data.
star_wide <- haven::read_dta("star_wide.dta")
glimpse(star_wide)
## Rows: 1,471
## Columns: 11
## $ stdntid <dbl> 10023, 10042, 10056, 10082, 10097, 10103, 10111, 10113, 10…
## $ race <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, …
## $ gender <dbl+lbl> 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, …
## $ ac_mot <dbl> 50, 50, 52, 49, 51, 50, 51, 57, 40, 48, 53, 28, 53, 50, 56…
## $ ever_lunch <dbl+lbl> 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, …
## $ science3 <dbl> 632, 618, 618, 649, 577, 655, 662, 627, 573, 627, 581, 589…
## $ science4 <dbl> 756, 710, 719, 722, 688, 719, 767, 709, 668, 694, 624, 698…
## $ science5 <dbl> 729, 738, 760, 729, 681, 738, 749, 754, 586, 731, 642, 693…
## $ science6 <dbl> 762, 740, 776, 753, 720, 710, 782, 755, 687, 750, 671, 711…
## $ science7 <dbl> 778, 734, 782, 809, 698, 766, 763, 729, 634, 744, 719, 669…
## $ science8 <dbl> 783, 771, 772, 806, 745, 774, 804, 793, 765, 769, 674, 773…
##To remember we have 11 columns in wide and 1,471 rows which is about the number of students. Now need to make the dataset long which will seperate the observations out and have 6 entries of year per student. Let the clustering begin.
pivot_longer to reshape from wide to long….star.long <- star_wide %>%
pivot_longer(.,
cols = starts_with("science"),
names_to = "year",
values_to = "science",
names_prefix = "science_grade",
)
glimpse(star.long)
## Rows: 8,826
## Columns: 7
## $ stdntid <dbl> 10023, 10023, 10023, 10023, 10023, 10023, 10042, 10042, 10…
## $ race <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender <dbl+lbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ ac_mot <dbl> 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52…
## $ ever_lunch <dbl+lbl> 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ year <chr> "science3", "science4", "science5", "science6", "science7"…
## $ science <dbl> 632, 756, 729, 762, 778, 783, 618, 710, 738, 740, 734, 771…
#need to fix grade to a year not character
star.long.clean <- star.long %>%
mutate(.,
year = as.numeric(year)) %>%
na.omit()
## Warning: Problem with `mutate()` input `year`.
## ℹ NAs introduced by coercion
## ℹ Input `year` is `as.numeric(year)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
glimpse(star.long.clean)
## Rows: 0
## Columns: 7
## $ stdntid <dbl>
## $ race <dbl+lbl>
## $ gender <dbl+lbl>
## $ ac_mot <dbl>
## $ ever_lunch <dbl+lbl>
## $ year <dbl>
## $ science <dbl>
##going back to check my code to make it long version. realized I didn’t need to make a prefix variable since I am not deleting a connection like our example.
STAR.long <- star_wide %>%
pivot_longer(.,
cols = starts_with("science"),
names_to = "year",
values_to = "science",
names_prefix = "science"
)
glimpse(STAR.long)
## Rows: 8,826
## Columns: 7
## $ stdntid <dbl> 10023, 10023, 10023, 10023, 10023, 10023, 10042, 10042, 10…
## $ race <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender <dbl+lbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ ac_mot <dbl> 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52…
## $ ever_lunch <dbl+lbl> 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ year <chr> "3", "4", "5", "6", "7", "8", "3", "4", "5", "6", "7", "8"…
## $ science <dbl> 632, 756, 729, 762, 778, 783, 618, 710, 738, 740, 734, 771…
#need to fix grade to a year not character
star.long.clean <- STAR.long %>%
mutate(.,
year = as.numeric(year)) %>%
na.omit()
glimpse(star.long.clean)
## Rows: 8,826
## Columns: 7
## $ stdntid <dbl> 10023, 10023, 10023, 10023, 10023, 10023, 10042, 10042, 10…
## $ race <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender <dbl+lbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ ac_mot <dbl> 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52…
## $ ever_lunch <dbl+lbl> 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ year <dbl> 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4…
## $ science <dbl> 632, 756, 729, 762, 778, 783, 618, 710, 738, 740, 734, 771…
##Create a new variable, grade_0, which goes from 0-5 instead of 3-8 like our current grade variable & creating squared and cubic terms for time…
star.long.clean <- star.long.clean %>%
mutate(.,
year0 = year - 3,
year_sq = year0^2,
year_cube = year0^3)
glimpse(star.long.clean)
## Rows: 8,826
## Columns: 10
## $ stdntid <dbl> 10023, 10023, 10023, 10023, 10023, 10023, 10042, 10042, 10…
## $ race <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender <dbl+lbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ ac_mot <dbl> 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52…
## $ ever_lunch <dbl+lbl> 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ year <dbl> 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4…
## $ science <dbl> 632, 756, 729, 762, 778, 783, 618, 710, 738, 740, 734, 771…
## $ year0 <dbl> 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1…
## $ year_sq <dbl> 0, 1, 4, 9, 16, 25, 0, 1, 4, 9, 16, 25, 0, 1, 4, 9, 16, 25…
## $ year_cube <dbl> 0, 1, 8, 27, 64, 125, 0, 1, 8, 27, 64, 125, 0, 1, 8, 27, 6…
##Run and interpret three models, all with science scores (science) as the DV, and all with scores clustered within students (stdntid): 1) a null growth model, 2) a growth model with linear and quadratic growth, and 3) a growth model with linear, quadratic, and cubic growth. Use AIC and BIC to choose which model has the best fit. Does this dataset show evidence of nonlinear growth in science scores?
model.null.growth <- lmer(science ~ year0 + (1|stdntid), data = star.long.clean)
summary(model.null.growth)
## Linear mixed model fit by REML ['lmerMod']
## Formula: science ~ year0 + (1 | stdntid)
## Data: star.long.clean
##
## REML criterion at convergence: 90950.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.0250 -0.6361 0.0105 0.6366 4.0222
##
## Random effects:
## Groups Name Variance Std.Dev.
## stdntid (Intercept) 1073 32.76
## Residual 1300 36.05
## Number of obs: 8826, groups: stdntid, 1471
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 669.3920 1.0919 613
## year0 24.2729 0.2247 108
##
## Correlation of Fixed Effects:
## (Intr)
## year0 -0.514
#ICC for GROWTH null moel
null.groth.icc <- 1073/(1073 + 1300)
null.groth.icc
## [1] 0.4521702
##A growth model with linear and quadratic growth
model.square <- lmer(science ~ year0 + year_sq + (1|stdntid), REML = FALSE, data = star.long.clean)
summary(model.square)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: science ~ year0 + year_sq + (1 | stdntid)
## Data: star.long.clean
##
## AIC BIC logLik deviance df.resid
## 88570.1 88605.5 -44280.0 88560.1 8821
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.6568 -0.5429 -0.0099 0.5539 4.5442
##
## Random effects:
## Groups Name Variance Std.Dev.
## stdntid (Intercept) 1132.3 33.65
## Residual 938.9 30.64
## Number of obs: 8826, groups: stdntid, 1471
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 646.2231 1.1376 568.08
## year0 59.0262 0.6811 86.66
## year_sq -6.9507 0.1308 -53.16
##
## Correlation of Fixed Effects:
## (Intr) year0
## year0 -0.485
## year_sq 0.383 -0.960
model.both <- lmer(science ~ year0 + year_sq + year_cube + (year0|stdntid), REML = FALSE, data = star.long.clean)
## boundary (singular) fit: see ?isSingular
summary(model.both)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: science ~ year0 + year_sq + year_cube + (year0 | stdntid)
## Data: star.long.clean
##
## AIC BIC logLik deviance df.resid
## 87223.8 87280.5 -43603.9 87207.8 8818
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.2775 -0.4967 0.0019 0.5228 4.4707
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## stdntid (Intercept) 1100.2324 33.1698
## year0 0.1208 0.3475 1.00
## Residual 781.1644 27.9493
## Number of obs: 8826, groups: stdntid, 1471
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 635.77161 1.12157 566.86
## year0 106.75456 1.38715 76.96
## year_sq -33.07934 0.68935 -47.99
## year_cube 3.48382 0.09053 38.48
##
## Correlation of Fixed Effects:
## (Intr) year0 yer_sq
## year0 -0.413
## year_sq 0.300 -0.955
## year_cube -0.242 0.894 -0.985
## convergence code: 0
## boundary (singular) fit: see ?isSingular
##now using ANOVA to see if we shoul keep cubic term time (year)
anova(model.square, model.both)
## Data: star.long.clean
## Models:
## model.square: science ~ year0 + year_sq + (1 | stdntid)
## model.both: science ~ year0 + year_sq + year_cube + (year0 | stdntid)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model.square 5 88570 88605 -44280 88560
## model.both 8 87224 87280 -43604 87208 1352.3 3 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##Using your preferred model from #3 above, test a heterogeneous level-1 variance structure using the lme function in the nlme package for R users. R users - do not include a random slope for grade_0 at the student level (this was causing convergence issues). Does the use of this structure improve the model fit?
library(nlme)
##
## Attaching package: 'nlme'
## The following object is masked from 'package:lme4':
##
## lmList
## The following object is masked from 'package:dplyr':
##
## collapse
##now homogenous model w/out random slope
model.null.growth.homog <- lme(science ~ year0,
random = ~ 1|stdntid,
data = star.long.clean,
method = "ML")
summary(model.null.growth.homog)
## Linear mixed-effects model fit by maximum likelihood
## Data: star.long.clean
## AIC BIC logLik
## 90959.37 90987.71 -45475.68
##
## Random effects:
## Formula: ~1 | stdntid
## (Intercept) Residual
## StdDev: 32.74456 36.05019
##
## Fixed effects: science ~ year0
## Value Std.Error DF t-value p-value
## (Intercept) 669.3920 1.0917635 7354 613.1291 0
## year0 24.2729 0.2247147 7354 108.0163 0
## Correlation:
## (Intr)
## year0 -0.515
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -4.02535175 -0.63612595 0.01049814 0.63666093 4.02256978
##
## Number of Observations: 8826
## Number of Groups: 1471
##now heterogenous model w/out random slope
model.null.growth.het <- lme(science ~ year0,
random = ~ 1|stdntid,
data = star.long.clean,
method = "ML",
weights = varIdent(form = ~ 1 | year0)
)
summary(model.null.growth.het)
## Linear mixed-effects model fit by maximum likelihood
## Data: star.long.clean
## AIC BIC logLik
## 89887.23 89951 -44934.62
##
## Random effects:
## Formula: ~1 | stdntid
## (Intercept) Residual
## StdDev: 34.32642 67.40924
##
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | year0
## Parameter estimates:
## 0 1 2 3 4 5
## 1.0000000 0.4987552 0.4549923 0.4852018 0.3421510 0.3497103
## Fixed effects: science ~ year0
## Value Std.Error DF t-value p-value
## (Intercept) 695.2179 1.179164 7354 589.5853 0
## year0 16.9724 0.212356 7354 79.9241 0
## Correlation:
## (Intr)
## year0 -0.593
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -4.91503937 -0.71613675 -0.05974243 0.51662541 4.30180514
##
## Number of Observations: 8826
## Number of Groups: 1471