##This week, we will use a wide version of the Project STAR data (STAR_wide.dta). This is a sub-sample of more than 1,300 students who participated in the Tennessee Class Size Experiment and were followed from 3rd to 8th grade. This sample does not include any grade repeaters, so each student should be observed 6 times.

Load in packages

suppressPackageStartupMessages(library(tidyverse))
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

##Starting with the wide version of the STAR data (with one row per person), reshape the dataset into long format, where one row represents each observation (multiple rows per person). Use pivot_longer function in the dplyr function in R, creating a new variable called grade or year (your choice). Glimpse in R below once you have reshaped the data.

star_wide <- haven::read_dta("star_wide.dta")
glimpse(star_wide)
## Rows: 1,471
## Columns: 11
## $ stdntid    <dbl> 10023, 10042, 10056, 10082, 10097, 10103, 10111, 10113, 10…
## $ race       <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, …
## $ gender     <dbl+lbl> 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, …
## $ ac_mot     <dbl> 50, 50, 52, 49, 51, 50, 51, 57, 40, 48, 53, 28, 53, 50, 56…
## $ ever_lunch <dbl+lbl> 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, …
## $ science3   <dbl> 632, 618, 618, 649, 577, 655, 662, 627, 573, 627, 581, 589…
## $ science4   <dbl> 756, 710, 719, 722, 688, 719, 767, 709, 668, 694, 624, 698…
## $ science5   <dbl> 729, 738, 760, 729, 681, 738, 749, 754, 586, 731, 642, 693…
## $ science6   <dbl> 762, 740, 776, 753, 720, 710, 782, 755, 687, 750, 671, 711…
## $ science7   <dbl> 778, 734, 782, 809, 698, 766, 763, 729, 634, 744, 719, 669…
## $ science8   <dbl> 783, 771, 772, 806, 745, 774, 804, 793, 765, 769, 674, 773…

##To remember we have 11 columns in wide and 1,471 rows which is about the number of students. Now need to make the dataset long which will seperate the observations out and have 6 entries of year per student. Let the clustering begin.

Using pivot_longer to reshape from wide to long….

star.long <- star_wide %>%
  pivot_longer(.,
              cols = starts_with("science"),
              names_to = "year",
              values_to = "science",
              names_prefix = "science_grade",
              )

glimpse(star.long)
## Rows: 8,826
## Columns: 7
## $ stdntid    <dbl> 10023, 10023, 10023, 10023, 10023, 10023, 10042, 10042, 10…
## $ race       <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender     <dbl+lbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ ac_mot     <dbl> 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52…
## $ ever_lunch <dbl+lbl> 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ year       <chr> "science3", "science4", "science5", "science6", "science7"…
## $ science    <dbl> 632, 756, 729, 762, 778, 783, 618, 710, 738, 740, 734, 771…

#need to fix grade to a year not character

star.long.clean <- star.long %>%
  mutate(.,
         year = as.numeric(year)) %>%
  na.omit()
## Warning: Problem with `mutate()` input `year`.
## ℹ NAs introduced by coercion
## ℹ Input `year` is `as.numeric(year)`.
## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion
glimpse(star.long.clean)
## Rows: 0
## Columns: 7
## $ stdntid    <dbl> 
## $ race       <dbl+lbl> 
## $ gender     <dbl+lbl> 
## $ ac_mot     <dbl> 
## $ ever_lunch <dbl+lbl> 
## $ year       <dbl> 
## $ science    <dbl>

##going back to check my code to make it long version. realized I didn’t need to make a prefix variable since I am not deleting a connection like our example.

STAR.long <- star_wide %>%
  pivot_longer(.,
              cols = starts_with("science"),
              names_to = "year",
              values_to = "science",
              names_prefix = "science"
              )
glimpse(STAR.long)
## Rows: 8,826
## Columns: 7
## $ stdntid    <dbl> 10023, 10023, 10023, 10023, 10023, 10023, 10042, 10042, 10…
## $ race       <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender     <dbl+lbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ ac_mot     <dbl> 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52…
## $ ever_lunch <dbl+lbl> 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ year       <chr> "3", "4", "5", "6", "7", "8", "3", "4", "5", "6", "7", "8"…
## $ science    <dbl> 632, 756, 729, 762, 778, 783, 618, 710, 738, 740, 734, 771…

#need to fix grade to a year not character

star.long.clean <- STAR.long %>%
  mutate(.,
         year = as.numeric(year)) %>%
  na.omit()

glimpse(star.long.clean)
## Rows: 8,826
## Columns: 7
## $ stdntid    <dbl> 10023, 10023, 10023, 10023, 10023, 10023, 10042, 10042, 10…
## $ race       <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender     <dbl+lbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ ac_mot     <dbl> 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52…
## $ ever_lunch <dbl+lbl> 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ year       <dbl> 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4…
## $ science    <dbl> 632, 756, 729, 762, 778, 783, 618, 710, 738, 740, 734, 771…

##Create a new variable, grade_0, which goes from 0-5 instead of 3-8 like our current grade variable & creating squared and cubic terms for time…

star.long.clean <- star.long.clean %>%
  mutate(.,
         year0 = year - 3,
         year_sq = year0^2,
         year_cube = year0^3)

glimpse(star.long.clean)
## Rows: 8,826
## Columns: 10
## $ stdntid    <dbl> 10023, 10023, 10023, 10023, 10023, 10023, 10042, 10042, 10…
## $ race       <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender     <dbl+lbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ ac_mot     <dbl> 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52…
## $ ever_lunch <dbl+lbl> 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ year       <dbl> 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4…
## $ science    <dbl> 632, 756, 729, 762, 778, 783, 618, 710, 738, 740, 734, 771…
## $ year0      <dbl> 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1…
## $ year_sq    <dbl> 0, 1, 4, 9, 16, 25, 0, 1, 4, 9, 16, 25, 0, 1, 4, 9, 16, 25…
## $ year_cube  <dbl> 0, 1, 8, 27, 64, 125, 0, 1, 8, 27, 64, 125, 0, 1, 8, 27, 6…

##Run and interpret three models, all with science scores (science) as the DV, and all with scores clustered within students (stdntid): 1) a null growth model, 2) a growth model with linear and quadratic growth, and 3) a growth model with linear, quadratic, and cubic growth. Use AIC and BIC to choose which model has the best fit. Does this dataset show evidence of nonlinear growth in science scores?

GROWTH null model for math scores (with year included)

model.null.growth <- lmer(science ~ year0 + (1|stdntid), data = star.long.clean)
summary(model.null.growth)
## Linear mixed model fit by REML ['lmerMod']
## Formula: science ~ year0 + (1 | stdntid)
##    Data: star.long.clean
## 
## REML criterion at convergence: 90950.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0250 -0.6361  0.0105  0.6366  4.0222 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  stdntid  (Intercept) 1073     32.76   
##  Residual             1300     36.05   
## Number of obs: 8826, groups:  stdntid, 1471
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 669.3920     1.0919     613
## year0        24.2729     0.2247     108
## 
## Correlation of Fixed Effects:
##       (Intr)
## year0 -0.514

#ICC for GROWTH null moel

null.groth.icc <- 1073/(1073 + 1300)
null.groth.icc
## [1] 0.4521702

##A growth model with linear and quadratic growth

model.square <- lmer(science ~ year0 + year_sq + (1|stdntid), REML = FALSE, data = star.long.clean)
summary(model.square)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: science ~ year0 + year_sq + (1 | stdntid)
##    Data: star.long.clean
## 
##      AIC      BIC   logLik deviance df.resid 
##  88570.1  88605.5 -44280.0  88560.1     8821 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.6568 -0.5429 -0.0099  0.5539  4.5442 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  stdntid  (Intercept) 1132.3   33.65   
##  Residual              938.9   30.64   
## Number of obs: 8826, groups:  stdntid, 1471
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 646.2231     1.1376  568.08
## year0        59.0262     0.6811   86.66
## year_sq      -6.9507     0.1308  -53.16
## 
## Correlation of Fixed Effects:
##         (Intr) year0 
## year0   -0.485       
## year_sq  0.383 -0.960

A growth model with linear, quadratic, and cubic growth.

model.both <- lmer(science ~ year0 + year_sq + year_cube + (year0|stdntid), REML = FALSE, data = star.long.clean)
## boundary (singular) fit: see ?isSingular
summary(model.both)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: science ~ year0 + year_sq + year_cube + (year0 | stdntid)
##    Data: star.long.clean
## 
##      AIC      BIC   logLik deviance df.resid 
##  87223.8  87280.5 -43603.9  87207.8     8818 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.2775 -0.4967  0.0019  0.5228  4.4707 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr
##  stdntid  (Intercept) 1100.2324 33.1698      
##           year0          0.1208  0.3475  1.00
##  Residual              781.1644 27.9493      
## Number of obs: 8826, groups:  stdntid, 1471
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 635.77161    1.12157  566.86
## year0       106.75456    1.38715   76.96
## year_sq     -33.07934    0.68935  -47.99
## year_cube     3.48382    0.09053   38.48
## 
## Correlation of Fixed Effects:
##           (Intr) year0  yer_sq
## year0     -0.413              
## year_sq    0.300 -0.955       
## year_cube -0.242  0.894 -0.985
## convergence code: 0
## boundary (singular) fit: see ?isSingular

##now using ANOVA to see if we shoul keep cubic term time (year)

anova(model.square, model.both)
## Data: star.long.clean
## Models:
## model.square: science ~ year0 + year_sq + (1 | stdntid)
## model.both: science ~ year0 + year_sq + year_cube + (year0 | stdntid)
##              npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)    
## model.square    5 88570 88605 -44280    88560                         
## model.both      8 87224 87280 -43604    87208 1352.3  3  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##Using your preferred model from #3 above, test a heterogeneous level-1 variance structure using the lme function in the nlme package for R users. R users - do not include a random slope for grade_0 at the student level (this was causing convergence issues). Does the use of this structure improve the model fit?

library(nlme)
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:lme4':
## 
##     lmList
## The following object is masked from 'package:dplyr':
## 
##     collapse

##now homogenous model w/out random slope

model.null.growth.homog <- lme(science ~ year0, 
                         random = ~ 1|stdntid, 
                         data = star.long.clean,
                         method = "ML")
summary(model.null.growth.homog)
## Linear mixed-effects model fit by maximum likelihood
##  Data: star.long.clean 
##        AIC      BIC    logLik
##   90959.37 90987.71 -45475.68
## 
## Random effects:
##  Formula: ~1 | stdntid
##         (Intercept) Residual
## StdDev:    32.74456 36.05019
## 
## Fixed effects: science ~ year0 
##                Value Std.Error   DF  t-value p-value
## (Intercept) 669.3920 1.0917635 7354 613.1291       0
## year0        24.2729 0.2247147 7354 108.0163       0
##  Correlation: 
##       (Intr)
## year0 -0.515
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -4.02535175 -0.63612595  0.01049814  0.63666093  4.02256978 
## 
## Number of Observations: 8826
## Number of Groups: 1471

##now heterogenous model w/out random slope

model.null.growth.het <- lme(science ~ year0, 
                             random =  ~ 1|stdntid, 
                             data = star.long.clean, 
                             method = "ML", 
                             weights = varIdent(form = ~ 1 | year0)
)
summary(model.null.growth.het)
## Linear mixed-effects model fit by maximum likelihood
##  Data: star.long.clean 
##        AIC   BIC    logLik
##   89887.23 89951 -44934.62
## 
## Random effects:
##  Formula: ~1 | stdntid
##         (Intercept) Residual
## StdDev:    34.32642 67.40924
## 
## Variance function:
##  Structure: Different standard deviations per stratum
##  Formula: ~1 | year0 
##  Parameter estimates:
##         0         1         2         3         4         5 
## 1.0000000 0.4987552 0.4549923 0.4852018 0.3421510 0.3497103 
## Fixed effects: science ~ year0 
##                Value Std.Error   DF  t-value p-value
## (Intercept) 695.2179  1.179164 7354 589.5853       0
## year0        16.9724  0.212356 7354  79.9241       0
##  Correlation: 
##       (Intr)
## year0 -0.593
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -4.91503937 -0.71613675 -0.05974243  0.51662541  4.30180514 
## 
## Number of Observations: 8826
## Number of Groups: 1471