#load in packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lmerTest)
## Loading required package: lme4
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(lme4)

#load in dataset

star_long <-haven::read_dta("STAR_long.dta")
glimpse(star_long)
## Rows: 8,826
## Columns: 7
## $ stdntid    <dbl> 10023, 10023, 10023, 10023, 10023, 10023, 10042, 10042, 10…
## $ grade      <dbl> 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4…
## $ race       <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender     <dbl+lbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ ac_mot     <dbl> 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52…
## $ ever_lunch <dbl+lbl> 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ science    <dbl> 632, 756, 729, 762, 778, 783, 618, 710, 738, 740, 734, 771…

#tried to Hmisc and it isn’t working? So I will check to make sure dataset is named correctly & it is.

glimpse(star_long)
## Rows: 8,826
## Columns: 7
## $ stdntid    <dbl> 10023, 10023, 10023, 10023, 10023, 10023, 10042, 10042, 10…
## $ grade      <dbl> 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 3, 4…
## $ race       <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender     <dbl+lbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ ac_mot     <dbl> 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 52, 52, 52…
## $ ever_lunch <dbl+lbl> 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ science    <dbl> 632, 756, 729, 762, 778, 783, 618, 710, 738, 740, 734, 771…

###Cleaning: What are my variables? #science scores (science) as the DV, and grade (grade), gender (gender), race/ethnicity (race), and academic motivation (ac_mot) as predictors. #Create a new variable, grade_0, which goes from 0-5 instead of 3-8 like our current grade variable does. Why is using a time variable that starts at 0 helpful in interpreting our results?

starlong.clean <-star_long %>%
  mutate(.,
         race.fac = as_factor(race),
         gender.fac = as_factor(gender),
         freelunch.fac = as_factor(ever_lunch),
         grade0 = grade-3)

#tried arrange…didn’t seem to do the same as what happened with Dr. Broda

starlong.clean %>%
  arrange(., stdntid, grade0)
## # A tibble: 8,826 x 11
##    stdntid grade    race  gender ac_mot ever_lunch science race.fac gender.fac
##      <dbl> <dbl> <dbl+l> <dbl+l>  <dbl>  <dbl+lbl>   <dbl> <fct>    <fct>     
##  1   10023     3 1 [WHI… 1 [MAL…     50 2 [NON-FR…     632 WHITE    MALE      
##  2   10023     4 1 [WHI… 1 [MAL…     50 2 [NON-FR…     756 WHITE    MALE      
##  3   10023     5 1 [WHI… 1 [MAL…     50 2 [NON-FR…     729 WHITE    MALE      
##  4   10023     6 1 [WHI… 1 [MAL…     50 2 [NON-FR…     762 WHITE    MALE      
##  5   10023     7 1 [WHI… 1 [MAL…     50 2 [NON-FR…     778 WHITE    MALE      
##  6   10023     8 1 [WHI… 1 [MAL…     50 2 [NON-FR…     783 WHITE    MALE      
##  7   10042     3 1 [WHI… 2 [FEM…     50 1 [FREE L…     618 WHITE    FEMALE    
##  8   10042     4 1 [WHI… 2 [FEM…     50 1 [FREE L…     710 WHITE    FEMALE    
##  9   10042     5 1 [WHI… 2 [FEM…     50 1 [FREE L…     738 WHITE    FEMALE    
## 10   10042     6 1 [WHI… 2 [FEM…     50 1 [FREE L…     740 WHITE    FEMALE    
## # … with 8,816 more rows, and 2 more variables: freelunch.fac <fct>,
## #   grade0 <dbl>

#null model: science scores (science) as the DV, with scores clustered within students (stdntid). Note to self, rename the variable so it is easier to work with.

model.null <- lmer(science ~ (1|stdntid), data = starlong.clean)
summary(model.null)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: science ~ (1 | stdntid)
##    Data: starlong.clean
## 
## REML criterion at convergence: 97939.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1898 -0.3885  0.2381  0.6370  2.7353 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  stdntid  (Intercept)  729.4   27.01   
##  Residual             3361.7   57.98   
## Number of obs: 8826, groups:  stdntid, 1471
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)  730.0741     0.9363 1469.9997   779.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##Calculate ICC

null.icc <- 729.4/(729.4 + 3361.7)
null.icc
## [1] 0.1782895

#Now time to run a growth null moddel for science scores with grade component (time value)

model.null.growth <- lmer(science ~ grade0 + (1|stdntid), data = starlong.clean)
summary(model.null.growth)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: science ~ grade0 + (1 | stdntid)
##    Data: starlong.clean
## 
## REML criterion at convergence: 90950.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0250 -0.6361  0.0105  0.6366  4.0222 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  stdntid  (Intercept) 1073     32.76   
##  Residual             1300     36.05   
## Number of obs: 8826, groups:  stdntid, 1471
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)  669.3920     1.0919 2650.0494     613   <2e-16 ***
## grade0        24.2729     0.2247 7354.0000     108   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##        (Intr)
## grade0 -0.514

##ICC for null growth model

null.growth.icc <-1073 / (1073 + 1300) 
null.growth.icc
## [1] 0.4521702

#Now, run a random intercept growth model with science scores (science) as the DV, and grade (grade_0), gender (gender), race/ethnicity (race), and academic motivation (ac_mot) as predictors. Interpret the results and evaluate model fit using AIC/BIC and your choice of effect size.

model.1 <- lmer(science ~ grade0 + gender + race + ac_mot + (1|stdntid), data = starlong.clean)
summary(model.1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: science ~ grade0 + gender + race + ac_mot + (1 | stdntid)
##    Data: starlong.clean
## 
## REML criterion at convergence: 90850.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0751 -0.6323  0.0104  0.6353  3.9929 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  stdntid  (Intercept)  995     31.54   
##  Residual             1300     36.05   
## Number of obs: 8826, groups:  stdntid, 1471
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)  717.9111    11.2852 1474.2937  63.615  < 2e-16 ***
## grade0        24.2729     0.2247 7354.0000 108.021  < 2e-16 ***
## gender        -6.4345     1.8672 1467.0000  -3.446 0.000585 ***
## race         -18.4500     2.0271 1467.0000  -9.102  < 2e-16 ***
## ac_mot        -0.3384     0.2294 1466.9983  -1.475 0.140412    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##        (Intr) grade0 gender race  
## grade0 -0.050                     
## gender -0.048  0.000              
## race   -0.194  0.000  0.056       
## ac_mot -0.937  0.000 -0.218 -0.034

#model fit

lmerTest::rand(model.1)
## ANOVA-like table for random-effects: Single term deletions
## 
## Model:
## science ~ grade0 + gender + race + ac_mot + (1 | stdntid)
##               npar logLik   AIC    LRT Df Pr(>Chisq)    
## <none>           7 -45425 90865                         
## (1 | stdntid)    6 -46666 93343 2480.5  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Run an interaction effect between ac_mot and grade_0 to test whether students with different levels of academic motivation have different growth in science scores. Interpret the results.

model.2 <- lmer(science ~ grade0 + gender + race + ac_mot + grade0:ac_mot + (1|stdntid), data = starlong.clean)
summary(model.2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: science ~ grade0 + gender + race + ac_mot + grade0:ac_mot + (1 |  
##     stdntid)
##    Data: starlong.clean
## 
## REML criterion at convergence: 90854.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0727 -0.6324  0.0102  0.6354  3.9923 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  stdntid  (Intercept)  995     31.54   
##  Residual             1300     36.06   
## Number of obs: 8826, groups:  stdntid, 1471
## 
## Fixed effects:
##                 Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)    7.187e+02  1.318e+01  2.674e+03  54.512  < 2e-16 ***
## grade0         2.397e+01  2.735e+00  7.353e+03   8.763  < 2e-16 ***
## gender        -6.434e+00  1.867e+00  1.467e+03  -3.446 0.000585 ***
## race          -1.845e+01  2.027e+00  1.467e+03  -9.102  < 2e-16 ***
## ac_mot        -3.537e-01  2.680e-01  2.662e+03  -1.320 0.187029    
## grade0:ac_mot  6.121e-03  5.542e-02  7.353e+03   0.110 0.912047    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) grade0 gender race   ac_mot
## grade0      -0.519                            
## gender      -0.041  0.000                     
## race        -0.166  0.000  0.056              
## ac_mot      -0.954  0.515 -0.187 -0.029       
## grade0:c_mt  0.517 -0.997  0.000  0.000 -0.517

##Try adding a random slope for time (grade_0) at the student level.

model.3 <- lmer(science ~ grade0 + gender + race + ac_mot + (grade0|stdntid), data = starlong.clean)
## boundary (singular) fit: see ?isSingular
summary(model.3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: science ~ grade0 + gender + race + ac_mot + (grade0 | stdntid)
##    Data: starlong.clean
## 
## REML criterion at convergence: 90849.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0444 -0.6345  0.0095  0.6351  4.0093 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr
##  stdntid  (Intercept) 9.529e+02 30.8685      
##           grade0      7.319e-02  0.2705  1.00
##  Residual             1.300e+03 36.0488      
## Number of obs: 8826, groups:  stdntid, 1471
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)  717.8706    11.2795 1474.0410  63.644   <2e-16 ***
## grade0        24.2729     0.2248 7291.9444 107.980   <2e-16 ***
## gender        -6.3401     1.8665 1467.3490  -3.397   0.0007 ***
## race         -18.3997     2.0262 1467.3490  -9.081   <2e-16 ***
## ac_mot        -0.3417     0.2293 1467.3497  -1.490   0.1364    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##        (Intr) grade0 gender race  
## grade0 -0.048                     
## gender -0.048  0.000              
## race   -0.194  0.000  0.056       
## ac_mot -0.937  0.000 -0.218 -0.034
## convergence code: 0
## boundary (singular) fit: see ?isSingular

##Use the rand function (R) to evaluate whether the slope should be treated as random.

rand(model.3)
## ANOVA-like table for random-effects: Single term deletions
## 
## Model:
## science ~ grade0 + gender + race + ac_mot + (grade0 | stdntid)
##                              npar logLik   AIC    LRT Df Pr(>Chisq)
## <none>                          9 -45425 90868                     
## grade0 in (grade0 | stdntid)    7 -45425 90865 1.3075  2     0.5201

##this looks weird, I will try again

lmerTest::rand(model.3)
## ANOVA-like table for random-effects: Single term deletions
## 
## Model:
## science ~ grade0 + gender + race + ac_mot + (grade0 | stdntid)
##                              npar logLik   AIC    LRT Df Pr(>Chisq)
## <none>                          9 -45425 90868                     
## grade0 in (grade0 | stdntid)    7 -45425 90865 1.3075  2     0.5201