getwd()
## [1] "D:/1. ACTUARY STUDY/0.0 Monash Study/4. ETC3580- Advanced statistical modelling- with R/3. Assignment 3-Deadine 17.10"
load("A3.Rdata")

1. All the variables in the two tibbles are stored as numerical information. Re-code the categorical variables to be factors with appropriate labels. Comment on the grouping levels in the data: do we have nested levels or crossed levels? (3 marks)

Comment:

In school tibble, the variable Type is recoded with 1= Public and 2 = Private, while the Size variable is divided into 3 groups: small (<1000 students), large (>2000) and medium (all remainders).

On grouping factors, we have nested levels as in both factors Type and Size form a hierarchical structure. In either Type level “Public” or “Private”, we all have 3 sizes of schools, though the label for Size are same for both Types, there are 6 independent levels (Small Public, Medium Public, Large Public, Small Private, Medium Private, Large Private)

school$Type[school$Type==1]="Public"
school$Type[school$Type==2]="Private"

school$size<- cut(school$Size,
                breaks=c(-Inf, 1000, 2000, Inf),
                labels=c("small","medium","large"))
school$Type<-as.factor(school$Type)
school$School<-as.factor(school$School)
student$School<-as.factor(student$School)
summary(school)
##      School        Type         Size            size   
##  1      : 1   Private:21   Min.   : 100.0   small :27  
##  2      : 1   Public :29   1st Qu.: 530.2   medium:16  
##  3      : 1                Median : 929.0   large : 7  
##  4      : 1                Mean   :1122.4              
##  5      : 1                3rd Qu.:1665.5              
##  6      : 1                Max.   :2713.0              
##  (Other):44

2. Add two predictors to the student tibble. Create the predictor mStatus, that is equal to the mean socio-economic status of students in each school, and create cStatus, that is Status centered to zero within schools. Use left_join to join the two tibbles together. (3 marks)

library(dplyr) 
# note: the function to generate mStatus sometimes does not create mean by group, please update your package then re-run. 

# mStatus
mS<-student %>%
  group_by(School) %>% summarise(mStatus= mean(Status))
## `summarise()` ungrouping output (override with `.groups` argument)
student<-left_join(student, mS)
## Joining, by = "School"
# cStatus
cS<-student %>% 
  mutate(cStatus = student$Status-student$mStatus)

student<-left_join(student, cS)
## Joining, by = c("School", "Status", "Math", "mStatus")
student %>% 
  group_by(School) %>% summarise(centermean=mean(cStatus))   # check centering mean 
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 50 x 2
##    School centermean
##    <fct>       <dbl>
##  1 1       -7.94e-18
##  2 2        3.12e-17
##  3 3        3.01e-17
##  4 4        9.60e-17
##  5 5        2.72e-17
##  6 6        3.55e-17
##  7 7       -3.03e-17
##  8 8       -6.56e-18
##  9 9       -5.46e-18
## 10 10       9.39e-18
## # ... with 40 more rows
# left joint

data<-left_join(school,student)
## Joining, by = "School"
summary(data)
##      School          Type           Size          size          Status       
##  12     :  67   Private: 979   Min.   : 100   small :1233   Min.   :-2.8380  
##  30     :  63   Public :1195   1st Qu.: 509   medium: 676   1st Qu.:-0.7780  
##  25     :  61                  Median : 780   large : 265   Median :-0.2480  
##  32     :  58                  Mean   :1074                 Mean   :-0.2319  
##  42     :  58                  3rd Qu.:1524                 3rd Qu.: 0.3220  
##  44     :  58                  Max.   :2713                 Max.   : 2.6920  
##  (Other):1809                                                                
##       Math           mStatus           cStatus         
##  Min.   :-2.832   Min.   :-1.1939   Min.   :-3.366909  
##  1st Qu.: 5.144   1st Qu.:-0.5227   1st Qu.:-0.452699  
##  Median :10.461   Median :-0.2919   Median :-0.006206  
##  Mean   :10.484   Mean   :-0.2319   Mean   : 0.000000  
##  3rd Qu.:15.571   3rd Qu.: 0.0732   3rd Qu.: 0.447079  
##  Max.   :23.447   Max.   : 0.6266   Max.   : 2.856078  
## 

3. Produce plots for Math against cStatus, faceted by the ten smallest private schools. Next, produce the same plot for the ten smallest Public schools. Comment on the differences. (6 marks)

Comment:

In terms of 10 smallest public schools, the math ability and cStatus does correlate in a certain degree via plot visualization (most of the schools shows that there is little positive correlation between cStatus and Math ability)

Those the positive correlation in private schools is not as clearly strong as public school

# Define 10 smallest puclic school (<=1313)
school %>% filter(Type=="Public")%>%arrange(Size)
## # A tibble: 29 x 4
##    School Type    Size size  
##    <fct>  <fct>  <dbl> <fct> 
##  1 14     Public   100 small 
##  2 39     Public   365 small 
##  3 35     Public   415 small 
##  4 25     Public   488 small 
##  5 36     Public   760 small 
##  6 40     Public   760 small 
##  7 1      Public   842 small 
##  8 17     Public   959 small 
##  9 46     Public  1125 medium
## 10 44     Public  1313 medium
## # ... with 19 more rows
# Select Math and cStatus for 10 smallest public schools
math_pub<-data %>% 
            arrange(desc(Size))%>%
            filter(Type=="Public" & Size<=1313)%>%
            select(Type,Math,cStatus,Size,School)

# Define 10 smallest private school (<=531)
school %>% filter(Type=="Private")%>%arrange(Size)
## # A tibble: 21 x 4
##    School Type     Size size 
##    <fct>  <fct>   <dbl> <fct>
##  1 9      Private   185 small
##  2 33     Private   270 small
##  3 49     Private   388 small
##  4 15     Private   400 small
##  5 5      Private   455 small
##  6 41     Private   457 small
##  7 22     Private   485 small
##  8 50     Private   509 small
##  9 11     Private   530 small
## 10 12     Private   531 small
## # ... with 11 more rows
# Select Math and cStatus for 10 smallest private schools
math_pri<-data %>% 
            arrange(desc(Size))%>%
            filter(Type=="Private" & Size<=531)%>%
            select(Type,Math,cStatus,Size,School)
mb<-math_pub %>%
    ggplot(aes(y= Math, x= cStatus))+
      geom_point()+
      geom_smooth(method=lm)+
      ggtitle('Math score against cStatus by 10 smallest public schools') +
      xlab('Ascending cStatus') +
      ylab('Ascending Math Ability')+facet_wrap(~School)+
      stat_regline_equation(label.x = -2, label.y = 30)
  

mp<-math_pri %>%
    ggplot(aes(y= Math, x= cStatus))+
      geom_point()+
      geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
      ggtitle('Math score against cStatus by 10 smallest private schools') +
      xlab('Ascending cStatus') +
      ylab('Ascending Math Ability')+facet_wrap(~School)+
      stat_regline_equation(label.x = -2, label.y = 30)+geom_rug()
mp
## `geom_smooth()` using formula 'y ~ x'

mb
## `geom_smooth()` using formula 'y ~ x'

math_10smallest<-bind_rows(math_pri,math_pub)



math_10smallest %>%
    ggplot(aes(x= cStatus, y= Math, color=Type ))+
      geom_point()+
      geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
      ggtitle('Math score against cStatus by 10 smallest private schools') +
      ylab('Ascending Math') +
      xlab('Ascending Cstatus')+
      stat_regline_equation(label.x = -4, label.y = 35)+
      facet_wrap(~School,nrow=5)+theme(panel.spacing = unit(0.5, "lines"))
## `geom_smooth()` using formula 'y ~ x'

4 Produce boxplots that compare the intercepts and slopes of the types. Include all schools in this analysis. Interpret the results. (8 marks)

Visually, the intercept for public is slightly smaller than private schools, that indicates that the math ability in public school is slightly lower than public schools.

Visually The slope for public is lightly higher to private schools in general, indicating that growth in every unit of cStatus will provoke a faster growth in Math ability for public rather than private school students.

5. Fit a mixed effects model with the math score as the response, and for each school a random intercept and slope for cStatus. The fixed main effects in the model are mStatus, Type, and cStatus, interaction between Type and cStatus. (3 marks).

model1<-lmer(data=data, 
             Math~ mStatus+Type+cStatus+Type:cStatus+ (cStatus|School))

6. Briefly interpret the estimated fixed effects. Use a visreg plot for the interaction. (7 marks)

model1 is built for the intercept and slopes can vary among school with:

The intercept 11.57 is the average intercept among 50 schools. Specific intercept of each school will be computed based on the fixed intercept. For example the variation of each school intercept relative to 11.57 is theta, then the intercept of each school will be 11.57+theta.

The slope for mStatus and Type are fixed across schools, while the average slope of cStatus of 50 schools are 1.26 (the slope of specific school will higher or lower around 1.26)

For 4.51 unit increase in for Math by additional unit of mStatus.

Those in public schools tends to be outperformed by private school students with 0.31 times less than.

For example, for school 1 the intercept is 8.01 and cStatus slope is 0.92, while school 3 has 10.9 and 1.1 for intercept and cStatus slope respectively.

As plotting with interaction, for either private or public, cStatus is positive correlated with math score. The slope of public school is slightly steeper indicating that the cStatus in public school students grow slightly faster.

summary(model1) # coef of fixed effect only
## Linear mixed model fit by REML ['lmerMod']
## Formula: Math ~ mStatus + Type + cStatus + Type:cStatus + (cStatus | School)
##    Data: data
## 
## REML criterion at convergence: 13954
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.71032 -0.74023 -0.01129  0.71450  2.81159 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr
##  School   (Intercept)  2.6630  1.6319       
##           cStatus      0.6111  0.7817   0.12
##  Residual             34.5769  5.8802       
## Number of obs: 2174, groups:  School, 50
## 
## Fixed effects:
##                    Estimate Std. Error t value
## (Intercept)         11.5687     0.4146  27.902
## mStatus              4.5119     0.6308   7.152
## TypePublic          -0.3064     0.5431  -0.564
## cStatus              1.2584     0.3385   3.718
## TypePublic:cStatus   0.3705     0.4508   0.822
## 
## Correlation of Fixed Effects:
##             (Intr) mStats TypPbl cStats
## mStatus      0.221                     
## TypePublic  -0.689  0.166              
## cStatus      0.051  0.000 -0.039       
## TypPblc:cSt -0.039 -0.001  0.051 -0.751
# plot interaction effect
visreg(model1, "cStatus", by="Type", gg=TRUE) + theme_bw()

7. Briefly interpret the random effects. (3 marks)

The variation between individual students within the school are extremely large (34.5769),

while the variation intercept between 50 schools not really large (2.663).

The slope variation for cStatus is small (0.6111).

The corr = 0.12 indicates positive correlation between intercept an slope (when an School’s intercept increases by one unit of standard deviation, that school’s slope would increase by 0.12 standard deviation unit).

For example,

School1 the y= 8.0 + 4.51x1 - 0.31x2 + 0.92x3 + 0.37x2x3 School3 the y=10.9 + 4.51x1 - 0.31x2 + 1.12x3 + 0.37x2x3

head(coef(model1)$School)
##   (Intercept)  mStatus TypePublic   cStatus TypePublic:cStatus
## 1    8.011615 4.511919  -0.306393 0.9230512          0.3704958
## 2    9.010448 4.511919  -0.306393 1.2316946          0.3704958
## 3   10.998980 4.511919  -0.306393 1.1077697          0.3704958
## 4   10.173915 4.511919  -0.306393 0.8663212          0.3704958
## 5    8.848771 4.511919  -0.306393 1.0422813          0.3704958
## 6   10.975848 4.511919  -0.306393 1.3958886          0.3704958

8. Produce plots to assess the distributional assumptions in the mixed effects model. What lmer(data=data,is your conclusion? (4 marks)

For the normal plot of data, it shows that the residual follow the normal distribution but with the unusual fat tails.

Based on the fitted residual plot, the model somehow not violated the constant variance assumption (though there are some outliers with large residual, most of data points concentrate around residual plot)

For the normal plot of random effect, it shows that the residual of random effect not normally distributed

# Linearity Assumption
Plot.Model.F.Linearity<-plot(resid(model1),data$Math)

# fitted vs residual plot (Homogeneity of Variance assumption)
augment(model1) %>%
  ggplot(aes(x=.fitted,y=.resid)) + 
    geom_point(alpha=0.1) +
    geom_hline(yintercept=0) +
    xlab("Fitted") + ylab("Residuals")

# assumption that the residuals of the model are normally distributed 
augment(model1) %>%
  ggplot(aes(sample=.resid)) + geom_qq()+geom_qq_line()

# Check normality of random school effects
ranef(model1) %>% as.tibble() %>%
  ggplot(aes(sample=condval)) +
  facet_wrap(~ grpvar) +
  geom_qq() 
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

redres::plot_ranef(model1)
## Loading required namespace: testthat

9. Which fixed effects are significant? Is the correlation between the random effects significant? (3 marks)

thieu phan test correlation

mStatus, the interaction between cStatus and Type are significant with p-value<0.05 (we reject the Null hypothesis)

library(pbkrtest)

model1<-lmer(data=data, Math~ mStatus+Type+ cStatus+Type:cStatus+(cStatus|School),REML = F)
nullmodel<-lmer(data=data, Math~      Type+ cStatus+Type:cStatus+(cStatus|School), REML = F)
KRmodcomp(model1, nullmodel)
## F-test with Kenward-Roger approximation; time: 9.19 sec
## large : Math ~ mStatus + Type + cStatus + (cStatus | School) + Type:cStatus
## small : Math ~ Type + cStatus + Type:cStatus + (cStatus | School)
##         stat    ndf    ddf F.scaling   p.value    
## Ftest 49.094  1.000 46.155         1 8.754e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# reject Null =>  significant
model1<-lmer(data=data,   Math~ mStatus+Type+cStatus+ Type:cStatus+(cStatus|School),REML = F)
nullmodel<-lmer(data=data,Math~ mStatus              +Type:cStatus+(cStatus|School),REML = F)
KRmodcomp(model1, nullmodel)
## F-test with Kenward-Roger approximation; time: 8.36 sec
## large : Math ~ mStatus + Type + cStatus + (cStatus | School) + Type:cStatus
## small : Math ~ mStatus + Type:cStatus + (cStatus | School)
##          stat     ndf     ddf F.scaling p.value
## Ftest  0.3177  1.0000 46.2820         1  0.5757
# fail to reject Null => Type and cStatus no significant
model1<-lmer(data=data, Math~      mStatus+ Type:cStatus+ (cStatus|School),REML = F)
nullmodel <- lmer(data=data, Math~ mStatus+               (cStatus|School), REML = F)
KRmodcomp(model1, nullmodel)
## F-test with Kenward-Roger approximation; time: 7.72 sec
## large : Math ~ mStatus + (cStatus | School) + Type:cStatus
## small : Math ~ mStatus + (cStatus | School)
##         stat    ndf    ddf F.scaling   p.value    
## Ftest 21.740  2.000 43.642   0.99992 2.811e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# reject Null => interaction is significant
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
lmerTest::ranova(model1)
## ANOVA-like table for random-effects: Single term deletions
## 
## Model:
## Math ~ mStatus + (cStatus | School) + Type:cStatus
##                               npar  logLik   AIC    LRT Df Pr(>Chisq)
## <none>                           8 -6977.0 13970                     
## cStatus in (cStatus | School)    6 -6977.8 13968 1.5929  2     0.4509
car::Anova(model1)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: Math
##               Chisq Df Pr(>Chisq)    
## mStatus      57.808  1  2.890e-14 ***
## Type:cStatus 45.695  2  1.195e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

10. Fit a second mixed effects model, with only mStatus and cStatus as fixed effects, and the same random effect specification as in question 5. Use a bootstrap test to check if this model is better. (8 marks)

New model: Math~ mStatus+cStatus+(cStatus|School)

The null: effect of 2 drop out predictors = 0 The alternative: effect of 2 drop out predictors >< 0

The p-value =0.602 then we fail to reject the Null and conclude that Type and the interaction are not significant (their effects to model is 0). Then they are not worth to be added in model. Therefore, we choose the a simpler model that is Null model (with only mStatus and cStatus as fixed effects).

We would like to double check with AIC. AIC of model null model AIC(nullmodel) is smaller than AIC of model1 AIC(model1)

# Compute bootstrap p-value = (number of bootstrap value > observed value)/ # of bootstrap time

sum(actual < lrstat)/1000 
## [1] 0.602
# Double check the result with AIC
AIC(model1)
## [1] 13971.63
AIC(nullmodel)
## [1] 13968.71