getwd()
## [1] "D:/1. ACTUARY STUDY/0.0 Monash Study/4. ETC3580- Advanced statistical modelling- with R/3. Assignment 3-Deadine 17.10"
load("A3.Rdata")
Comment:
In school tibble, the variable Type is recoded with 1= Public and 2 = Private, while the Size variable is divided into 3 groups: small (<1000 students), large (>2000) and medium (all remainders).
On grouping factors, we have nested levels as in both factors Type and Size form a hierarchical structure. In either Type level “Public” or “Private”, we all have 3 sizes of schools, though the label for Size are same for both Types, there are 6 independent levels (Small Public, Medium Public, Large Public, Small Private, Medium Private, Large Private)
school$Type[school$Type==1]="Public"
school$Type[school$Type==2]="Private"
school$size<- cut(school$Size,
breaks=c(-Inf, 1000, 2000, Inf),
labels=c("small","medium","large"))
school$Type<-as.factor(school$Type)
school$School<-as.factor(school$School)
student$School<-as.factor(student$School)
summary(school)
## School Type Size size
## 1 : 1 Private:21 Min. : 100.0 small :27
## 2 : 1 Public :29 1st Qu.: 530.2 medium:16
## 3 : 1 Median : 929.0 large : 7
## 4 : 1 Mean :1122.4
## 5 : 1 3rd Qu.:1665.5
## 6 : 1 Max. :2713.0
## (Other):44
library(dplyr)
# note: the function to generate mStatus sometimes does not create mean by group, please update your package then re-run.
# mStatus
mS<-student %>%
group_by(School) %>% summarise(mStatus= mean(Status))
## `summarise()` ungrouping output (override with `.groups` argument)
student<-left_join(student, mS)
## Joining, by = "School"
# cStatus
cS<-student %>%
mutate(cStatus = student$Status-student$mStatus)
student<-left_join(student, cS)
## Joining, by = c("School", "Status", "Math", "mStatus")
student %>%
group_by(School) %>% summarise(centermean=mean(cStatus)) # check centering mean
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 50 x 2
## School centermean
## <fct> <dbl>
## 1 1 -7.94e-18
## 2 2 3.12e-17
## 3 3 3.01e-17
## 4 4 9.60e-17
## 5 5 2.72e-17
## 6 6 3.55e-17
## 7 7 -3.03e-17
## 8 8 -6.56e-18
## 9 9 -5.46e-18
## 10 10 9.39e-18
## # ... with 40 more rows
# left joint
data<-left_join(school,student)
## Joining, by = "School"
summary(data)
## School Type Size size Status
## 12 : 67 Private: 979 Min. : 100 small :1233 Min. :-2.8380
## 30 : 63 Public :1195 1st Qu.: 509 medium: 676 1st Qu.:-0.7780
## 25 : 61 Median : 780 large : 265 Median :-0.2480
## 32 : 58 Mean :1074 Mean :-0.2319
## 42 : 58 3rd Qu.:1524 3rd Qu.: 0.3220
## 44 : 58 Max. :2713 Max. : 2.6920
## (Other):1809
## Math mStatus cStatus
## Min. :-2.832 Min. :-1.1939 Min. :-3.366909
## 1st Qu.: 5.144 1st Qu.:-0.5227 1st Qu.:-0.452699
## Median :10.461 Median :-0.2919 Median :-0.006206
## Mean :10.484 Mean :-0.2319 Mean : 0.000000
## 3rd Qu.:15.571 3rd Qu.: 0.0732 3rd Qu.: 0.447079
## Max. :23.447 Max. : 0.6266 Max. : 2.856078
##
Comment:
In terms of 10 smallest public schools, the math ability and cStatus does correlate in a certain degree via plot visualization (most of the schools shows that there is little positive correlation between cStatus and Math ability)
Those the positive correlation in private schools is not as clearly strong as public school
# Define 10 smallest puclic school (<=1313)
school %>% filter(Type=="Public")%>%arrange(Size)
## # A tibble: 29 x 4
## School Type Size size
## <fct> <fct> <dbl> <fct>
## 1 14 Public 100 small
## 2 39 Public 365 small
## 3 35 Public 415 small
## 4 25 Public 488 small
## 5 36 Public 760 small
## 6 40 Public 760 small
## 7 1 Public 842 small
## 8 17 Public 959 small
## 9 46 Public 1125 medium
## 10 44 Public 1313 medium
## # ... with 19 more rows
# Select Math and cStatus for 10 smallest public schools
math_pub<-data %>%
arrange(desc(Size))%>%
filter(Type=="Public" & Size<=1313)%>%
select(Type,Math,cStatus,Size,School)
# Define 10 smallest private school (<=531)
school %>% filter(Type=="Private")%>%arrange(Size)
## # A tibble: 21 x 4
## School Type Size size
## <fct> <fct> <dbl> <fct>
## 1 9 Private 185 small
## 2 33 Private 270 small
## 3 49 Private 388 small
## 4 15 Private 400 small
## 5 5 Private 455 small
## 6 41 Private 457 small
## 7 22 Private 485 small
## 8 50 Private 509 small
## 9 11 Private 530 small
## 10 12 Private 531 small
## # ... with 11 more rows
# Select Math and cStatus for 10 smallest private schools
math_pri<-data %>%
arrange(desc(Size))%>%
filter(Type=="Private" & Size<=531)%>%
select(Type,Math,cStatus,Size,School)
mb<-math_pub %>%
ggplot(aes(y= Math, x= cStatus))+
geom_point()+
geom_smooth(method=lm)+
ggtitle('Math score against cStatus by 10 smallest public schools') +
xlab('Ascending cStatus') +
ylab('Ascending Math Ability')+facet_wrap(~School)+
stat_regline_equation(label.x = -2, label.y = 30)
mp<-math_pri %>%
ggplot(aes(y= Math, x= cStatus))+
geom_point()+
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
ggtitle('Math score against cStatus by 10 smallest private schools') +
xlab('Ascending cStatus') +
ylab('Ascending Math Ability')+facet_wrap(~School)+
stat_regline_equation(label.x = -2, label.y = 30)+geom_rug()
mp
## `geom_smooth()` using formula 'y ~ x'
mb
## `geom_smooth()` using formula 'y ~ x'
math_10smallest<-bind_rows(math_pri,math_pub)
math_10smallest %>%
ggplot(aes(x= cStatus, y= Math, color=Type ))+
geom_point()+
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
ggtitle('Math score against cStatus by 10 smallest private schools') +
ylab('Ascending Math') +
xlab('Ascending Cstatus')+
stat_regline_equation(label.x = -4, label.y = 35)+
facet_wrap(~School,nrow=5)+theme(panel.spacing = unit(0.5, "lines"))
## `geom_smooth()` using formula 'y ~ x'
Visually, the intercept for public is slightly smaller than private schools, that indicates that the math ability in public school is slightly lower than public schools.
Visually The slope for public is lightly higher to private schools in general, indicating that growth in every unit of cStatus will provoke a faster growth in Math ability for public rather than private school students.
model1<-lmer(data=data,
Math~ mStatus+Type+cStatus+Type:cStatus+ (cStatus|School))
model1 is built for the intercept and slopes can vary among school with:
The intercept 11.57 is the average intercept among 50 schools. Specific intercept of each school will be computed based on the fixed intercept. For example the variation of each school intercept relative to 11.57 is theta, then the intercept of each school will be 11.57+theta.
The slope for mStatus and Type are fixed across schools, while the average slope of cStatus of 50 schools are 1.26 (the slope of specific school will higher or lower around 1.26)
For 4.51 unit increase in for Math by additional unit of mStatus.
Those in public schools tends to be outperformed by private school students with 0.31 times less than.
For example, for school 1 the intercept is 8.01 and cStatus slope is 0.92, while school 3 has 10.9 and 1.1 for intercept and cStatus slope respectively.
As plotting with interaction, for either private or public, cStatus is positive correlated with math score. The slope of public school is slightly steeper indicating that the cStatus in public school students grow slightly faster.
summary(model1) # coef of fixed effect only
## Linear mixed model fit by REML ['lmerMod']
## Formula: Math ~ mStatus + Type + cStatus + Type:cStatus + (cStatus | School)
## Data: data
##
## REML criterion at convergence: 13954
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.71032 -0.74023 -0.01129 0.71450 2.81159
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## School (Intercept) 2.6630 1.6319
## cStatus 0.6111 0.7817 0.12
## Residual 34.5769 5.8802
## Number of obs: 2174, groups: School, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 11.5687 0.4146 27.902
## mStatus 4.5119 0.6308 7.152
## TypePublic -0.3064 0.5431 -0.564
## cStatus 1.2584 0.3385 3.718
## TypePublic:cStatus 0.3705 0.4508 0.822
##
## Correlation of Fixed Effects:
## (Intr) mStats TypPbl cStats
## mStatus 0.221
## TypePublic -0.689 0.166
## cStatus 0.051 0.000 -0.039
## TypPblc:cSt -0.039 -0.001 0.051 -0.751
# plot interaction effect
visreg(model1, "cStatus", by="Type", gg=TRUE) + theme_bw()
The variation between individual students within the school are extremely large (34.5769),
while the variation intercept between 50 schools not really large (2.663).
The slope variation for cStatus is small (0.6111).
The corr = 0.12 indicates positive correlation between intercept an slope (when an School’s intercept increases by one unit of standard deviation, that school’s slope would increase by 0.12 standard deviation unit).
For example,
School1 the y= 8.0 + 4.51x1 - 0.31x2 + 0.92x3 + 0.37x2x3 School3 the y=10.9 + 4.51x1 - 0.31x2 + 1.12x3 + 0.37x2x3
head(coef(model1)$School)
## (Intercept) mStatus TypePublic cStatus TypePublic:cStatus
## 1 8.011615 4.511919 -0.306393 0.9230512 0.3704958
## 2 9.010448 4.511919 -0.306393 1.2316946 0.3704958
## 3 10.998980 4.511919 -0.306393 1.1077697 0.3704958
## 4 10.173915 4.511919 -0.306393 0.8663212 0.3704958
## 5 8.848771 4.511919 -0.306393 1.0422813 0.3704958
## 6 10.975848 4.511919 -0.306393 1.3958886 0.3704958
For the normal plot of data, it shows that the residual follow the normal distribution but with the unusual fat tails.
Based on the fitted residual plot, the model somehow not violated the constant variance assumption (though there are some outliers with large residual, most of data points concentrate around residual plot)
For the normal plot of random effect, it shows that the residual of random effect not normally distributed
# Linearity Assumption
Plot.Model.F.Linearity<-plot(resid(model1),data$Math)
# fitted vs residual plot (Homogeneity of Variance assumption)
augment(model1) %>%
ggplot(aes(x=.fitted,y=.resid)) +
geom_point(alpha=0.1) +
geom_hline(yintercept=0) +
xlab("Fitted") + ylab("Residuals")
# assumption that the residuals of the model are normally distributed
augment(model1) %>%
ggplot(aes(sample=.resid)) + geom_qq()+geom_qq_line()
# Check normality of random school effects
ranef(model1) %>% as.tibble() %>%
ggplot(aes(sample=condval)) +
facet_wrap(~ grpvar) +
geom_qq()
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
redres::plot_ranef(model1)
## Loading required namespace: testthat
thieu phan test correlation
mStatus, the interaction between cStatus and Type are significant with p-value<0.05 (we reject the Null hypothesis)
library(pbkrtest)
model1<-lmer(data=data, Math~ mStatus+Type+ cStatus+Type:cStatus+(cStatus|School),REML = F)
nullmodel<-lmer(data=data, Math~ Type+ cStatus+Type:cStatus+(cStatus|School), REML = F)
KRmodcomp(model1, nullmodel)
## F-test with Kenward-Roger approximation; time: 9.19 sec
## large : Math ~ mStatus + Type + cStatus + (cStatus | School) + Type:cStatus
## small : Math ~ Type + cStatus + Type:cStatus + (cStatus | School)
## stat ndf ddf F.scaling p.value
## Ftest 49.094 1.000 46.155 1 8.754e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# reject Null => significant
model1<-lmer(data=data, Math~ mStatus+Type+cStatus+ Type:cStatus+(cStatus|School),REML = F)
nullmodel<-lmer(data=data,Math~ mStatus +Type:cStatus+(cStatus|School),REML = F)
KRmodcomp(model1, nullmodel)
## F-test with Kenward-Roger approximation; time: 8.36 sec
## large : Math ~ mStatus + Type + cStatus + (cStatus | School) + Type:cStatus
## small : Math ~ mStatus + Type:cStatus + (cStatus | School)
## stat ndf ddf F.scaling p.value
## Ftest 0.3177 1.0000 46.2820 1 0.5757
# fail to reject Null => Type and cStatus no significant
model1<-lmer(data=data, Math~ mStatus+ Type:cStatus+ (cStatus|School),REML = F)
nullmodel <- lmer(data=data, Math~ mStatus+ (cStatus|School), REML = F)
KRmodcomp(model1, nullmodel)
## F-test with Kenward-Roger approximation; time: 7.72 sec
## large : Math ~ mStatus + (cStatus | School) + Type:cStatus
## small : Math ~ mStatus + (cStatus | School)
## stat ndf ddf F.scaling p.value
## Ftest 21.740 2.000 43.642 0.99992 2.811e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# reject Null => interaction is significant
library(lmerTest)
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
lmerTest::ranova(model1)
## ANOVA-like table for random-effects: Single term deletions
##
## Model:
## Math ~ mStatus + (cStatus | School) + Type:cStatus
## npar logLik AIC LRT Df Pr(>Chisq)
## <none> 8 -6977.0 13970
## cStatus in (cStatus | School) 6 -6977.8 13968 1.5929 2 0.4509
car::Anova(model1)
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: Math
## Chisq Df Pr(>Chisq)
## mStatus 57.808 1 2.890e-14 ***
## Type:cStatus 45.695 2 1.195e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
New model: Math~ mStatus+cStatus+(cStatus|School)
The null: effect of 2 drop out predictors = 0 The alternative: effect of 2 drop out predictors >< 0
The p-value =0.602 then we fail to reject the Null and conclude that Type and the interaction are not significant (their effects to model is 0). Then they are not worth to be added in model. Therefore, we choose the a simpler model that is Null model (with only mStatus and cStatus as fixed effects).
We would like to double check with AIC. AIC of model null model AIC(nullmodel) is smaller than AIC of model1 AIC(model1)
# Compute bootstrap p-value = (number of bootstrap value > observed value)/ # of bootstrap time
sum(actual < lrstat)/1000
## [1] 0.602
# Double check the result with AIC
AIC(model1)
## [1] 13971.63
AIC(nullmodel)
## [1] 13968.71