p = c("dplyr", "magrittr", "lme4", "ggplot2","segmented", "gridExtra")
lapply(p, library, character.only = TRUE)
dat = read.csv("data/classroom.csv", header = TRUE)
str(dat)
## 'data.frame': 1190 obs. of 12 variables:
## $ sex : int 1 0 1 0 0 1 0 0 1 0 ...
## $ minority: int 1 1 1 1 1 1 1 1 1 1 ...
## $ mathkind: int 448 460 511 449 425 450 452 443 422 480 ...
## $ mathgain: int 32 109 56 83 53 65 51 66 88 -7 ...
## $ ses : num 0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
## $ yearstea: num 1 1 1 2 2 2 2 2 2 2 ...
## $ mathknow: num NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
## $ housepov: num 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
## $ mathprep: num 2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
## $ classid : int 160 160 160 217 217 217 217 217 217 217 ...
## $ schoolid: int 1 1 1 1 1 1 1 1 1 1 ...
## $ childid : int 1 2 3 4 5 6 7 8 9 10 ...
index = c(1,2,10,11,12)
dat[,index] <- lapply(dat[,index], as.factor)
#Model 1: Different intercept among different schools and classes
summary(m0 <- lmer(mathgain~(1|schoolid)+(1|classid)+
mathkind+sex+minority+ses, data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: mathgain ~ (1 | schoolid) + (1 | classid) + mathkind + sex +
## minority + ses
## Data: dat
##
## REML criterion at convergence: 11385.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.8257 -0.6110 -0.0337 0.5538 4.2678
##
## Random effects:
## Groups Name Variance Std.Dev.
## classid (Intercept) 83.28 9.126
## schoolid (Intercept) 75.20 8.672
## Residual 734.57 27.103
## Number of obs: 1190, groups: classid, 312; schoolid, 107
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 282.79034 10.85323 26.056
## mathkind -0.46980 0.02227 -21.100
## sex1 -1.25119 1.65773 -0.755
## minority1 -8.26213 2.34011 -3.531
## ses 5.34638 1.24109 4.308
##
## Correlation of Fixed Effects:
## (Intr) mthknd sex1 mnrty1
## mathkind -0.978
## sex1 -0.044 -0.031
## minority1 -0.307 0.163 -0.018
## ses 0.140 -0.168 0.019 0.163
#Model 2: Using classese and school to predict
summary(m1 <- lmer(mathgain~(1|schoolid)+(1|classid), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: mathgain ~ (1 | schoolid) + (1 | classid)
## Data: dat
##
## REML criterion at convergence: 11768.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.6441 -0.5984 -0.0336 0.5334 5.6335
##
## Random effects:
## Groups Name Variance Std.Dev.
## classid (Intercept) 99.23 9.961
## schoolid (Intercept) 77.49 8.803
## Residual 1028.23 32.066
## Number of obs: 1190, groups: classid, 312; schoolid, 107
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 57.427 1.443 39.79
dat = read.csv("data/language_arith.csv", header = TRUE)
str(dat)
## 'data.frame': 2287 obs. of 4 variables:
## $ School : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Pupil : int 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
## $ Language : int 46 45 33 46 20 30 30 57 36 36 ...
## $ Arithmetic: int 24 19 24 26 9 13 13 30 23 22 ...
dat$School %<>% as.factor
cor(dat$Language, dat$Arithmetic)
## [1] 0.694228
#Different intercepts on school level
summary(m0 <- lmer(Language~(1|School), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: Language ~ (1 | School)
## Data: dat
##
## REML criterion at convergence: 16253.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.11618 -0.65703 0.07597 0.74128 2.50639
##
## Random effects:
## Groups Name Variance Std.Dev.
## School (Intercept) 19.63 4.431
## Residual 64.56 8.035
## Number of obs: 2287, groups: School, 131
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 40.3624 0.4282 94.26
summary(m1 <- lmer(Arithmetic~(1|School), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: Arithmetic ~ (1 | School)
## Data: dat
##
## REML criterion at convergence: 14695.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.80522 -0.70245 0.08321 0.75577 2.68341
##
## Random effects:
## Groups Name Variance Std.Dev.
## School (Intercept) 12.62 3.552
## Residual 32.28 5.682
## Number of obs: 2287, groups: School, 131
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 18.9473 0.3365 56.31
m = dat %>% group_by(School) %>% summarize(av_lan = mean(Language),
av_ari = mean(Arithmetic))
#Plot
ggplot(dat, aes(x = Arithmetic, y = Language))+
geom_point(color = "skyblue", alpha = .5)+
stat_smooth(method = "lm", color = "skyblue", size = .8)+
geom_point(data = m, aes(x = av_ari, y = av_lan),color = "blue", alpha = .5)+
stat_smooth(data = m, aes(x = av_ari, y = av_lan),method = "lm", color = "blue", size = .8)+
labs(x = "Arithmetic Score", y = "Language Score")+
theme_bw()
#Q3
data("jsp", package = "faraway")
dat = jsp
str(dat)
## 'data.frame': 3236 obs. of 9 variables:
## $ school : Factor w/ 49 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ class : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## $ gender : Factor w/ 2 levels "boy","girl": 2 2 2 1 1 1 1 1 1 1 ...
## $ social : Factor w/ 9 levels "1","2","3","4",..: 9 9 9 2 2 2 2 2 9 9 ...
## $ raven : num 23 23 23 15 15 22 22 22 14 14 ...
## $ id : Factor w/ 1192 levels "1","2","3","4",..: 1 1 1 2 2 3 3 3 4 4 ...
## $ english: num 72 80 39 7 17 88 89 83 12 25 ...
## $ math : num 23 24 23 14 11 36 32 39 24 26 ...
## $ year : num 0 1 2 0 1 0 1 2 0 1 ...
#School level
dat_s = dat %>% group_by(school) %>% summarize(av_eng = mean(english),
sd_eng = sd(english),
av_raven = mean(raven),
sd_raven = sd(raven)) %>% as.data.frame
#Take a look
ggplot(dat, aes(x = raven, y = english))+
geom_point(alpha = .5, color = "skyblue")+
stat_smooth(method = "lm", color = "skyblue")+
geom_point(data = dat_s, aes(x = av_raven, y = av_eng),alpha = .5, color = "blue")+
stat_smooth(data = dat_s, aes(x = av_raven, y = av_eng),method = "lm", color = "blue")+
labs(x = "Ravens test score", y = "English test score")+
theme_bw()
#Each class and each school has its intercept
summary(m1 <- lmer(english~raven+(1|class:school)+(1|school), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: english ~ raven + (1 | class:school) + (1 | school)
## Data: dat
##
## REML criterion at convergence: 29107.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.90145 -0.76284 0.05924 0.78127 2.82146
##
## Random effects:
## Groups Name Variance Std.Dev.
## class:school (Intercept) 40.58 6.370
## school (Intercept) 15.59 3.949
## Residual 451.47 21.248
## Number of obs: 3236, groups: class:school, 94; school, 49
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 9.24949 2.01350 4.594
## raven 1.69687 0.06988 24.282
##
## Correlation of Fixed Effects:
## (Intr)
## raven -0.873
plot(m1, xlab = "Fitted values", ylab = "Standardized redisuals")
dat = read.table("data/course_eval.txt", header = TRUE)
str(dat)
## 'data.frame': 463 obs. of 7 variables:
## $ Score : num 4.3 4.5 3.7 4.3 4.4 4.2 4 3.4 4.5 3.9 ...
## $ Beauty : num 0.202 -0.826 -0.66 -0.766 1.421 ...
## $ Gender : int 1 0 0 1 1 0 1 1 1 0 ...
## $ Age : int 36 59 51 40 31 62 33 51 33 47 ...
## $ Minority: int 1 0 0 0 0 0 0 0 0 0 ...
## $ Tenure : int 0 1 1 1 0 1 0 1 0 0 ...
## $ Course : int 3 0 4 2 0 0 4 0 0 4 ...
index = c(3,5,6,7)
dat[,index] = lapply(dat[,index], as.factor)
#Plot
ggplot(dat, aes(x = Beauty, y = Score))+
geom_point(alpha = .5)+
scale_x_continuous(name = "Beauty Score", breaks = c(-1.5,-1,-0.5,0,0.5,1,1.5,2))+
scale_y_continuous(name = "Average teaching evaluation", breaks = c(2,2.5,3,3.5,4,4.5,5))+
stat_smooth(method = "lm")+
theme_bw()
#Each class
ggplot(dat, aes(x = Beauty, y = Score))+
geom_point(alpha = .5, size = .8)+
labs(x = "Beauty Score", y = "Average teaching evaluation")+
stat_smooth(method = "lm", size = .7)+
scale_x_continuous(limits = c(-2,2))+
scale_y_continuous(limits = c(2,5))+
facet_wrap(~Course, nrow = 6, as.table = FALSE)+
theme_bw()+
theme(panel.grid = element_blank())
#Course has its unique intercept
summary(m0 <- lmer(Score~Beauty+Age+Gender+Minority+Tenure+(1|Course), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: Score ~ Beauty + Age + Gender + Minority + Tenure + (1 | Course)
## Data: dat
##
## REML criterion at convergence: 749.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.6110 -0.6666 0.0876 0.7275 2.0388
##
## Random effects:
## Groups Name Variance Std.Dev.
## Course (Intercept) 0.07618 0.2760
## Residual 0.26282 0.5127
## Number of obs: 463, groups: Course, 31
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 4.352253 0.159658 27.260
## Beauty 0.131483 0.033863 3.883
## Age -0.003687 0.002972 -1.240
## Gender1 -0.232125 0.052992 -4.380
## Minority1 -0.127408 0.075157 -1.695
## Tenure1 -0.091118 0.054955 -1.658
##
## Correlation of Fixed Effects:
## (Intr) Beauty Age Gendr1 Mnrty1
## Beauty -0.229
## Age -0.869 0.265
## Gender1 -0.309 -0.034 0.175
## Minority1 -0.174 0.035 0.090 -0.072
## Tenure1 0.059 0.047 -0.316 0.136 0.044
dat = read.table("data/iq_language.txt", header = TRUE)
str(dat)
## 'data.frame': 2287 obs. of 9 variables:
## $ School : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Pupil : int 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
## $ IQ : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
## $ Language : int 46 45 33 46 20 30 30 57 36 36 ...
## $ Group_size : int 29 29 29 29 29 29 29 29 29 29 ...
## $ IQ_c : num 3.166 2.666 -2.334 -0.834 -3.834 ...
## $ School_mean: num -1.51 -1.51 -1.51 -1.51 -1.51 ...
## $ Group_mean : num 5.9 5.9 5.9 5.9 5.9 5.9 5.9 5.9 5.9 5.9 ...
## $ SES_c : num -4.81 -17.81 -12.81 -4.81 -17.81 ...
index = c(1,2)
dat[,index] = lapply(dat[,index], as.factor)
#School level
m = dat %>% group_by(School) %>% summarize(av_IQ = mean(IQ),
av_lan = mean(Language))
#Plot
ggplot(dat, aes(x = IQ, y = Language))+
geom_point(alpha = .5, color = "skyblue")+
stat_smooth(method = "lm", color = "skyblue")+
geom_point(data = m, aes(x = av_IQ, y = av_lan), alpha = .5, color = "blue")+
stat_smooth(data = m, aes(x = av_IQ, y = av_lan), method = "lm", color = "blue")+
theme_bw()
#Different intercept on school level
summary(m0 <- lmer(Language~IQ+(1|School), data = dat)) #Fixed effect differs from teacher's example?
## Linear mixed model fit by REML ['lmerMod']
## Formula: Language ~ IQ + (1 | School)
## Data: dat
##
## REML criterion at convergence: 15255.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.0939 -0.6375 0.0579 0.7061 3.1448
##
## Random effects:
## Groups Name Variance Std.Dev.
## School (Intercept) 9.602 3.099
## Residual 42.245 6.500
## Number of obs: 2287, groups: School, 131
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 11.16993 0.87952 12.7
## IQ 2.48759 0.07008 35.5
##
## Correlation of Fixed Effects:
## (Intr)
## IQ -0.937
dat = read.table("data/leadership_math.txt", header = TRUE)
str(dat)
## 'data.frame': 100 obs. of 6 variables:
## $ Student: int 1 2 3 4 5 6 7 8 9 10 ...
## $ School : int 100 100 100 100 100 100 100 100 100 100 ...
## $ Lead : num 0.74 0.74 0.74 0.74 0.74 0.74 0.74 0.74 0.74 0.74 ...
## $ Math : int 623 635 700 663 656 695 802 648 594 679 ...
## $ Gender : Factor w/ 2 levels "F","M": 2 1 1 2 1 1 2 2 2 2 ...
## $ SES : Factor w/ 2 levels "L","O": 2 2 2 2 2 2 2 2 1 2 ...
index = c(1,2)
dat[,index] =lapply(dat[,index],as.factor)
sum(sapply(dat, is.na))
## [1] 0
#School level
m = dat %>% group_by(School) %>% summarize(av_math = mean(Math),
av_Lead = mean(Lead))
#Plot
ggplot(dat, aes(x = Math, y = Lead))+
geom_point(alpha = .5, color = "skyblue")+
stat_smooth(method = "lm", color = "skyblue")+
geom_point(data = m, aes(x = av_math, y = av_Lead), alpha = .5, color = "blue")+
stat_smooth(data = m, aes(x = av_math, y = av_Lead), method = "lm", color = "blue")+
theme_bw()
summary(m2 <- lm(Lead~School, data = dat))
## Warning in summary.lm(m2 <- lm(Lead ~ School, data = dat)): essentially
## perfect fit: summary may be unreliable
##
## Call:
## lm(formula = Lead ~ School, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.826e-16 0.000e+00 0.000e+00 0.000e+00 3.933e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.400e-01 1.452e-16 5.097e+15 <2e-16 ***
## School103 -4.000e-02 2.053e-16 -1.948e+14 <2e-16 ***
## School104 -1.100e-01 2.053e-16 -5.358e+14 <2e-16 ***
## School105 -1.000e-01 2.053e-16 -4.871e+14 <2e-16 ***
## School106 4.000e-02 2.053e-16 1.948e+14 <2e-16 ***
## School107 -6.000e-02 2.053e-16 -2.922e+14 <2e-16 ***
## School108 -1.000e-02 2.053e-16 -4.871e+13 <2e-16 ***
## School109 4.000e-02 2.053e-16 1.948e+14 <2e-16 ***
## School110 -8.000e-02 2.053e-16 -3.896e+14 <2e-16 ***
## School111 -7.000e-02 2.053e-16 -3.409e+14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.591e-16 on 90 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.407e+29 on 9 and 90 DF, p-value: < 2.2e-16
dat = read.csv("data/attitudes2Sci.csv", header = TRUE)
str(dat)
## 'data.frame': 1385 obs. of 7 variables:
## $ State : Factor w/ 2 levels "ACT","NSW": 1 1 1 1 1 1 1 1 1 1 ...
## $ PrivPub: Factor w/ 2 levels "private","public": 2 2 2 2 2 2 2 2 2 2 ...
## $ school : int 1 1 1 1 1 1 1 1 1 1 ...
## $ class : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "f","m": 1 1 1 1 1 1 1 1 2 2 ...
## $ like : int 8 6 5 2 5 6 3 7 6 3 ...
## $ Class : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
index = c(3,4,7)
dat[,index] = lapply(dat[,index], as.factor)
#Take a look
ggplot(dat, aes(x = like))+
geom_histogram(binwidth = 0.5, fill = "skyblue")+
scale_x_continuous(breaks = c(0,2,4,6,8,10,12), limits = c(0,12))+
labs(x = "Attitude score", y = "Count")+
theme_bw()
#Variance complonent analysis
summary(m0 <- lmer(like~(1|State)+(1|PrivPub)+(1|school)+(1|class:school)+(1|sex)+
(1|sex:school)+(1|sex:class), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## like ~ (1 | State) + (1 | PrivPub) + (1 | school) + (1 | class:school) +
## (1 | sex) + (1 | sex:school) + (1 | sex:class)
## Data: dat
##
## REML criterion at convergence: 5543.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.6655 -0.6672 0.1398 0.6947 2.6023
##
## Random effects:
## Groups Name Variance Std.Dev.
## sex:school (Intercept) 0.02069 0.1438
## class:school (Intercept) 0.31061 0.5573
## school (Intercept) 0.00000 0.0000
## sex:class (Intercept) 0.09209 0.3035
## sex (Intercept) 0.09543 0.3089
## PrivPub (Intercept) 0.07635 0.2763
## State (Intercept) 0.00000 0.0000
## Residual 3.01672 1.7369
## Number of obs: 1383, groups:
## sex:school, 80; class:school, 66; school, 41; sex:class, 8; sex, 2; PrivPub, 2; State, 2
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 5.0433 0.3327 15.16
#Because State did not contribute much, I excluded it
summary(m1 <- lmer(like~PrivPub+class+sex+(1|school), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: like ~ PrivPub + class + sex + (1 | school)
## Data: dat
##
## REML criterion at convergence: 5571.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.6022 -0.6552 0.1307 0.6958 2.1107
##
## Random effects:
## Groups Name Variance Std.Dev.
## school (Intercept) 0.2313 0.4809
## Residual 3.1660 1.7793
## Number of obs: 1383, groups: school, 41
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 4.5918 0.1835 25.018
## PrivPubpublic 0.4290 0.2065 2.077
## class2 0.4999 0.1295 3.860
## class3 0.5778 0.2274 2.541
## class4 0.5177 0.2546 2.033
## sexm 0.1914 0.1004 1.907
##
## Correlation of Fixed Effects:
## (Intr) PrvPbp class2 class3 class4
## PrivPubpblc -0.803
## class2 -0.163 -0.012
## class3 -0.101 0.022 0.261
## class4 -0.110 0.046 0.239 0.266
## sexm -0.234 -0.034 0.005 -0.053 -0.031
#Because the data are really strange
#I want to know if it is normal-distributed
shapiro.test(dat$like)
##
## Shapiro-Wilk normality test
##
## data: dat$like
## W = 0.94274, p-value < 2.2e-16
qqnorm(dat$like)
qqline(dat$like)
The data was not of normal distribution.
Thus, I don’t think it is suitable to use linear regression model to analyze the data.