p = c("dplyr", "magrittr", "lme4", "ggplot2","segmented", "gridExtra")
lapply(p, library, character.only = TRUE)

Q1

dat = read.csv("data/classroom.csv", header = TRUE)
str(dat)
## 'data.frame':    1190 obs. of  12 variables:
##  $ sex     : int  1 0 1 0 0 1 0 0 1 0 ...
##  $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ mathkind: int  448 460 511 449 425 450 452 443 422 480 ...
##  $ mathgain: int  32 109 56 83 53 65 51 66 88 -7 ...
##  $ ses     : num  0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
##  $ yearstea: num  1 1 1 2 2 2 2 2 2 2 ...
##  $ mathknow: num  NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
##  $ housepov: num  0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
##  $ mathprep: num  2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
##  $ classid : int  160 160 160 217 217 217 217 217 217 217 ...
##  $ schoolid: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ childid : int  1 2 3 4 5 6 7 8 9 10 ...
index = c(1,2,10,11,12)
dat[,index] <- lapply(dat[,index], as.factor)

#Model 1: Different intercept among different schools and classes
summary(m0 <- lmer(mathgain~(1|schoolid)+(1|classid)+
                     mathkind+sex+minority+ses, data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: mathgain ~ (1 | schoolid) + (1 | classid) + mathkind + sex +  
##     minority + ses
##    Data: dat
## 
## REML criterion at convergence: 11385.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.8257 -0.6110 -0.0337  0.5538  4.2678 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  classid  (Intercept)  83.28    9.126  
##  schoolid (Intercept)  75.20    8.672  
##  Residual             734.57   27.103  
## Number of obs: 1190, groups:  classid, 312; schoolid, 107
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) 282.79034   10.85323  26.056
## mathkind     -0.46980    0.02227 -21.100
## sex1         -1.25119    1.65773  -0.755
## minority1    -8.26213    2.34011  -3.531
## ses           5.34638    1.24109   4.308
## 
## Correlation of Fixed Effects:
##           (Intr) mthknd sex1   mnrty1
## mathkind  -0.978                     
## sex1      -0.044 -0.031              
## minority1 -0.307  0.163 -0.018       
## ses        0.140 -0.168  0.019  0.163
#Model 2: Using classese and school to predict
summary(m1 <- lmer(mathgain~(1|schoolid)+(1|classid), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: mathgain ~ (1 | schoolid) + (1 | classid)
##    Data: dat
## 
## REML criterion at convergence: 11768.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.6441 -0.5984 -0.0336  0.5334  5.6335 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  classid  (Intercept)   99.23   9.961  
##  schoolid (Intercept)   77.49   8.803  
##  Residual             1028.23  32.066  
## Number of obs: 1190, groups:  classid, 312; schoolid, 107
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   57.427      1.443   39.79



Q2

dat = read.csv("data/language_arith.csv", header = TRUE)
str(dat)
## 'data.frame':    2287 obs. of  4 variables:
##  $ School    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Pupil     : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
##  $ Language  : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ Arithmetic: int  24 19 24 26 9 13 13 30 23 22 ...
dat$School %<>% as.factor

cor(dat$Language, dat$Arithmetic)
## [1] 0.694228
#Different intercepts on school level
summary(m0 <- lmer(Language~(1|School), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: Language ~ (1 | School)
##    Data: dat
## 
## REML criterion at convergence: 16253.1
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -3.11618 -0.65703  0.07597  0.74128  2.50639 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  School   (Intercept) 19.63    4.431   
##  Residual             64.56    8.035   
## Number of obs: 2287, groups:  School, 131
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  40.3624     0.4282   94.26
summary(m1 <- lmer(Arithmetic~(1|School), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: Arithmetic ~ (1 | School)
##    Data: dat
## 
## REML criterion at convergence: 14695.8
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.80522 -0.70245  0.08321  0.75577  2.68341 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  School   (Intercept) 12.62    3.552   
##  Residual             32.28    5.682   
## Number of obs: 2287, groups:  School, 131
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  18.9473     0.3365   56.31
m = dat %>% group_by(School) %>% summarize(av_lan = mean(Language),
                                           av_ari = mean(Arithmetic))

#Plot
ggplot(dat, aes(x = Arithmetic, y = Language))+
  geom_point(color = "skyblue", alpha = .5)+
  stat_smooth(method = "lm", color = "skyblue", size = .8)+
  geom_point(data = m, aes(x = av_ari, y = av_lan),color = "blue", alpha = .5)+
  stat_smooth(data = m, aes(x = av_ari, y = av_lan),method = "lm", color = "blue", size = .8)+
  labs(x = "Arithmetic Score", y = "Language Score")+
  theme_bw()



Q3

#Q3
data("jsp", package = "faraway")
dat = jsp
str(dat)
## 'data.frame':    3236 obs. of  9 variables:
##  $ school : Factor w/ 49 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ class  : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ gender : Factor w/ 2 levels "boy","girl": 2 2 2 1 1 1 1 1 1 1 ...
##  $ social : Factor w/ 9 levels "1","2","3","4",..: 9 9 9 2 2 2 2 2 9 9 ...
##  $ raven  : num  23 23 23 15 15 22 22 22 14 14 ...
##  $ id     : Factor w/ 1192 levels "1","2","3","4",..: 1 1 1 2 2 3 3 3 4 4 ...
##  $ english: num  72 80 39 7 17 88 89 83 12 25 ...
##  $ math   : num  23 24 23 14 11 36 32 39 24 26 ...
##  $ year   : num  0 1 2 0 1 0 1 2 0 1 ...
#School level
dat_s = dat %>% group_by(school) %>% summarize(av_eng = mean(english),
                                               sd_eng = sd(english),
                                               av_raven = mean(raven),
                                               sd_raven = sd(raven)) %>% as.data.frame

#Take a look
ggplot(dat, aes(x = raven, y = english))+
  geom_point(alpha = .5, color = "skyblue")+
  stat_smooth(method = "lm", color = "skyblue")+
  geom_point(data = dat_s, aes(x = av_raven, y = av_eng),alpha = .5, color = "blue")+
  stat_smooth(data = dat_s, aes(x = av_raven, y = av_eng),method = "lm", color = "blue")+
  labs(x = "Ravens test score", y = "English test score")+
  theme_bw()

#Each class and each school has its intercept
summary(m1 <- lmer(english~raven+(1|class:school)+(1|school), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: english ~ raven + (1 | class:school) + (1 | school)
##    Data: dat
## 
## REML criterion at convergence: 29107.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.90145 -0.76284  0.05924  0.78127  2.82146 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  class:school (Intercept)  40.58    6.370  
##  school       (Intercept)  15.59    3.949  
##  Residual                 451.47   21.248  
## Number of obs: 3236, groups:  class:school, 94; school, 49
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  9.24949    2.01350   4.594
## raven        1.69687    0.06988  24.282
## 
## Correlation of Fixed Effects:
##       (Intr)
## raven -0.873
plot(m1, xlab = "Fitted values", ylab = "Standardized redisuals")



Q4

dat = read.table("data/course_eval.txt", header = TRUE)
str(dat)
## 'data.frame':    463 obs. of  7 variables:
##  $ Score   : num  4.3 4.5 3.7 4.3 4.4 4.2 4 3.4 4.5 3.9 ...
##  $ Beauty  : num  0.202 -0.826 -0.66 -0.766 1.421 ...
##  $ Gender  : int  1 0 0 1 1 0 1 1 1 0 ...
##  $ Age     : int  36 59 51 40 31 62 33 51 33 47 ...
##  $ Minority: int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Tenure  : int  0 1 1 1 0 1 0 1 0 0 ...
##  $ Course  : int  3 0 4 2 0 0 4 0 0 4 ...
index = c(3,5,6,7)
dat[,index] = lapply(dat[,index], as.factor)

#Plot
ggplot(dat, aes(x = Beauty, y = Score))+
  geom_point(alpha = .5)+
  scale_x_continuous(name = "Beauty Score", breaks = c(-1.5,-1,-0.5,0,0.5,1,1.5,2))+
  scale_y_continuous(name = "Average teaching evaluation", breaks = c(2,2.5,3,3.5,4,4.5,5))+
  stat_smooth(method = "lm")+
  theme_bw()

#Each class
ggplot(dat, aes(x = Beauty, y = Score))+
  geom_point(alpha = .5, size = .8)+
  labs(x = "Beauty Score", y = "Average teaching evaluation")+
  stat_smooth(method = "lm", size = .7)+
  scale_x_continuous(limits = c(-2,2))+
  scale_y_continuous(limits = c(2,5))+
  facet_wrap(~Course, nrow = 6, as.table = FALSE)+
  theme_bw()+
  theme(panel.grid = element_blank())

#Course has its unique intercept
summary(m0 <- lmer(Score~Beauty+Age+Gender+Minority+Tenure+(1|Course), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: Score ~ Beauty + Age + Gender + Minority + Tenure + (1 | Course)
##    Data: dat
## 
## REML criterion at convergence: 749.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6110 -0.6666  0.0876  0.7275  2.0388 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Course   (Intercept) 0.07618  0.2760  
##  Residual             0.26282  0.5127  
## Number of obs: 463, groups:  Course, 31
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  4.352253   0.159658  27.260
## Beauty       0.131483   0.033863   3.883
## Age         -0.003687   0.002972  -1.240
## Gender1     -0.232125   0.052992  -4.380
## Minority1   -0.127408   0.075157  -1.695
## Tenure1     -0.091118   0.054955  -1.658
## 
## Correlation of Fixed Effects:
##           (Intr) Beauty Age    Gendr1 Mnrty1
## Beauty    -0.229                            
## Age       -0.869  0.265                     
## Gender1   -0.309 -0.034  0.175              
## Minority1 -0.174  0.035  0.090 -0.072       
## Tenure1    0.059  0.047 -0.316  0.136  0.044



Q5

dat = read.table("data/iq_language.txt", header = TRUE)
str(dat)
## 'data.frame':    2287 obs. of  9 variables:
##  $ School     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Pupil      : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
##  $ IQ         : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ Language   : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ Group_size : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ IQ_c       : num  3.166 2.666 -2.334 -0.834 -3.834 ...
##  $ School_mean: num  -1.51 -1.51 -1.51 -1.51 -1.51 ...
##  $ Group_mean : num  5.9 5.9 5.9 5.9 5.9 5.9 5.9 5.9 5.9 5.9 ...
##  $ SES_c      : num  -4.81 -17.81 -12.81 -4.81 -17.81 ...
index = c(1,2)
dat[,index] = lapply(dat[,index], as.factor)

#School level
m = dat %>% group_by(School) %>% summarize(av_IQ = mean(IQ),
                                           av_lan = mean(Language))

#Plot
ggplot(dat, aes(x = IQ, y = Language))+
  geom_point(alpha = .5, color = "skyblue")+
  stat_smooth(method = "lm", color = "skyblue")+
  geom_point(data = m, aes(x = av_IQ, y = av_lan), alpha = .5, color = "blue")+
  stat_smooth(data = m, aes(x = av_IQ, y = av_lan), method = "lm", color = "blue")+
  theme_bw()

#Different intercept on school level
summary(m0 <- lmer(Language~IQ+(1|School), data = dat)) #Fixed effect differs from teacher's example?
## Linear mixed model fit by REML ['lmerMod']
## Formula: Language ~ IQ + (1 | School)
##    Data: dat
## 
## REML criterion at convergence: 15255.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.0939 -0.6375  0.0579  0.7061  3.1448 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  School   (Intercept)  9.602   3.099   
##  Residual             42.245   6.500   
## Number of obs: 2287, groups:  School, 131
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 11.16993    0.87952    12.7
## IQ           2.48759    0.07008    35.5
## 
## Correlation of Fixed Effects:
##    (Intr)
## IQ -0.937



Q6

dat = read.table("data/leadership_math.txt", header = TRUE)
str(dat)
## 'data.frame':    100 obs. of  6 variables:
##  $ Student: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ School : int  100 100 100 100 100 100 100 100 100 100 ...
##  $ Lead   : num  0.74 0.74 0.74 0.74 0.74 0.74 0.74 0.74 0.74 0.74 ...
##  $ Math   : int  623 635 700 663 656 695 802 648 594 679 ...
##  $ Gender : Factor w/ 2 levels "F","M": 2 1 1 2 1 1 2 2 2 2 ...
##  $ SES    : Factor w/ 2 levels "L","O": 2 2 2 2 2 2 2 2 1 2 ...
index = c(1,2)
dat[,index] =lapply(dat[,index],as.factor)
sum(sapply(dat, is.na))
## [1] 0
#School level
m = dat %>% group_by(School) %>% summarize(av_math = mean(Math),
                                          av_Lead = mean(Lead))

#Plot
ggplot(dat, aes(x = Math, y = Lead))+
  geom_point(alpha = .5, color = "skyblue")+
  stat_smooth(method = "lm", color = "skyblue")+
  geom_point(data = m, aes(x = av_math, y = av_Lead), alpha = .5, color = "blue")+
  stat_smooth(data = m, aes(x = av_math, y = av_Lead), method = "lm", color = "blue")+
  theme_bw()

summary(m2 <- lm(Lead~School, data = dat))
## Warning in summary.lm(m2 <- lm(Lead ~ School, data = dat)): essentially
## perfect fit: summary may be unreliable
## 
## Call:
## lm(formula = Lead ~ School, data = dat)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -9.826e-16  0.000e+00  0.000e+00  0.000e+00  3.933e-15 
## 
## Coefficients:
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  7.400e-01  1.452e-16  5.097e+15   <2e-16 ***
## School103   -4.000e-02  2.053e-16 -1.948e+14   <2e-16 ***
## School104   -1.100e-01  2.053e-16 -5.358e+14   <2e-16 ***
## School105   -1.000e-01  2.053e-16 -4.871e+14   <2e-16 ***
## School106    4.000e-02  2.053e-16  1.948e+14   <2e-16 ***
## School107   -6.000e-02  2.053e-16 -2.922e+14   <2e-16 ***
## School108   -1.000e-02  2.053e-16 -4.871e+13   <2e-16 ***
## School109    4.000e-02  2.053e-16  1.948e+14   <2e-16 ***
## School110   -8.000e-02  2.053e-16 -3.896e+14   <2e-16 ***
## School111   -7.000e-02  2.053e-16 -3.409e+14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.591e-16 on 90 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.407e+29 on 9 and 90 DF,  p-value: < 2.2e-16



Q7

dat = read.csv("data/attitudes2Sci.csv", header = TRUE)
str(dat)
## 'data.frame':    1385 obs. of  7 variables:
##  $ State  : Factor w/ 2 levels "ACT","NSW": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PrivPub: Factor w/ 2 levels "private","public": 2 2 2 2 2 2 2 2 2 2 ...
##  $ school : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ class  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ sex    : Factor w/ 2 levels "f","m": 1 1 1 1 1 1 1 1 2 2 ...
##  $ like   : int  8 6 5 2 5 6 3 7 6 3 ...
##  $ Class  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
index = c(3,4,7)
dat[,index] = lapply(dat[,index], as.factor)

#Take a look
ggplot(dat, aes(x = like))+
  geom_histogram(binwidth = 0.5, fill = "skyblue")+
  scale_x_continuous(breaks = c(0,2,4,6,8,10,12), limits = c(0,12))+
  labs(x = "Attitude score", y = "Count")+
  theme_bw()

#Variance complonent analysis
summary(m0 <- lmer(like~(1|State)+(1|PrivPub)+(1|school)+(1|class:school)+(1|sex)+
                     (1|sex:school)+(1|sex:class), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## like ~ (1 | State) + (1 | PrivPub) + (1 | school) + (1 | class:school) +  
##     (1 | sex) + (1 | sex:school) + (1 | sex:class)
##    Data: dat
## 
## REML criterion at convergence: 5543.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.6655 -0.6672  0.1398  0.6947  2.6023 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  sex:school   (Intercept) 0.02069  0.1438  
##  class:school (Intercept) 0.31061  0.5573  
##  school       (Intercept) 0.00000  0.0000  
##  sex:class    (Intercept) 0.09209  0.3035  
##  sex          (Intercept) 0.09543  0.3089  
##  PrivPub      (Intercept) 0.07635  0.2763  
##  State        (Intercept) 0.00000  0.0000  
##  Residual                 3.01672  1.7369  
## Number of obs: 1383, groups:  
## sex:school, 80; class:school, 66; school, 41; sex:class, 8; sex, 2; PrivPub, 2; State, 2
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   5.0433     0.3327   15.16
#Because State did not contribute much, I excluded it
summary(m1 <- lmer(like~PrivPub+class+sex+(1|school), data = dat))
## Linear mixed model fit by REML ['lmerMod']
## Formula: like ~ PrivPub + class + sex + (1 | school)
##    Data: dat
## 
## REML criterion at convergence: 5571.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.6022 -0.6552  0.1307  0.6958  2.1107 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  school   (Intercept) 0.2313   0.4809  
##  Residual             3.1660   1.7793  
## Number of obs: 1383, groups:  school, 41
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)     4.5918     0.1835  25.018
## PrivPubpublic   0.4290     0.2065   2.077
## class2          0.4999     0.1295   3.860
## class3          0.5778     0.2274   2.541
## class4          0.5177     0.2546   2.033
## sexm            0.1914     0.1004   1.907
## 
## Correlation of Fixed Effects:
##             (Intr) PrvPbp class2 class3 class4
## PrivPubpblc -0.803                            
## class2      -0.163 -0.012                     
## class3      -0.101  0.022  0.261              
## class4      -0.110  0.046  0.239  0.266       
## sexm        -0.234 -0.034  0.005 -0.053 -0.031
#Because the data are really strange
#I want to know if it is normal-distributed
shapiro.test(dat$like)
## 
##  Shapiro-Wilk normality test
## 
## data:  dat$like
## W = 0.94274, p-value < 2.2e-16
qqnorm(dat$like)
qqline(dat$like)


The data was not of normal distribution.
Thus, I don’t think it is suitable to use linear regression model to analyze the data.