1.knit

#資料整理

if (!require("pacman")) install.packages("pacman", repos="cloud-r.project.org")

## 載入需要的套件：pacman

## Warning: 套件 'pacman' 是用 R 版本 4.1.3 來建造的

pacman::p_load(mlmRev, gridExtra, tidyverse, coda, reshape,lme4)

#載入mlmRev套件
library(mlmRev)

#載進資料 
data(Chem97)

#檢視套件提供的資訊
?Chem97

## starting httpd help server ...

##  done

#看資料結構
str(Chem97)

## 'data.frame':    31022 obs. of  8 variables:
##  $ lea      : Factor w/ 131 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ school   : Factor w/ 2410 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ student  : Factor w/ 31022 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ score    : num  4 10 10 10 8 10 6 8 4 10 ...
##  $ gender   : Factor w/ 2 levels "M","F": 2 2 2 2 2 2 2 2 2 2 ...
##  $ age      : num  3 -3 -4 -2 -1 4 1 4 3 0 ...
##  $ gcsescore: num  6.62 7.62 7.25 7.5 6.44 ...
##  $ gcsecnt  : num  0.339 1.339 0.964 1.214 0.158 ...

#程式報表5.1
head(Chem97)

##   lea school student score gender age gcsescore   gcsecnt
## 1   1      1       1     4      F   3     6.625 0.3393157
## 2   1      1       2    10      F  -3     7.625 1.3393157
## 3   1      1       3    10      F  -4     7.250 0.9643157
## 4   1      1       4    10      F  -2     7.500 1.2143157
## 5   1      1       5     8      F  -1     6.444 0.1583157
## 6   1      1       6    10      F   4     7.750 1.4643157

#去掉第六與第八個變項，只將需要的資料留下，並重新命名
#原始文章分析是以 GCSE 預測 Chem，我們預計倒過來做
dta <- Chem97[, -c(6, 8)]
names(dta) <- c("區域", "學校", "學生", "化學", "性別", "總成績")


#程式報表5.1
head(dta)

##   區域 學校 學生 化學 性別 總成績
## 1    1    1    1    4    F  6.625
## 2    1    1    2   10    F  7.625
## 3    1    1    3   10    F  7.250
## 4    1    1    4   10    F  7.500
## 5    1    1    5    8    F  6.444
## 6    1    1    6   10    F  7.750

#描述統計

#程式報表5.2
sapply(dta[, c('化學', '總成績')], summary)

##              化學   總成績
## Min.     0.000000 0.000000
## 1st Qu.  4.000000 5.750000
## Median   6.000000 6.375000
## Mean     5.812585 6.285684
## 3rd Qu.  8.000000 6.900000
## Max.    10.000000 8.000000

sapply(dta[, c('化學', '總成績')], sd)

##      化學    總成績 
## 3.3191673 0.8734512

#計算各校化學與總成績平均分數，記錄成資料檔，留待後用
#aggregate函式應該是資料處理中常用到的函式，簡單說有點類似sql語言中的group by，可以按照要求把資料打組聚合，然後對聚合以後的資料進行加和、求平均等各種操作。
#cbind： 根據列進行合並，即疊加所有列，m列的矩陣與n列的矩陣cbind()最後變成m+n列，合並前提：cbind(a, c)中矩陣a、c的行數必需相符
#資料型態，所對應的轉換函式：number>as.numeric()
dta_m <- aggregate(cbind(化學, 總成績, as.numeric(性別) - 1) ~ 學校, 
                   data = dta, mean)
names(dta_m) <- c('學校', '化學平均', '總成績平均', '女性比率')

#程式報表5.3
head(dta_m)

##   學校 化學平均 總成績平均  女性比率
## 1    1 8.307692   7.224462 1.0000000
## 2    2 8.714286   7.023893 0.0000000
## 3    3 4.666667   5.310000 0.6666667
## 4    4 8.285714   6.232143 0.1428571
## 5    5 5.500000   6.174313 0.4375000
## 6    6 4.714286   6.705000 0.7857143

#繪圖

#載進 lattice，準備畫圖，lattice包是一個非常強大地高級繪圖程序包
library(lattice)

#以總成績與化學原始分數，繪製直方圖
p1 <- histogram(~ 總成績, data = dta, type = 'density', 
                xlab = '總成績', ylab = '機率密度')
p2 <- histogram(~ 化學, data = dta, type = 'density', 
                xlab = '化學分數', ylab = '機率密度')

#以學校平均分數，繪製直方圖
p3 <- histogram(~ 總成績平均, data = dta_m, type = 'density', 
          xlab = '各校總成績平均', ylab = '機率密度')
p4 <- histogram(~ 化學平均, data = dta_m, type = 'density', 
          xlab = '各校化學平均分數', ylab = '機率密度')

#計算學校標準差，存檔留後用
dta_sd <- aggregate(cbind(化學, 總成績) ~ 學校, data = dta, sd)
names(dta_sd)[2:3] <- c('化學標準差', '總成績標準差')

#以學校標準差，繪製直方圖。
p5 <- histogram(~ 總成績標準差, data = dta_sd, type = 'density',
          xlab = '各校總成績標準差', ylab = '機率密度')
          
p6 <- histogram(~ 化學標準差, data = dta_sd, type = 'density',
          xlab = '各校化學分數標準差', ylab = '機率密度')

#載入gridExtra，把六張放一起
#圖 5.1
library(gridExtra)
grid.arrange(p6, p5, p4, p3, p2, p1, as.table = T)

#看看化學與總成績相關，以學生層次分數計算是 .662
cor(dta[, c('化學', '總成績')])

##             化學    總成績
## 化學   1.0000000 0.6622476
## 總成績 0.6622476 1.0000000

#以學校層次分數計算生態相關（ecological correlation），為 .698
##程式報表5.4
cor(dta_m[, -1])

##              化學平均 總成績平均   女性比率
## 化學平均   1.00000000  0.6980651 0.06625497
## 總成績平均 0.69806507  1.0000000 0.24520916
## 女性比率   0.06625497  0.2452092 1.00000000

#畫看兩者間關聯
#圖5.2
#參數asp確定y軸與x軸的比值,y/x=1
plot(dta[, 4], dta[, 6], type = 'n', xlab ='化學分數', ylab = '總成績', 
     asp = 1)
grid()

#學生
#參數pch即「point character」，也就是點的類型，col即「color」，設置顏色。
#cex指符號的大小
#lty指線型，可以是整數(1實線；2虛線；3點線；4點虛線；5長虛線；6雙虛線)
#abline()，可以在當前繪圖新增一條或多條直線
points(dta[, 4], dta[, 6], pch = '.', cex = 2)
abline(lm(dta[,6] ~ dta[, 4]))


#學校
points(dta_m[, 2], dta_m[, 3], cex = 0.5, col = 'grey') 
abline(lm(dta_m[,3] ~ dta_m[,2]), col = 'grey')
#對角線
abline(0, 1, lty = 3, col = 'blue')

#看看各校與各區域以化學預測總分時的截距與斜率
#請注意原始文章的預測變項與解釋變項跟此所作是相反的。
#載入 ggplot2 套件，準備畫圖。
library(ggplot2)

# 記錄下原始配色
old <- theme_set(theme_bw())

#各校以化學預測總分時的截距與斜率
#圖5.3
ggplot(data = dta, aes(x = 化學, y = 總成績, group = 學校))+
 stat_smooth(method = 'lm', formula = y ~ x, se = F, color = 'lightgray') +
 geom_point(size = 1) +
 stat_smooth(aes(group = 1), method= 'lm', se = F, color = 'black') +
 labs(x = '化學分數', y = '總成績', title = '學校')

## `geom_smooth()` using formula 'y ~ x'

#各區域以化學預測總分時的截距與斜率
#圖5.4
ggplot(data = dta, aes(x = 化學, y = 總成績, group = 區域))+
 stat_smooth(method = 'lm', formula = y ~ x, se = F, color = 'lightgray') +
 geom_point(size = 1) +
 stat_smooth(aes(group = 1), method = 'lm', se = F, color = 'black') +
 labs(x = '化學分數', y = '總成績' , title = '區域')

## `geom_smooth()` using formula 'y ~ x'

# 將變項以總分均置中，亦即，減去總平均。Like "center"。
dta$化學置中 <- scale(dta$化學, scale = F)

#選取25個學校與區域 
#levels代表在這個變數裡，存在哪些類別
set.seed(1225)
ns25 <- sample(levels(dta$學校), 25)
set.seed(1225)
nr25 <- sample(levels(dta$區域), 25)

#重看一次各校以化學預測總分時的截距與斜率
#圖5.5
#Utilizing the %in% Operator to Subset Data 
ggplot(data = dta[dta$學校 %in% ns25, ], aes(x = 化學置中, y = 總成績, color = 性別))+
  geom_point(size = 1) +
  stat_smooth(method = 'lm', formula = y ~ x, se = F) +
  facet_wrap( ~ 學校 )

#重看一次各區域以化學預測總分時的截距與斜率
#圖5.6
ggplot(data = dta[dta$區域 %in% nr25, ], aes(x = 化學置中, y = 總成績, color = 性別))+
  geom_point(size = 1) +
  stat_smooth(method = 'lm', formula = y ~ x, se = F) +
  facet_wrap( ~ 區域 )

#多層次分析

#載入 lme4 套件，用來分析多層次資料
library(lme4)

# 將變項以總分均置中，亦即，減去總平均。
dta$化學置中 <- scale(dta$化學, scale = F)

#先以完整模型嘗試
#程式報表5.5
summary(m0 <- lmer(總成績 ~ 化學置中 + 性別 + 化學置中:性別 + 
                  ( 1 | 學校 ) + ( 1 | 區域 ), data = dta ) )

## Linear mixed model fit by REML ['lmerMod']
## Formula: 總成績 ~ 化學置中 + 性別 + 化學置中:性別 + (1 | 學校) + (1 |  
##     區域)
##    Data: dta
## 
## REML criterion at convergence: 55708.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -9.2306 -0.5671  0.0638  0.6527  4.0788 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  學校     (Intercept) 0.08763  0.2960  
##  區域     (Intercept) 0.01064  0.1031  
##  Residual             0.31811  0.5640  
## Number of obs: 31022, groups:  學校, 2410; 區域, 131
## 
## Fixed effects:
##                 Estimate Std. Error t value
## (Intercept)     6.080254   0.013263 458.434
## 化學置中        0.168078   0.001408 119.333
## 性別F           0.334091   0.007518  44.440
## 化學置中:性別F -0.012266   0.002091  -5.866
## 
## Correlation of Fixed Effects:

## Warning in abbreviate(rn, minlength = 6): abbreviate 不適用於非 ASCII 字元

##                (Intr) 化學置中 性別F 
## 化學置中        0.051                
## 性別F          -0.265 -0.065         
## 化學置中:性別F -0.023 -0.630    0.067

# 試著去除區域的隨機效果，並看看去除隨機效果是否顯著。
m1 <- update(m0, . ~ . - ( 1 | 區域 ) )

#程式報表5.6
anova(m0, m1)

## refitting model(s) with ML (instead of REML)

## Data: dta
## Models:
## m1: 總成績 ~ 化學置中 + 性別 + (1 | 學校) + 化學置中:性別
## m0: 總成績 ~ 化學置中 + 性別 + 化學置中:性別 + (1 | 學校) + (1 | 區域)
##    npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)    
## m1    6 55743 55793 -27866    55731                         
## m0    7 55685 55744 -27836    55671 59.744  1   1.08e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#去除交互作用項
#程式報表5.7
#drop1逐步回歸的優化
#Chisq>卡方檢定
drop1(m0, test = 'Chisq')

## Single term deletions
## 
## Model:
## 總成績 ~ 化學置中 + 性別 + 化學置中:性別 + (1 | 學校) + (1 | 
##     區域)
##               npar   AIC    LRT   Pr(Chi)    
## <none>             55685                     
## 化學置中:性別    1 55718 34.388 4.515e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(m0)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 總成績 ~ 化學置中 + 性別 + 化學置中:性別 + (1 | 學校) + (1 |  
##     區域)
##    Data: dta
## 
## REML criterion at convergence: 55708.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -9.2306 -0.5671  0.0638  0.6527  4.0788 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  學校     (Intercept) 0.08763  0.2960  
##  區域     (Intercept) 0.01064  0.1031  
##  Residual             0.31811  0.5640  
## Number of obs: 31022, groups:  學校, 2410; 區域, 131
## 
## Fixed effects:
##                 Estimate Std. Error t value
## (Intercept)     6.080254   0.013263 458.434
## 化學置中        0.168078   0.001408 119.333
## 性別F           0.334091   0.007518  44.440
## 化學置中:性別F -0.012266   0.002091  -5.866
## 
## Correlation of Fixed Effects:

## Warning in abbreviate(rn, minlength = 6): abbreviate 不適用於非 ASCII 字元

##                (Intr) 化學置中 性別F 
## 化學置中        0.051                
## 性別F          -0.265 -0.065         
## 化學置中:性別F -0.023 -0.630    0.067

#固定效果

#載入 coefplot2 套件，繪製固定效果
if (!require("coefplot2")) install.packages("coefplot2",
    repos="http://www.math.mcmaster.ca/bolker/R")

## 載入需要的套件：coefplot2

library(coefplot2)
coefplot2(m0)

#抽取變異成分，計算學校與區域可以解釋部分（ICCs）
print(vc <- lme4::VarCorr(m0), comp = 'Variance' )

##  Groups   Name        Variance
##  學校     (Intercept) 0.087635
##  區域     (Intercept) 0.010637
##  Residual             0.318108

vc <- as.data.frame(vc)
vc[vc$grp=='學校', 'vcov']/ sum(vc$vcov)

## [1] 0.2104676

vc[vc$grp=='區域', 'vcov']/ sum(vc$vcov)

## [1] 0.02554734

1 - (vc[vc$grp=='Residual', 'vcov']/ sum(vc$vcov))

## [1] 0.2360149

檢驗模型

#ranef: Extract the modes of the random effects
#學校與區域層次的隨機效果QQ圖，檢查隨機效果是否呈常態
qq_r21 <- qqmath(~ ranef(m0)$學校, type = c('p', 'g', 'r'), pch = '.',
                xlab = '常態分位數', ylab = '學校隨機效果分位數')

qq_r22 <- qqmath(~ ranef(m0)$區域, type = c('p', 'g', 'r'), pch = '.',
                xlab = '常態分位數', ylab = '區域隨機效果分位數')

#殘差的QQ圖，檢驗殘差是否呈常態
qq_r0 <- qqmath(~ resid(m0, scale = T), type = c('p', 'g', 'r'), pch = '.',
                xlab = '標準化常態分位數', ylab = '標準化殘差')

#預測值對殘差圖
m0_f <-  data.frame(yhat=fitted(m0), zres=resid(m0, scaled=TRUE))
r_m0 <- ggplot(data = m0_f, aes(x = yhat, y = zres)) + 
          geom_point(pch = '.') +
          stat_smooth(method = 'gam', formula = y ~ s(x, bs='cs'),
                      se = FALSE) +
          labs( x = '預測值', y = '標準化殘差')

#把圖放在一起呈現
#圖5.7
#nrow()、 ncol() 與 dim() 這三個內建函數可以幫助我們暸解所輸入資料框的列數、欄數與維度資訊
library(gridExtra)
grid.arrange(qq_r21, qq_r22, qq_r0, r_m0, nrow = 2, ncol = 2)

#複雜的多層次模型

#顯示複雜的多層次模型，包括隨機斜率效果、學校層次的預測變項以及跨層次交互作用
#先要將不同層次資料併在一起
dta <- merge(dta, dta_m, by="學校") 

#lmer takes as its first two arguments a formula spec- ifying the model and the data with which to evaluate the formula
rslt <- lmer(總成績 ~ 化學置中 + 性別 + 化學置中:性別 +
        女性比率 + 女性比率:性別+
        (1+化學置中+性別|學校) + (1|區域), data=dta)

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.0157101 (tol = 0.002, component 1)

summary(rslt)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 總成績 ~ 化學置中 + 性別 + 化學置中:性別 + 女性比率 + 女性比率:性別 +  
##     (1 + 化學置中 + 性別 | 學校) + (1 | 區域)
##    Data: dta
## 
## REML criterion at convergence: 55393
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -9.3493 -0.5613  0.0682  0.6500  4.1232 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr       
##  學校     (Intercept) 0.0981685 0.31332             
##           化學置中    0.0005489 0.02343  -0.56      
##           性別F       0.0181129 0.13458  -0.63  0.07
##  區域     (Intercept) 0.0106142 0.10303             
##  Residual             0.3103827 0.55712             
## Number of obs: 31022, groups:  學校, 2410; 區域, 131
## 
## Fixed effects:
##                 Estimate Std. Error t value
## (Intercept)     6.140229   0.019251 318.952
## 化學置中        0.169861   0.001557 109.102
## 性別F           0.146337   0.022260   6.574
## 女性比率       -0.222825   0.040026  -5.567
## 化學置中:性別F -0.011094   0.002156  -5.146
## 性別F:女性比率  0.430795   0.048239   8.930
## 
## Correlation of Fixed Effects:

## Warning in abbreviate(rn, minlength = 6): abbreviate 不適用於非 ASCII 字元

##                (Intr) 化學置中 性別F  女性比率 化中:性別F
## 化學置中       -0.095                                    
## 性別F          -0.482 -0.004                             
## 女性比率       -0.705  0.069    0.645                    
## 化學置中:性別F  0.066 -0.608    0.004 -0.102             
## 性別F:女性比率  0.494 -0.026   -0.915 -0.797    0.030    
## optimizer (nloptwrap) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.0157101 (tol = 0.002, component 1)

#結束 ##顯示演練單元信息

sessionInfo()

## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22000)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Chinese (Traditional)_Taiwan.950 
## [2] LC_CTYPE=Chinese (Traditional)_Taiwan.950   
## [3] LC_MONETARY=Chinese (Traditional)_Taiwan.950
## [4] LC_NUMERIC=C                                
## [5] LC_TIME=Chinese (Traditional)_Taiwan.950    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] coefplot2_0.1.3.2 lattice_0.20-44   reshape_0.8.9     coda_0.19-4      
##  [5] forcats_0.5.2     stringr_1.4.1     dplyr_1.0.10      purrr_0.3.4      
##  [9] readr_2.1.2       tidyr_1.2.1       tibble_3.1.8      ggplot2_3.3.6    
## [13] tidyverse_1.3.2   gridExtra_2.3     mlmRev_1.0-8      lme4_1.1-30      
## [17] Matrix_1.5-1      pacman_0.5.1     
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.4          sass_0.4.2          jsonlite_1.8.0     
##  [4] splines_4.1.1       modelr_0.1.9        bslib_0.4.0        
##  [7] assertthat_0.2.1    highr_0.9           googlesheets4_1.0.1
## [10] cellranger_1.1.0    pillar_1.8.1        backports_1.4.1    
## [13] glue_1.6.2          digest_0.6.29       rvest_1.0.3        
## [16] minqa_1.2.4         colorspace_2.0-3    plyr_1.8.7         
## [19] htmltools_0.5.3     pkgconfig_2.0.3     broom_1.0.1        
## [22] haven_2.5.1         scales_1.2.1        tzdb_0.3.0         
## [25] googledrive_2.0.0   mgcv_1.8-36         farver_2.1.1       
## [28] generics_0.1.3      ellipsis_0.3.2      cachem_1.0.6       
## [31] withr_2.5.0         cli_3.4.0           crayon_1.5.1       
## [34] magrittr_2.0.3      readxl_1.4.1        evaluate_0.16      
## [37] fs_1.5.2            fansi_1.0.3         nlme_3.1-152       
## [40] MASS_7.3-54         xml2_1.3.3          tools_4.1.1        
## [43] hms_1.1.2           gargle_1.2.1        lifecycle_1.0.2    
## [46] munsell_0.5.0       reprex_2.0.2        compiler_4.1.1     
## [49] jquerylib_0.1.4     rlang_1.0.5         grid_4.1.1         
## [52] nloptr_2.0.3        rstudioapi_0.14     labeling_0.4.2     
## [55] rmarkdown_2.16      boot_1.3-28         gtable_0.3.1       
## [58] DBI_1.1.3           R6_2.5.1            lubridate_1.8.0    
## [61] knitr_1.40          fastmap_1.1.0       utf8_1.2.2         
## [64] stringi_1.7.6       Rcpp_1.0.9          vctrs_0.4.1        
## [67] dbplyr_2.2.1        tidyselect_1.1.2    xfun_0.33