1 install packages

pacman::p_load(mlmRev, tidyverse, lme4)

2 data from mlmRev package

#從"mlmRev"這個package中取得Hsb82這個資料集
data(Hsb82, "mlmRev")

## Warning in data(Hsb82, "mlmRev"): data set 'mlmRev' not found

3 show structure of data

str(Hsb82)

## 'data.frame':    7185 obs. of  8 variables:
##  $ school : Ord.factor w/ 160 levels "8367"<"8854"<..: 59 59 59 59 59 59 59 59 59 59 ...
##  $ minrty : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sx     : Factor w/ 2 levels "Male","Female": 2 2 1 1 1 1 2 1 2 1 ...
##  $ ses    : num  -1.528 -0.588 -0.528 -0.668 -0.158 ...
##  $ mAch   : num  5.88 19.71 20.35 8.78 17.9 ...
##  $ meanses: num  -0.434 -0.434 -0.434 -0.434 -0.434 ...
##  $ sector : Factor w/ 2 levels "Public","Catholic": 1 1 1 1 1 1 1 1 1 1 ...
##  $ cses   : num  -1.0936 -0.1536 -0.0936 -0.2336 0.2764 ...

有四個類別變數：$ school、 $ minrty、$ sx、$ sector

4 make a copy of data to the dta object

dta <- Hsb82  #命Hsb82等於dta

5 use sensible variable names

names(dta) <- c("School", "Minority", "Gender", "SES", "Math",
                "Ave_SES", "Sector", "C_SES") #將變數改為較容易了解的名稱

#show first 10 lines

head(dta,10)

##    School Minority Gender    SES   Math   Ave_SES Sector       C_SES
## 1    1224       No Female -1.528  5.876 -0.434383 Public -1.09361702
## 2    1224       No Female -0.588 19.708 -0.434383 Public -0.15361702
## 3    1224       No   Male -0.528 20.349 -0.434383 Public -0.09361702
## 4    1224       No   Male -0.668  8.781 -0.434383 Public -0.23361702
## 5    1224       No   Male -0.158 17.898 -0.434383 Public  0.27638298
## 6    1224       No   Male  0.022  4.583 -0.434383 Public  0.45638298
## 7    1224       No Female -0.618 -2.832 -0.434383 Public -0.18361702
## 8    1224       No   Male -0.998  0.523 -0.434383 Public -0.56361702
## 9    1224       No Female -0.888  1.527 -0.434383 Public -0.45361702
## 10   1224       No   Male -0.458 21.521 -0.434383 Public -0.02361702

5.1

ot <- theme_set(theme_bw()) #theme_bw將背景主題設定為白色

6 freedman-diaconis rule for binwidth

bw <- 2*IQR(dta$Math)/(dim(dta)[1]^(1/3)) #用freedman-diaconis rule來決定直方圖中box的寬度

#install.packages("ggplot")
library(ggplot2)

7 histograms

ggplot(dta, aes(Math)) +
  stat_bin(aes(fill = ..count..), binwidth=bw) +
  labs(x="Math achievement scores")

8 seed the random number to reproduce the sample

set.seed(20160923) #隨機抽樣1/3的raw data

9 pick 60 out of 160

n <- sample(160, 60) #sample是取出放回，重複60次，這有可能重複選取到同一所學校，加入replace = FALSE則為一次選取60個。

10 box plots for a sample of 60 schools

ggplot(filter(dta, School %in% levels(School)[n]), #filter用來篩選要分析的觀察值(Row)，%in% 包含在那60所學校的level
       aes(reorder(School, Math, mean), Math, color=Sector)) + #依據各校數學平均分數畫圖，低到高，不同sector顯示不同顏色
 geom_boxplot() +
 geom_point(size= rel(0.6)) +
 labs(x="School ID", y="Math achievement score")+
 theme(axis.text.x=element_text(size=6))+
 ggtitle("A sample of 60 schools")

從圖看來，在60所學校中，左邊橘色佔多數，表示多數公立學校的平均分數較低。

11 Analysis

math mean by school 計算各校數學平均

dta_mmath <- dta %>%  
             group_by(School) %>% #group_by(School)依據學校分群分析
             mutate( ave_math = mean(Math)) %>% #mutate新增變數，計算各校數學平均分數ave_math
             select( School, ave_math ) %>%  #select選要分析的欄位(Column)
             unique %>% #丟掉重複選取的學校
             as.data.frame

summary(dta_mmath$ave_math)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.24   10.47   12.90   12.62   14.65   19.72

#看看這60所學校數學平均成績的集中趨勢與離散程度

sd(dta_mmath$ave_math) #計算60所學校數學平均分數的標準差(離散程度)

## [1] 3.117651

12 grand mean

summary(dta$Math) #全部學校數學分數的集中趨勢與離散程度

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.832   7.275  13.131  12.748  18.317  24.993

全部學校的數學平均12.748 > 抽樣的60所學校的平均分數。

13 sd for overall Math scores

sd(dta$Math) #全部學校的數學成績標準差(離散程度)

## [1] 6.878246

14 freedman-diaconis rule for binwidth

bw <- 2*IQR(dta_mmath$ave_math)/(160^(1/3))

15

ggplot(dta_mmath, aes(ave_math)) + #60所學校數學平均分數的機率密度
 geom_histogram(aes(y= ..density.., alpha= .2), binwidth=bw) + 
 geom_density(fill="NA") +
 geom_vline(xintercept=mean(dta_mmath$ave_math), linetype="longdash") +
 geom_text(aes(7, 0.17, label= "ave = 12.62, sd = 3.12")) +
 labs(x="Math achievement school means", y="Density") +
 theme(legend.position="NONE")

16 baseline model

summary(m0 <- lmer(Math ~ (1 | School), data = dta)) #估計不同學校對數學成績的影響，(1 | School)是隨機截距項的R語法，意思是假設每個學校的截距不同，其中"1"代表截距項。

## Linear mixed model fit by REML ['lmerMod']
## Formula: Math ~ (1 | School)
##    Data: dta
## 
## REML criterion at convergence: 47116.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0631 -0.7539  0.0267  0.7606  2.7426 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  School   (Intercept)  8.614   2.935   
##  Residual             39.148   6.257   
## Number of obs: 7185, groups:  School, 160
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  12.6370     0.2444   51.71

m0 <- lme4::lmer(Math ~ (1 | School), data=dta)
sjPlot::tab_model(m0, show.p=FALSE, show.r2=FALSE)

	Math
Predictors	Estimates	CI
(Intercept)	12.64	12.16 – 13.12
Random Effects
σ²	39.15
τ₀₀ _School	8.61
ICC	0.18
N _School	160
Observations	7185

17 CIs for model parameters

confint(m0, method="boot")

## Computing bootstrap confidence intervals ...

##                 2.5 %    97.5 %
## .sig01       2.596281  3.333500
## .sigma       6.151525  6.360499
## (Intercept) 12.128943 13.114645

confint(m0) #The default method assumes normality

## Computing profile confidence intervals ...

##                 2.5 %    97.5 %
## .sig01       2.594729  3.315880
## .sigma       6.154803  6.361786
## (Intercept) 12.156289 13.117121

18 default residual plot

plot(m0, xlab="Fitted values", ylab="Pearson residuals", 
     pch=20, cex=.5, type = c("p","g"))

診斷模型：殘差圖顯示模型符合常態假設。

in-class exercise 1-1

Ching-Fang Wu

Fri Oct 09 14:52:39 2020

1 install packages

2 data from mlmRev package

3 show structure of data

4 make a copy of data to the dta object

5 use sensible variable names

5.1

6 freedman-diaconis rule for binwidth

7 histograms

8 seed the random number to reproduce the sample

9 pick 60 out of 160

10 box plots for a sample of 60 schools

11 Analysis

12 grand mean

13 sd for overall Math scores

14 freedman-diaconis rule for binwidth

15

16 baseline model

17 CIs for model parameters

18 default residual plot

19 The end