pacman::p_load(mlmRev, tidyverse, lme4)
#從"mlmRev"這個package中取得Hsb82這個資料集
data(Hsb82, "mlmRev")
## Warning in data(Hsb82, "mlmRev"): data set 'mlmRev' not found
str(Hsb82)
## 'data.frame': 7185 obs. of 8 variables:
## $ school : Ord.factor w/ 160 levels "8367"<"8854"<..: 59 59 59 59 59 59 59 59 59 59 ...
## $ minrty : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ sx : Factor w/ 2 levels "Male","Female": 2 2 1 1 1 1 2 1 2 1 ...
## $ ses : num -1.528 -0.588 -0.528 -0.668 -0.158 ...
## $ mAch : num 5.88 19.71 20.35 8.78 17.9 ...
## $ meanses: num -0.434 -0.434 -0.434 -0.434 -0.434 ...
## $ sector : Factor w/ 2 levels "Public","Catholic": 1 1 1 1 1 1 1 1 1 1 ...
## $ cses : num -1.0936 -0.1536 -0.0936 -0.2336 0.2764 ...
有四個類別變數:$ school、 $ minrty、$ sx、$ sector
dta <- Hsb82 #命Hsb82等於dta
names(dta) <- c("School", "Minority", "Gender", "SES", "Math",
"Ave_SES", "Sector", "C_SES") #將變數改為較容易了解的名稱
#show first 10 lines
head(dta,10)
## School Minority Gender SES Math Ave_SES Sector C_SES
## 1 1224 No Female -1.528 5.876 -0.434383 Public -1.09361702
## 2 1224 No Female -0.588 19.708 -0.434383 Public -0.15361702
## 3 1224 No Male -0.528 20.349 -0.434383 Public -0.09361702
## 4 1224 No Male -0.668 8.781 -0.434383 Public -0.23361702
## 5 1224 No Male -0.158 17.898 -0.434383 Public 0.27638298
## 6 1224 No Male 0.022 4.583 -0.434383 Public 0.45638298
## 7 1224 No Female -0.618 -2.832 -0.434383 Public -0.18361702
## 8 1224 No Male -0.998 0.523 -0.434383 Public -0.56361702
## 9 1224 No Female -0.888 1.527 -0.434383 Public -0.45361702
## 10 1224 No Male -0.458 21.521 -0.434383 Public -0.02361702
ot <- theme_set(theme_bw()) #theme_bw將背景主題設定為白色
bw <- 2*IQR(dta$Math)/(dim(dta)[1]^(1/3)) #用freedman-diaconis rule來決定直方圖中box的寬度
#install.packages("ggplot")
library(ggplot2)
ggplot(dta, aes(Math)) +
stat_bin(aes(fill = ..count..), binwidth=bw) +
labs(x="Math achievement scores")
set.seed(20160923) #隨機抽樣1/3的raw data
n <- sample(160, 60) #sample是取出放回,重複60次,這有可能重複選取到同一所學校,加入replace = FALSE則為一次選取60個。
ggplot(filter(dta, School %in% levels(School)[n]), #filter用來篩選要分析的觀察值(Row),%in% 包含在那60所學校的level
aes(reorder(School, Math, mean), Math, color=Sector)) + #依據各校數學平均分數畫圖,低到高,不同sector顯示不同顏色
geom_boxplot() +
geom_point(size= rel(0.6)) +
labs(x="School ID", y="Math achievement score")+
theme(axis.text.x=element_text(size=6))+
ggtitle("A sample of 60 schools")
從圖看來,在60所學校中,左邊橘色佔多數,表示多數公立學校的平均分數較低。
math mean by school 計算各校數學平均
dta_mmath <- dta %>%
group_by(School) %>% #group_by(School)依據學校分群分析
mutate( ave_math = mean(Math)) %>% #mutate新增變數,計算各校數學平均分數ave_math
select( School, ave_math ) %>% #select選要分析的欄位(Column)
unique %>% #丟掉重複選取的學校
as.data.frame
summary(dta_mmath$ave_math)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.24 10.47 12.90 12.62 14.65 19.72
#看看這60所學校數學平均成績的集中趨勢與離散程度
sd(dta_mmath$ave_math) #計算60所學校數學平均分數的標準差(離散程度)
## [1] 3.117651
summary(dta$Math) #全部學校數學分數的集中趨勢與離散程度
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.832 7.275 13.131 12.748 18.317 24.993
全部學校的數學平均12.748 > 抽樣的60所學校的平均分數。
sd(dta$Math) #全部學校的數學成績標準差(離散程度)
## [1] 6.878246
bw <- 2*IQR(dta_mmath$ave_math)/(160^(1/3))
ggplot(dta_mmath, aes(ave_math)) + #60所學校數學平均分數的機率密度
geom_histogram(aes(y= ..density.., alpha= .2), binwidth=bw) +
geom_density(fill="NA") +
geom_vline(xintercept=mean(dta_mmath$ave_math), linetype="longdash") +
geom_text(aes(7, 0.17, label= "ave = 12.62, sd = 3.12")) +
labs(x="Math achievement school means", y="Density") +
theme(legend.position="NONE")
summary(m0 <- lmer(Math ~ (1 | School), data = dta)) #估計不同學校對數學成績的影響,(1 | School)是隨機截距項的R語法,意思是假設每個學校的截距不同,其中"1"代表截距項。
## Linear mixed model fit by REML ['lmerMod']
## Formula: Math ~ (1 | School)
## Data: dta
##
## REML criterion at convergence: 47116.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0631 -0.7539 0.0267 0.7606 2.7426
##
## Random effects:
## Groups Name Variance Std.Dev.
## School (Intercept) 8.614 2.935
## Residual 39.148 6.257
## Number of obs: 7185, groups: School, 160
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 12.6370 0.2444 51.71
m0 <- lme4::lmer(Math ~ (1 | School), data=dta)
sjPlot::tab_model(m0, show.p=FALSE, show.r2=FALSE)
Math | ||
---|---|---|
Predictors | Estimates | CI |
(Intercept) | 12.64 | 12.16 – 13.12 |
Random Effects | ||
σ2 | 39.15 | |
τ00 School | 8.61 | |
ICC | 0.18 | |
N School | 160 | |
Observations | 7185 |
confint(m0, method="boot")
## Computing bootstrap confidence intervals ...
## 2.5 % 97.5 %
## .sig01 2.596281 3.333500
## .sigma 6.151525 6.360499
## (Intercept) 12.128943 13.114645
confint(m0) #The default method assumes normality
## Computing profile confidence intervals ...
## 2.5 % 97.5 %
## .sig01 2.594729 3.315880
## .sigma 6.154803 6.361786
## (Intercept) 12.156289 13.117121
plot(m0, xlab="Fitted values", ylab="Pearson residuals",
pch=20, cex=.5, type = c("p","g"))
診斷模型:殘差圖顯示模型符合常態假設。