#
pacman::p_load(mlmRev, tidyverse, lme4)

1 Data Input

# data from mlmRev package
data(Hsb82, "mlmRev")

## Warning in data(Hsb82, "mlmRev"): data set 'mlmRev' not found

str(Hsb82)

## 'data.frame':    7185 obs. of  8 variables:
##  $ school : Ord.factor w/ 160 levels "8367"<"8854"<..: 59 59 59 59 59 59 59 59 59 59 ...
##  $ minrty : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sx     : Factor w/ 2 levels "Male","Female": 2 2 1 1 1 1 2 1 2 1 ...
##  $ ses    : num  -1.528 -0.588 -0.528 -0.668 -0.158 ...
##  $ mAch   : num  5.88 19.71 20.35 8.78 17.9 ...
##  $ meanses: num  -0.434 -0.434 -0.434 -0.434 -0.434 ...
##  $ sector : Factor w/ 2 levels "Public","Catholic": 1 1 1 1 1 1 1 1 1 1 ...
##  $ cses   : num  -1.0936 -0.1536 -0.0936 -0.2336 0.2764 ...

2 copy the data

# make a copy of data to the dta object 
dta <- Hsb82

3 use sensible variable names

# use sensible variable names
names(dta) <- c("School", "Minority", "Gender", "SES", "Math",
                "Ave_SES", "Sector", "C_SES")
#重新命名變數名稱

head(dta)

##   School Minority Gender    SES   Math   Ave_SES Sector       C_SES
## 1   1224       No Female -1.528  5.876 -0.434383 Public -1.09361702
## 2   1224       No Female -0.588 19.708 -0.434383 Public -0.15361702
## 3   1224       No   Male -0.528 20.349 -0.434383 Public -0.09361702
## 4   1224       No   Male -0.668  8.781 -0.434383 Public -0.23361702
## 5   1224       No   Male -0.158 17.898 -0.434383 Public  0.27638298
## 6   1224       No   Male  0.022  4.583 -0.434383 Public  0.45638298

##設定背景主題
ot <- theme_set(theme_bw())

# freedman-diaconis rule for binwidth
bw <- 2*IQR(dta$Math)/(dim(dta)[1]^(1/3))
# histograms大小設定

4 histograms

ggplot(dta, aes(Math)) +
  stat_bin(aes(fill = ..count..), binwidth=bw) +
  labs(x="Math achievement scores")

5 seed the random number to reproduce the sample

set.seed(20160923)
#隨機抽樣

6 pick 60 out of 160

# pick 60 out of 160
n <- sample(160, 60)
#從160個中重複抽取樣本60次。

7 box plots for a sample of 60 schools

ggplot(filter(dta, School %in% levels(School)[n]),
       aes(reorder(School, Math, mean), Math, color=Sector)) +
 geom_boxplot() +
 geom_point(size= rel(0.6)) +
 labs(x="School ID", y="Math achievement score")+
 theme(axis.text.x=element_text(size=6))+
 ggtitle("A sample of 60 schools")

依各校分數低到高繪圖，color=Sector表示不同部門是不同顏色，紅色是公立學校，藍色是catholic學校。圖顯示紅色處於較低分區域，亦即公立學校均分較低。

8 Data Analysis

library(dplyr)
# math mean by school(學校的數學平均分數)
dta_mmath <- dta %>% 
             group_by(School) %>% 
             mutate( ave_math = mean(Math)) %>% 
             select( School, ave_math ) %>%  
             unique %>%
             as.data.frame

summary(dta_mmath$ave_math)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.24   10.47   12.90   12.62   14.65   19.72

#呈現ave_math概要
#抽取的60所學校的平均數是12.62
sd(dta_mmath$ave_math)

## [1] 3.117651

#抽取的60所學校的標準差

9 grand mean

summary(dta$Math)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.832   7.275  13.131  12.748  18.317  24.993

所有學校的分數的資料分析平均數是12.748

# sd for overall Math scores
sd(dta$Math)

## [1] 6.878246

所有學校的分數的標準差

10 freedman-diaconis rule for binwidth

bw <- 2*IQR(dta_mmath$ave_math)/(160^(1/3))

#Plot

ggplot(dta_mmath, aes(ave_math)) +
 geom_histogram(aes(y= ..density.., alpha= .2), binwidth=bw) + 
 geom_density(fill="NA") +
 geom_vline(xintercept=mean(dta_mmath$ave_math), linetype="longdash") +
 geom_text(aes(7, 0.17, label= "ave = 12.62, sd = 3.12")) +
 labs(x="Math achievement school means", y="Density") +
 theme(legend.position="NONE")

11 baseline model

summary(m0 <- lmer(Math ~ (1 | School), data = dta))

## Linear mixed model fit by REML ['lmerMod']
## Formula: Math ~ (1 | School)
##    Data: dta
## 
## REML criterion at convergence: 47116.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0631 -0.7539  0.0267  0.7606  2.7426 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  School   (Intercept)  8.614   2.935   
##  Residual             39.148   6.257   
## Number of obs: 7185, groups:  School, 160
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  12.6370     0.2444   51.71

12 CIs for model parameters

confint(m0, method="boot")

## Computing bootstrap confidence intervals ...

##                 2.5 %    97.5 %
## .sig01       2.596281  3.333500
## .sigma       6.151525  6.360499
## (Intercept) 12.128943 13.114645

# default residual plot
plot(m0, xlab="Fitted values", ylab="Pearson residuals", 
     pch=20, cex=.5, type = c("p","g"))

In class Exercise 1

Pei-Yun,Li

Tue Oct 06 12:56:41 2020

1 Data Input

2 copy the data

3 use sensible variable names

4 histograms

5 seed the random number to reproduce the sample

6 pick 60 out of 160

7 box plots for a sample of 60 schools

8 Data Analysis

9 grand mean

10 freedman-diaconis rule for binwidth

11 baseline model

12 CIs for model parameters

13 The end