1 Exercise 1

Assess how similar family members are in their ratings of liberalism in the family example using the intraclass correlation.

pacman::p_load(tidyverse, lme4)

data input and check

dta1 <- read.table("~/data/family.txt", header = T)
str(dta1)
'data.frame':   12 obs. of  2 variables:
 $ family    : int  1 1 1 2 2 2 2 3 3 4 ...
 $ liberalism: int  25 29 34 18 21 21 26 31 35 30 ...
head(dta1)
  family liberalism
1      1         25
2      1         29
3      1         34
4      2         18
5      2         21
6      2         21

family 變項的資料屬性轉成 factor

dta1$family <- factor(dta1$family)

family 作 fixed effect 做 anova

summary(m0 <- aov(liberalism ~ family, data = dta1))
            Df Sum Sq Mean Sq F value Pr(>F)
family       3  244.3    81.4    7.73 0.0095
Residuals    8   84.3    10.5               

由上述 fixed effect 的 anova 結果可以看到,識字率受家庭因子影響下是有顯著差異的。

family作 random effect 做 anova

summary(m1 <- aov(liberalism ~ Error(family), data = dta1))

Error: family
          Df Sum Sq Mean Sq F value Pr(>F)
Residuals  3    244    81.4               

Error: Within
          Df Sum Sq Mean Sq F value Pr(>F)
Residuals  8   84.3    10.5               

去掉 intercept 做迴歸

summary(m2 <- lm(liberalism ~ family - 1, data = dta1))

Call:
lm(formula = liberalism ~ family - 1, data = dta1)

Residuals:
   Min     1Q Median     3Q    Max 
 -4.33  -1.00  -0.50   1.50   4.67 

Coefficients:
        Estimate Std. Error t value Pr(>|t|)
family1    29.33       1.87    15.6  2.8e-07
family2    21.50       1.62    13.2  1.0e-06
family3    33.00       2.30    14.4  5.4e-07
family4    30.67       1.87    16.4  2.0e-07

Residual standard error: 3.25 on 8 degrees of freedom
Multiple R-squared:  0.991, Adjusted R-squared:  0.987 
F-statistic:  224 on 4 and 8 DF,  p-value: 3.06e-08
anova(m2)
Analysis of Variance Table

Response: liberalism
          Df Sum Sq Mean Sq F value  Pr(>F)
family     4   9430    2357     224 3.1e-08
Residuals  8     84      11                

去掉截距看 4 個家庭分組與識字率的關係,其共變數分析結果顯示,識字率與 4 個家庭分組間個字都有顯著的共變關係,而 anova 結果也顯示之間具有顯著關係。

使用 lme package 做有 intercept 的 randeom effect 迴歸分析

summary(m3 <- lmer(liberalism ~ 1 + (1 | family), data = dta1))
Linear mixed model fit by REML ['lmerMod']
Formula: liberalism ~ 1 + (1 | family)
   Data: dta1

REML criterion at convergence: 65.4

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-1.312 -0.385 -0.114  0.600  1.477 

Random effects:
 Groups   Name        Variance Std.Dev.
 family   (Intercept) 21.9     4.68    
 Residual             10.5     3.24    
Number of obs: 12, groups:  family, 4

Fixed effects:
            Estimate Std. Error t value
(Intercept)    28.49       2.53    11.3
ranef(m3)
$family
  (Intercept)
1     0.72715
2    -6.24380
3     3.63949
4     1.87716

with conditional variances for "family" 

family 分組的資料及各組與整體平均

dta1_fy <- dta1 %>% 
    mutate( ID = paste0("S", 11:22), g_ave = mean(liberalism )) %>% 
    group_by(family) %>% 
    mutate( grp_ave = mean(liberalism) ) %>%
    as.data.frame %>%
    mutate( grp_ave_uw = mean(unique(grp_ave)) ) 
dta1_fy
   family liberalism  ID  g_ave grp_ave grp_ave_uw
1       1         25 S11 27.667  29.333     28.625
2       1         29 S12 27.667  29.333     28.625
3       1         34 S13 27.667  29.333     28.625
4       2         18 S14 27.667  21.500     28.625
5       2         21 S15 27.667  21.500     28.625
6       2         21 S16 27.667  21.500     28.625
7       2         26 S17 27.667  21.500     28.625
8       3         31 S18 27.667  33.000     28.625
9       3         35 S19 27.667  33.000     28.625
10      4         30 S20 27.667  30.667     28.625
11      4         30 S21 27.667  30.667     28.625
12      4         32 S22 27.667  30.667     28.625

以有 intercept 的 randeom effect 迴歸模型取資料

dta1_m3 <- data.frame(ID = paste0("S",11:22), fe = rep(summary(m3)$coef[1], 12), 
            re = rep(as.numeric(t(as.data.frame(ranef(m3)[1]))), c(3, 4, 2, 3)), 
            pred = fitted(m3))

合併兩組資料

merge(dta1_fy, dta1_m3, by = "ID")
    ID family liberalism  g_ave grp_ave grp_ave_uw    fe       re   pred
1  S11      1         25 27.667  29.333     28.625 28.49  0.72715 29.217
2  S12      1         29 27.667  29.333     28.625 28.49  0.72715 29.217
3  S13      1         34 27.667  29.333     28.625 28.49  0.72715 29.217
4  S14      2         18 27.667  21.500     28.625 28.49 -6.24380 22.246
5  S15      2         21 27.667  21.500     28.625 28.49 -6.24380 22.246
6  S16      2         21 27.667  21.500     28.625 28.49 -6.24380 22.246
7  S17      2         26 27.667  21.500     28.625 28.49 -6.24380 22.246
8  S18      3         31 27.667  33.000     28.625 28.49  3.63949 32.130
9  S19      3         35 27.667  33.000     28.625 28.49  3.63949 32.130
10 S20      4         30 27.667  30.667     28.625 28.49  1.87716 30.367
11 S21      4         30 27.667  30.667     28.625 28.49  1.87716 30.367
12 S22      4         32 27.667  30.667     28.625 28.49  1.87716 30.367

畫圖

ggplot(data = dta1, aes(x = liberalism, y = family)) +
  geom_point() +
  geom_point(aes(x = dta1_fy$grp_ave), shape = 3, size = rel(.5))+
  geom_segment(aes(yend = family, x = dta1_fy$grp_ave, xend = mean(liberalism)), lty = 3, size = rel(.3))+
  geom_vline(aes(xintercept = mean(liberalism)), lty = 2, size = rel(.8))+
  geom_vline(aes(xintercept = summary(m3)$coef[1]), lty = 2, color = "red")+
  geom_vline(aes(xintercept = mean(unique(dta1_fy$grp_ave))), lty = 2, color = "darkgreen")+
  labs(x = "Rating of Liberalism", y = "Family")+
  theme_bw()

黑色直虛線:識字率總平均 紅色虛線:Fixed effect 下的統計檢定值 (來自m3) 綠色虛線:randeom effect 的不同家庭識字率加權平均 黑色水平虛線:不同家庭的識字率平均

The end

2 Exercise 2

Replicate the Chicano example where the numerical results using SAS are available, and add comment for each code chunk.

資料讀取

dta2 <- read.csv("~/data/Chicano.csv", stringsAsFactors = TRUE)
str(dta2)
'data.frame':   24 obs. of  4 variables:
 $ Pupil: Factor w/ 24 levels "P01","P02","P03",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ Class: Factor w/ 6 levels "C1","C2","C3",..: 1 1 1 1 2 2 2 2 3 3 ...
 $ Trt  : Factor w/ 2 levels "C","T": 2 2 2 2 2 2 2 2 2 2 ...
 $ Score: int  5 2 10 11 10 15 11 8 7 12 ...
pacman::p_load(tidyverse, VCA, lme4, nlme)

畫出分數對應由上而下三層次分組的資料與各組分布圖

VCA::varPlot(Score ~ Trt/Class/Pupil, 
             Data = dta2,
             YLabel = list(text="Score", 
                         side=2, 
                         cex=1),
             MeanLine = list(var=c("Trt", "Class"),
                           col=c("darkred", "salmon"), 
                           lwd=c(1, 2)))

random effect regression model 的 anova

summary(m1 <- aov(Score ~ Trt + Error(Class), data=dta2))

Error: Class
          Df Sum Sq Mean Sq F value Pr(>F)
Trt        1    216     216    9.82  0.035
Residuals  4     88      22               

Error: Within
          Df Sum Sq Mean Sq F value Pr(>F)
Residuals 18    198      11               

由此看出,在 Treatment 層級的分數是有顯著差異的,class 間的 randeom effect 並不會被檢驗。

變異數估計,包含 fixed effect 和 random effect

faraway::sumary(m2 <- lmer(Score ~ Trt + (1 | Class), data=dta2))
Fixed Effects:
            coef.est coef.se
(Intercept) 4.00     1.35   
TrtT        6.00     1.91   

Random Effects:
 Groups   Name        Std.Dev.
 Class    (Intercept) 1.66    
 Residual             3.32    
---
number of obs: 24, groups: Class, 6
AIC = 130.9, DIC = 131.8
deviance = 127.4 

信賴區間

confint(m2, method="boot")
             2.5 % 97.5 %
.sig01      0.0000 3.5916
.sigma      2.2544 4.3945
(Intercept) 1.3497 6.6566
TrtT        2.4026 9.6106

The end

3 Exercise 3

Perform the analysis in the finger ridges example to assess the resemblance of the dermal ridge counts among twins. Comment on your code chunks.

pacman::p_load(tidyverse, lme4)

輸入資料

dta3 <- read.table("~/data/finger_ridges.txt", header = T)
str(dta3)
'data.frame':   12 obs. of  3 variables:
 $ Pair  : int  1 2 3 4 5 6 7 8 9 10 ...
 $ Twin_1: int  71 79 105 115 76 83 114 57 114 94 ...
 $ Twin_2: int  71 82 99 114 70 82 113 44 113 91 ...

將Twin ID的變量 Pair 轉為 factor屬性

dta3$Pair <- factor(dta3$Pair)
head(dta3)
  Pair Twin_1 Twin_2
1    1     71     71
2    2     79     82
3    3    105     99
4    4    115    114
5    5     76     70
6    6     83     82

組內相關

cor(c(dta3$Twin_1, dta3$Twin_2), c(dta3$Twin_2, dta3$Twin_1))
[1] 0.96256

資料由寬型轉為長型

dta3L <- dta3 %>% 
        gather(key = Twin, value = Ridges, Twin_1, Twin_2) %>%
        mutate( Twin = as.factor(Twin)) 
head(dta3L)
  Pair   Twin Ridges
1    1 Twin_1     71
2    2 Twin_1     79
3    3 Twin_1    105
4    4 Twin_1    115
5    5 Twin_1     76
6    6 Twin_1     83

計算個對雙胞胎的平均與標準差,並以平均數由大至小排列

dta3L %>% group_by(Pair) %>%
         summarize( ridge_ave = mean(Ridges), ridge_sd = sd(Ridges)) %>%
         arrange(desc(ridge_ave))
# A tibble: 12 x 3
   Pair  ridge_ave ridge_sd
   <fct>     <dbl>    <dbl>
 1 4         114.     0.707
 2 7         114.     0.707
 3 9         114.     0.707
 4 3         102      4.24 
 5 10         92.5    2.12 
 6 6          82.5    0.707
 7 2          80.5    2.12 
 8 11         79      5.66 
 9 12         74      2.83 
10 5          73      4.24 
11 1          71      0    
12 8          50.5    9.19 

繪製各對雙胞胎平均的散布圖

ggplot(dta3L, aes(reorder(Pair, Ridges, mean), Ridges, color = Pair)) + 
  geom_point() + 
  geom_hline(yintercept = mean(dta3L$Ridges), linetype = "dashed") +
  labs(y = "Number of finger ridges", x = "Twin ID") +
  theme_bw() +
  theme(legend.position = "NONE")

random effect anova for balanced designs

summary(m0 <- aov(Ridges ~ Error(Pair), data = dta3L))

Error: Pair
          Df Sum Sq Mean Sq F value Pr(>F)
Residuals 11   8990     817               

Error: Within
          Df Sum Sq Mean Sq F value Pr(>F)
Residuals 12    172    14.3               

random effects anova

summary(m1 <- lmer(Ridges ~ 1 + (1 | Pair), data = dta3L))
Linear mixed model fit by REML ['lmerMod']
Formula: Ridges ~ 1 + (1 | Pair)
   Data: dta3L

REML criterion at convergence: 174.1

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.8892 -0.3862 -0.0083  0.3796  1.5496 

Random effects:
 Groups   Name        Variance Std.Dev.
 Pair     (Intercept) 401.5    20.04   
 Residual              14.3     3.78   
Number of obs: 24, groups:  Pair, 12

Fixed effects:
            Estimate Std. Error t value
(Intercept)    87.21       5.84    14.9
ranef(m1)
$Pair
   (Intercept)
1     -15.9249
2      -6.5910
3      14.5330
4      26.8144
5     -13.9599
6      -4.6260
7      25.8319
8     -36.0664
9      25.8319
10      5.1991
11     -8.0648
12    -12.9774

with conditional variances for "Pair" 

The end