1 In-class exercise 1

1.1 Problem

Assess how similar family members are in their ratings of liberalism in the family example using the intraclass correlation(組內相關).
Data into: Researchers were interested in determining the correlation between family members on a measure of liberalism. They took a simple random sample of families and obtained the liberalism ratings for all family members of each family selected.

1.2 Setting and input data

# load package
pacman::p_load(tidyverse, lme4)
# input data
dta <- read.table("family.txt", header = T)
# data structure
str(dta)

## 'data.frame':    12 obs. of  2 variables:
##  $ family    : int  1 1 1 2 2 2 2 3 3 4 ...
##  $ liberalism: int  25 29 34 18 21 21 26 31 35 30 ...

# first 6 lines
head(dta)

##   family liberalism
## 1      1         25
## 2      1         29
## 3      1         34
## 4      2         18
## 5      2         21
## 6      2         21

# variable family to be the factor type
dta$family <- factor(dta$family)

1.3 Correlation between family on liberalism rate

# aov with fixed family effects
summary(m0 <- aov(liberalism ~ family, data = dta))

##             Df Sum Sq Mean Sq F value  Pr(>F)   
## family       3 244.33   81.44   7.726 0.00951 **
## Residuals    8  84.33   10.54                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#aov(liberalism ~ family, data = dta)

aov(liberalism ~ family, data = dta) showed Residual standard error: 3.246793, Estimated effects may be unbalanced (因為各組間(family)樣本 unbalance / family 1: 3位, family 2: 4位, family 3: 2位, family 4:1位)
the result indicates a significant differences of the rating of liberalism have been observed between family.

# aov with random family effects (不會有 F value 跟 Pr(>F))
summary(m1 <- aov(liberalism ~ Error(family), data = dta))

## 
## Error: family
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  3  244.3   81.44               
## 
## Error: Within
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals  8  84.33   10.54

# regression without intercept term (-1)
summary(m2 <- lm(liberalism ~ family - 1, data = dta))

## 
## Call:
## lm(formula = liberalism ~ family - 1, data = dta)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.333 -1.000 -0.500  1.500  4.667 
## 
## Coefficients:
##         Estimate Std. Error t value Pr(>|t|)    
## family1   29.333      1.875   15.65 2.77e-07 ***
## family2   21.500      1.623   13.24 1.01e-06 ***
## family3   33.000      2.296   14.37 5.36e-07 ***
## family4   30.667      1.875   16.36 1.96e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.247 on 8 degrees of freedom
## Multiple R-squared:  0.9911, Adjusted R-squared:  0.9867 
## F-statistic: 223.6 on 4 and 8 DF,  p-value: 3.065e-08

# compute analysis of variance tables for m2
anova(m2)

## Analysis of Variance Table
## 
## Response: liberalism
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## family     4 9429.7 2357.42  223.63 3.065e-08 ***
## Residuals  8   84.3   10.54                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# lme4 package/ m3(linear mixed-effects model fit by REML)
summary(m3 <- lmer(liberalism ~ 1 + (1 | family), data = dta))

## Linear mixed model fit by REML ['lmerMod']
## Formula: liberalism ~ 1 + (1 | family)
##    Data: dta
## 
## REML criterion at convergence: 65.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.3116 -0.3850 -0.1135  0.5998  1.4772 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  family   (Intercept) 21.92    4.682   
##  Residual             10.48    3.238   
## Number of obs: 12, groups:  family, 4
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)    28.49       2.53   11.26

The correlation between any two liberalism ratings from the same family is 21.919/(21.919+10.482) = 0.677

# show random effects: extract the conditional modes of the random effects from m3
ranef(m3)

## $family
##   (Intercept)
## 1   0.7271474
## 2  -6.2437983
## 3   3.6394896
## 4   1.8771613
## 
## with conditional variances for "family"

# augument data with group mean and ID
# ID = paste0("S", 11:22), 在ID前加S, ?? 11:22 ??
# 新增 g_ave: mean of liberalism, grp_ave: mean of liberalism by family, grp_ave_uw: mean of liberalism by sum of each family
# 不確定unique語法使用的邏輯(unique returns a vector, data frame or array like x but with duplicate elements/rows removed.)，取grp_ave的值，但重複的值則丟棄：mean(unique(grp_ave))取每組的平均值再平均
dta_fy <- dta %>% 
  mutate( ID = paste0("S", 11:22), g_ave = mean(liberalism) ) %>%
  group_by(family) %>% 
  mutate( grp_ave = mean(liberalism) ) %>%
  as.data.frame %>%
  mutate( grp_ave_uw = mean(unique(grp_ave)) ) 

# output from model m3
# fe= fixed effects of each family, re= random effects of each family, pred= predicted mean of each family=(fe+re)
dta_m3 <- data.frame (ID = paste0("S",11:22), fe = rep(summary(m3)$coef[1], 12), re = rep(as.numeric(t(as.data.frame(ranef(m3)[1]))), c(3, 4, 2, 3)), pred = fitted(m3))
# join two data frames
merge(dta_fy, dta_m3, by = "ID")

##     ID family liberalism    g_ave  grp_ave grp_ave_uw       fe         re
## 1  S11      1         25 27.66667 29.33333     28.625 28.49027  0.7271474
## 2  S12      1         29 27.66667 29.33333     28.625 28.49027  0.7271474
## 3  S13      1         34 27.66667 29.33333     28.625 28.49027  0.7271474
## 4  S14      2         18 27.66667 21.50000     28.625 28.49027 -6.2437983
## 5  S15      2         21 27.66667 21.50000     28.625 28.49027 -6.2437983
## 6  S16      2         21 27.66667 21.50000     28.625 28.49027 -6.2437983
## 7  S17      2         26 27.66667 21.50000     28.625 28.49027 -6.2437983
## 8  S18      3         31 27.66667 33.00000     28.625 28.49027  3.6394896
## 9  S19      3         35 27.66667 33.00000     28.625 28.49027  3.6394896
## 10 S20      4         30 27.66667 30.66667     28.625 28.49027  1.8771613
## 11 S21      4         30 27.66667 30.66667     28.625 28.49027  1.8771613
## 12 S22      4         32 27.66667 30.66667     28.625 28.49027  1.8771613
##        pred
## 1  29.21742
## 2  29.21742
## 3  29.21742
## 4  22.24648
## 5  22.24648
## 6  22.24648
## 7  22.24648
## 8  32.12976
## 9  32.12976
## 10 30.36744
## 11 30.36744
## 12 30.36744

The predicted mean for family 4 is 28.49(fe) + 1.877(re) = 30.368(pred) < 30.667(grp_ave)

2 In-class exercise 2

2.1 Problem

Replicate the Chicano example (A random effect nested within a fixed effect.) where the numerical results using SAS are available, and add comment for each code chunk.
Data info: A study was conducted to evaluate the effectiveness of a pattern-practice approach in modifying the nonstandard dialect of the Chicano living in San Antonio, Texas. The essential elements of the approach involved pattern drill in imitating audio recorded speech models and immediate feedback by a teacher. The control condition involved reading stories aloud to the teacher. The children were praised for a good performance, but no correction or guidance were given. Twenty-four 12-year old Chicano children with similar scores on a standardized speech test were randomly assigned to 6 classes, with 4 children in each. The six classes were randomly assigned to the two treatment conditions. The differences between the children’s post- and pre-test scores on the test were obtained.

2.2 Setting and input data

# load package
pacman::p_load(tidyverse, VCA, lme4, nlme)
# input data
dta2 <- read.csv("Chicano.csv", stringsAsFactors = TRUE)
str(dta2)

## 'data.frame':    24 obs. of  4 variables:
##  $ Pupil: Factor w/ 24 levels "P01","P02","P03",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Class: Factor w/ 6 levels "C1","C2","C3",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ Trt  : Factor w/ 2 levels "C","T": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Score: int  5 2 10 11 10 15 11 8 7 12 ...

2.3 Variability Chart for Hierarchical Models.

# Function varPlot determines the sequence of variables in the model formula and uses this information to construct the variability chart.
# 24 children(P01-P24) were randomly assigned to 6 classes(each for 4 children) with 2 treatment condition(treat/control)
# MeanLine (deepskyblue4: 治療組的平均值, coral color: 各班的平均值), Score = (後測-前測)
VCA::varPlot(Score ~ Trt/Class/Pupil, 
             Data=dta2,
             Points = list(pch = 1, cex = 1, col = "gray"),
             YLabel=list(text="Score", 
                         side=2, #指定坐標軸位置:1下2左3上4右
                         cex=1), #cex符號的大小
             MeanLine=list(var=c("Trt", "Class"),
                           col=c("deepskyblue4", "coral"), 
                           lwd=c(1, 2))) #lwd:The line width= c(第一個變項,第二個變項的線寬度)

2.4 Mixed Model Analysis of Variance

# Fixed-effects(ANOVA) approach (the result indicates significant difference of mean score among different treatments)
summary(m1 <- aov(Score ~ Trt + Error(Class), data=dta2) )

## 
## Error: Class
##           Df Sum Sq Mean Sq F value Pr(>F)  
## Trt        1    216     216   9.818 0.0351 *
## Residuals  4     88      22                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Error: Within
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 18    198      11

# Random effects approach
# 治療對分數的影響, 班級為Random-effects terms
faraway::sumary(m2 <- lmer(Score ~ Trt + (1 | Class), data=dta2))

## Fixed Effects:
##             coef.est coef.se
## (Intercept) 4.00     1.35   
## TrtT        6.00     1.91   
## 
## Random Effects:
##  Groups   Name        Std.Dev.
##  Class    (Intercept) 1.66    
##  Residual             3.32    
## ---
## number of obs: 24, groups: Class, 6
## AIC = 130.9, DIC = 131.8
## deviance = 127.4

# Variance estimates by nlme::lme
m2_old<- nlme::lme(Score ~ Trt, random = ~ 1 | Class, data=dta2, method="REML")
summary(m2_old)

## Linear mixed-effects model fit by REML
##  Data: dta2 
##        AIC      BIC   logLik
##   130.9294 135.2936 -61.4647
## 
## Random effects:
##  Formula: ~1 | Class
##         (Intercept) Residual
## StdDev:    1.658318 3.316624
## 
## Fixed effects: Score ~ Trt 
##             Value Std.Error DF  t-value p-value
## (Intercept)     4  1.354008 18 2.954191  0.0085
## TrtT            6  1.914857  4 3.133393  0.0351
##  Correlation: 
##      (Intr)
## TrtT -0.707
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -1.95982274 -0.79146768  0.07537759  0.79146851  1.50755612 
## 
## Number of Observations: 24
## Number of Groups: 6

# without intercept term
faraway::sumary(m2_1 <- lmer(Score ~ Trt -1 + (1 | Class), data=dta2))

## Fixed Effects:
##      coef.est coef.se
## TrtC  4.00     1.35  
## TrtT 10.00     1.35  
## 
## Random Effects:
##  Groups   Name        Std.Dev.
##  Class    (Intercept) 1.66    
##  Residual             3.32    
## ---
## number of obs: 24, groups: Class, 6
## AIC = 130.9, DIC = 131.8
## deviance = 127.4

# computes confidence intervals for parameters in m2 by bootstrap.
confint(m2_1, oldNames=F, method="boot")

## Computing bootstrap confidence intervals ...

## 
## 138 message(s): boundary (singular) fit: see ?isSingular

##                         2.5 %    97.5 %
## sd_(Intercept)|Class 0.000000  3.524811
## sigma                2.243276  4.343242
## TrtC                 1.342984  6.698088
## TrtT                 7.389098 12.708448

#          2.5 %  coef.est     97.5 %
#TrtC   1.466666      4.00   6.807770
#TrtT   7.150976     10.00  12.560913
#The end

3 In-class exercise 3

Intraclass correlation example

3.1 Problem

Perform the analysis in the finger ridges example to assess the resemblance of the dermal ridge counts among twins. Comment on your code chunks.
Data info: Researchers collected data on the number of finger ridges on both hands of individuals in 12 pairs of female identical twins. The aim was to measure the “similarity” between the numbers of finger ridges for the two members of the twins (classes).

3.2 Setting and input data

# load package
pacman::p_load(tidyverse, lme4)
# input data
dta3 <- read.table("finger_ridges.txt", header = T)
# coerce pair variable to factor type
dta3$Pair <- factor(dta3$Pair)
# show first 6 lines
head(dta3)

##   Pair Twin_1 Twin_2
## 1    1     71     71
## 2    2     79     82
## 3    3    105     99
## 4    4    115    114
## 5    5     76     70
## 6    6     83     82

3.3 Intraclass correlation

# historical idea of intraclass correlation
cor(c(dta3$Twin_1, dta3$Twin_2), c(dta3$Twin_2, dta3$Twin_1))

## [1] 0.9625626

The correlation between each twins is 0.96 (24 pairs?)

# turn data from wide format to long format
dta3L <- dta3 %>% 
        gather( key = Twin, value = Ridges, Twin_1, Twin_2) %>%
        mutate( Twin = as.factor(Twin) ) 
head(dta3L)

##   Pair   Twin Ridges
## 1    1 Twin_1     71
## 2    2 Twin_1     79
## 3    3 Twin_1    105
## 4    4 Twin_1    115
## 5    5 Twin_1     76
## 6    6 Twin_1     83

# show mean and sd of ridges by twin pair, Ridges_ave in descending order 
dta3t <- dta3L %>%
  group_by(Pair) %>%
  summarize( ridge_ave = mean(Ridges), ridge_sd = sd(Ridges)) %>%
  arrange( desc(ridge_ave))

## `summarise()` ungrouping output (override with `.groups` argument)

# dot plot 
# black dashed line is the mean of all Ridges
ggplot(dta3L, aes(reorder(Pair, Ridges, mean), Ridges, color = Pair)) +
  geom_point() + 
  geom_hline(yintercept = mean(dta3L$Ridges), linetype = "dashed") +
  labs(y = "Number of finger ridges", x = "Twin ID") +
  theme_bw() +
  theme(legend.position = "NONE")

3.4 Mixed Model Analysis of Variance

# random effect anova for balanced designs
summary(m0 <- aov(Ridges ~ Error(Pair), data = dta3L))

## 
## Error: Pair
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 11   8990   817.3               
## 
## Error: Within
##           Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 12  171.5   14.29

# same as Note page 11
sort(var(dta3t$ridge_ave) - dta3t$ridge_sd)

##  [1] 399.4648 403.0003 404.4146 404.4146 405.8288 406.5359 406.5359 407.9501
##  [9] 407.9501 407.9501 407.9501 408.6572

quantile(var(dta3t$ridge_ave) - dta3t$ridge_sd)

##       0%      25%      50%      75%     100% 
## 399.4648 404.4146 406.5359 407.9501 408.6572

(MSB-MSW)/n=variance(Estimated Variance), (817.3-14.29)/2=401.505, but far from 406???

# random effects anova
summary(m1 <- lmer(Ridges ~ 1 + (1 | Pair), data = dta3L))

## Linear mixed model fit by REML ['lmerMod']
## Formula: Ridges ~ 1 + (1 | Pair)
##    Data: dta3L
## 
## REML criterion at convergence: 174.1
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.88917 -0.38618 -0.00834  0.37963  1.54959 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Pair     (Intercept) 401.51   20.04   
##  Residual              14.29    3.78   
## Number of obs: 24, groups:  Pair, 12
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   87.208      5.836   14.94

Estimated Var(twin) = (817.3 - 14.29) /2 = 401.51
The estimate of intraclass correlation = 401.51/(401.51+14.29) = 0.966

# random effects from m1 a fitted model object.
ranef(m1)

## $Pair
##    (Intercept)
## 1   -15.924912
## 2    -6.591031
## 3    14.533018
## 4    26.814441
## 5   -13.959885
## 6    -4.626003
## 7    25.831927
## 8   -36.066447
## 9    25.831927
## 10    5.199136
## 11   -8.064801
## 12  -12.977371
## 
## with conditional variances for "Pair"

# The predicted mean for all
pred1 = fitted(m1)
mean(pred1) #same as Fixed effects Estimates = 87.208

## [1] 87.20833

In-class Exercises:Analysis of variance

Chang-Chun Chen

2020-10-01