hw2:Regression with dependent errors

0.1 trial by trial reaction time data

# load packages
pacman::p_load(languageR, lattice, tidyverse, nlme, lmtest)

# load data
data(heid, package="languageR")

# see first 6 lines of data
head(heid)

##   Subject         Word   RT BaseFrequency
## 1     pp1   basaalheid 6.69          3.56
## 2     pp1  markantheid 6.81          5.16
## 3     pp1 ontroerdheid 6.51          5.55
## 4     pp1  contentheid 6.58          4.50
## 5     pp1    riantheid 6.86          4.53
## 6     pp1  tembaarheid 6.35          0.00

# check data structure
str(heid)

## 'data.frame':    832 obs. of  4 variables:
##  $ Subject      : Factor w/ 26 levels "pp1","pp10","pp11",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Word         : Factor w/ 40 levels "aftandsheid",..: 4 24 28 10 33 38 25 5 16 30 ...
##  $ RT           : num  6.69 6.81 6.51 6.58 6.86 6.35 6.69 7.17 6.58 6.98 ...
##  $ BaseFrequency: num  3.56 5.16 5.55 4.5 4.53 0 1.61 3.61 2.56 5.86 ...

0.2 Create a Trial variable

heid$Trial <- as.numeric(unlist(apply(as.matrix(table(heid$Subject)), 1, seq)))

str(heid)

## 'data.frame':    832 obs. of  5 variables:
##  $ Subject      : Factor w/ 26 levels "pp1","pp10","pp11",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Word         : Factor w/ 40 levels "aftandsheid",..: 4 24 28 10 33 38 25 5 16 30 ...
##  $ RT           : num  6.69 6.81 6.51 6.58 6.86 6.35 6.69 7.17 6.58 6.98 ...
##  $ BaseFrequency: num  3.56 5.16 5.55 4.5 4.53 0 1.61 3.61 2.56 5.86 ...
##  $ Trial        : num  1 2 3 4 5 6 7 8 9 10 ...

變項Trial針對每個樣本重複測量進行序位編號

0.3 Create a Subset data

# select a subset of variables for current example
dta <- heid %>% select(Subject, Trial, RT, BaseFrequency)

# set plot theme
ot <- theme_set(theme_bw())

0.4 plot the varible effect

# word frequency effect
ggplot(data = dta, aes(x = BaseFrequency, y = RT)) +
 geom_point() +
 stat_smooth(method="lm") +
 labs(x = "Word Frequency", y = "Reaction Time (transformed)")

## `geom_smooth()` using formula 'y ~ x'

word frequency 對 Reaction Time的ordinary regression。由於未考量樣本重複測量的效果，word frequency 對 Reaction Time散佈圖與回歸線明顯不fit

# trial effect by subject
ggplot(data = dta, aes(x = Trial, y = RT)) +
 geom_point(pch =1, size = rel(.5)) +
 geom_line() +
 facet_wrap(~ Subject) +
 labs(x = "Trial ID", y = "Reaction Time (transformed)")

每個樣本在word frequency 對 Reaction Time的效果

0.5 simple linear ignoring trial dependence

# simple linear model ignoring trial dependence - least-squares
summary(m0 <- lm(RT ~ BaseFrequency, data = dta))

## 
## Call:
## lm(formula = RT ~ BaseFrequency, data = dta)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.51664 -0.21576 -0.05251  0.15991  0.89027 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.641720   0.020730 320.392  < 2e-16 ***
## BaseFrequency -0.015470   0.004701  -3.291  0.00104 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.282 on 830 degrees of freedom
## Multiple R-squared:  0.01288,    Adjusted R-squared:  0.01169 
## F-statistic: 10.83 on 1 and 830 DF,  p-value: 0.001041

# gls Model method use ML
summary(m0a <- gls(RT ~ BaseFrequency, data = dta, method = "ML"))

## Generalized least squares fit by maximum likelihood
##   Model: RT ~ BaseFrequency 
##   Data: dta 
##        AIC      BIC   logLik
##   258.4621 272.6336 -126.231
## 
## Coefficients:
##                  Value   Std.Error  t-value p-value
## (Intercept)    6.64172 0.020730012 320.3915   0.000
## BaseFrequency -0.01547 0.004701192  -3.2908   0.001
## 
##  Correlation: 
##               (Intr)
## BaseFrequency -0.882
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -1.8345863 -0.7661546 -0.1864644  0.5678517  3.1613281 
## 
## Residual standard error: 0.2816138 
## Degrees of freedom: 832 total; 830 residual

0.6 Considering trial autocorrelation

# simple linear model with autocorrelation error using gls
summary(m1 <- update(m0a, correlation = corAR1(form = ~ Trial | Subject)))

## Generalized least squares fit by maximum likelihood
##   Model: RT ~ BaseFrequency 
##   Data: dta 
##        AIC      BIC    logLik
##   64.26212 83.15745 -28.13106
## 
## Correlation Structure: AR(1)
##  Formula: ~Trial | Subject 
##  Parameter estimate(s):
##       Phi 
## 0.4621332 
## 
## Coefficients:
##                   Value   Std.Error   t-value p-value
## (Intercept)    6.635164 0.021934244 302.50253   0e+00
## BaseFrequency -0.013854 0.003868156  -3.58149   4e-04
## 
##  Correlation: 
##               (Intr)
## BaseFrequency -0.699
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -1.8345660 -0.7652167 -0.1837375  0.5862265  3.1590365 
## 
## Residual standard error: 0.2811809 
## Degrees of freedom: 832 total; 830 residual

0.7 compare models

anova(m0a, m1)

##     Model df       AIC       BIC     logLik   Test L.Ratio p-value
## m0a     1  3 258.46209 272.63359 -126.23105                       
## m1      2  4  64.26212  83.15745  -28.13106 1 vs 2   196.2  <.0001

m0a未考慮自相關的效果，m1把自相關的效果考慮進去後，兩個模型比較，m1的AIC,BIC的模型配適比較好。

0.8 autocorrelation by Subject

acf.fnc(dta, group = "Subject", time = "Trial", x = "RT", plot = TRUE)

acf圖結果顯示，大部分樣本在word frequency 對 Reaction Time的時間序列的效果上，還算平穩，除了pp5、pp12比較動盪些。

0.9 residuals and Durbin-Watson test

# collect residuals from the two models above
dta <- dta %>% mutate(res0 = rstandard(m0),
                      res1 = resid(m1, type = "normalized"))

# Durbin-Watson test of serial correlation
dwtest(res0 ~ BaseFrequency, data = dta)

## 
##  Durbin-Watson test
## 
## data:  res0 ~ BaseFrequency
## DW = 1.064, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

#
dwtest(res1 ~ BaseFrequency, data = dta)

## 
##  Durbin-Watson test
## 
## data:  res1 ~ BaseFrequency
## DW = 2.1854, p-value = 0.9963
## alternative hypothesis: true autocorrelation is greater than 0

# check acf for residuals
acf.fnc(dta, group = "Subject", time = "Trial", x = "res1", plot = TRUE)