# load packages
pacman::p_load(languageR, lattice, tidyverse, nlme, lmtest)
# load data
data(heid, package="languageR")
# see first 6 lines of data
head(heid)
## Subject Word RT BaseFrequency
## 1 pp1 basaalheid 6.69 3.56
## 2 pp1 markantheid 6.81 5.16
## 3 pp1 ontroerdheid 6.51 5.55
## 4 pp1 contentheid 6.58 4.50
## 5 pp1 riantheid 6.86 4.53
## 6 pp1 tembaarheid 6.35 0.00
# check data structure
str(heid)
## 'data.frame': 832 obs. of 4 variables:
## $ Subject : Factor w/ 26 levels "pp1","pp10","pp11",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Word : Factor w/ 40 levels "aftandsheid",..: 4 24 28 10 33 38 25 5 16 30 ...
## $ RT : num 6.69 6.81 6.51 6.58 6.86 6.35 6.69 7.17 6.58 6.98 ...
## $ BaseFrequency: num 3.56 5.16 5.55 4.5 4.53 0 1.61 3.61 2.56 5.86 ...
heid$Trial <- as.numeric(unlist(apply(as.matrix(table(heid$Subject)), 1, seq)))
str(heid)
## 'data.frame': 832 obs. of 5 variables:
## $ Subject : Factor w/ 26 levels "pp1","pp10","pp11",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Word : Factor w/ 40 levels "aftandsheid",..: 4 24 28 10 33 38 25 5 16 30 ...
## $ RT : num 6.69 6.81 6.51 6.58 6.86 6.35 6.69 7.17 6.58 6.98 ...
## $ BaseFrequency: num 3.56 5.16 5.55 4.5 4.53 0 1.61 3.61 2.56 5.86 ...
## $ Trial : num 1 2 3 4 5 6 7 8 9 10 ...
變項Trial針對每個樣本重複測量進行序位編號
# select a subset of variables for current example
dta <- heid %>% select(Subject, Trial, RT, BaseFrequency)
# set plot theme
ot <- theme_set(theme_bw())
# word frequency effect
ggplot(data = dta, aes(x = BaseFrequency, y = RT)) +
geom_point() +
stat_smooth(method="lm") +
labs(x = "Word Frequency", y = "Reaction Time (transformed)")
## `geom_smooth()` using formula 'y ~ x'
word frequency 對 Reaction Time的ordinary regression。由於未考量樣本重複測量的效果,word frequency 對 Reaction Time散佈圖與回歸線明顯不fit
# trial effect by subject
ggplot(data = dta, aes(x = Trial, y = RT)) +
geom_point(pch =1, size = rel(.5)) +
geom_line() +
facet_wrap(~ Subject) +
labs(x = "Trial ID", y = "Reaction Time (transformed)")
每個樣本在word frequency 對 Reaction Time的效果
# simple linear model ignoring trial dependence - least-squares
summary(m0 <- lm(RT ~ BaseFrequency, data = dta))
##
## Call:
## lm(formula = RT ~ BaseFrequency, data = dta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.51664 -0.21576 -0.05251 0.15991 0.89027
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.641720 0.020730 320.392 < 2e-16 ***
## BaseFrequency -0.015470 0.004701 -3.291 0.00104 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.282 on 830 degrees of freedom
## Multiple R-squared: 0.01288, Adjusted R-squared: 0.01169
## F-statistic: 10.83 on 1 and 830 DF, p-value: 0.001041
# gls Model method use ML
summary(m0a <- gls(RT ~ BaseFrequency, data = dta, method = "ML"))
## Generalized least squares fit by maximum likelihood
## Model: RT ~ BaseFrequency
## Data: dta
## AIC BIC logLik
## 258.4621 272.6336 -126.231
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 6.64172 0.020730012 320.3915 0.000
## BaseFrequency -0.01547 0.004701192 -3.2908 0.001
##
## Correlation:
## (Intr)
## BaseFrequency -0.882
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.8345863 -0.7661546 -0.1864644 0.5678517 3.1613281
##
## Residual standard error: 0.2816138
## Degrees of freedom: 832 total; 830 residual
# simple linear model with autocorrelation error using gls
summary(m1 <- update(m0a, correlation = corAR1(form = ~ Trial | Subject)))
## Generalized least squares fit by maximum likelihood
## Model: RT ~ BaseFrequency
## Data: dta
## AIC BIC logLik
## 64.26212 83.15745 -28.13106
##
## Correlation Structure: AR(1)
## Formula: ~Trial | Subject
## Parameter estimate(s):
## Phi
## 0.4621332
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 6.635164 0.021934244 302.50253 0e+00
## BaseFrequency -0.013854 0.003868156 -3.58149 4e-04
##
## Correlation:
## (Intr)
## BaseFrequency -0.699
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.8345660 -0.7652167 -0.1837375 0.5862265 3.1590365
##
## Residual standard error: 0.2811809
## Degrees of freedom: 832 total; 830 residual
anova(m0a, m1)
## Model df AIC BIC logLik Test L.Ratio p-value
## m0a 1 3 258.46209 272.63359 -126.23105
## m1 2 4 64.26212 83.15745 -28.13106 1 vs 2 196.2 <.0001
m0a未考慮自相關的效果,m1把自相關的效果考慮進去後,兩個模型比較,m1的AIC,BIC的模型配適比較好。
acf.fnc(dta, group = "Subject", time = "Trial", x = "RT", plot = TRUE)
acf圖結果顯示,大部分樣本在word frequency 對 Reaction Time的時間序列的效果上,還算平穩,除了pp5、pp12比較動盪些。
# collect residuals from the two models above
dta <- dta %>% mutate(res0 = rstandard(m0),
res1 = resid(m1, type = "normalized"))
# Durbin-Watson test of serial correlation
dwtest(res0 ~ BaseFrequency, data = dta)
##
## Durbin-Watson test
##
## data: res0 ~ BaseFrequency
## DW = 1.064, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
#
dwtest(res1 ~ BaseFrequency, data = dta)
##
## Durbin-Watson test
##
## data: res1 ~ BaseFrequency
## DW = 2.1854, p-value = 0.9963
## alternative hypothesis: true autocorrelation is greater than 0
# check acf for residuals
acf.fnc(dta, group = "Subject", time = "Trial", x = "res1", plot = TRUE)