Like the last homework, we are interested in determing what behavioral factors might determine ones ability to elicit an ERP component indicative of predictive language processing. This data was collected from older Spanish speaking adults in the San Antonio Area.
From the behavioral data, we have demogrpahic information such as age and education. We also tested them on various cogntive assays to determine their language fluency, memory abilities and other cognitive abilities.
We load our behavioral data dems.csv and our ERP data ERP.csv. We will fit all of these variables into a linear model to determine any variance inflation factors.
library(car)
## Loading required package: carData
dems<-read.csv("ESPS_dems.csv",header=TRUE,sep="\t")
ERP<-read.csv("ESPS_ERP_Dist2.csv",header=TRUE,sep = "\t")
ESPS<-merge(ERP,dems,all.x=TRUE,by.x="Subject")
linfit<-lm(Data~Education+Age+female+WJ7_SS+WJ9_SS+VFE_cat_raw+VFE_let_raw+VFS_cat_raw+VFS_let_raw+MMSE+Span_Read_Raw+Digit_Raw+BNTS_raw+BNTE_raw,data=ESPS)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Recode demographics and behavioral data
dems$Education<-scale(dems$Education,center=TRUE,scale=TRUE)
dems$Age<-scale(dems$Age,center=TRUE,scale=TRUE)
dems$female<-dplyr::recode_factor(dems$female, f = 1, m = 0)
dems$WJ7<-scale(dems$WJ7_SS,center=TRUE,scale=TRUE)
dems$WJ9<-scale(dems$WJ9_SS,center=TRUE,scale=TRUE)
dems$VFE_cat<-scale(dems$VFE_cat_raw,center=TRUE,scale=TRUE)
dems$VFE_let<-scale(dems$VFE_let_raw,center=TRUE,scale=TRUE)
dems$VFS_cat<-scale(dems$VFS_cat_raw,center=TRUE,scale=TRUE)
dems$VFS_let<-scale(dems$VFS_let_raw,center=TRUE,scale=TRUE)
dems$MMSE<-scale(dems$MMSE,center=TRUE,scale=TRUE)
dems$Span_Read<-scale(dems$Span_Read_Raw,center=TRUE,scale=TRUE)
dems$Digit<-scale(dems$Digit_Raw,center=TRUE,scale=TRUE)
dems$BNTS<-scale(dems$BNTS_raw,center=TRUE,scale=TRUE)
dems$BNTE<-scale(dems$BNTE_raw,center=TRUE,scale=TRUE)
#Recoding ERP data
ERP$Gen_Effect<-NA #initialize gender effect variable
ERP$Gen_Effect[ERP$bins_1==2]<-ERP$Data[ERP$bins_1==1]-ERP$Data[ERP$bins_1==2] #calculate the gender effect in half the variables
ERP$Laterality[is.na(ERP$horiz)]<-NA
ERP$Lat<-as.factor(ERP$horiz) # 1 - Left Lat. ; 2 - Left Med. ; 3 - Med. ; 4- Right Med. ; 5 - Right Lat.
ERP$Anteriority[is.na(ERP$ante)]<-NA
ERP$Ant<-as.factor(ERP$ante) # 1 - prefrontal; 2 - frontal; 3 - central; 4 - occipital
#Merge data sets
dems<-dplyr::select(dems,Subject,Education,Age,female,WJ7,WJ9,VFE_cat,VFE_let,VFS_cat,VFS_let,MMSE,Span_Read,Digit,BNTS,BNTE)
ERP<-dplyr::select(ERP,Subject,Gen_Effect,Lat,Ant)
ESPS<-merge(ERP,dems,all.y=TRUE)
ESPS<-ESPS[complete.cases(ESPS),]
vif(linfit)
## Education Age female WJ7_SS WJ9_SS
## 4.281792 3.989441 2.446150 4.001824 6.596070
## VFE_cat_raw VFE_let_raw VFS_cat_raw VFS_let_raw MMSE
## 3.610352 3.905663 3.306244 5.941388 2.199270
## Span_Read_Raw Digit_Raw BNTS_raw BNTE_raw
## 1.824761 1.987265 2.999248 4.574751
Because we have so many variables that are testing cognitive abilities, it’s not surprising that we have inflated variables. I will use PCA to reduce the number of variables going into my model and reduce my inflated variance.
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(ggplot2)
library(pander)
ESPS.pc<-prcomp(~WJ7+WJ9+VFE_cat+VFE_let+VFS_cat+VFS_let+MMSE+Span_Read+Digit+BNTS+BNTE,data=ESPS,retx=T)
summary(ESPS.pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.1063 1.3497 1.00995 0.93844 0.80373 0.74440
## Proportion of Variance 0.4179 0.1716 0.09608 0.08295 0.06085 0.05219
## Cumulative Proportion 0.4179 0.5895 0.68552 0.76847 0.82932 0.88151
## PC7 PC8 PC9 PC10 PC11
## Standard deviation 0.64366 0.54658 0.49895 0.4134 0.35358
## Proportion of Variance 0.03902 0.02814 0.02345 0.0161 0.01178
## Cumulative Proportion 0.92054 0.94868 0.97213 0.9882 1.00000
ESPS.pc$rotation
## PC1 PC2 PC3 PC4 PC5
## WJ7 0.3301543 -0.17867244 -0.01539182 0.11890995 -0.3469868
## WJ9 0.3960508 -0.06813436 0.16236907 0.16008439 -0.0410340
## VFE_cat 0.2700245 -0.43606585 -0.35183078 -0.15553662 0.2575752
## VFE_let 0.3783753 -0.06300577 -0.27444318 0.12713420 -0.2220177
## VFS_cat 0.3007764 0.21167505 0.02116009 -0.57452020 0.4282242
## VFS_let 0.3721898 0.12138719 0.23592087 -0.40269113 -0.1127410
## MMSE 0.1301607 0.42889643 -0.50516425 -0.22105932 -0.5080909
## Span_Read 0.2342503 0.44873991 0.07231054 0.20415179 0.3895891
## Digit 0.2402000 0.29169455 -0.28218557 0.55581373 0.2805395
## BNTS 0.2707186 0.10391043 0.61475046 0.15384459 -0.2527572
## BNTE 0.2937166 -0.47808661 -0.01491489 0.06710313 0.1169120
## PC6 PC7 PC8 PC9 PC10
## WJ7 0.703104165 -0.053904590 0.37670985 -0.10925994 0.26814215
## WJ9 0.134273113 -0.004388282 -0.48022883 0.71999130 -0.04931573
## VFE_cat -0.184788441 0.057000923 0.33871763 0.08825211 -0.25771769
## VFE_let -0.129881976 -0.150954019 -0.56876061 -0.57390614 -0.01055544
## VFS_cat 0.086833916 0.172575522 -0.13020049 -0.06756285 0.53972770
## VFS_let 0.104729872 0.194539836 0.06453896 -0.12989785 -0.66785583
## MMSE -0.280180144 -0.098970254 0.19467095 0.30473726 0.11221545
## Span_Read 0.056639927 -0.690026468 0.19114437 -0.03568590 -0.15141966
## Digit -0.006293101 0.584259419 0.11678832 -0.03598994 -0.02143706
## BNTS -0.472718817 0.146802484 0.25022442 -0.10675266 0.21038280
## BNTE -0.335974725 -0.233600261 0.13209669 0.06519365 0.20386227
## PC11
## WJ7 -0.03589700
## WJ9 -0.12385132
## VFE_cat -0.54431561
## VFE_let -0.13951213
## VFS_cat -0.02616729
## VFS_let 0.31674329
## MMSE 0.07239911
## Span_Read -0.07064449
## Digit 0.18211319
## BNTS -0.29522570
## BNTE 0.66035020
The PCA analysis shows 3 components with eigenvalues over 1, suggesting at least 3 real components.
For PC1, there exists only positive associations. Because all the measures put into analysis can be classified as general “cognitive performance”, this could be what the component is capturing.
For PC2, the All the English measures are negatively associated (VFE and BNTE), but the Woodcock Johnson studies are also slightly negatively associated. The WJ measure language comprehension and vocabulary, and were tested in Spanish. The strongest positive associations are for the MMSE, which is a battery of tests for assessing cognitnive impairment due to neurodegeneration, and the Span_Read, which measuring working memory in Spanish sentences. These two assessments share some overlap in working memory, as does Digit, which is a number working memory assessment.
PC3 looks like more of a mixed bag. The strongest negative associations are the MMSE The strongest positive association is for the BNTS (Boston Naming Test: Spanish), where participants name increasingly complex pictureslike
cor(ESPS.pc$x[,1:3])
## PC1 PC2 PC3
## PC1 1.000000e+00 4.439159e-16 1.163639e-15
## PC2 4.439159e-16 1.000000e+00 5.805172e-16
## PC3 1.163639e-15 5.805172e-16 1.000000e+00
The components have pretty much 0 correlation with one another, suggesting they are indeed orthogonal
ESPS<-cbind(ESPS,data.frame(ESPS.pc$x[,1:3]))
round(cor(ESPS[,c("WJ7","WJ9","VFE_cat","VFE_let","VFS_cat","VFS_let","MMSE","Span_Read","Digit","BNTS","BNTE","PC1","PC2","PC3")],method="spearman"),3)
## WJ7 WJ9 VFE_cat VFE_let VFS_cat VFS_let MMSE Span_Read
## WJ7 1.000 0.533 0.335 0.511 0.358 0.505 0.107 0.342
## WJ9 0.533 1.000 0.455 0.563 0.295 0.623 0.045 0.359
## VFE_cat 0.335 0.455 1.000 0.628 0.257 0.398 -0.016 0.126
## VFE_let 0.511 0.563 0.628 1.000 0.305 0.515 0.352 0.237
## VFS_cat 0.358 0.295 0.257 0.305 1.000 0.639 0.327 0.474
## VFS_let 0.505 0.623 0.398 0.515 0.639 1.000 0.268 0.353
## MMSE 0.107 0.045 -0.016 0.352 0.327 0.268 1.000 0.270
## Span_Read 0.342 0.359 0.126 0.237 0.474 0.353 0.270 1.000
## Digit 0.312 0.311 0.508 0.467 0.447 0.345 0.282 0.464
## BNTS 0.247 0.455 0.112 0.188 0.224 0.521 -0.087 0.223
## BNTE 0.388 0.551 0.761 0.581 0.093 0.333 -0.184 -0.001
## PC1 0.719 0.746 0.658 0.789 0.588 0.802 0.246 0.475
## PC2 -0.043 -0.082 -0.437 -0.076 0.389 0.243 0.582 0.554
## PC3 -0.024 0.134 -0.342 -0.396 -0.019 0.127 -0.571 0.044
## Digit BNTS BNTE PC1 PC2 PC3
## WJ7 0.312 0.247 0.388 0.719 -0.043 -0.024
## WJ9 0.311 0.455 0.551 0.746 -0.082 0.134
## VFE_cat 0.508 0.112 0.761 0.658 -0.437 -0.342
## VFE_let 0.467 0.188 0.581 0.789 -0.076 -0.396
## VFS_cat 0.447 0.224 0.093 0.588 0.389 -0.019
## VFS_let 0.345 0.521 0.333 0.802 0.243 0.127
## MMSE 0.282 -0.087 -0.184 0.246 0.582 -0.571
## Span_Read 0.464 0.223 -0.001 0.475 0.554 0.044
## Digit 1.000 0.176 0.169 0.555 0.329 -0.386
## BNTS 0.176 1.000 0.270 0.465 0.141 0.615
## BNTE 0.169 0.270 1.000 0.612 -0.648 -0.031
## PC1 0.555 0.465 0.612 1.000 0.043 -0.106
## PC2 0.329 0.141 -0.648 0.043 1.000 -0.027
## PC3 -0.386 0.615 -0.031 -0.106 -0.027 1.000
Now I am somewhat confident in saying that PC1 reflects general cognitive processes, PC2 reflects working memory, and its possible that PC3 reflects language fluency.
Using these components should reduce the inflated variance in my measures. I will test this again
comfit<-lm(Gen_Effect~Education+Age+female+PC1+PC2+PC3,data=ESPS)
print("VIF for Individual Measures")
## [1] "VIF for Individual Measures"
vif(linfit)
## Education Age female WJ7_SS WJ9_SS
## 4.281792 3.989441 2.446150 4.001824 6.596070
## VFE_cat_raw VFE_let_raw VFS_cat_raw VFS_let_raw MMSE
## 3.610352 3.905663 3.306244 5.941388 2.199270
## Span_Read_Raw Digit_Raw BNTS_raw BNTE_raw
## 1.824761 1.987265 2.999248 4.574751
print("VIF for Components")
## [1] "VIF for Components"
vif(comfit)
## Education Age female PC1 PC2 PC3
## 2.533410 1.370043 1.153172 2.628885 1.378155 1.076915
library(lme4)
fit.lm<-lmer(Gen_Effect~Education+Age+female+PC1+PC2+PC3+(1|Subject),data=ESPS)
summary(fit.lm)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Gen_Effect ~ Education + Age + female + PC1 + PC2 + PC3 + (1 |
## Subject)
## Data: ESPS
##
## REML criterion at convergence: 3691.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.6129 -0.5719 -0.0031 0.5484 3.5107
##
## Random effects:
## Groups Name Variance Std.Dev.
## Subject (Intercept) 0.473 0.6877
## Residual 1.471 1.2129
## Number of obs: 1120, groups: Subject, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.13698 0.16778 0.816
## Education -0.01874 0.21870 -0.086
## Age -0.29108 0.16083 -1.810
## female0 0.08416 0.31025 0.271
## PC1 -0.08138 0.10391 -0.783
## PC2 -0.20673 0.11741 -1.761
## PC3 0.03106 0.13871 0.224
##
## Correlation of Fixed Effects:
## (Intr) Eductn Age femal0 PC1 PC2
## Education -0.060
## Age 0.003 0.166
## female0 -0.594 0.101 -0.005
## PC1 0.114 -0.765 -0.266 -0.192
## PC2 0.145 -0.201 0.395 -0.244 0.118
## PC3 0.111 0.101 0.168 -0.187 -0.076 0.088
I will be creating a Bayesian model using default priors with a 95% CI
#Using default priors
library(brms)
## Loading required package: Rcpp
## Loading 'brms' package (version 2.5.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
## Run theme_set(theme_default()) to use the default bayesplot theme.
##
## Attaching package: 'brms'
## The following object is masked from 'package:lme4':
##
## ngrps
## The following object is masked from 'package:survival':
##
## kidney
library(rstan)
## Loading required package: StanHeaders
## rstan (Version 2.18.1, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
Sys.time()
## [1] "2018-11-05 16:55:42 CST"
fit.1<-brm(Gen_Effect~Education+Age+female+PC1+PC2+PC3,data=ESPS,family=gaussian(),warmup=5000,iter=20000,chains=4,cores=2,seed=1992,refresh=0,thin=5)
## Compiling the C++ model
## Start sampling
Sys.time()
## [1] "2018-11-05 16:57:16 CST"
summary(fit.1)
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: Gen_Effect ~ Education + Age + female + PC1 + PC2 + PC3
## Data: ESPS (Number of observations: 1120)
## Samples: 4 chains, each with iter = 20000; warmup = 5000; thin = 5;
## total post-warmup samples = 12000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept 0.14 0.05 0.04 0.23 11620 1.00
## Education -0.02 0.07 -0.15 0.11 12487 1.00
## Age -0.29 0.05 -0.39 -0.20 11418 1.00
## female0 0.08 0.09 -0.10 0.26 11707 1.00
## PC1 -0.08 0.03 -0.14 -0.02 12013 1.00
## PC2 -0.21 0.04 -0.28 -0.14 12233 1.00
## PC3 0.03 0.04 -0.05 0.11 11892 1.00
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma 1.35 0.03 1.30 1.41 11736 1.00
##
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
## is a crude measure of effective sample size, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(fit.1,N=2)
It looks like our factors converged nicely given the largely uninformative priors. I just used to default priors, however. I do want to be able to feel more confident in choosing what prior distributions would best work with my data.
It appear that the expectancy effect (our depenent measure) decreases as people get older, which does seem to make sense with the larger literature suggesting that older adults have greater difficulty rapidly using sentence context to predict upcoming words.
The effect also decreased with our first two components, general cognitive ability (PC1) and working memory (PC2). However, it does not make sense that a the effect would increase as these factors declined, as it is believed that prediction is a resource and processing intensive procedure, so the opposite relationship would be expected. A similar pattern was seen when I previously performed simple pair correlations between my cognitive measures and the same effect.
I will run the same bayeisan model with a 90% CI
posterior_interval(fit.1,prob = 0.9,type = "central")
## 5% 95%
## b_Intercept 5.582512e-02 2.195780e-01
## b_Education -1.280609e-01 8.933661e-02
## b_Age -3.703838e-01 -2.119423e-01
## b_female0 -6.971473e-02 2.347382e-01
## b_PC1 -1.329364e-01 -2.993327e-02
## b_PC2 -2.646913e-01 -1.488955e-01
## b_PC3 -3.822170e-02 9.939491e-02
## sigma 1.307564e+00 1.401242e+00
## lp__ -1.936928e+03 -1.930507e+03
Here we see the same pattern with the 95% credible interval.
Now I will run the model again with alternate priors to test for robustisity
altpriors<-c(set_prior("normal(0,1)",class="b"),
set_prior("normal(0,1)",class="Intercept"),
set_prior("inv_gamma(0.5,0.5)",class="sigma"))
Sys.time()
## [1] "2018-11-05 16:57:22 CST"
fit.alt<-brm(Gen_Effect~Education+Age+female+PC1+PC2+PC3,data=ESPS,family=gaussian(),prior=altpriors,warmup=5000,iter=20000,chains=4,cores=2,seed=1992,refresh=0,thin=5)
## Compiling the C++ model
## Start sampling
Sys.time()
## [1] "2018-11-05 16:58:14 CST"
summary(fit.1)
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: Gen_Effect ~ Education + Age + female + PC1 + PC2 + PC3
## Data: ESPS (Number of observations: 1120)
## Samples: 4 chains, each with iter = 20000; warmup = 5000; thin = 5;
## total post-warmup samples = 12000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept 0.14 0.05 0.04 0.23 11620 1.00
## Education -0.02 0.07 -0.15 0.11 12487 1.00
## Age -0.29 0.05 -0.39 -0.20 11418 1.00
## female0 0.08 0.09 -0.10 0.26 11707 1.00
## PC1 -0.08 0.03 -0.14 -0.02 12013 1.00
## PC2 -0.21 0.04 -0.28 -0.14 12233 1.00
## PC3 0.03 0.04 -0.05 0.11 11892 1.00
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma 1.35 0.03 1.30 1.41 11736 1.00
##
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
## is a crude measure of effective sample size, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(fit.1,N=2)
Again, the model seemed to converge correctly again, which is a good first step.
We also see the same pattern of signiticant factors in the model with different priors, suggesting the model is robust.
Determining model specifications using WAI and loo.
#WAIC first
WAIC1<-WAIC(fit.1)
WAICalt<-WAIC(fit.alt)
compare_ic(WAIC1,WAICalt,ic=c("waic"))
## WAIC SE
## fit.1 3862.45 55.95
## fit.alt 3862.41 56.02
## fit.1 - fit.alt 0.05 0.10
#Loo #2
brms::loo(fit.1,fit.alt)
## LOOIC SE
## fit.1 3862.46 55.95
## fit.alt 3862.42 56.02
## fit.1 - fit.alt 0.05 0.10
There is very little difference in the WAIC and LOO scores between these two models, suggesting the model is indeed robust.