Data Preprocessing

library(dplyr)
library(readr)
library(Hmisc)
library(data.table) #subset rows based on partial match
library(lme4)
library(knitr)

Combine all .csv files into one giant .csv file

data <- list.files(full.names = TRUE) %>% 
  lapply(read_csv) %>% 
  lapply(function(d) d[-1,]) %>% #remove first row from each file because it's empty
  bind_rows 
# Check data dimention to see if merging was succeful
dim(data)
## [1] 51347    29

Data Cleaning

# Work only with relevant columns
dat<-data[,c("stimuli", "participant","key_resp_3.corr", "EgD or Cont")]

# let R know our factor variables that looked like numerics
dat[c('participant','key_resp_3.corr')]<-lapply(dat[c('participant','key_resp_3.corr')],factor)

# Column value in EgD/Cont was inconsistent due to casing (e.g., CONT or cont)
# Create uniform entries
dat<-data.frame(lapply(dat, function(v) {
  if (is.character(v)) return(toupper(v))
  else return(v)
}))

Descriptives

# Get an idea of number of participant, unique stimuli, and repetition of each stimuli for each Ss 
describe(dat)
## dat 
## 
##  4  Variables      51347  Observations
## ---------------------------------------------------------------------------
## stimuli 
##        n  missing distinct 
##    50560      787       80 
## 
## lowest : CJA1_1.JPG CJA1_2.JPG CJA1_3.JPG CJA1_4.JPG CJA1_5.JPG
## highest: CJB1_5.JPG CJB1_6.JPG CJB1_7.JPG CJB1_8.JPG CJB1_9.JPG
## ---------------------------------------------------------------------------
## participant 
##        n  missing distinct 
##    50560      787       79 
## 
## lowest : 1   2   3   4   5  , highest: 100 101 102 103 104
## ---------------------------------------------------------------------------
## key_resp_3.corr 
##        n  missing distinct 
##    25280    26067        2 
##                       
## Value          0     1
## Frequency   8138 17142
## Proportion 0.322 0.678
## ---------------------------------------------------------------------------
## EgD.or.Cont 
##        n  missing distinct 
##    46720     4627        2 
##                       
## Value       CONT   EGD
## Frequency  22400 24320
## Proportion 0.479 0.521
## ---------------------------------------------------------------------------
# Find out the number of repetitions per stimuli was seen by each subject
sorted <-dat %>%
  group_by(participant, stimuli) %>%
  summarise(no_rows = length(stimuli))
## Warning: Factor `participant` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `stimuli` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(sorted)

We have 79 subjects, 80 unique stimuli.
It seems that each stimuli was being seen 8 times by each subject, HOWEVER, that’s how PsychoPy recorded data, in fact, 4 out of the 8 entries are NAs, each subject only see each stimuli 4 times.
It would have been nice to incorporate learning curve into our model, but I don’t have the information of the order each record was created for the 8 repeatitions of the same stimuli, therefore it has to be left out of the model.

Selection only Category A stimuli to work with

# Let's only look at stimuli that belong in category A for simplicity
dat2<-dat[dat$stimuli %like% "CJA", ] #Categories are labelled as CJA_xxx or CJB_xxx
head(dat2)
describe(dat2)
## dat2 
## 
##  4  Variables      25280  Observations
## ---------------------------------------------------------------------------
## stimuli 
##        n  missing distinct 
##    25280        0       40 
## 
## lowest : CJA1_1.JPG CJA1_2.JPG CJA1_3.JPG CJA1_4.JPG CJA1_5.JPG
## highest: CJA3_5.JPG CJA3_6.JPG CJA3_7.JPG CJA3_8.JPG CJA3_9.JPG
## ---------------------------------------------------------------------------
## participant 
##        n  missing distinct 
##    25280        0       79 
## 
## lowest : 1   2   3   4   5  , highest: 100 101 102 103 104
## ---------------------------------------------------------------------------
## key_resp_3.corr 
##        n  missing distinct 
##    12640    12640        2 
##                       
## Value          0     1
## Frequency   4439  8201
## Proportion 0.351 0.649
## ---------------------------------------------------------------------------
## EgD.or.Cont 
##        n  missing distinct 
##    23360     1920        2 
##                       
## Value       CONT   EGD
## Frequency  11200 12160
## Proportion 0.479 0.521
## ---------------------------------------------------------------------------

Now we are only working with Category A stimuli, and seems our Ego-depletion and control split is rather 50/50. Let’s proceed to GLMM analysis.

Dummy code categorical predictor

# dummy code ego-depletion condition (Ego_depletion = 1, Control = 0)
levels(dat2$EgD.or.Cont)<-c(0,1)
colnames(dat2)[colnames(dat2)=="EgD.or.Cont"] <- "IsEgD"

Generalized Linear Mixed Effects Model Analysis

Model without ‘stimuli’ as a predictor

# estimate the model and store results in m
m <- glmer(key_resp_3.corr ~ IsEgD +
    (1 | participant) + (1 | stimuli), 
    data = dat2, family = binomial, control = glmerControl(optimizer = "bobyqa"),
    nAGQ = 1)

# print the mod results without correlations among fixed effects
print(m, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: key_resp_3.corr ~ IsEgD + (1 | participant) + (1 | stimuli)
##    Data: dat2
##       AIC       BIC    logLik  deviance  df.resid 
## 13890.115 13919.577 -6941.057 13882.115     11676 
## Random effects:
##  Groups      Name        Std.Dev.
##  participant (Intercept) 0.5098  
##  stimuli     (Intercept) 0.6546  
## Number of obs: 11680, groups:  participant, 73; stimuli, 40
## Fixed Effects:
## (Intercept)       IsEgD1  
##      0.7519      -0.1009

Model with ‘stimuli’ as predictor

This code took about 2 minutes to run on my Linux 16.04 distribution, so don’t panic if it doesn’t finish right away.

# bobyqa optimize does not work with high-dimensional data, need to change optimizer to incorate 'stimuli' as predictor which has 40 levels.
library(nloptr)
defaultControl <- list(algorithm="NLOPT_LN_BOBYQA",xtol_rel=1e-6,maxeval=1e5)
nloptwrap2 <- function(fn,par,lower,upper,control=list(),...) {
    for (n in names(defaultControl)) 
      if (is.null(control[[n]])) control[[n]] <- defaultControl[[n]]
    res <- nloptr(x0=par,eval_f=fn,lb=lower,ub=upper,opts=control,...)
    with(res,list(par=solution,
                  fval=objective,
                  feval=iterations,
                  conv=if (status>0) 0 else status,
                  message=message))
}

system.time(test.model <- glmer(key_resp_3.corr ~ IsEgD + stimuli +
    (1 | participant) + (1 | stimuli), 
    data = dat2, 
family='binomial', control=glmerControl(optimizer="nloptwrap2")))
## boundary (singular) fit: see ?isSingular
##    user  system elapsed 
## 114.192   0.016 114.208
print(test.model, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: key_resp_3.corr ~ IsEgD + stimuli + (1 | participant) + (1 |  
##     stimuli)
##    Data: dat2
##       AIC       BIC    logLik  deviance  df.resid 
## 13800.589 14117.312 -6857.295 13714.589     11637 
## Random effects:
##  Groups      Name        Std.Dev. 
##  participant (Intercept) 5.092e-01
##  stimuli     (Intercept) 6.803e-09
## Number of obs: 11680, groups:  participant, 73; stimuli, 40
## Fixed Effects:
##        (Intercept)              IsEgD1   stimuliCJA1_2.JPG  
##             1.6876             -0.1012             -0.3137  
##  stimuliCJA1_3.JPG   stimuliCJA1_4.JPG   stimuliCJA1_5.JPG  
##            -0.3746             -0.4141             -0.3137  
##  stimuliCJA1_6.JPG   stimuliCJA1_7.JPG   stimuliCJA1_8.JPG  
##            -0.2509             -0.2295             -0.2509  
## stimuliCJA2_10.JPG  stimuliCJA2_11.JPG  stimuliCJA2_12.JPG  
##            -0.5472             -0.3945             -0.5838  
## stimuliCJA2_13.JPG  stimuliCJA2_14.JPG  stimuliCJA2_15.JPG  
##            -0.4722             -0.6019             -0.3137  
## stimuliCJA2_16.JPG   stimuliCJA2_1.JPG   stimuliCJA2_2.JPG  
##            -0.6019             -0.2509             -0.3746  
##  stimuliCJA2_3.JPG   stimuliCJA2_4.JPG   stimuliCJA2_5.JPG  
##            -0.5100             -0.7593             -0.6730  
##  stimuliCJA2_6.JPG   stimuliCJA2_7.JPG   stimuliCJA2_8.JPG  
##            -0.8264             -0.3342             -0.1859  
##  stimuliCJA2_9.JPG  stimuliCJA3_10.JPG  stimuliCJA3_11.JPG  
##            -0.6019             -1.4126             -1.3684  
## stimuliCJA3_12.JPG  stimuliCJA3_13.JPG  stimuliCJA3_14.JPG  
##            -2.0444             -1.1589             -1.8508  
## stimuliCJA3_15.JPG  stimuliCJA3_16.JPG   stimuliCJA3_1.JPG  
##            -2.1052             -1.9098             -1.9843  
##  stimuliCJA3_2.JPG   stimuliCJA3_3.JPG   stimuliCJA3_4.JPG  
##            -1.6461             -1.5150             -1.5734  
##  stimuliCJA3_5.JPG   stimuliCJA3_6.JPG   stimuliCJA3_7.JPG  
##            -1.5879             -1.7044             -1.7482  
##  stimuliCJA3_8.JPG   stimuliCJA3_9.JPG  
##            -1.5588             -2.0143  
## convergence code 0; 1 optimizer warnings; 0 lme4 warnings

Get Confidence intervals for this model

se <- sqrt(diag(vcov(test.model)))
# table of estimates with 95% CI
tab <- as.data.frame(cbind(Est = fixef(test.model), 
             LL = fixef(test.model) - 1.96 * se, 
             UL = fixef(test.model) + 1.96 * se))

# sort by descending Estimation of regression coefficient
kable(tab[order(tab$Est),])
Est LL UL
stimuliCJA3_15.JPG -2.1051734 -2.4949498 -1.7153969
stimuliCJA3_12.JPG -2.0444008 -2.4333259 -1.6554757
stimuliCJA3_9.JPG -2.0142608 -2.4028019 -1.6257197
stimuliCJA3_1.JPG -1.9842673 -2.3723999 -1.5961347
stimuliCJA3_16.JPG -1.9098469 -2.2971872 -1.5225066
stimuliCJA3_14.JPG -1.8507923 -2.2376389 -1.4639457
stimuliCJA3_7.JPG -1.7481944 -2.1344941 -1.3618947
stimuliCJA3_6.JPG -1.7044188 -2.0905482 -1.3182894
stimuliCJA3_2.JPG -1.6461498 -2.0321793 -1.2601203
stimuliCJA3_5.JPG -1.5879145 -1.9738404 -1.2019886
stimuliCJA3_4.JPG -1.5733526 -1.9593195 -1.1873858
stimuliCJA3_8.JPG -1.5587847 -1.9447289 -1.1728406
stimuliCJA3_3.JPG -1.5150451 -1.9011239 -1.1289662
stimuliCJA3_10.JPG -1.4125895 -1.7991097 -1.0260693
stimuliCJA3_11.JPG -1.3684232 -1.7551534 -0.9816931
stimuliCJA3_13.JPG -1.1589209 -1.5480339 -0.7698080
stimuliCJA2_6.JPG -0.8264096 -1.2221883 -0.4306309
stimuliCJA2_4.JPG -0.7592594 -1.1568841 -0.3616347
stimuliCJA2_5.JPG -0.6730131 -1.0731437 -0.2728826
stimuliCJA2_16.JPG -0.6019088 -1.0044233 -0.1993943
stimuliCJA2_9.JPG -0.6019086 -1.0044187 -0.1993986
stimuliCJA2_14.JPG -0.6019077 -1.0044933 -0.1993222
stimuliCJA2_12.JPG -0.5838082 -0.9869391 -0.1806773
stimuliCJA2_10.JPG -0.5471901 -0.9517227 -0.1426574
stimuliCJA2_3.JPG -0.5099837 -0.9158950 -0.1040724
stimuliCJA2_13.JPG -0.4721549 -0.8796712 -0.0646385
stimuliCJA1_4.JPG -0.4141474 -0.8240567 -0.0042381
stimuliCJA2_11.JPG -0.3944532 -0.8052696 0.0163633
stimuliCJA1_3.JPG -0.3745687 -0.7862796 0.0371423
stimuliCJA2_2.JPG -0.3745674 -0.7862275 0.0370927
stimuliCJA2_7.JPG -0.3342051 -0.7477663 0.0793560
stimuliCJA1_2.JPG -0.3137142 -0.7283204 0.1008920
stimuliCJA1_5.JPG -0.3137126 -0.7282934 0.1008681
stimuliCJA2_15.JPG -0.3137126 -0.7282854 0.1008601
stimuliCJA2_1.JPG -0.2509015 -0.6687080 0.1669049
stimuliCJA1_6.JPG -0.2509013 -0.6686418 0.1668392
stimuliCJA1_8.JPG -0.2509007 -0.6685904 0.1667891
stimuliCJA1_7.JPG -0.2294964 -0.6483248 0.1893320
stimuliCJA2_8.JPG -0.1859219 -0.6070896 0.2352457
IsEgD1 -0.1012429 -0.3491373 0.1466515
(Intercept) 1.6875543 1.3360443 2.0390643

Things to consider, we did not consider learning that happened across the 4 blocks for each subject, in this analysis we consider the 4 repetitions of the same stimuli as equal. I also need to do a back transformation to transform linear regression coefficent to proportion terms, then the interpretation could be simpler such that the estimate would be the change in the probability of getting to a correct answer.