Data Preprocessing

library(dplyr)
library(readr)
library(Hmisc)
library(data.table) #subset rows based on partial match
library(lme4)
library(knitr)

Combine all .csv files into one giant .csv file

data <- list.files(full.names = TRUE) %>% 
  lapply(read_csv) %>% 
  lapply(function(d) d[-1,]) %>% #remove first row from each file because it's empty
  bind_rows

# Check data dimention to see if merging was succeful
dim(data)

## [1] 51347    29

Data Cleaning

# Work only with relevant columns
dat<-data[,c("stimuli", "participant","key_resp_3.corr", "EgD or Cont")]

# let R know our factor variables that looked like numerics
dat[c('participant','key_resp_3.corr')]<-lapply(dat[c('participant','key_resp_3.corr')],factor)

# Column value in EgD/Cont was inconsistent due to casing (e.g., CONT or cont)
# Create uniform entries
dat<-data.frame(lapply(dat, function(v) {
  if (is.character(v)) return(toupper(v))
  else return(v)
}))

Descriptives

# Get an idea of number of participant, unique stimuli, and repetition of each stimuli for each Ss 
describe(dat)

## dat 
## 
##  4  Variables      51347  Observations
## ---------------------------------------------------------------------------
## stimuli 
##        n  missing distinct 
##    50560      787       80 
## 
## lowest : CJA1_1.JPG CJA1_2.JPG CJA1_3.JPG CJA1_4.JPG CJA1_5.JPG
## highest: CJB1_5.JPG CJB1_6.JPG CJB1_7.JPG CJB1_8.JPG CJB1_9.JPG
## ---------------------------------------------------------------------------
## participant 
##        n  missing distinct 
##    50560      787       79 
## 
## lowest : 1   2   3   4   5  , highest: 100 101 102 103 104
## ---------------------------------------------------------------------------
## key_resp_3.corr 
##        n  missing distinct 
##    25280    26067        2 
##                       
## Value          0     1
## Frequency   8138 17142
## Proportion 0.322 0.678
## ---------------------------------------------------------------------------
## EgD.or.Cont 
##        n  missing distinct 
##    46720     4627        2 
##                       
## Value       CONT   EGD
## Frequency  22400 24320
## Proportion 0.479 0.521
## ---------------------------------------------------------------------------

# Find out the number of repetitions per stimuli was seen by each subject
sorted <-dat %>%
  group_by(participant, stimuli) %>%
  summarise(no_rows = length(stimuli))

## Warning: Factor `participant` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `stimuli` contains implicit NA, consider using
## `forcats::fct_explicit_na`

head(sorted)

We have 79 subjects, 80 unique stimuli.
It seems that each stimuli was being seen 8 times by each subject, HOWEVER, that’s how PsychoPy recorded data, in fact, 4 out of the 8 entries are NAs, each subject only see each stimuli 4 times.
It would have been nice to incorporate learning curve into our model, but I don’t have the information of the order each record was created for the 8 repeatitions of the same stimuli, therefore it has to be left out of the model.

Selection only Category A stimuli to work with

# Let's only look at stimuli that belong in category A for simplicity
dat2<-dat[dat$stimuli %like% "CJA", ] #Categories are labelled as CJA_xxx or CJB_xxx
head(dat2)

describe(dat2)

## dat2 
## 
##  4  Variables      25280  Observations
## ---------------------------------------------------------------------------
## stimuli 
##        n  missing distinct 
##    25280        0       40 
## 
## lowest : CJA1_1.JPG CJA1_2.JPG CJA1_3.JPG CJA1_4.JPG CJA1_5.JPG
## highest: CJA3_5.JPG CJA3_6.JPG CJA3_7.JPG CJA3_8.JPG CJA3_9.JPG
## ---------------------------------------------------------------------------
## participant 
##        n  missing distinct 
##    25280        0       79 
## 
## lowest : 1   2   3   4   5  , highest: 100 101 102 103 104
## ---------------------------------------------------------------------------
## key_resp_3.corr 
##        n  missing distinct 
##    12640    12640        2 
##                       
## Value          0     1
## Frequency   4439  8201
## Proportion 0.351 0.649
## ---------------------------------------------------------------------------
## EgD.or.Cont 
##        n  missing distinct 
##    23360     1920        2 
##                       
## Value       CONT   EGD
## Frequency  11200 12160
## Proportion 0.479 0.521
## ---------------------------------------------------------------------------

Now we are only working with Category A stimuli, and seems our Ego-depletion and control split is rather 50/50. Let’s proceed to GLMM analysis.

Dummy code categorical predictor

# dummy code ego-depletion condition (Ego_depletion = 1, Control = 0)
levels(dat2$EgD.or.Cont)<-c(0,1)
colnames(dat2)[colnames(dat2)=="EgD.or.Cont"] <- "IsEgD"

Generalized Linear Mixed Effects Model Analysis

Model without ‘stimuli’ as a predictor

# estimate the model and store results in m
m <- glmer(key_resp_3.corr ~ IsEgD +
    (1 | participant) + (1 | stimuli), 
    data = dat2, family = binomial, control = glmerControl(optimizer = "bobyqa"),
    nAGQ = 1)

# print the mod results without correlations among fixed effects
print(m, corr = FALSE)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: key_resp_3.corr ~ IsEgD + (1 | participant) + (1 | stimuli)
##    Data: dat2
##       AIC       BIC    logLik  deviance  df.resid 
## 13890.115 13919.577 -6941.057 13882.115     11676 
## Random effects:
##  Groups      Name        Std.Dev.
##  participant (Intercept) 0.5098  
##  stimuli     (Intercept) 0.6546  
## Number of obs: 11680, groups:  participant, 73; stimuli, 40
## Fixed Effects:
## (Intercept)       IsEgD1  
##      0.7519      -0.1009

Model with ‘stimuli’ as predictor

This code took about 2 minutes to run on my Linux 16.04 distribution, so don’t panic if it doesn’t finish right away.

# bobyqa optimize does not work with high-dimensional data, need to change optimizer to incorate 'stimuli' as predictor which has 40 levels.
library(nloptr)
defaultControl <- list(algorithm="NLOPT_LN_BOBYQA",xtol_rel=1e-6,maxeval=1e5)
nloptwrap2 <- function(fn,par,lower,upper,control=list(),...) {
    for (n in names(defaultControl)) 
      if (is.null(control[[n]])) control[[n]] <- defaultControl[[n]]
    res <- nloptr(x0=par,eval_f=fn,lb=lower,ub=upper,opts=control,...)
    with(res,list(par=solution,
                  fval=objective,
                  feval=iterations,
                  conv=if (status>0) 0 else status,
                  message=message))
}

system.time(test.model <- glmer(key_resp_3.corr ~ IsEgD + stimuli +
    (1 | participant) + (1 | stimuli), 
    data = dat2, 
family='binomial', control=glmerControl(optimizer="nloptwrap2")))

## boundary (singular) fit: see ?isSingular

##    user  system elapsed 
## 114.192   0.016 114.208

print(test.model, corr = FALSE)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: key_resp_3.corr ~ IsEgD + stimuli + (1 | participant) + (1 |  
##     stimuli)
##    Data: dat2
##       AIC       BIC    logLik  deviance  df.resid 
## 13800.589 14117.312 -6857.295 13714.589     11637 
## Random effects:
##  Groups      Name        Std.Dev. 
##  participant (Intercept) 5.092e-01
##  stimuli     (Intercept) 6.803e-09
## Number of obs: 11680, groups:  participant, 73; stimuli, 40
## Fixed Effects:
##        (Intercept)              IsEgD1   stimuliCJA1_2.JPG  
##             1.6876             -0.1012             -0.3137  
##  stimuliCJA1_3.JPG   stimuliCJA1_4.JPG   stimuliCJA1_5.JPG  
##            -0.3746             -0.4141             -0.3137  
##  stimuliCJA1_6.JPG   stimuliCJA1_7.JPG   stimuliCJA1_8.JPG  
##            -0.2509             -0.2295             -0.2509  
## stimuliCJA2_10.JPG  stimuliCJA2_11.JPG  stimuliCJA2_12.JPG  
##            -0.5472             -0.3945             -0.5838  
## stimuliCJA2_13.JPG  stimuliCJA2_14.JPG  stimuliCJA2_15.JPG  
##            -0.4722             -0.6019             -0.3137  
## stimuliCJA2_16.JPG   stimuliCJA2_1.JPG   stimuliCJA2_2.JPG  
##            -0.6019             -0.2509             -0.3746  
##  stimuliCJA2_3.JPG   stimuliCJA2_4.JPG   stimuliCJA2_5.JPG  
##            -0.5100             -0.7593             -0.6730  
##  stimuliCJA2_6.JPG   stimuliCJA2_7.JPG   stimuliCJA2_8.JPG  
##            -0.8264             -0.3342             -0.1859  
##  stimuliCJA2_9.JPG  stimuliCJA3_10.JPG  stimuliCJA3_11.JPG  
##            -0.6019             -1.4126             -1.3684  
## stimuliCJA3_12.JPG  stimuliCJA3_13.JPG  stimuliCJA3_14.JPG  
##            -2.0444             -1.1589             -1.8508  
## stimuliCJA3_15.JPG  stimuliCJA3_16.JPG   stimuliCJA3_1.JPG  
##            -2.1052             -1.9098             -1.9843  
##  stimuliCJA3_2.JPG   stimuliCJA3_3.JPG   stimuliCJA3_4.JPG  
##            -1.6461             -1.5150             -1.5734  
##  stimuliCJA3_5.JPG   stimuliCJA3_6.JPG   stimuliCJA3_7.JPG  
##            -1.5879             -1.7044             -1.7482  
##  stimuliCJA3_8.JPG   stimuliCJA3_9.JPG  
##            -1.5588             -2.0143  
## convergence code 0; 1 optimizer warnings; 0 lme4 warnings

Get Confidence intervals for this model

se <- sqrt(diag(vcov(test.model)))
# table of estimates with 95% CI
tab <- as.data.frame(cbind(Est = fixef(test.model), 
             LL = fixef(test.model) - 1.96 * se, 
             UL = fixef(test.model) + 1.96 * se))

# sort by descending Estimation of regression coefficient
kable(tab[order(tab$Est),])

	Est	LL	UL
stimuliCJA3_15.JPG	-2.1051734	-2.4949498	-1.7153969
stimuliCJA3_12.JPG	-2.0444008	-2.4333259	-1.6554757
stimuliCJA3_9.JPG	-2.0142608	-2.4028019	-1.6257197
stimuliCJA3_1.JPG	-1.9842673	-2.3723999	-1.5961347
stimuliCJA3_16.JPG	-1.9098469	-2.2971872	-1.5225066
stimuliCJA3_14.JPG	-1.8507923	-2.2376389	-1.4639457
stimuliCJA3_7.JPG	-1.7481944	-2.1344941	-1.3618947
stimuliCJA3_6.JPG	-1.7044188	-2.0905482	-1.3182894
stimuliCJA3_2.JPG	-1.6461498	-2.0321793	-1.2601203
stimuliCJA3_5.JPG	-1.5879145	-1.9738404	-1.2019886
stimuliCJA3_4.JPG	-1.5733526	-1.9593195	-1.1873858
stimuliCJA3_8.JPG	-1.5587847	-1.9447289	-1.1728406
stimuliCJA3_3.JPG	-1.5150451	-1.9011239	-1.1289662
stimuliCJA3_10.JPG	-1.4125895	-1.7991097	-1.0260693
stimuliCJA3_11.JPG	-1.3684232	-1.7551534	-0.9816931
stimuliCJA3_13.JPG	-1.1589209	-1.5480339	-0.7698080
stimuliCJA2_6.JPG	-0.8264096	-1.2221883	-0.4306309
stimuliCJA2_4.JPG	-0.7592594	-1.1568841	-0.3616347
stimuliCJA2_5.JPG	-0.6730131	-1.0731437	-0.2728826
stimuliCJA2_16.JPG	-0.6019088	-1.0044233	-0.1993943
stimuliCJA2_9.JPG	-0.6019086	-1.0044187	-0.1993986
stimuliCJA2_14.JPG	-0.6019077	-1.0044933	-0.1993222
stimuliCJA2_12.JPG	-0.5838082	-0.9869391	-0.1806773
stimuliCJA2_10.JPG	-0.5471901	-0.9517227	-0.1426574
stimuliCJA2_3.JPG	-0.5099837	-0.9158950	-0.1040724
stimuliCJA2_13.JPG	-0.4721549	-0.8796712	-0.0646385
stimuliCJA1_4.JPG	-0.4141474	-0.8240567	-0.0042381
stimuliCJA2_11.JPG	-0.3944532	-0.8052696	0.0163633
stimuliCJA1_3.JPG	-0.3745687	-0.7862796	0.0371423
stimuliCJA2_2.JPG	-0.3745674	-0.7862275	0.0370927
stimuliCJA2_7.JPG	-0.3342051	-0.7477663	0.0793560
stimuliCJA1_2.JPG	-0.3137142	-0.7283204	0.1008920
stimuliCJA1_5.JPG	-0.3137126	-0.7282934	0.1008681
stimuliCJA2_15.JPG	-0.3137126	-0.7282854	0.1008601
stimuliCJA2_1.JPG	-0.2509015	-0.6687080	0.1669049
stimuliCJA1_6.JPG	-0.2509013	-0.6686418	0.1668392
stimuliCJA1_8.JPG	-0.2509007	-0.6685904	0.1667891
stimuliCJA1_7.JPG	-0.2294964	-0.6483248	0.1893320
stimuliCJA2_8.JPG	-0.1859219	-0.6070896	0.2352457
IsEgD1	-0.1012429	-0.3491373	0.1466515
(Intercept)	1.6875543	1.3360443	2.0390643

Things to consider, we did not consider learning that happened across the 4 blocks for each subject, in this analysis we consider the 4 repetitions of the same stimuli as equal. I also need to do a back transformation to transform linear regression coefficent to proportion terms, then the interpretation could be simpler such that the estimate would be the change in the probability of getting to a correct answer.

GLMM analysis of L-shaped task stimuli learning

Generalized Linear Mixed Effects Model/Multilevel Logistic Regression