library(tidyverse)
library(zoo)
library(knitr)
library(tibble)
library(plyr)
library(dplyr)
library(data.table)
library(Hmisc)
library(grt)
library(reshape) #reshape data from long to wide
library(MASS) #backward elimination of non-significant predictors in linear models
library(nnet) #multinomial logistic regression

#need this to work on Mac
logLik.grg <- function(object, ...)
{
  val <- object$logLik
  class(val) <- "logLik"
  val
}

Qualtrics Data

df <- read.csv("qualtrics_pilot.csv", header=T)
df<-df %>% drop_na("exist_in_Pav") #drop people who did not complete Pavlovia part

Cleaning - Extract Demographic and ATQ items

df<-df %>% mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x)) #fill na with col average

# which(colnames(df)=="NA.1_1") #get column number to extract
# which(colnames(df)=="EX.77R_1")
cols <- c(1, 5:12, 18:79)
df<-df[,cols] #extract only ATQ columns
cols.num <- c(10:71) #find ATQ columns
df[cols.num] <- sapply(df[cols.num],as.numeric) #convert to numerics
#sapply(df, class)

names(df)[2:9] <- c("Ethnicity", "Age", "Gender", "Education_Completed","Country_origin","Country_residence", "Disability_status","Disability_spec")

Reverse coding

indx <- grepl('R', colnames(df)) #find reverse coded items
df[,indx]<-abs(df[,indx]-8) #reverse coding

Get average factor scale scores

NA_indx <- grepl('NA', colnames(df)) #find Neg_Affect items
NA_sum <- rowSums(df[, NA_indx]) #total score of NA items for each participant
df$NA_score <- NA_sum/25 #append average NA scores to df

EC_indx <- grepl('EC', colnames(df)) #find EC items
EC_sum <- rowSums(df[, EC_indx]) #total score of EC items for each participant
df$EC_score <- EC_sum/19 #append EC scores to df

EX_indx <- grepl('EX', colnames(df)) #find EX items
EX_sum <- rowSums(df[, EX_indx]) #total score of EX items for each participant
df$EX_score <- EX_sum/17 #append EX scores to df

#subj 3302ma completed 11 blocks out of 12, will be removed
df <- df %>% filter(!UniqueID == "3302ma")

df_temp <-df[,c(1:9, 72:74)]

Pavlovia Data

setwd( "/Users/tzhu9/Desktop/pilotData_analysis/pilot_Pavlovia_data/completed_pilot_data")

csv <- list.files(full.names = TRUE, pattern = '*.csv')

df_Pav <- as_tibble(rbind.fill(lapply (csv, fread)))

#subset dataset

# which(colnames(df_Pav)=="exp_Sentence") 
# which(colnames(df_Pav)=="catlearn_corrAns")
cols1 <- c(1, 85:132)
df_Pav<-df_Pav[,cols1]

# which(colnames(df_Pav)=="exp_typedWord")
# which(colnames(df_Pav)=="numberWords_recalled")
# which(colnames(df_Pav)=="key_resp_20.keys")
# which(colnames(df_Pav)=="stim_file")
cols2 <- c(1:6,12,20:21,31:33,40:47, 49)
df_Pav<-df_Pav[,cols2]

#fill empty cells with NA
df_Pav <- df_Pav %>%
mutate_at(vars(colnames(.)),
        .funs = funs(ifelse(.=="", NA, as.character(.))))

#remove empty rows
df_Pav<-df_Pav[!is.na(df_Pav$exp_Sentence) | !is.na(df_Pav$exp_typedWord) | !is.na(df_Pav$key_resp_20.keys),]

Create RSpan DataFrame

#which(colnames(df_Pav)=="sentence_num_total")
df_rspan<-df_Pav[, 1:9] #subset relevant cols
df_rspan<-df_rspan[!is.na(df_rspan$exp_Sentence) | !is.na(df_rspan$exp_typedWord),] #keep rows with either judgment or recall responses
cols4 <- c(4, 6, 8:9)
df_rspan[cols4] <- sapply(df_rspan[cols4],as.numeric)

RSpan Data Cleaning

#keep only judgment related rows
accuracy <- df_rspan %>% 
  filter(!is.na(exp_isCorrect))

#late responses on judgment are filled with rt of 4.5 
accuracy$key_resp_10.rt[is.na(accuracy$key_resp_10.rt)] <- 4.5

#keep only recalled_word related rows
recall_num <- df_rspan %>% 
  filter(!is.na(numberWords_recalled))

RSpan Performance data ready for later merging

#accuracy dataframe
df1<-aggregate(accuracy[, c(4,6)], list(accuracy$UniqueID), mean)
#recalled number dataframe
df2<-aggregate(recall_num[, 8], list(recall_num$UniqueID), sum)
df_rspan_perf <- merge(df1,df2,by="Group.1")
names(df_rspan_perf)[1] <- "UniqueID"
head(df_rspan_perf)

Merge temperament df with Rspan df

df_tem_rspan<-merge(df_temp, df_rspan_perf, by = "UniqueID")
kable(head(df_tem_rspan))

UniqueID	Ethnicity	Age	Gender	Education_Completed	Country_origin	Country_residence	Disability_status	NA_score	EC_score	EX_score	exp_isCorrect	key_resp_10.rt	numberWords_recalled
0202ch	White or Caucasian	18	Male	High school	Canada	Canada	No	3.56	5.473684	4.823529	0.9833333	1.519284	48
0602ly	Asian or Pacific Islander	18	Male	High school	Pakistan	Canada	No	3.64	3.736842	6.117647	0.9333333	2.737075	42
0902be	Asian or Pacific Islander	18	Male	High school	China	Canada	No	4.28	4.157895	4.235294	0.9000000	1.172533	35
1001re	Not listed here	19	Female	High school	Canada	Canada	No	4.68	3.105263	4.529412	0.9000000	1.774508	44
1502ha	South Asian	18	Male	High school	Sri Lanka	Canada	No	4.00	5.210526	3.823529	0.9500000	2.029060	55
2202li	White or Caucasian	18	Female	High school	Canada	Canada	No	4.56	4.526316	4.588235	0.9500000	1.197451	41

Create Catlearn DataFrame

cols3<- c(1, 10:21)
df_catlearn<-df_Pav[, cols3] #remove irrelevant cols
df_catlearn<-df_catlearn[!is.na(df_catlearn$key_resp_20.keys),] #remove empty rows

df_catlearn <- df_catlearn %>% filter(!UniqueID == "3302ma")#incomplete catlearn data
df_catlearn$block <- rep(1:12, times=25, each = 30)
df_catlearn<- df_catlearn[, c(1, 14, 2:13)]

#describe(df_catlearn)

Category Learning Strategy Fitting

df_catlearn <-(df_catlearn %>% mutate(catlearn_corrAns = case_when(
  catlearn_corrAns == "a" ~ 1,
  catlearn_corrAns == "b" ~ 2))
)

#relabel the key_resp_3.keys as 1 and 2
df_catlearn<-(df_catlearn %>% mutate(key_resp_20.keys = case_when(
  key_resp_20.keys == "a" ~ 1,
  key_resp_20.keys == "b" ~ 2))
)

cols5<-c(4, 5, 7:13)
df_catlearn[cols5] <- sapply(df_catlearn[cols5],as.numeric)

dat<-df_catlearn

# #template code for combining blocks 
# # cols6<-c(1, 3:14)
# # dat<-df_catlearn[, cols6]
# # dat$block <-rep(1:6, times = 24, each=60)
# # source("plot.gcjc.R")
# 
blocks <- levels(factor(dat$block))
sbj <- levels(factor(dat$UniqueID))
models <- c("Orientation", "Length", "Information Integration", "Conjunctive Rule", "Fixed Random", "General Random")
res <- matrix(nrow=length(sbj) * length(blocks),ncol=(length(models) * 2 + 2))
res <- cbind(expand.grid(sbj,blocks), res)
colnames(res)[1:2] <- c("UniqueID","block")
pdf(file="25ss_individual_plots.pdf", width=12, height=9)
op <- par(mfrow = c(3, 4), pty = "m")

for(i in sbj)
{
  for(j in blocks)
  {
    data <- subset(dat, dat$block == j & dat$UniqueID == i)
    fit.ori <- glc(key_resp_20.keys ~ ori_transform, category=data$catlearn_corrAns, data=data)
    fit.length <- glc(key_resp_20.keys ~ length, category=data$catlearn_corrAns, data=data)
    fit.2d <- glc(key_resp_20.keys ~ length + ori_transform, category=data$catlearn_corrAns, data=data)
    fit.cj <- gcjc(key_resp_20.keys ~ length + ori_transform,
                   category=data$catlearn_corrAns, data=data,
                   equal.noise = TRUE, config = 2,
                   fixed = list(noise = c(TRUE, FALSE), bias = c(FALSE, FALSE)))
    fit.frg <- grg(data$catlearn_corrAns, fixed=T)
    fit.grg <- grg(data$catlearn_corrAns, fixed=F)
    AICs <- AIC(fit.ori, fit.length, fit.2d, fit.cj, fit.frg, fit.grg)
    BICs <- AIC(fit.ori, fit.length, fit.2d, fit.cj, fit.frg, fit.grg, k=log(80))
    winner <- which.min(AICs$AIC)
    winner2 <- which.min(BICs$AIC)
    res[res$UniqueID==i & res$block == j,3:ncol(res)] <- c(round(AICs$AIC,2), models[winner], round(BICs$AIC,2), models[winner2])
  ##### now plot'em ###RACHEL CHANGED 'winner' to 'winner2' to reflect BIC instead of AIC
    col <- rep("black",4)
    col[winner2] <- "red"
    bg <- c("blue","red")[factor(data$key_resp_20.keys)]
    pch <- c(21,24)[factor(data$catlearn_corrAns)]
    title <- paste(i, ": block", j, ": ", models[winner2])
    plot(ori_transform ~ length, data=data, xlab="Length", ylab="Orientation", bg=bg, pch=pch, xlim=c(0,300), ylim=c(0, 110), main=title)
    abline(h=coef(fit.ori), col = col[1])
    abline(v=coef(fit.length), col = col[2])
    abline(coef(fit.2d), col = col[3])
    cj.coef <- coef(fit.cj)
    abline(h=cj.coef[["ori_transform"]], col = col[4], lty = 3)
    abline(v=cj.coef[["length"]], col = col[4], lty = 3)
  }
}

## Warning in glc(key_resp_20.keys ~ length + ori_transform, category = data$catlearn_corrAns, : nlminb problem, convergence error code = 1
##   message = iteration limit reached without convergence (10)

dev.off()

## quartz_off_screen 
##                 2

res.nam <- c("ori","length","2d","cj","frg","grg","winner")
colnames(res)[3:16] <- c(paste(res.nam, "AIC", sep="."), paste(res.nam, "BIC", sep="."))

Accuracy Catlearn Performance

catlearn_perf<- dat %>%
    group_by(UniqueID, block) %>%
    dplyr::summarize(Mean = mean(key_resp_20.corr, na.rm=TRUE))

head(catlearn_perf)

wide1<-cast(catlearn_perf, UniqueID~block) #catlearn_perf long to wide for later merging

Descriptives of Participants Strategy Use

# count the number of time each model is used by each participant
model_block <- res %>% 
  group_by(UniqueID, winner.BIC) %>%
  dplyr::summarise(Count = n())

#how many blocks for each Ss was best fitted by CR model
head(model_block[with(model_block, winner.BIC == "Conjunctive Rule" ), ])

# find the first block each S used the optimal CR strategy
first_cj<- res %>% 
  group_by(UniqueID, block, winner.BIC) %>%
  dplyr::summarise(Count = n())

a<-first_cj[first_cj$winner.BIC == "Conjunctive Rule",]
first<-a %>% 
    group_by(UniqueID) %>% 
    slice(which.min(block))

head(first)

Combine Temperament Rspan data with Catlearn data

#merge df_tem_rspan that has temperament scores and rspan performance with catlearn model fit descriptives
b<-merge(first, df_tem_rspan, all = TRUE)
cols<-c(1:2, 5:18)
b<-b[, cols]
colnames(b)[2] <- "first_CR_block"
#reshape model_block from long to wide, this df contains number of blocks best fitted by each strategy
wide<-cast(model_block, UniqueID ~ winner.BIC)
b<-merge(wide,b,all =TRUE)
#fill NAs with 0 in the block count for each model
b[is.na(b)] <- 0
complete<-setnames(b, old = c('Conjunctive Rule','Fixed Random','Information Integration', 'Length','Orientation'), new = c('total_CR','total_Rand','total_II','total_length','total_ori'))

# generate a column that keep count of the total number of blocks best fitted by rule-based models, regardless of 1d or 2d
complete$total_rule<-complete$total_CR+complete$total_length + complete$total_ori

# merge with catlearn accuracy performance of each block
complete<-merge(complete, wide1, by = "UniqueID")

Find and remove random respondents

# remove Ss based on performance - more than 3 blocks < 60% accuracy
dt<-complete[,23:34]
complete$num_low_perf<-rowSums(dt < 0.60)
complete$avg_cat_perf <- rowSums(dt)/12
complete<-complete[!complete$num_low_perf > 6,]

#generate columns that take temperament factors as low/medium/high levels for potential analyses
complete<-complete %>% mutate(NA_level = cut(NA_score, quantile(NA_score, c(0, 0.5, 1)), labels = c('Low', 'High'), include.lowest = TRUE))

complete<-complete %>% mutate(EC_level = cut(EC_score, quantile(EC_score, c(0, 0.5, 1)), labels = c('Low', 'High'), include.lowest = TRUE))

complete<-complete %>% mutate(EX_level = cut(EX_score, quantile(EX_score, c(0, 0.5, 1)), labels = c('Low', 'High'), include.lowest = TRUE))

cols2<-c(1,8:18, 37:39, 19:21, 23:34, 2:7, 22)
complete<-complete[,cols2]

complete<-setnames(complete, old = c('1','2','3','4', '5','6','7','8','9','10',"11","12"), new = c('block_1','block_2','block_3','block_4', 'block_5','block_6','block_7','block_8','block_9','block_10','block_11','block_12'))

head(complete)

Proposed Analyses

Use first strategy block as default strategy

#find the first block of non-random response as use it as default strategy

winnerBIC<-res[,c(1, 2, 16)]
default_strat<-cast(winnerBIC, UniqueID ~ block)
default_strat<-setnames(default_strat, old = c('1','2','3','4', '5','6','7','8','9','10',"11","12"), new = c('block_1','block_2','block_3','block_4', 'block_5','block_6','block_7','block_8','block_9','block_10','block_11','block_12'))

# find the first non-random block and tracks its value and block number
func <- function(..., val = "Fixed Random") {
  dat <- unlist(list(...))
  ind <- which.max(dat != val)
  list(dat[[ind]], ind)
}
new<-setNames(
  do.call(rbind.data.frame, do.call(Map, c(list(f=func), default_strat[,-1]))),
  c("default_strat", "first_nonRand_block"))

default_strat <- cbind(new$default_strat, new$first_nonRand_block, default_strat)

default_strat<-setnames(default_strat, old = c("new$default_strat", "new$first_nonRand_block"), new = c("default_strat", "first_nonRand_block"))

default_strat$default_strat <- ifelse(default_strat$default_strat=="Length", "1",
        ifelse(default_strat$default_strat=="Orientation", "1",
        ifelse(default_strat$default_strat=="Conjunctive Rule", "1",
        ifelse(default_strat$default_strat=="Information Integration", "0",
                        NA  ))))

default_strat$default_strat<-as.numeric(default_strat$default_strat)
default_strat<-merge(default_strat[,1:3], complete, by="UniqueID")
default_strat<-default_strat[, c(1, 4:39, 2:3)]

head(default_strat)

Binomial Logistic regression w. implicit/explicit strategy as DV

Only 3 people used II as default strategy out of the 23 Ss, this analysis can only be done when more data is available.

#model did not converge with very few IIs in the column
# fit_first_nonrand <-glm(default_strat ~ EX_score + NA_score + EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, default_strat, family="binomial")

Strategy transition point

# look at at which block Ss started using the optimal strategy. In other words, their learning rate.
pot_df<-default_strat[!is.na(default_strat[, 36]), ] #removes Ss who never used CR

pot_df[,c(10:12,36)] <- sapply(pot_df[,c(10:12,36)], as.numeric)

pot_fit <- lm(first_CR_block ~ EC_score + EX_score + NA_score + key_resp_10.rt + exp_isCorrect + numberWords_recalled, data = pot_df) 

step_pot<-step(pot_fit)

summary(step_pot)

## 
## Call:
## lm(formula = first_CR_block ~ EC_score + numberWords_recalled, 
##     data = pot_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4347 -1.4709 -0.0765  1.4958  3.5948 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           4.60359    3.38440   1.360  0.19261   
## EC_score             -1.89674    0.68024  -2.788  0.01315 * 
## numberWords_recalled  0.22093    0.06841   3.229  0.00524 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.054 on 16 degrees of freedom
## Multiple R-squared:  0.4629, Adjusted R-squared:  0.3958 
## F-statistic: 6.896 on 2 and 16 DF,  p-value: 0.006921

Effortful control significantly predicted learning rate, higher EC is associated with faster learning of the optimal strategy. WMC also significantly predicted learning rate, interestingly, results seem to suggest that higher number of words recalled is associated with slower learning rate.

Ability to discover optimal strategy

#create a column that tracks whether participants ever used optimal strategy in any of the 12 blocks
default_strat$first_CR_block<-as.numeric(default_strat$first_CR_block)
default_strat$ifOpti[is.na(default_strat$first_CR_block)] <- 0
default_strat$ifOpti[default_strat$first_CR_block > 0] <-1
default_strat$ifOpti <- as.factor(default_strat$ifOpti)

m <-glm(ifOpti ~ EX_score + NA_score + EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, default_strat, family="binomial")

step_m <- stepAIC(m, direction="both")

summary(step_m)

## 
## Call:
## glm(formula = ifOpti ~ NA_score, family = "binomial", data = default_strat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7229   0.2454   0.3756   0.6276   1.1619  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   12.041      7.672   1.570    0.117
## NA_score      -2.480      1.764  -1.406    0.160
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 21.254  on 22  degrees of freedom
## Residual deviance: 18.621  on 21  degrees of freedom
## AIC: 22.621
## 
## Number of Fisher Scoring iterations: 5

No predictor significantly predicted the ability to discover optimal strategy or not, although the model chose to keep NA_score. Current data suggests the trend is that increase in Negative Affect is associated with decreases in the log odds of discovering the optimal strategy.

Strategy Consistency

Total number of blocks using optimal strategy

m3<-lm(total_CR ~ EX_score + NA_score + EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, default_strat)

step_m3<-step(m3)

summary(step_m3)

## 
## Call:
## lm(formula = total_CR ~ NA_score + EC_score + key_resp_10.rt + 
##     numberWords_recalled, data = default_strat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6302 -1.1980  0.4832  0.9290  1.9637 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           1.66779    4.76838   0.350  0.73058   
## NA_score             -1.05711    0.70462  -1.500  0.15088   
## EC_score              1.81141    0.51931   3.488  0.00262 **
## key_resp_10.rt        1.09765    0.60786   1.806  0.08771 . 
## numberWords_recalled -0.09771    0.04340  -2.252  0.03708 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.401 on 18 degrees of freedom
## Multiple R-squared:  0.5308, Adjusted R-squared:  0.4265 
## F-statistic: 5.091 on 4 and 18 DF,  p-value: 0.006367

Effortful control significantly predicts the consistency of people applying optimal strategy throughout the task; higher EC, more use of optimal strategy. NA was not removed from the model, and it’s suggesting a negative relationship with the use of optimal CR strategy. Higher NA, less CR use. WMC is also significant, higher WMC, less CR strategy use.

Performance after discovery of optimal strategy

# find first block that used optimal strategy then calculate accuracy for that block through the end of task
cols <- grep('block_', names(pot_df), value = TRUE) #use the df with total_CR_block == NA rows removed
n <- length(cols)

pot_df$per_afterOpti <- mapply(function(x, y) mean(unlist(pot_df[x, cols[y:n]])), seq(nrow(pot_df)), pot_df$first_CR_block)

m4 <- lm(per_afterOpti ~ EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, pot_df)

step_m4 <- step(m4)

summary(step_m4)

## 
## Call:
## lm(formula = per_afterOpti ~ EC_score + key_resp_10.rt + numberWords_recalled, 
##     data = pot_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.148345 -0.051042  0.006609  0.042259  0.121577 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)          0.269830   0.173607   1.554   0.1410  
## EC_score             0.057372   0.028243   2.031   0.0603 .
## key_resp_10.rt       0.058327   0.034602   1.686   0.1125  
## numberWords_recalled 0.003619   0.002427   1.491   0.1567  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07283 on 15 degrees of freedom
## Multiple R-squared:  0.3641, Adjusted R-squared:  0.2369 
## F-statistic: 2.863 on 3 and 15 DF,  p-value: 0.07181

Effortful control is approaching significance, higher EC is associated with higher performance after discovering the optimal strategy, which means more consistently applying the optimal strategy.

Pilot Data Analyses

Jan 13 2021