library(tidyverse)
library(zoo)
library(knitr)
library(tibble)
library(plyr)
library(dplyr)
library(data.table)
library(Hmisc)
library(grt)
library(reshape) #reshape data from long to wide
library(MASS) #backward elimination of non-significant predictors in linear models

#need this to work on Mac
logLik.grg <- function(object, ...)
{
  val <- object$logLik
  class(val) <- "logLik"
  val
}

Qualtrics Data

df <- read.csv("qualtrics_ii.csv", header=T)

Cleaning - Extract Demographic and ATQ items

df<-df %>% mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x)) #fill na with col average

cols.num <- c(12:73) #find ATQ columns
df[cols.num] <- sapply(df[cols.num],as.numeric) #convert to numerics
#sapply(df, class)

Reverse coding

indx <- grepl('R', colnames(df)) #find reverse coded items
df[,indx]<-abs(df[,indx]-8) #reverse coding

Get average factor scale scores

NA_indx <- grepl('NA', colnames(df)) #find Neg_Affect items
NA_sum <- rowSums(df[, NA_indx]) #total score of NA items for each participant
# df$NA_score <- NA_sum/25 #append average NA scores to df
df$NA_score <- NA_sum

EC_indx <- grepl('EC', colnames(df)) #find EC items
EC_sum <- rowSums(df[, EC_indx]) #total score of EC items for each participant
# df$EC_score <- EC_sum/19 #append EC scores to df
df$EC_score <- EC_sum

EX_indx <- grepl('EX', colnames(df)) #find EX items
EX_sum <- rowSums(df[, EX_indx]) #total score of EX items for each participant
# df$EX_score <- EX_sum/17 #append EX scores to df
df$EX_score <- EX_sum

df_temp <-df[,c(1:11, 74:76)]
#generate columns that take temperament factors as low/medium/high levels for potential analyses
df_temp<-df_temp %>% mutate(NA_level = cut(NA_score, quantile(NA_score, c(0, .5, 1)), labels = c('Low',  'High'), include.lowest = TRUE))

df_temp<-df_temp %>% mutate(EC_level = cut(EC_score, quantile(EC_score, c(0, .5, 1)), labels = c('Low',  'High'), include.lowest = TRUE))

df_temp<-df_temp %>% mutate(EX_level = cut(EX_score, quantile(EX_score, c(0, .5, 1)), labels = c('Low',  'High'), include.lowest = TRUE))

Pavlovia Data

setwd( "/Users/tzhu9/Desktop/pilotData_ii/completed_rspan_ii")

csv <- list.files(full.names = TRUE, pattern = '*.csv')

df_Pav <- as_tibble(rbind.fill(lapply (csv, fread)))

#subset dataset

# which(colnames(df_Pav)=="exp_Sentence")
# which(colnames(df_Pav)=="catlearn_corrAns")
cols1 <- c(1, 85:131)
df_Pav<-df_Pav[,cols1]

# which(colnames(df_Pav)=="exp_typedWord")
# which(colnames(df_Pav)=="numberWords_recalled")
# which(colnames(df_Pav)=="key_resp_20.keys")
# which(colnames(df_Pav)=="II_stim_file")
cols2 <- c(1:6,12,20:21,31:33,40:48)
df_Pav<-df_Pav[,cols2]

#fill empty cells with NA
df_Pav <- df_Pav %>%
mutate_at(vars(colnames(.)),
        .funs = funs(ifelse(.=="", NA, as.character(.))))

#remove empty rows
df_Pav<-df_Pav[!is.na(df_Pav$exp_Sentence) | !is.na(df_Pav$exp_typedWord) | !is.na(df_Pav$key_resp_20.keys),]

Create RSpan DataFrame

# which(colnames(df_Pav)=="sentence_num_total")
df_rspan<-df_Pav[, 1:9] #subset relevant cols
df_rspan<-df_rspan[!is.na(df_rspan$exp_Sentence) | !is.na(df_rspan$exp_typedWord),] #keep rows with either judgment or recall responses
cols4 <- c(4, 6, 8:9)
df_rspan[cols4] <- sapply(df_rspan[cols4],as.numeric)

RSpan Data Cleaning

#keep only judgment related rows
accuracy <- df_rspan %>% 
  filter(!is.na(exp_isCorrect))

#late responses on judgment are filled with rt of 4.5 
accuracy$key_resp_10.rt[is.na(accuracy$key_resp_10.rt)] <- 4.5

#keep only recalled_word related rows
recall_num <- df_rspan %>% 
  filter(!is.na(numberWords_recalled))

RSpan Performance data ready for later merging

#accuracy dataframe
df1<-aggregate(accuracy[, c(4,6)], list(accuracy$UniqueID), mean)#average for each Ss performance
#recalled number dataframe
df2<-aggregate(recall_num[, 8], list(recall_num$UniqueID), sum)
df_rspan_perf <- merge(df1,df2,by="Group.1")
names(df_rspan_perf)[1] <- "UniqueID"
head(df_rspan_perf)

Merge temperament df with Rspan df

df_tem_rspan<-merge(df_temp, df_rspan_perf, by = "UniqueID")
head(df_tem_rspan)

Create Catlearn DataFrame

cols3<- c(1, 10:21)
df_catlearn<-df_Pav[, cols3] #remove irrelevant cols
df_catlearn<-df_catlearn[!is.na(df_catlearn$key_resp_20.keys),] #remove empty rows

df_catlearn$block <- rep(1:12, times=26, each = 30)
df_catlearn<- df_catlearn[, c(1, 14, 2:13)]

#describe(df_catlearn)

Category Learning Strategy Fitting

df_catlearn <-(df_catlearn %>% mutate(catlearn_corrAns = case_when(
  catlearn_corrAns == "a" ~ 1,
  catlearn_corrAns == "b" ~ 2))
)

#relabel the key_resp_3.keys as 1 and 2
df_catlearn<-(df_catlearn %>% mutate(key_resp_20.keys = case_when(
  key_resp_20.keys == "a" ~ 1,
  key_resp_20.keys == "b" ~ 2))
)

cols5<-c(4, 5, 7:13)
df_catlearn[cols5] <- sapply(df_catlearn[cols5],as.numeric)

dat<-df_catlearn

#template code for combining blocks
# cols6<-c(1, 3:14)
# dat<-df_catlearn[, cols6]
# dat$block <-rep(1:6, times = 24, each=60)
# source("plot.gcjc.R")

blocks <- levels(factor(dat$block))
sbj <- levels(factor(dat$UniqueID))
models <- c("Orientation", "Length", "Information Integration", "Conjunctive Rule", "Fixed Random", "General Random")
res <- matrix(nrow=length(sbj) * length(blocks),ncol=(length(models) * 2 + 2))
res <- cbind(expand.grid(sbj,blocks), res)
colnames(res)[1:2] <- c("UniqueID","block")
pdf(file="26s_ii_individual_plots.pdf", width=12, height=9)
op <- par(mfrow = c(3, 4), pty = "m")

for(i in sbj)
{
  for(j in blocks)
  {
    data <- subset(dat, dat$block == j & dat$UniqueID == i)
    fit.ori <- glc(key_resp_20.keys ~ ori_transform, category=data$catlearn_corrAns, data=data)
    fit.length <- glc(key_resp_20.keys ~ length, category=data$catlearn_corrAns, data=data)
    fit.2d <- glc(key_resp_20.keys ~ length + ori_transform, category=data$catlearn_corrAns, data=data)
    fit.cj <- gcjc(key_resp_20.keys ~ length + ori_transform,
                   category=data$catlearn_corrAns, data=data,
                   equal.noise = TRUE, config = 2,
                   fixed = list(noise = c(TRUE, FALSE), bias = c(FALSE, FALSE)))
    fit.frg <- grg(data$catlearn_corrAns, fixed=T)
    fit.grg <- grg(data$catlearn_corrAns, fixed=F)
    AICs <- AIC(fit.ori, fit.length, fit.2d, fit.cj, fit.frg, fit.grg)
    BICs <- AIC(fit.ori, fit.length, fit.2d, fit.cj, fit.frg, fit.grg, k=log(80))
    winner <- which.min(AICs$AIC)
    winner2 <- which.min(BICs$AIC)
    res[res$UniqueID==i & res$block == j,3:ncol(res)] <- c(round(AICs$AIC,2), models[winner], round(BICs$AIC,2), models[winner2])
  ##### now plot'em ###RACHEL CHANGED 'winner' to 'winner2' to reflect BIC instead of AIC
    col <- rep("black",4)
    col[winner2] <- "red"
    bg <- c("blue","red")[factor(data$key_resp_20.keys)]
    pch <- c(21,24)[factor(data$catlearn_corrAns)]
    title <- paste(i, ": block", j, ": ", models[winner2])
    plot(ori_transform ~ length, data=data, xlab="Length", ylab="Orientation", bg=bg, pch=pch, xlim=c(0,300), ylim=c(0, 110), main=title)
    abline(h=coef(fit.ori), col = col[1])
    abline(v=coef(fit.length), col = col[2])
    abline(coef(fit.2d), col = col[3])
    cj.coef <- coef(fit.cj)
    abline(h=cj.coef[["ori_transform"]], col = col[4], lty = 3)
    abline(v=cj.coef[["length"]], col = col[4], lty = 3)
  }
}

dev.off()

## quartz_off_screen 
##                 2

res.nam <- c("ori","length","2d","cj","frg","grg","winner")
colnames(res)[3:16] <- c(paste(res.nam, "AIC", sep="."), paste(res.nam, "BIC", sep="."))

Accuracy Catlearn Performance

catlearn_perf<- dat %>%
    group_by(UniqueID, block) %>%
    dplyr::summarize(Mean = mean(key_resp_20.corr, na.rm=TRUE))

head(catlearn_perf)

wide1<-cast(catlearn_perf, UniqueID~block)

Descriptives of Participants Strategy Use

# count the number of time each model is used by each participant
model_block <- res %>% 
  group_by(UniqueID, winner.BIC) %>%
  dplyr::summarise(Count = n())

#how many blocks for each Ss was best fitted by CR model
head(model_block[with(model_block, winner.BIC == "Information Integration" ), ])

# find the first block each S used the optimal CR strategy
first_ii<- res %>% 
  group_by(UniqueID, block, winner.BIC) %>%
  dplyr::summarise(Count = n())

a<-first_ii[first_ii$winner.BIC == "Information Integration",]
first_ii<-a %>% 
    group_by(UniqueID) %>% 
    slice(which.min(block))

head(first_ii)

Combine Temperament Rspan data with Catlearn data

#merge df_tem_rspan that has temperament scores and rspan performance with catlearn model fit descriptives
b<-merge(first_ii, df_tem_rspan, all = TRUE)
cols<-c(1:2, 5:23)
b<-b[, cols]
colnames(b)[2] <- "first_II_block"
b[,2] <- as.numeric(as.character(b[,2]))
#reshape model_block from long to wide, this df contains number of blocks best fitted by each strategy
wide<-cast(model_block, UniqueID ~ winner.BIC)
b<-merge(wide,b,all =TRUE)

complete<-setnames(b, old = c('Conjunctive Rule','Fixed Random','General Random','Information Integration', 'Length','Orientation'), new = c('total_CR','total_fixedRand','total_genRand','total_II','total_length','total_ori'))

# generate a column that keep count of the total number of blocks best fitted by rule-based models, regardless of 1d or 2d
complete$total_rule<-complete$total_CR + complete$total_length + complete$total_ori
complete$total_Rand<-complete$total_fixedRand + complete$total_genRand

complete <- complete[,c(1, 12:16,11,9:10,17:27, 2:8, 28:29)]#rearrange cols

# merge with catlearn accuracy performance of each block
complete<-merge(complete, wide1, by = "UniqueID")

Find and remove random respondents

# # find random respondents - randomly responded to more than 3 blocks
# complete$UniqueID[complete$total_Rand > 3]
# # remove subjects that had more than 3 blocks best fitted by random guess
# complete<-complete[!complete$total_Rand > 3,]

# remove Ss based on performance - more than 3 blocks < 60% accuracy
dt<-complete[,30:41]
complete$num_low_perf<-rowSums(dt < 0.60)
complete$avg_cat_perf <- rowSums(dt)/12
complete<-complete[!complete$num_low_perf > 6,]
complete<-complete[which(!complete$UniqueID == "tokatest"), ]

head(complete)

complete<-setnames(complete, old = c('1','2','3','4', '5','6','7','8','9','10', '11','12'), new = c('block_1','block_2','block_3','block_4', 'block_5','block_6','block_7','block_8','block_9','block_10', 'block_11','block_12'))

Proposed Analyses

Use first strategy block as default strategy

#find the first block of non-random response as use it as default strategy
winnerBIC<-res[,c(1, 2, 16)]
default_strat<-cast(winnerBIC, UniqueID ~ block)
default_strat<-setnames(default_strat, old = c('1','2','3','4', '5','6','7','8','9','10',"11","12"), new = c('block_1','block_2','block_3','block_4', 'block_5','block_6','block_7','block_8','block_9','block_10','block_11','block_12'))

# find the first non-random block and tracks its value and block number
func <- function(..., val = "Fixed Random") {
  dat <- unlist(list(...))
  ind <- which.max(dat != val)
  list(dat[[ind]], ind)
}
new<-setNames(
  do.call(rbind.data.frame, do.call(Map, c(list(f=func), default_strat[,-1]))),
  c("default_strat", "first_nonRand_block"))

default_strat <- cbind(new$default_strat, new$first_nonRand_block, default_strat)

default_strat<-setnames(default_strat, old = c("new$default_strat", "new$first_nonRand_block"), new = c("default_strat", "first_nonRand_block"))

default_strat$default_strat <- ifelse(default_strat$default_strat=="Length", "1",
        ifelse(default_strat$default_strat=="Orientation", "1",
        ifelse(default_strat$default_strat=="Conjunctive Rule", "1",
        ifelse(default_strat$default_strat=="Information Integration", "0",
                        NA  ))))

default_strat$default_strat<-as.numeric(default_strat$default_strat)
default_strat<-merge(default_strat[,1:3], complete, by="UniqueID")
default_strat<-default_strat[, c(1, 4:45, 2:3)]

default_strat$NA_score<-default_strat$NA_score/25
default_strat$EC_score<-default_strat$EC_score/19
default_strat$EX_score<-default_strat$EX_score/17


#fill NAs with 0 in the block count for each model
default_strat[, c(22:26, 28:29)][is.na(default_strat[, c(22:26, 28:29)])] <- 0
head(default_strat)

Binomial logistic regression with implicit/explicit strategy as DV

Only 5 people used II as default strategy out of the 20 Ss, this analysis can only be done when more data is available.

#model did not converge with very few IIs in the column
# fit_first_nonrand <- glm(default_strat ~ EX_score + NA_score + EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, default_strat, family="binomial")

Strategy transition point

# look at at which block Ss started using the optimal strategy. In other words, their learning rate.
pot_df<-default_strat[!is.na(default_strat[, 27]), ] #removes Ss who never used II

pot_df[,c(12:14,27)] <- sapply(pot_df[,c(12:14,27)], as.numeric)

pot_fit <- lm(first_II_block ~ EC_score + EX_score + NA_score + key_resp_10.rt + exp_isCorrect + numberWords_recalled, data = pot_df) 

step_pot<-step(pot_fit)

summary(step_pot)

## 
## Call:
## lm(formula = first_II_block ~ EC_score + key_resp_10.rt + numberWords_recalled, 
##     data = pot_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5555 -0.5436  0.1234  0.6690  2.9310 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           8.22882    2.51657   3.270 0.005169 ** 
## EC_score              0.89366    0.40232   2.221 0.042147 *  
## key_resp_10.rt       -1.22932    0.72699  -1.691 0.111511    
## numberWords_recalled -0.13467    0.03086  -4.364 0.000555 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.46 on 15 degrees of freedom
## Multiple R-squared:  0.6301, Adjusted R-squared:  0.5561 
## F-statistic: 8.516 on 3 and 15 DF,  p-value: 0.001534

Effortful Control was positively related to the blocks required to learn the optimal strategy; higher EC score, slower learning. High EC people may spend more time looking for verbalizable rules. WMC is also a significant predictor, better WMC, faster learning.

Ability to discover optimal strategy

#create a column that tracks whether participants ever used optimal strategy in any of the 12 blocks
default_strat$first_II_block<-as.numeric(default_strat$first_II_block)
default_strat$ifOpti[is.na(default_strat$first_II_block)] <- 0
default_strat$ifOpti[default_strat$first_II_block > 0] <-1
default_strat$ifOpti <- as.factor(default_strat$ifOpti)

#It seem that most people have at least 1 block using II, current data cannot converge the model with just 1 person did not ever use II

# m <-glm(ifOpti ~ EX_score + NA_score + EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, default_strat, family="binomial")
# 
# step_m <- stepAIC(m, direction="both")
# summary(step_m)

Strategy consistency

Total number of blocks using optimal strategy

m3<-lm(total_II ~ EX_score + NA_score + EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, default_strat)

step_m3<-step(m3)

summary(step_m3)

## 
## Call:
## lm(formula = total_II ~ EX_score + key_resp_10.rt + numberWords_recalled, 
##     data = default_strat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9245 -1.0078  0.0805  1.3490  3.3912 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           1.21511    4.94554   0.246   0.8090  
## EX_score             -0.98614    0.74019  -1.332   0.2014  
## key_resp_10.rt        1.75355    1.17817   1.488   0.1561  
## numberWords_recalled  0.10177    0.05122   1.987   0.0643 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.488 on 16 degrees of freedom
## Multiple R-squared:  0.3524, Adjusted R-squared:  0.231 
## F-statistic: 2.903 on 3 and 16 DF,  p-value: 0.06708

Higher WMC, more blocks using optimal strategy.

Performance after discovery of optimal strategy

# find first block that used optimal strategy then calculate accuracy for that block through the end of task
cols <- grep('block_', names(pot_df), value = TRUE) #use the df with total_CR_block == NA rows removed
n <- length(cols)

pot_df$per_afterOpti <- mapply(function(x, y) mean(unlist(pot_df[x, cols[y:n]])), seq(nrow(pot_df)), pot_df$first_II_block)

m4 <- lm(per_afterOpti ~ EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, pot_df)

step_m4 <- step(m4)

summary(step_m4)

## 
## Call:
## lm(formula = per_afterOpti ~ numberWords_recalled, data = pot_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.14559 -0.02541  0.01131  0.03750  0.08217 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.752671   0.054767  13.743 1.23e-10 ***
## numberWords_recalled 0.002073   0.001176   1.762    0.096 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05711 on 17 degrees of freedom
## Multiple R-squared:  0.1545, Adjusted R-squared:  0.1047 
## F-statistic: 3.106 on 1 and 17 DF,  p-value: 0.09597

WMC is approaching significance. Better WMC is trending towards consistently applying optimal strategy post discovery.

Pilot II Task Data Analyses

Feb 11 2021