library(tidyverse)
library(zoo)
library(knitr)
library(tibble)
library(plyr)
library(dplyr)
library(data.table)
library(Hmisc)
library(grt)
library(reshape) #reshape data from long to wide
library(MASS) #backward elimination of non-significant predictors in linear models
#need this to work on Mac
logLik.grg <- function(object, ...)
{
val <- object$logLik
class(val) <- "logLik"
val
}
df <- read.csv("qualtrics_ii.csv", header=T)
df<-df %>% mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x)) #fill na with col average
cols.num <- c(12:73) #find ATQ columns
df[cols.num] <- sapply(df[cols.num],as.numeric) #convert to numerics
#sapply(df, class)
indx <- grepl('R', colnames(df)) #find reverse coded items
df[,indx]<-abs(df[,indx]-8) #reverse coding
NA_indx <- grepl('NA', colnames(df)) #find Neg_Affect items
NA_sum <- rowSums(df[, NA_indx]) #total score of NA items for each participant
# df$NA_score <- NA_sum/25 #append average NA scores to df
df$NA_score <- NA_sum
EC_indx <- grepl('EC', colnames(df)) #find EC items
EC_sum <- rowSums(df[, EC_indx]) #total score of EC items for each participant
# df$EC_score <- EC_sum/19 #append EC scores to df
df$EC_score <- EC_sum
EX_indx <- grepl('EX', colnames(df)) #find EX items
EX_sum <- rowSums(df[, EX_indx]) #total score of EX items for each participant
# df$EX_score <- EX_sum/17 #append EX scores to df
df$EX_score <- EX_sum
df_temp <-df[,c(1:11, 74:76)]
#generate columns that take temperament factors as low/medium/high levels for potential analyses
df_temp<-df_temp %>% mutate(NA_level = cut(NA_score, quantile(NA_score, c(0, .5, 1)), labels = c('Low', 'High'), include.lowest = TRUE))
df_temp<-df_temp %>% mutate(EC_level = cut(EC_score, quantile(EC_score, c(0, .5, 1)), labels = c('Low', 'High'), include.lowest = TRUE))
df_temp<-df_temp %>% mutate(EX_level = cut(EX_score, quantile(EX_score, c(0, .5, 1)), labels = c('Low', 'High'), include.lowest = TRUE))
setwd( "/Users/tzhu9/Desktop/pilotData_ii/completed_rspan_ii")
csv <- list.files(full.names = TRUE, pattern = '*.csv')
df_Pav <- as_tibble(rbind.fill(lapply (csv, fread)))
#subset dataset
# which(colnames(df_Pav)=="exp_Sentence")
# which(colnames(df_Pav)=="catlearn_corrAns")
cols1 <- c(1, 85:131)
df_Pav<-df_Pav[,cols1]
# which(colnames(df_Pav)=="exp_typedWord")
# which(colnames(df_Pav)=="numberWords_recalled")
# which(colnames(df_Pav)=="key_resp_20.keys")
# which(colnames(df_Pav)=="II_stim_file")
cols2 <- c(1:6,12,20:21,31:33,40:48)
df_Pav<-df_Pav[,cols2]
#fill empty cells with NA
df_Pav <- df_Pav %>%
mutate_at(vars(colnames(.)),
.funs = funs(ifelse(.=="", NA, as.character(.))))
#remove empty rows
df_Pav<-df_Pav[!is.na(df_Pav$exp_Sentence) | !is.na(df_Pav$exp_typedWord) | !is.na(df_Pav$key_resp_20.keys),]
# which(colnames(df_Pav)=="sentence_num_total")
df_rspan<-df_Pav[, 1:9] #subset relevant cols
df_rspan<-df_rspan[!is.na(df_rspan$exp_Sentence) | !is.na(df_rspan$exp_typedWord),] #keep rows with either judgment or recall responses
cols4 <- c(4, 6, 8:9)
df_rspan[cols4] <- sapply(df_rspan[cols4],as.numeric)
#keep only judgment related rows
accuracy <- df_rspan %>%
filter(!is.na(exp_isCorrect))
#late responses on judgment are filled with rt of 4.5
accuracy$key_resp_10.rt[is.na(accuracy$key_resp_10.rt)] <- 4.5
#keep only recalled_word related rows
recall_num <- df_rspan %>%
filter(!is.na(numberWords_recalled))
#accuracy dataframe
df1<-aggregate(accuracy[, c(4,6)], list(accuracy$UniqueID), mean)#average for each Ss performance
#recalled number dataframe
df2<-aggregate(recall_num[, 8], list(recall_num$UniqueID), sum)
df_rspan_perf <- merge(df1,df2,by="Group.1")
names(df_rspan_perf)[1] <- "UniqueID"
head(df_rspan_perf)
df_tem_rspan<-merge(df_temp, df_rspan_perf, by = "UniqueID")
head(df_tem_rspan)
cols3<- c(1, 10:21)
df_catlearn<-df_Pav[, cols3] #remove irrelevant cols
df_catlearn<-df_catlearn[!is.na(df_catlearn$key_resp_20.keys),] #remove empty rows
df_catlearn$block <- rep(1:12, times=26, each = 30)
df_catlearn<- df_catlearn[, c(1, 14, 2:13)]
#describe(df_catlearn)
df_catlearn <-(df_catlearn %>% mutate(catlearn_corrAns = case_when(
catlearn_corrAns == "a" ~ 1,
catlearn_corrAns == "b" ~ 2))
)
#relabel the key_resp_3.keys as 1 and 2
df_catlearn<-(df_catlearn %>% mutate(key_resp_20.keys = case_when(
key_resp_20.keys == "a" ~ 1,
key_resp_20.keys == "b" ~ 2))
)
cols5<-c(4, 5, 7:13)
df_catlearn[cols5] <- sapply(df_catlearn[cols5],as.numeric)
dat<-df_catlearn
#template code for combining blocks
# cols6<-c(1, 3:14)
# dat<-df_catlearn[, cols6]
# dat$block <-rep(1:6, times = 24, each=60)
# source("plot.gcjc.R")
blocks <- levels(factor(dat$block))
sbj <- levels(factor(dat$UniqueID))
models <- c("Orientation", "Length", "Information Integration", "Conjunctive Rule", "Fixed Random", "General Random")
res <- matrix(nrow=length(sbj) * length(blocks),ncol=(length(models) * 2 + 2))
res <- cbind(expand.grid(sbj,blocks), res)
colnames(res)[1:2] <- c("UniqueID","block")
pdf(file="26s_ii_individual_plots.pdf", width=12, height=9)
op <- par(mfrow = c(3, 4), pty = "m")
for(i in sbj)
{
for(j in blocks)
{
data <- subset(dat, dat$block == j & dat$UniqueID == i)
fit.ori <- glc(key_resp_20.keys ~ ori_transform, category=data$catlearn_corrAns, data=data)
fit.length <- glc(key_resp_20.keys ~ length, category=data$catlearn_corrAns, data=data)
fit.2d <- glc(key_resp_20.keys ~ length + ori_transform, category=data$catlearn_corrAns, data=data)
fit.cj <- gcjc(key_resp_20.keys ~ length + ori_transform,
category=data$catlearn_corrAns, data=data,
equal.noise = TRUE, config = 2,
fixed = list(noise = c(TRUE, FALSE), bias = c(FALSE, FALSE)))
fit.frg <- grg(data$catlearn_corrAns, fixed=T)
fit.grg <- grg(data$catlearn_corrAns, fixed=F)
AICs <- AIC(fit.ori, fit.length, fit.2d, fit.cj, fit.frg, fit.grg)
BICs <- AIC(fit.ori, fit.length, fit.2d, fit.cj, fit.frg, fit.grg, k=log(80))
winner <- which.min(AICs$AIC)
winner2 <- which.min(BICs$AIC)
res[res$UniqueID==i & res$block == j,3:ncol(res)] <- c(round(AICs$AIC,2), models[winner], round(BICs$AIC,2), models[winner2])
##### now plot'em ###RACHEL CHANGED 'winner' to 'winner2' to reflect BIC instead of AIC
col <- rep("black",4)
col[winner2] <- "red"
bg <- c("blue","red")[factor(data$key_resp_20.keys)]
pch <- c(21,24)[factor(data$catlearn_corrAns)]
title <- paste(i, ": block", j, ": ", models[winner2])
plot(ori_transform ~ length, data=data, xlab="Length", ylab="Orientation", bg=bg, pch=pch, xlim=c(0,300), ylim=c(0, 110), main=title)
abline(h=coef(fit.ori), col = col[1])
abline(v=coef(fit.length), col = col[2])
abline(coef(fit.2d), col = col[3])
cj.coef <- coef(fit.cj)
abline(h=cj.coef[["ori_transform"]], col = col[4], lty = 3)
abline(v=cj.coef[["length"]], col = col[4], lty = 3)
}
}
dev.off()
## quartz_off_screen
## 2
res.nam <- c("ori","length","2d","cj","frg","grg","winner")
colnames(res)[3:16] <- c(paste(res.nam, "AIC", sep="."), paste(res.nam, "BIC", sep="."))
catlearn_perf<- dat %>%
group_by(UniqueID, block) %>%
dplyr::summarize(Mean = mean(key_resp_20.corr, na.rm=TRUE))
head(catlearn_perf)
wide1<-cast(catlearn_perf, UniqueID~block)
# count the number of time each model is used by each participant
model_block <- res %>%
group_by(UniqueID, winner.BIC) %>%
dplyr::summarise(Count = n())
#how many blocks for each Ss was best fitted by CR model
head(model_block[with(model_block, winner.BIC == "Information Integration" ), ])
# find the first block each S used the optimal CR strategy
first_ii<- res %>%
group_by(UniqueID, block, winner.BIC) %>%
dplyr::summarise(Count = n())
a<-first_ii[first_ii$winner.BIC == "Information Integration",]
first_ii<-a %>%
group_by(UniqueID) %>%
slice(which.min(block))
head(first_ii)
#merge df_tem_rspan that has temperament scores and rspan performance with catlearn model fit descriptives
b<-merge(first_ii, df_tem_rspan, all = TRUE)
cols<-c(1:2, 5:23)
b<-b[, cols]
colnames(b)[2] <- "first_II_block"
b[,2] <- as.numeric(as.character(b[,2]))
#reshape model_block from long to wide, this df contains number of blocks best fitted by each strategy
wide<-cast(model_block, UniqueID ~ winner.BIC)
b<-merge(wide,b,all =TRUE)
complete<-setnames(b, old = c('Conjunctive Rule','Fixed Random','General Random','Information Integration', 'Length','Orientation'), new = c('total_CR','total_fixedRand','total_genRand','total_II','total_length','total_ori'))
# generate a column that keep count of the total number of blocks best fitted by rule-based models, regardless of 1d or 2d
complete$total_rule<-complete$total_CR + complete$total_length + complete$total_ori
complete$total_Rand<-complete$total_fixedRand + complete$total_genRand
complete <- complete[,c(1, 12:16,11,9:10,17:27, 2:8, 28:29)]#rearrange cols
# merge with catlearn accuracy performance of each block
complete<-merge(complete, wide1, by = "UniqueID")
# # find random respondents - randomly responded to more than 3 blocks
# complete$UniqueID[complete$total_Rand > 3]
# # remove subjects that had more than 3 blocks best fitted by random guess
# complete<-complete[!complete$total_Rand > 3,]
# remove Ss based on performance - more than 3 blocks < 60% accuracy
dt<-complete[,30:41]
complete$num_low_perf<-rowSums(dt < 0.60)
complete$avg_cat_perf <- rowSums(dt)/12
complete<-complete[!complete$num_low_perf > 6,]
complete<-complete[which(!complete$UniqueID == "tokatest"), ]
head(complete)
complete<-setnames(complete, old = c('1','2','3','4', '5','6','7','8','9','10', '11','12'), new = c('block_1','block_2','block_3','block_4', 'block_5','block_6','block_7','block_8','block_9','block_10', 'block_11','block_12'))
#find the first block of non-random response as use it as default strategy
winnerBIC<-res[,c(1, 2, 16)]
default_strat<-cast(winnerBIC, UniqueID ~ block)
default_strat<-setnames(default_strat, old = c('1','2','3','4', '5','6','7','8','9','10',"11","12"), new = c('block_1','block_2','block_3','block_4', 'block_5','block_6','block_7','block_8','block_9','block_10','block_11','block_12'))
# find the first non-random block and tracks its value and block number
func <- function(..., val = "Fixed Random") {
dat <- unlist(list(...))
ind <- which.max(dat != val)
list(dat[[ind]], ind)
}
new<-setNames(
do.call(rbind.data.frame, do.call(Map, c(list(f=func), default_strat[,-1]))),
c("default_strat", "first_nonRand_block"))
default_strat <- cbind(new$default_strat, new$first_nonRand_block, default_strat)
default_strat<-setnames(default_strat, old = c("new$default_strat", "new$first_nonRand_block"), new = c("default_strat", "first_nonRand_block"))
default_strat$default_strat <- ifelse(default_strat$default_strat=="Length", "1",
ifelse(default_strat$default_strat=="Orientation", "1",
ifelse(default_strat$default_strat=="Conjunctive Rule", "1",
ifelse(default_strat$default_strat=="Information Integration", "0",
NA ))))
default_strat$default_strat<-as.numeric(default_strat$default_strat)
default_strat<-merge(default_strat[,1:3], complete, by="UniqueID")
default_strat<-default_strat[, c(1, 4:45, 2:3)]
default_strat$NA_score<-default_strat$NA_score/25
default_strat$EC_score<-default_strat$EC_score/19
default_strat$EX_score<-default_strat$EX_score/17
#fill NAs with 0 in the block count for each model
default_strat[, c(22:26, 28:29)][is.na(default_strat[, c(22:26, 28:29)])] <- 0
head(default_strat)
Only 5 people used II as default strategy out of the 20 Ss, this analysis can only be done when more data is available.
#model did not converge with very few IIs in the column
# fit_first_nonrand <- glm(default_strat ~ EX_score + NA_score + EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, default_strat, family="binomial")
# look at at which block Ss started using the optimal strategy. In other words, their learning rate.
pot_df<-default_strat[!is.na(default_strat[, 27]), ] #removes Ss who never used II
pot_df[,c(12:14,27)] <- sapply(pot_df[,c(12:14,27)], as.numeric)
pot_fit <- lm(first_II_block ~ EC_score + EX_score + NA_score + key_resp_10.rt + exp_isCorrect + numberWords_recalled, data = pot_df)
step_pot<-step(pot_fit)
summary(step_pot)
##
## Call:
## lm(formula = first_II_block ~ EC_score + key_resp_10.rt + numberWords_recalled,
## data = pot_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5555 -0.5436 0.1234 0.6690 2.9310
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.22882 2.51657 3.270 0.005169 **
## EC_score 0.89366 0.40232 2.221 0.042147 *
## key_resp_10.rt -1.22932 0.72699 -1.691 0.111511
## numberWords_recalled -0.13467 0.03086 -4.364 0.000555 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.46 on 15 degrees of freedom
## Multiple R-squared: 0.6301, Adjusted R-squared: 0.5561
## F-statistic: 8.516 on 3 and 15 DF, p-value: 0.001534
Effortful Control was positively related to the blocks required to learn the optimal strategy; higher EC score, slower learning. High EC people may spend more time looking for verbalizable rules. WMC is also a significant predictor, better WMC, faster learning.
#create a column that tracks whether participants ever used optimal strategy in any of the 12 blocks
default_strat$first_II_block<-as.numeric(default_strat$first_II_block)
default_strat$ifOpti[is.na(default_strat$first_II_block)] <- 0
default_strat$ifOpti[default_strat$first_II_block > 0] <-1
default_strat$ifOpti <- as.factor(default_strat$ifOpti)
#It seem that most people have at least 1 block using II, current data cannot converge the model with just 1 person did not ever use II
# m <-glm(ifOpti ~ EX_score + NA_score + EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, default_strat, family="binomial")
#
# step_m <- stepAIC(m, direction="both")
# summary(step_m)
m3<-lm(total_II ~ EX_score + NA_score + EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, default_strat)
step_m3<-step(m3)
summary(step_m3)
##
## Call:
## lm(formula = total_II ~ EX_score + key_resp_10.rt + numberWords_recalled,
## data = default_strat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9245 -1.0078 0.0805 1.3490 3.3912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.21511 4.94554 0.246 0.8090
## EX_score -0.98614 0.74019 -1.332 0.2014
## key_resp_10.rt 1.75355 1.17817 1.488 0.1561
## numberWords_recalled 0.10177 0.05122 1.987 0.0643 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.488 on 16 degrees of freedom
## Multiple R-squared: 0.3524, Adjusted R-squared: 0.231
## F-statistic: 2.903 on 3 and 16 DF, p-value: 0.06708
Higher WMC, more blocks using optimal strategy.
# find first block that used optimal strategy then calculate accuracy for that block through the end of task
cols <- grep('block_', names(pot_df), value = TRUE) #use the df with total_CR_block == NA rows removed
n <- length(cols)
pot_df$per_afterOpti <- mapply(function(x, y) mean(unlist(pot_df[x, cols[y:n]])), seq(nrow(pot_df)), pot_df$first_II_block)
m4 <- lm(per_afterOpti ~ EC_score+ key_resp_10.rt + numberWords_recalled + exp_isCorrect, pot_df)
step_m4 <- step(m4)
summary(step_m4)
##
## Call:
## lm(formula = per_afterOpti ~ numberWords_recalled, data = pot_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.14559 -0.02541 0.01131 0.03750 0.08217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.752671 0.054767 13.743 1.23e-10 ***
## numberWords_recalled 0.002073 0.001176 1.762 0.096 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05711 on 17 degrees of freedom
## Multiple R-squared: 0.1545, Adjusted R-squared: 0.1047
## F-statistic: 3.106 on 1 and 17 DF, p-value: 0.09597
WMC is approaching significance. Better WMC is trending towards consistently applying optimal strategy post discovery.