Data Preprocessing
#rename useful variables
data$acc<-data$Accuracy
data$sbj<-data$participant
data$block<-data$block.thisN+1
#covert to factors
data <- mutate_if(data, is.character, as.factor)
data$sbj<-as.factor(data$sbj)
#exclude one participant (20335) for not following instructions
data<-droplevels(data[data$sbj!="20335",])
str(data)
## 'data.frame': 21120 obs. of 37 variables:
## $ st_exp : int 4 30 48 18 9 55 24 62 12 34 ...
## $ p : int 48 21 35 13 6 40 17 45 50 25 ...
## $ hue : num -0.05 0.15 0.35 -0.25 -0.35 0.25 0.35 0.15 -0.05 -0.25 ...
## $ border : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 2 1 ...
## $ size : num 2.1 2.7 3.1 2.5 2.3 3.3 2.5 3.5 2.3 2.9 ...
## $ correctAns : Factor w/ 2 levels "a","b": 2 1 1 2 2 1 1 1 2 2 ...
## $ practice_trials.thisRepN : logi NA NA NA NA NA NA ...
## $ practice_trials.thisTrialN: logi NA NA NA NA NA NA ...
## $ practice_trials.thisN : logi NA NA NA NA NA NA ...
## $ practice_trials.thisIndex : logi NA NA NA NA NA NA ...
## $ block.thisRepN : int 0 0 0 0 0 0 0 0 0 0 ...
## $ block.thisTrialN : int 0 0 0 0 0 0 0 0 0 0 ...
## $ block.thisN : int 0 0 0 0 0 0 0 0 0 0 ...
## $ block.thisIndex : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trials.thisRepN : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trials.thisTrialN : int 0 1 2 3 4 5 6 7 8 9 ...
## $ trials.thisN : int 0 1 2 3 4 5 6 7 8 9 ...
## $ trials.thisIndex : int 60 36 29 5 34 9 23 33 50 19 ...
## $ thisRow.t : num 111 115 117 119 121 ...
## $ notes : logi NA NA NA NA NA NA ...
## $ Accuracy : int 0 0 1 1 1 1 1 1 1 1 ...
## $ RT : num 2.2247 0.3665 0.3849 0.4082 0.0632 ...
## $ participant : int 17202 17202 17202 17202 17202 17202 17202 17202 17202 17202 ...
## $ categories : Factor w/ 2 levels "AB","BA": 1 1 1 1 1 1 1 1 1 1 ...
## $ group : Factor w/ 4 levels "1100","1600",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ date : Factor w/ 66 levels "2025-02-25_12h00.07.372",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ expName : Factor w/ 1 level "Categorization": 1 1 1 1 1 1 1 1 1 1 ...
## $ psychopyVersion : Factor w/ 1 level "2023.2.3": 1 1 1 1 1 1 1 1 1 1 ...
## $ frameRate : num 60.1 60.1 60.1 60.1 60.1 ...
## $ expStart : Factor w/ 66 levels "2025-02-25 12h00.22.380300 +0200",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ X : logi NA NA NA NA NA NA ...
## $ X.1 : logi NA NA NA NA NA NA ...
## $ X.2 : logi NA NA NA NA NA NA ...
## $ X.3 : logi NA NA NA NA NA NA ...
## $ acc : int 0 0 1 1 1 1 1 1 1 1 ...
## $ sbj : Factor w/ 66 levels "341","1003","1206",..: 10 10 10 10 10 10 10 10 10 10 ...
## $ block : num 1 1 1 1 1 1 1 1 1 1 ...
#calculate participant's average accuracy
data_av<-aggregate(data$acc, list(data$sbj, data$group), mean)
colnames(data_av) <-c("sbj","group" ,"acc")
# check learning criterion
# criterion was set to >=22/32 for one or more blocks of the task
cr=22/32
dat_b<-droplevels(data[data$border=="yes",])
dat_b_av<- aggregate(dat_b$acc, list(dat_b$sbj, dat_b$group, dat_b$block), mean)
colnames(dat_b_av)<-c("sbj","group", "block", "acc")
dat_b_av<-dat_b_av[order(dat_b_av$sbj, dat_b_av$block),]
#new column, "yes" if acc>=cr
dat_b_av$cr<-ifelse(dat_b_av$acc>=cr,"yes", "no")
#Create a list containing sbj ids that do not fulfill the learning criterion (4 "no"s)
only_no<- dat_b_av[dat_b_av$cr == "no", ]
no_counts<- table(only_no$sbj)
total_blocks <- table(dat_b_av$sbj)
only_no_sbj<- names(no_counts[no_counts == 4 & total_blocks[names(no_counts)] == 4])
only_no_sbj
## [1] "43423" "48888" "63018" "77337" "77623"
rm(only_no,no_counts, total_blocks, cr)
excl<-only_no_sbj
#save exclude_list, for use in similarity data analysis
saveRDS(excl, file = "exclude_list_learnersonly.rds")
#Keep data only from those participants not in exclude list
data<-droplevels(data[!(data$sbj %in% excl), ])
#Check that there are 14 sbjs per group
data %>%
distinct(sbj, group) %>%
count(group)
## group n
## 1 1100 15
## 2 1600 15
## 3 600 16
## 4 RD 15
#calculate participant's average accuracy
data_av<-aggregate(data$acc, list(data$sbj, data$group), mean)
colnames(data_av) <-c("sbj","group" ,"acc")
Inferential Statistics - ANOVA & gam
#one-way anova
ezANOVA(data=data_av, dv=acc, wid=sbj, between=group, type=3)
## Warning: Data is unbalanced (unequal N per group). Make sure you specified a
## well-considered value for the type argument to ezANOVA().
## Coefficient covariances computed by hccm()
## $ANOVA
## Effect DFn DFd F p p<.05 ges
## 2 group 3 57 2.762903 0.05020963 0.1269547
##
## $`Levene's Test for Homogeneity of Variance`
## DFn DFd SSn SSd F p p<.05
## 1 3 57 0.000535345 0.02731222 0.3724178 0.7731975
# No effect of group!
# Learning Curves
##################
# calculate average performance per sbj and block
data_bl<-aggregate(data$acc, list(data$sbj, data$group, data$block), mean)
colnames(data_bl) <-c("sbj","group", "block", "acc")
#calculate average accuracy and SE per block
data_gr<-aggregate(data_bl$acc, list(data_bl$group, data_bl$block), mean)
colnames(data_gr)<-c("group","block", "acc")
temp<- aggregate(data_bl$acc, list(data_bl$group, data_bl$block), se)
data_gr$se<-temp$x
rm(temp)
data_gr<-data_gr[order(data_gr$group, data_gr$block),]
#xlb="Block\n Error Bars: +/- 1 SE"
xlb="Block"
ylb="Proportion Correct"
mn="Categorization Accuracy"
off=0.05
pch=c(15,16,17,18)
lty=c(2,3,4,5)
col=c("grey70","grey60","grey50", "grey40" )
#"600"
plotCI(x=1:4, y=data_gr[data_gr$group=="600",]$acc, uiw=data_gr[data_gr$group=="600",]$se, bty="n", ylim=c(0.6,1),xlim=c(1, 4.5), xlab=xlb, ylab=ylb, main=mn, las=1, xaxt="n", yaxt="n", pch=pch[1], col=col[1], gap=0)
lines(x=1:4, y=data_gr[data_gr$group=="600",]$acc, lty=lty[1], col=col[1], lwd=2)
#"1100"
plotCI(x=1:4+off, y=data_gr[data_gr$group=="1100",]$acc, uiw=data_gr[data_gr$group=="600",]$se, add=T, pch=pch[2], col=col[2], gap=0)
lines(x=1:4+off, y=data_gr[data_gr$group=="1100",]$acc, lty=lty[2], col=col[2], lwd=2)
#"1600"
plotCI(x=1:4+2*off, y=data_gr[data_gr$group=="1600",]$acc, uiw=data_gr[data_gr$group=="600",]$se, add=T, pch=pch[3], col=col[3], gap=0)
lines(x=1:4+2*off, y=data_gr[data_gr$group=="1600",]$acc, lty=lty[3], col=col[3], lwd=2)
#"RD"
plotCI(x=1:4+3*off, y=data_gr[data_gr$group=="RD",]$acc, uiw=data_gr[data_gr$group=="600",]$se, add=T, pch=pch[4], col=col[4], gap=0)
lines(x=1:4+3*off, y=data_gr[data_gr$group=="RD",]$acc, lty=lty[4], col=col[4], lwd=2)
axis(side=1, at=c(1,2,3,4))
axis(side=2, at=c(0.6, 0.7, 0.8, 0.9, 1.0),labels=c("0.0", "0.7", "0.8", "0.9", "1.0") , las=2)
axis.break(2, 0.65, style="slash")
legend(x=2.5, y=0.8, legend=c("600 ms", "1100 ms", "1600 ms", "RD"), col=col, lty=lty, pch=pch, bty="n", seg.len=4, lwd=2)

#########################
####gam analysis
#########################
#first create the trial variable
tr<-1:320
data$trial<-tr
#Null model
m0<-bam(acc~1 + s(trial) + s(trial, sbj, bs="fs", m=1), data=data, family=binomial)
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated
## 1-d smooths of same variable.
gam.check(m0)

##
## Method: fREML Optimizer: perf newton
## full convergence after 6 iterations.
## Gradient range [-6.778873e-08,1.3506e-09]
## (score 27493.91 & scale 1).
## Hessian positive definite, eigenvalue range [2.556301,12.76212].
## Model rank = 559 / 559
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(trial) 9.00 7.97 0.95 0.43
## s(trial,sbj) 549.00 113.89 0.95 0.50
#edf is not close to k', so we are ok. We do not need to increase k
#add a fixed effect of group
m1<- bam(acc~ 1 + group + s(trial) + s(trial, sbj, bs="fs", m=1), data=data, family=binomial )
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated
## 1-d smooths of same variable.
gam.check(m1)

##
## Method: fREML Optimizer: perf newton
## full convergence after 6 iterations.
## Gradient range [-7.119537e-08,1.309818e-09]
## (score 27508.01 & scale 1).
## Hessian positive definite, eigenvalue range [2.556747,12.40878].
## Model rank = 562 / 562
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(trial) 9.00 7.97 0.93 0.055 .
## s(trial,sbj) 549.00 109.96 0.93 0.085 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#edf is not close to k', so we are ok. We do not need to increase k
compareML(m0,m1)
## m0: acc ~ 1 + s(trial) + s(trial, sbj, bs = "fs", m = 1)
##
## m1: acc ~ 1 + group + s(trial) + s(trial, sbj, bs = "fs", m = 1)
##
## Model m0 preferred: lower fREML score (14.109), and lower df (3.000).
## -----
## Model Score Edf Difference Df
## 1 m1 27508.01 8
## 2 m0 27493.91 5 14.109 -3.000
##
## AIC difference: 0.34, model m1 has lower AIC.
#m0 preferred (AIC difference too small to be taken inteo consideration)
#add a term modeling four smooth terms of trial, one for each group level
m2<-bam(acc~ 1 + group + s(trial) + s(trial, by=group) +s(trial, sbj, bs="fs", m=1), data=data, family=binomial )
## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has repeated
## 1-d smooths of same variable.
gam.check(m2)

##
## Method: fREML Optimizer: perf newton
## full convergence after 10 iterations.
## Gradient range [-0.0001628131,5.341441e-05]
## (score 27515.34 & scale 1).
## Hessian positive definite, eigenvalue range [2.012957e-05,12.05895].
## Model rank = 597 / 598
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(trial) 9.00e+00 7.96e+00 0.94 0.42
## s(trial):group1100 9.00e+00 1.81e+00 0.94 0.46
## s(trial):group1600 9.00e+00 1.01e-04 0.94 0.40
## s(trial):group600 9.00e+00 1.00e+00 0.94 0.41
## s(trial):groupRD 9.00e+00 1.00e+00 0.94 0.46
## s(trial,sbj) 5.49e+02 1.08e+02 0.94 0.48
#edf is not close to k', so we are ok. We do not need to increase k
compareML(m2, m0)
## m2: acc ~ 1 + group + s(trial) + s(trial, by = group) + s(trial,
## sbj, bs = "fs", m = 1)
##
## m0: acc ~ 1 + s(trial) + s(trial, sbj, bs = "fs", m = 1)
##
## Model m0 preferred: lower fREML score (21.431), and lower df (11.000).
## -----
## Model Score Edf Difference Df
## 1 m2 27515.34 16
## 2 m0 27493.91 5 -21.431 11.000
##
## AIC difference: 2.53, model m0 has lower AIC.
#m0 preferred
m3<-bam(acc~ 1 + group + s(trial, by=group) +s(trial, sbj, bs="fs", m=1), data=data, family=binomial )
gam.check(m3)

##
## Method: fREML Optimizer: perf newton
## full convergence after 8 iterations.
## Gradient range [-7.383876e-07,1.515882e-08]
## (score 27502.37 & scale 1).
## Hessian positive definite, eigenvalue range [0.5842266,11.83839].
## Model rank = 589 / 589
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(trial):group1100 9.00 6.07 0.96 0.89
## s(trial):group1600 9.00 4.10 0.96 0.89
## s(trial):group600 9.00 5.54 0.96 0.94
## s(trial):groupRD 9.00 7.18 0.96 0.90
## s(trial,sbj) 549.00 107.42 0.96 0.88
compareML(m3,m0)
## m3: acc ~ 1 + group + s(trial, by = group) + s(trial, sbj, bs = "fs",
## m = 1)
##
## m0: acc ~ 1 + s(trial) + s(trial, sbj, bs = "fs", m = 1)
##
## Model m0 preferred: lower fREML score (8.469), and lower df (9.000).
## -----
## Model Score Edf Difference Df
## 1 m3 27502.37 14
## 2 m0 27493.91 5 -8.469 9.000
##
## AIC difference: 23.89, model m0 has lower AIC.
m4<-bam(acc~ 1 + s(trial, by=group) +s(trial, sbj, bs="fs", m=1), data=data, family=binomial )
gam.check(m4)

##
## Method: fREML Optimizer: perf newton
## full convergence after 8 iterations.
## Gradient range [-1.8855e-06,4.177544e-08]
## (score 27487.44 & scale 1).
## Hessian positive definite, eigenvalue range [0.5721259,12.38955].
## Model rank = 586 / 586
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(trial):group1100 9.00 6.07 0.93 0.12
## s(trial):group1600 9.00 4.05 0.93 0.14
## s(trial):group600 9.00 5.62 0.93 0.16
## s(trial):groupRD 9.00 7.15 0.93 0.14
## s(trial,sbj) 549.00 111.40 0.93 0.12
compareML(m4,m0)
## m4: acc ~ 1 + s(trial, by = group) + s(trial, sbj, bs = "fs", m = 1)
##
## m0: acc ~ 1 + s(trial) + s(trial, sbj, bs = "fs", m = 1)
##
## Chi-square test of fREML scores
## -----
## Model Score Edf Difference Df p.value Sig.
## 1 m0 27493.91 5
## 2 m4 27487.44 11 6.464 6.000 0.044 *
##
## AIC difference: 24.26, model m0 has lower AIC.
fm<-m0
summary(fm)
##
## Family: binomial
## Link function: logit
##
## Formula:
## acc ~ 1 + s(trial) + s(trial, sbj, bs = "fs", m = 1)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.92712 0.03571 53.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(trial) 7.965 8.701 141.5 <2e-16 ***
## s(trial,sbj) 113.891 548.000 256.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0266 Deviance explained = 3.5%
## fREML = 27494 Scale est. = 1 n = 19520