library(peekbankr)
library(tidyverse)
library(lme4)
library(lmerTest)
library(tictoc)
library(langcog)
library(here)
library(DT)
figure_path <- here("vignettes","figures")
FIRST_TIME = FALSE # set to true first time to download data from DB
knitr::opts_chunk$set(cache = TRUE, warn = FALSE,warning=FALSE, message = FALSE,cache.lazy = FALSE)
con <- connect_to_peekbank()
datasets <- get_datasets(connection = con) %>% collect()
administrations <- get_administrations(connection = con) %>% collect()
subjects <- get_subjects(connection = con) %>% collect()
tic()
aoi_timepoints <- get_aoi_timepoints(connection = con) %>% collect()
toc()
stimuli <- get_stimuli(connection = con) %>% collect()
trial_types <- get_trial_types(connection = con) %>% collect()
trials <- get_trials(connection = con) %>% collect()
dataset_info <- administrations %>%
right_join(datasets) %>%
right_join(subjects)
aoi_data_joined <- aoi_timepoints %>%
right_join(administrations) %>%
right_join(trials) %>%
right_join(trial_types) %>%
right_join(datasets) %>%
mutate(stimulus_id = target_id) %>%
right_join(stimuli) %>%
filter(t_norm > t_range[1],
t_norm < t_range[2])
save(file = here("brm/data/aoi_data_joined.Rds"), aoi_data_joined)
save(file = here("brm/data/dataset_info.Rds"), dataset_info)
load(file = here("vignettes", "data","aoi_data_joined.Rds"))
dataset_name_mapping <- read_csv(here("vignettes","data","dataset_name_mapping.csv"))
aoi_data_joined <- aoi_data_joined %>%
left_join(dataset_name_mapping)
We’re summarizing missingness information on the trial level here for two windows: 1. the entire trial window 2. a crtical test window (here, a traditional 300-2000ms window - could be varied)
#overall trial missingness and stickiness patterns
trial_missingness <- aoi_data_joined %>% group_by(dataset_name,dataset_rename,administration_id, subject_id,age, trial_id,trial_type_id) %>%
summarize(
num_obs=n(),
window_size=max(t_norm)-min(t_norm),
accuracy=case_when(
sum(aoi %in% c("target","distractor"))==0 ~ NA_real_,
TRUE ~ sum(aoi=="target")/(sum(aoi %in% c("target","distractor")))),
percent_missing=sum(aoi %in% c("missing","other"))/num_obs,
total_looking=sum(aoi %in% c("target","distractor")),
percent_looking=total_looking/num_obs,
zoner_trial=ifelse(accuracy ==0 | accuracy==1,1,0),
zoner_type=case_when(
accuracy==1 ~ "target",
accuracy==0 ~ "distracter",
TRUE ~ "none"
)
)
window_start <- 300
window_end <- 2000
#mean accuracy per trial
trial_accuracy <- aoi_data_joined %>% group_by(dataset_name,dataset_rename,administration_id, subject_id,age, trial_id,trial_type_id) %>%
filter(t_norm>=window_start & t_norm <=window_end) %>%
summarize(
num_obs=n(),
accuracy=case_when(
sum(aoi %in% c("target","distractor"))==0 ~ NA_real_,
TRUE ~ sum(aoi=="target")/(sum(aoi %in% c("target","distractor")))),
percent_missing=sum(aoi %in% c("missing","other"))/num_obs,
total_looking=sum(aoi %in% c("target","distractor")),
percent_looking=total_looking/num_obs,
zoner_trial=ifelse(accuracy ==0 | accuracy==1,1,0),
zoner_type=case_when(
accuracy==1 ~ "target",
accuracy==0 ~ "distracter",
TRUE ~ "none"
)
)
Percent zoners (either looking exclusively at target or distractor throughout entire trial).
trial_missingness %>%
group_by(dataset_rename) %>%
summarize(
percent_zoner_trials=mean(zoner_trial,na.rm=TRUE),
percent_target_zoner=mean(accuracy==1,na.rm=TRUE),
percent_distracter_zoner=mean(accuracy==0,na.rm=TRUE)) %>%
DT::datatable()
Distribution of percent missing within the critical test window.
trial_accuracy %>%
filter(dataset_rename!="yoursmy") %>% #ignore yoursmy until it is updated
ggplot(aes(x=percent_missing)) +
geom_density() +
facet_wrap(~dataset_rename,nrow=3)
Upshot: Percent missing and zoning across the entire trial tends to decrease across age.
Percent missing across age
#Percent missing
trial_accuracy %>%
ggplot(aes(x=age,y=percent_missing)) +
geom_point(alpha=0.2)+
geom_smooth(method="lm")+
facet_wrap(~dataset_rename,nrow=3)
m <- lmer(percent_missing ~ age + (1|subject_id)+(1|dataset_rename)+(1|trial_type_id),data=trial_accuracy)
summary(m)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: percent_missing ~ age + (1 | subject_id) + (1 | dataset_rename) +
## (1 | trial_type_id)
## Data: trial_accuracy
##
## REML criterion at convergence: 14774
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.4457 -0.5426 -0.1683 0.3753 3.4524
##
## Random effects:
## Groups Name Variance Std.Dev.
## trial_type_id (Intercept) 0.002969 0.05449
## subject_id (Intercept) 0.033525 0.18310
## dataset_rename (Intercept) 0.026866 0.16391
## Residual 0.080410 0.28357
## Number of obs: 35023, groups:
## trial_type_id, 1664; subject_id, 1302; dataset_rename, 15
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 3.443e-01 4.517e-02 1.711e+01 7.622 6.74e-07 ***
## age -4.695e-03 5.177e-04 1.426e+03 -9.069 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age -0.306
Nice example that it’s important to account for data set non-independence. If we ignore it, it looks like percent missing actually increases with age.
trial_accuracy %>%
ggplot(aes(x=age,y=percent_missing)) +
geom_point(alpha=0.2)+
geom_smooth(method="lm")
Distribution of zoners across age - “zoning” tends to decrease with age
trial_missingness %>%
ggplot(aes(x=age,y=zoner_trial)) +
geom_point(alpha=0.2)+
geom_smooth(method="lm")+
facet_wrap(~dataset_rename,nrow=3)
#zoners
m <- glmer(zoner_trial ~ age + (1|subject_id)+(1|dataset_rename)+(1|trial_type_id),data=trial_accuracy,family=binomial)
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: zoner_trial ~ age + (1 | subject_id) + (1 | dataset_rename) +
## (1 | trial_type_id)
## Data: trial_accuracy
##
## AIC BIC logLik deviance df.resid
## 36688.0 36729.4 -18339.0 36678.0 29118
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.9806 -0.7293 -0.5670 1.0640 3.1435
##
## Random effects:
## Groups Name Variance Std.Dev.
## trial_type_id (Intercept) 0.1482 0.3849
## subject_id (Intercept) 0.2617 0.5116
## dataset_rename (Intercept) 0.1402 0.3744
## Number of obs: 29123, groups:
## trial_type_id, 1633; subject_id, 1291; dataset_rename, 15
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.335649 0.116430 -2.883 0.00394 **
## age -0.011825 0.002115 -5.591 2.26e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age -0.488
It appears that having a higher percentage of zoning trials in general is related to slightly lower accuracy
#compute percent zoner trials for each participant
subj_zoners <- trial_missingness %>%
group_by(dataset_name,dataset_rename, subject_id) %>%
summarize(
num_trials=n(),
num_zoner_trials=sum(zoner_trial,na.rm=TRUE),
percent_zoner_trials=num_zoner_trials/num_trials
)
#compute accuracy for a 300-2000 target window
subj_accuracy <- trial_accuracy %>%
group_by(dataset_name,dataset_rename,subject_id) %>%
mutate(
weighted_age=mean(age,na.rm=TRUE)
) %>%
group_by(dataset_name,dataset_rename,subject_id,weighted_age) %>%
summarize(
num_trials=n(),
mean_accuracy=mean(accuracy,na.rm=TRUE)
)
#combine
subj_accuracy <- subj_accuracy %>%
left_join(subj_zoners, by=c("dataset_rename","subject_id"))
#Plot relationship between percent zoner trials and accuracy
ggplot(subj_accuracy,aes(percent_zoner_trials, mean_accuracy,color=dataset_rename))+
geom_point(alpha=0.2)+
geom_smooth(method="lm",se=FALSE)
Number of trials and relation to reliability?
TO DO