library(peekbankr)
library(tidyverse)
library(lme4)
library(lmerTest)
library(tictoc)
library(langcog)
library(here)
figure_path <- here("vignettes","figures")
FIRST_TIME = FALSE # set to true first time to download data from DB
knitr::opts_chunk$set(cache = TRUE, warn = FALSE,warning=FALSE, message = FALSE,cache.lazy = FALSE)
(only on first time loading)
con <- connect_to_peekbank()
datasets <- get_datasets(connection = con) %>% collect()
administrations <- get_administrations(connection = con) %>% collect()
subjects <- get_subjects(connection = con) %>% collect()
tic()
aoi_timepoints <- get_aoi_timepoints(connection = con) %>% collect()
toc()
stimuli <- get_stimuli(connection = con) %>% collect()
trial_types <- get_trial_types(connection = con) %>% collect()
trials <- get_trials(connection = con) %>% collect()
dataset_info <- administrations %>%
right_join(datasets) %>%
right_join(subjects)
aoi_data_joined <- aoi_timepoints %>%
right_join(administrations) %>%
right_join(trials) %>%
right_join(trial_types) %>%
right_join(datasets) %>%
mutate(stimulus_id = target_id) %>%
right_join(stimuli) %>%
filter(t_norm > t_range[1],
t_norm < t_range[2])
save(file = here("brm/data/aoi_data_joined.Rds"), aoi_data_joined)
save(file = here("brm/data/dataset_info.Rds"), dataset_info)
load(file = here("vignettes", "data","aoi_data_joined.Rds"))
dataset_name_mapping <- read_csv(here("vignettes","data","dataset_name_mapping.csv"))
aoi_data_joined <- aoi_data_joined %>%
left_join(dataset_name_mapping)
Add the onset location (t_norm=0) for each trial
#determine onset location for each trial
onset_locations <- aoi_data_joined %>%
filter(t_norm==0) %>%
select(administration_id, subject_id, trial_id,trial_type_id,aoi) %>%
rename(onset_location=aoi)
#rejoin with aoi_data_joined
aoi_data_joined <- aoi_data_joined %>%
right_join(onset_locations)
We’re computing accuracy (percent target looking) over a 300-2000 window here, could check alternate windows.
window_start=300
window_end=2000
#mean accuracy per trial
trial_accuracy <- aoi_data_joined %>%
mutate(
onset_location_tdm=case_when(
onset_location=="other" ~ "missing",
TRUE ~ onset_location
)
) %>%
group_by(dataset_name,dataset_rename,administration_id, subject_id,age, trial_id,trial_type_id,onset_location_tdm) %>%
filter(t_norm>=window_start & t_norm <=window_end) %>%
summarize(
num_obs=n(),
accuracy=case_when(
sum(aoi %in% c("target","distractor"))==0 ~ NA_real_,
TRUE ~ sum(aoi=="target")/(sum(aoi %in% c("target","distractor")))),
percent_missing=sum(aoi %in% c("missing","other"))/num_obs,
total_looking=sum(aoi %in% c("target","distractor")),
percent_looking=total_looking/num_obs,
)
We’re plotting the difference in accuracy depending on where you are looking at the onset of the label (target, distracter, or somewhere else/ missing), and its relationship across age. The plot suggests an interaction between age and target onset location: if you are looking at the target at the onset of the label, you are generally more accurate than otherwise, but this difference is attenuated across age (with also generally increasing accuracy scores).
We’re plotting this first for all trials, and then after filtering based on a relatively strict missingness criterion (if more than 33% of the window is missing gaze data, the trial is excluded).
#Plot relationship between start location and accuracy (and age)
##all data points
ggplot(trial_accuracy,aes(age,accuracy,color=onset_location_tdm,group=onset_location_tdm)) +
geom_point(alpha=0.2)+
geom_smooth(method="lm")+
scale_color_brewer(
name="Onset Location",
type="qual",
palette="Set3"
)
ggsave(here(figure_path,"peekbank_item_onset_location_alldata.png"),width=10,height=6,dpi=600)
#Histogram of percent missing
hist(trial_accuracy$percent_missing)
#exclusions >33% missing
mean(trial_accuracy$percent_missing==0) # ~17% of trials are completely missing (no looks to target or distractor)
## [1] 0.1696212
mean(trial_accuracy$percent_missing>1/3) # excludes about 30% of total trials, so an additional ~13%
## [1] 0.3031601
trial_accuracy %>%
filter(percent_missing>1/3) %>%
ggplot(aes(age,accuracy,color=onset_location_tdm,group=onset_location_tdm)) +
geom_point(alpha=0.2)+
geom_smooth(method="lm")+
scale_color_brewer(
name="Onset Location",
type="qual",
palette="Set3"
)
ggsave(here(figure_path,"peekbank_item_onset_location_33perc_exclusion.png"),width=10,height=6,dpi=600)
The above plot does not take variation across datasets into account. We can try to see whether we see similar difference in accuracy depending on onset locationn across datasets.
TO DO: look at the interaction with age.
#summarize subject accuracy and dataset
subj_accuracy_dataset <- trial_accuracy %>%
filter(percent_missing>1/3) %>%
group_by(dataset_name,dataset_rename,subject_id) %>%
mutate(
weighted_age=mean(age,na.rm=TRUE)
) %>%
group_by(dataset_name,dataset_rename,subject_id,weighted_age,onset_location_tdm) %>%
summarize(
num_trials=n(),
mean_accuracy=mean(accuracy,na.rm=TRUE)
)
subj_accuracy_dataset %>%
ggplot(aes(onset_location_tdm,mean_accuracy,color=onset_location_tdm)) +
geom_boxplot()+
scale_color_brewer(
name="Onset Location",
type="qual",
palette="Set3"
)+
facet_wrap(~dataset_rename)
ggsave(here(figure_path,"peekbank_item_onset_location_33perc_exclusion_bydataset.png"),width=10,height=6,dpi=600)
Checking here whether a similar pattern holds for different windows. Here, adding an extra 1000ms to the end of the window (we do see similar patterns).
window_start=300
window_end=3000
#mean accuracy per trial
trial_accuracy <- aoi_data_joined %>%
mutate(
onset_location_tdm=case_when(
onset_location=="other" ~ "missing",
TRUE ~ onset_location
)
) %>%
group_by(dataset_name,dataset_rename,administration_id, subject_id,age, trial_id,trial_type_id,onset_location_tdm) %>%
filter(t_norm>=window_start & t_norm <=window_end) %>%
summarize(
num_obs=n(),
accuracy=case_when(
sum(aoi %in% c("target","distractor"))==0 ~ NA_real_,
TRUE ~ sum(aoi=="target")/(sum(aoi %in% c("target","distractor")))),
percent_missing=sum(aoi %in% c("missing","other"))/num_obs,
total_looking=sum(aoi %in% c("target","distractor")),
percent_looking=total_looking/num_obs,
)
#Plot relationship between start location and accuracy (and age)
##all data points
ggplot(trial_accuracy,aes(age,accuracy,color=onset_location_tdm,group=onset_location_tdm)) +
geom_point(alpha=0.2)+
geom_smooth(method="lm")+
scale_color_brewer(
name="Onset Location",
type="qual",
palette="Set3"
)
ggsave(here(figure_path,"peekbank_item_onset_location_alldata_3000end.png"),width=10,height=6,dpi=600)
#Histogram of percent missing
hist(trial_accuracy$percent_missing)
#exclusions >33% missing
mean(trial_accuracy$percent_missing==0) # ~17% of trials are completely missing (no looks to target or distractor)
## [1] 0.1037938
mean(trial_accuracy$percent_missing>1/3) # excludes about 30% of total trials, so an additional ~13%
## [1] 0.3263966
trial_accuracy %>%
filter(percent_missing>1/3) %>%
ggplot(aes(age,accuracy,color=onset_location_tdm,group=onset_location_tdm)) +
geom_point(alpha=0.2)+
geom_smooth(method="lm")+
scale_color_brewer(
name="Onset Location",
type="qual",
palette="Set3"
)
ggsave(here(figure_path,"peekbank_item_onset_location_33perc_exclusion_3000end.png"),width=10,height=6,dpi=600)