Peekbank Missingness and Zoning Exploration

library(peekbankr)
library(tidyverse)
library(lme4)
library(lmerTest)
library(tictoc)
library(langcog)
library(here)
library(DT)

figure_path <- here("vignettes","figures")

FIRST_TIME = FALSE # set to true first time to download data from DB

knitr::opts_chunk$set(cache = TRUE, warn = FALSE,warning=FALSE, message = FALSE,cache.lazy = FALSE)

Get data

con <- connect_to_peekbank()
datasets <- get_datasets(connection = con) %>% collect()
administrations <- get_administrations(connection = con) %>% collect()
subjects <- get_subjects(connection = con) %>% collect()
tic()
aoi_timepoints <- get_aoi_timepoints(connection = con) %>% collect()
toc()
stimuli <- get_stimuli(connection = con) %>% collect()
trial_types <- get_trial_types(connection = con) %>% collect()
trials <- get_trials(connection = con)  %>% collect()

dataset_info <- administrations %>%
  right_join(datasets) %>%
  right_join(subjects) 

aoi_data_joined <- aoi_timepoints %>%
  right_join(administrations) %>%
  right_join(trials) %>%
  right_join(trial_types) %>%
  right_join(datasets) %>%
  mutate(stimulus_id = target_id) %>%
  right_join(stimuli) %>%
  filter(t_norm > t_range[1],
         t_norm < t_range[2])

save(file = here("brm/data/aoi_data_joined.Rds"), aoi_data_joined)
save(file = here("brm/data/dataset_info.Rds"), dataset_info)

Load data

load(file = here("vignettes", "data","aoi_data_joined.Rds"))
dataset_name_mapping <- read_csv(here("vignettes","data","dataset_name_mapping.csv"))
aoi_data_joined <- aoi_data_joined %>%
  left_join(dataset_name_mapping)

Compute missingness on trial level

We’re summarizing missingness information on the trial level here for two windows: 1. the entire trial window 2. a crtical test window (here, a traditional 300-2000ms window - could be varied)

#overall trial missingness and stickiness patterns
trial_missingness <- aoi_data_joined %>% group_by(dataset_name,dataset_rename,administration_id, subject_id,age, trial_id,trial_type_id) %>%
  summarize(
    num_obs=n(),
    window_size=max(t_norm)-min(t_norm),
    accuracy=case_when(
      sum(aoi %in% c("target","distractor"))==0 ~ NA_real_,
      TRUE ~ sum(aoi=="target")/(sum(aoi %in% c("target","distractor")))),
    percent_missing=sum(aoi %in% c("missing","other"))/num_obs,
    total_looking=sum(aoi %in% c("target","distractor")),
    percent_looking=total_looking/num_obs,
    zoner_trial=ifelse(accuracy ==0 | accuracy==1,1,0),
    zoner_type=case_when(
      accuracy==1 ~ "target",
      accuracy==0 ~ "distracter",
      TRUE ~ "none"
    )
  )


window_start <- 300
window_end <- 2000

#mean accuracy per trial
trial_accuracy <- aoi_data_joined %>% group_by(dataset_name,dataset_rename,administration_id, subject_id,age, trial_id,trial_type_id) %>%
  filter(t_norm>=window_start & t_norm <=window_end) %>%
  summarize(
    num_obs=n(),
    accuracy=case_when(
      sum(aoi %in% c("target","distractor"))==0 ~ NA_real_,
      TRUE ~ sum(aoi=="target")/(sum(aoi %in% c("target","distractor")))),
    percent_missing=sum(aoi %in% c("missing","other"))/num_obs,
    total_looking=sum(aoi %in% c("target","distractor")),
    percent_looking=total_looking/num_obs,
    zoner_trial=ifelse(accuracy ==0 | accuracy==1,1,0),
    zoner_type=case_when(
      accuracy==1 ~ "target",
      accuracy==0 ~ "distracter",
      TRUE ~ "none"
    )
  )

What percentage of trials are missing & zoners, across dataset?

Percent zoners (either looking exclusively at target or distractor throughout entire trial).

trial_missingness %>%
  group_by(dataset_rename) %>%
  summarize(
    percent_zoner_trials=mean(zoner_trial,na.rm=TRUE),
    percent_target_zoner=mean(accuracy==1,na.rm=TRUE),
    percent_distracter_zoner=mean(accuracy==0,na.rm=TRUE)) %>%
  DT::datatable()

Distribution of percent missing within the critical test window.

trial_accuracy %>%
  filter(dataset_rename!="yoursmy") %>% #ignore yoursmy until it is updated
  ggplot(aes(x=percent_missing)) +
  geom_density() +
  facet_wrap(~dataset_rename,nrow=3)

Relationship between percent missing and zoners to age

Upshot: Percent missing and zoning across the entire trial tends to decrease across age.

Percent missing across age

#Percent missing
trial_accuracy %>%
  ggplot(aes(x=age,y=percent_missing)) +
  geom_point(alpha=0.2)+
  geom_smooth(method="lm")+
  facet_wrap(~dataset_rename,nrow=3)

m <- lmer(percent_missing ~ age + (1|subject_id)+(1|dataset_rename)+(1|trial_type_id),data=trial_accuracy)
summary(m)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: percent_missing ~ age + (1 | subject_id) + (1 | dataset_rename) +  
##     (1 | trial_type_id)
##    Data: trial_accuracy
## 
## REML criterion at convergence: 14774
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4457 -0.5426 -0.1683  0.3753  3.4524 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  trial_type_id  (Intercept) 0.002969 0.05449 
##  subject_id     (Intercept) 0.033525 0.18310 
##  dataset_rename (Intercept) 0.026866 0.16391 
##  Residual                   0.080410 0.28357 
## Number of obs: 35023, groups:  
## trial_type_id, 1664; subject_id, 1302; dataset_rename, 15
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  3.443e-01  4.517e-02  1.711e+01   7.622 6.74e-07 ***
## age         -4.695e-03  5.177e-04  1.426e+03  -9.069  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##     (Intr)
## age -0.306

Nice example that it’s important to account for data set non-independence. If we ignore it, it looks like percent missing actually increases with age.

trial_accuracy %>%
  ggplot(aes(x=age,y=percent_missing)) +
  geom_point(alpha=0.2)+
  geom_smooth(method="lm")

Distribution of zoners across age - “zoning” tends to decrease with age

trial_missingness %>%
  ggplot(aes(x=age,y=zoner_trial)) +
  geom_point(alpha=0.2)+
  geom_smooth(method="lm")+
  facet_wrap(~dataset_rename,nrow=3)

#zoners
m <- glmer(zoner_trial ~ age + (1|subject_id)+(1|dataset_rename)+(1|trial_type_id),data=trial_accuracy,family=binomial)
summary(m)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: zoner_trial ~ age + (1 | subject_id) + (1 | dataset_rename) +  
##     (1 | trial_type_id)
##    Data: trial_accuracy
## 
##      AIC      BIC   logLik deviance df.resid 
##  36688.0  36729.4 -18339.0  36678.0    29118 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.9806 -0.7293 -0.5670  1.0640  3.1435 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  trial_type_id  (Intercept) 0.1482   0.3849  
##  subject_id     (Intercept) 0.2617   0.5116  
##  dataset_rename (Intercept) 0.1402   0.3744  
## Number of obs: 29123, groups:  
## trial_type_id, 1633; subject_id, 1291; dataset_rename, 15
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.335649   0.116430  -2.883  0.00394 ** 
## age         -0.011825   0.002115  -5.591 2.26e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##     (Intr)
## age -0.488

Relationship between “Zoning” (subjects) and accuracy

It appears that having a higher percentage of zoning trials in general is related to slightly lower accuracy

#compute percent zoner trials for each participant
subj_zoners <- trial_missingness %>%
  group_by(dataset_name,dataset_rename, subject_id) %>%
  summarize(
    num_trials=n(),
    num_zoner_trials=sum(zoner_trial,na.rm=TRUE),
    percent_zoner_trials=num_zoner_trials/num_trials
  )
#compute accuracy for a 300-2000 target window
subj_accuracy <- trial_accuracy %>%
  group_by(dataset_name,dataset_rename,subject_id) %>%
  mutate(
    weighted_age=mean(age,na.rm=TRUE)
  ) %>%
  group_by(dataset_name,dataset_rename,subject_id,weighted_age) %>%
  summarize(
    num_trials=n(),
    mean_accuracy=mean(accuracy,na.rm=TRUE)
  )  

#combine
subj_accuracy <- subj_accuracy %>%
  left_join(subj_zoners, by=c("dataset_rename","subject_id"))

#Plot relationship between percent zoner trials and accuracy
ggplot(subj_accuracy,aes(percent_zoner_trials, mean_accuracy,color=dataset_rename))+
  geom_point(alpha=0.2)+
  geom_smooth(method="lm",se=FALSE)

Number of trials and relation to reliability?

TO DO