Peekbank image import

Author

Tarun Sepuri

Published

March 16, 2026

Download peekbankr

If you haven’t downloaded peekbankr (https://github.com/langcog/peekbankr) yet, be sure to do so first by uncommenting the lines below.

knitr::opts_chunk$set(cache = FALSE, arn = FALSE,warning=FALSE, message = FALSE,cache.lazy = FALSE)

Preliminaries and data loading

Load packages. Since it takes a while to download and join the data, you probably want to just do that once, and then save the resulting dataset. Setting the parameter FIRST_TIME to FALSE after you run the script the first time allows you to bypass the data download process on subsequent runs. You can also use the most recent data file uploaded to GitHub.

FIRST_TIME = FALSE # set to true first time to download data from DB

if (FIRST_TIME) {
  install.packages("remotes") # can also use devtools
  remotes::install_github("peekbank/peekbankr", force=TRUE)
}

library(peekbankr)
library(tidyverse)
library(here)
library(RColorBrewer)
library(cowplot)
library(magrittr)
theme_set(theme_cowplot())
source("helpers.R")
if (FIRST_TIME) {
  
  conn = connect_to_peekbank("2026.1",port=3307)
  administrations <- peekbankr::get_administrations(connection = conn) %>% dplyr::collect()
  subjects <- peekbankr::get_subjects(connection = conn) %>% dplyr::collect()
  aoi_timepoints <- peekbankr::get_aoi_timepoints(connection = conn) %>% dplyr::collect()
  
  # need to reconnect since aoi_timepoints takes a long time and causes our connection to drop
  conn <- connect_to_peekbank("2026.1", port=3307)
  all_stimuli <- peekbankr::get_stimuli(connection = conn) %>% dplyr::collect()
  trial_types <- peekbankr::get_trial_types(connection = conn) %>% dplyr::collect()
  trials <- peekbankr::get_trials(connection = conn) %>% dplyr::collect()
  saveRDS(administrations, here("intermediates/administrations.Rds"))
  saveRDS(subjects, here("intermediates/subjects.Rds"))
  saveRDS(aoi_timepoints, here("intermediates/aoi_timepoints.Rds"))
  saveRDS(all_stimuli, here("intermediates/all_stimuli.Rds"))
  saveRDS(trial_types, here("intermediates/trial_types.Rds"))
  saveRDS(trials, here("intermediates/trials.Rds"))
  DBI::dbDisconnect(conn)
} else {
  administrations <- readRDS(here("intermediates/administrations.Rds"))
  subjects <- readRDS(here("intermediates/subjects.Rds"))
  aoi_timepoints <- readRDS(here("intermediates/aoi_timepoints.Rds"))
  all_stimuli <- readRDS(here("intermediates/all_stimuli.Rds"))
  trial_types <- readRDS(here("intermediates/trial_types.Rds"))
  trials <- readRDS(here("intermediates/trials.Rds"))
}

dataset level subject counts - more on this in the Descriptives section

subj_counts <- administrations |> left_join(subjects) |>
  distinct(subject_id, .keep_all=TRUE) |>
  summarize(n = n(), .by=c("dataset_name", "dataset_id"))

Downloading stimuli files

if (FIRST_TIME) {
  conn = connect_to_peekbank("2026.1",port=3307)
  downloaded_stimuli <- peekbankr::download_stimuli(conn, local_base_dir = "stimulus_data")
  DBI::dbDisconnect(conn)
  saveRDS(downloaded_stimuli, here("intermediates/downloaded_stimuli.Rds"))
} else {
  downloaded_stimuli <- readRDS(here("intermediates/downloaded_stimuli.Rds"))
}
stimuli_dataset_information <- downloaded_stimuli |>
  group_by(dataset_id, dataset_name) |>
  summarise(
    stimuli_with_path = n(),  # Total stimuli with defined paths in this dataset
    total_stimuli = length(all_stimuli$stimulus_id[all_stimuli$dataset_id == dataset_id]),
    prop_with_defined_path = (stimuli_with_path / total_stimuli),  # Percentage of total stimuli
  )

stimuli_dataset_information
# A tibble: 26 × 5
# Groups:   dataset_id [26]
   dataset_id dataset_name          stimuli_with_path total_stimuli
        <int> <chr>                             <int>         <int>
 1          0 potter_remix                         16            16
 2          1 pomper_salientme                     28            28
 3          3 baumgartner_2014                     10            10
 4          4 sander-montant_2022                  12            47
 5          6 luchkina_waxman_2024                 24            24
 6          7 pomper_dimy                         102           102
 7          8 casillas_tseltal_2015                30            30
 8         10 perry_cowpig                         24            24
 9         12 mahr_coartic                         28            28
10         14 fernald_marchman_2012               104           104
# ℹ 16 more rows
# ℹ 1 more variable: prop_with_defined_path <dbl>

Just plotting dataset stimuli information

ggplot(stimuli_dataset_information, aes(x=prop_with_defined_path, y=stimuli_with_path, color=dataset_name)) +
  xlab("Proportion of stimuli available") +
  ylab("Number of stimuli available") +
  geom_point(size = 4, alpha=0.3) 

Create a CSV file of all of the pairs of images in all of the trials that we have stimuli files for

stimuli_trial_info <-  downloaded_stimuli |> 
      select(stimulus_id, image_description, local_stimulus_path)

downloaded_stimuli_trials <- trial_types |>
  left_join(
    stimuli_trial_info |> 
      rename_with(~ paste0(.x, "_target"), everything()),
    by = c("target_id" = "stimulus_id_target")
  ) |>
  left_join(
    stimuli_trial_info |> 
      rename_with(~ paste0(.x, "_distractor"), everything()), 
    by = c("distractor_id" = "stimulus_id_distractor")
  ) |>
  # This ensures that both the target and distractor do exist.
  filter(!is.na(local_stimulus_path_distractor) & !is.na(local_stimulus_path_target)) |>
  mutate(
    unique_pair = pmap_chr(list(target_id, distractor_id), ~ paste(sort(c(.x, .y)), collapse = "_"))
  ) |>
  select(-condition, -lab_trial_id, -aoi_region_set_id, -point_of_disambiguation)

# unique pairs keep trial-distractor pairing, we only need to find similarities for the trials that we but we will probably need to do pair-wise comparisons down the line
unique_stimuli_pairs <- downloaded_stimuli_trials |>
  distinct(unique_pair, .keep_all=TRUE) |>
  rename(text1=image_description_target, text2=image_description_distractor, image1=local_stimulus_path_target, image2=local_stimulus_path_distractor,dataset_name=dataset_name)

write.csv(unique_stimuli_pairs,file=here("peekbank_stimuli.csv"))
paste("Datasets")
[1] "Datasets"
unique(downloaded_stimuli_trials$dataset_name)
 [1] "adams_marchman_2018"       "bacon_gendercues"         
 [3] "baumgartner_2014"          "byers-heinlein_2017"      
 [5] "casillas_tseltal_2015"     "fernald_marchman_2012"    
 [7] "frank_tablet_2016"         "garrison_bergelson_2020"  
 [9] "kartushina_2019"           "kremin_2021"              
[11] "luchkina_waxman_2024"      "mahr_coartic"             
[13] "moore_bergelson_2022_verb" "perry_cowpig"             
[15] "pomper_dimy"               "pomper_prime"             
[17] "pomper_saffran_2016"       "pomper_salientme"         
[19] "pomper_yumme"              "potter_canine"            
[21] "potter_remix"              "reflook_v4"               
[23] "sander-montant_2022"       "weaver_saffran_2026"      
[25] "weaver_zettersten_2024"    "yoon_simpimp_2015"        

Processing stimuli AOIs

downloaded_aois <- trials |>
  filter(excluded == 0 & trial_type_id %in% downloaded_stimuli_trials$trial_type_id) |>
  left_join(aoi_timepoints) |>
  mutate(accuracy = ifelse(aoi == "target", 1, 0),
    not_looking_away = aoi == "target" | aoi == "distractor",
    accuracy = ifelse(not_looking_away, accuracy, NA))
count(trials |> left_join(trial_types) |>
  distinct(target_id, distractor_id))
# A tibble: 1 × 1
      n
  <int>
1  2277

Summarize usable trials with stimuli

helpers

# Function to summarize whether a trial is usable based on whether the subject is looking at the screen for greater than 50% of the critical window
summarize_subj_usable_trials <- function(data, critical_window, suffix, additional_fields=NULL) {
  additional_fields <- additional_fields %||% list()
  
  data %>%
    filter(t_norm >= critical_window[1] &
             t_norm <= critical_window[2]) %>%
    group_by(administration_id, trial_id, trial_type_id) %>%
    summarize(
      length = n(),
      usable_frames = sum(not_looking_away, na.rm = TRUE),
      percent_usable = usable_frames / length,
      usable = ifelse(percent_usable >= 0.5, 1, 0), # usable if at least 50% looking
      mean_target_looking = mean(accuracy, na.rm = TRUE),
      !!!additional_fields,
    ) %>%
    rename_with(~ paste0(., "_", suffix), -c(administration_id, trial_id, trial_type_id))
}

# Function to compute whether a trial is usable based on whether both the critical window and the baseline window are usable
compute_usable_trial <- function(baseline_col, critical_col) {
  case_when(
    is.na(baseline_col) ~ 0,
    is.na(critical_col) ~ 0,
    baseline_col == 1 & critical_col == 1 ~ 1,
    TRUE ~ 0
  )
}

# Calculate mean, standard deviation, standard error and confidence intervals for data grouped across two variables
summarized_data <- function(data, x_var, y_var, group_var) {
  return(data |>
           group_by(across(all_of(c(x_var, group_var)))) |>
           summarize(
                   #across(everything(), ~ if (n_distinct(.) == 1) first(.) else NA),
                    mean_value = mean(.data[[y_var]], na.rm = TRUE),
                     sd_value = sd(.data[[y_var]], na.rm = TRUE),
                     N = n(),
                     se = sd_value / sqrt(n()),
                     ci=qt(0.975, N-1)*sd_value/sqrt(N),
                     lower_ci=mean_value-ci,
                     upper_ci=mean_value+ci,
                     .groups = 'drop') |>
           select(where(~ !all(is.na(.))))
  )
}

using same settings as visvocab

critical_window <- c(300,3500)
critical_window_short <- c(300,1800)
baseline_window <- c(-2000,0)

# summarize critical window
summarize_subj_usable_trials_critical_window <- summarize_subj_usable_trials(
  data = downloaded_aois,
  critical_window = critical_window,
  suffix = "critical_window"
)
# summarize short critical window
summarize_subj_usable_trials_critical_window_short <- summarize_subj_usable_trials(
  data = downloaded_aois,
  critical_window = critical_window_short,
  suffix = "critical_window_short"
)

# combine into one dataset
summarize_subj_usable_trials_critical_window <- summarize_subj_usable_trials_critical_window %>%
  left_join(summarize_subj_usable_trials_critical_window_short)

# summarize baseline window information
summarize_subj_usable_trials_baseline_window <- summarize_subj_usable_trials(
  data = downloaded_aois,
  critical_window = baseline_window,
  suffix = "baseline_window")

#overall usable trials
summarize_subj_trials <- downloaded_aois %>%
  distinct(administration_id, trial_id, trial_type_id) %>%
  left_join(summarize_subj_usable_trials_critical_window) %>%
  left_join(summarize_subj_usable_trials_baseline_window) %>%
  mutate(
    usable_window = compute_usable_trial(usable_baseline_window, usable_critical_window),
    usable_window_short = compute_usable_trial(usable_baseline_window, usable_critical_window_short),
    corrected_target_looking = mean_target_looking_critical_window - mean_target_looking_baseline_window,
    corrected_target_looking_short = mean_target_looking_critical_window_short - mean_target_looking_baseline_window
  )
usable_trials_summarized <- summarize_subj_trials %>%
  filter(usable_window == 1)

Overall timecourse plot of proportion target looking

#summarizing within subject for each time point
summarize_subj_aois <- summarized_data(downloaded_aois |> filter(!is.na(accuracy) & trial_id %in% usable_trials_summarized$trial_id), "t_norm", "accuracy", "administration_id") |> rename(mean_accuracy = mean_value)

#summarizing across subjects for each time point
summarize_across_subj_aois <- summarized_data(summarize_subj_aois, "t_norm", "mean_accuracy", "t_norm") |> rename(accuracy = mean_value)

looking_times <- ggplot(summarize_across_subj_aois,aes(t_norm,accuracy))+
  xlim(-2000,4000)+
  geom_errorbar(aes(ymin=accuracy-ci,ymax=accuracy+ci),width=0, alpha=0.2)+
  #geom_point(alpha=0.2)+
    geom_smooth(method="gam")+
  geom_vline(xintercept=0,size=1.5)+
  geom_hline(yintercept=0.5,size=1.2,linetype="dashed")+
  geom_vline(xintercept=300,linetype="dotted")+
  ylim(0,1)+
  xlab("Time (normalized to target word onset) in ms")+
  ylab("Proportion Target Looking")
looking_times

ggsave(here("figures","prop_looking_across_time.png"),looking_times,width=9,height=6,bg = "white")

Age-related effects

summarize_subjects <- summarized_data(usable_trials_summarized |> 
                                        left_join(administrations), "administration_id", "corrected_target_looking", "age")
ggplot(summarize_subjects, aes(x = age, y = mean_value)) +
  geom_hline(yintercept = 0.0, linetype = "dashed") +
  geom_point(size = 4, alpha = 0.1,
             position = position_jitter(width = 0.1, seed = 123)) +
  geom_smooth(method="lm") +
  scale_x_continuous(breaks = seq(5, 75, by = 10)) +
  labs(x = "Age in months", y = "Corrected prop. of target looking by age") +
  ggtitle("Corrected proportion of target looking by age") +
  theme_minimal() +
  ggpubr::stat_cor(method = "pearson")

#Loading CLIP similarities

clip_similarities <- read.csv(here("data", "similarities-clip_data.csv")) |> 
  mutate(text_similarity = ifelse(grepl("/", text1), NA, text_similarity))

usable_trials_summarized_with_sims <- usable_trials_summarized |>
  left_join(downloaded_stimuli_trials |> select(trial_type_id, unique_pair, local_stimulus_path_target, local_stimulus_path_distractor, image_description_target, image_description_distractor, target_id, distractor_id), by=c("trial_type_id")) |>
  left_join(clip_similarities |> select(-trial_type_id, -target_id, -distractor_id), by = c("unique_pair")) |>
  left_join(administrations |> select("administration_id", "age") )|>
  group_by(unique_pair) |>
  # similarity scores are only generated for a single trial type ID so here we're mapping those scores to other trials that have the same image pair 
  mutate(
    across(
      ends_with("similarity"),
      ~ifelse(is.na(.), first(.[!is.na(.)]), .)
    )
  ) |>
  ungroup() 

cols_to_update <- colnames(clip_similarities|> select(-trial_type_id, -target_id, -distractor_id)) 
usable_trials_imagepair_sims <- usable_trials_summarized_with_sims |>
  mutate(unique_imagepair = pmap_chr(list(local_stimulus_path_target, local_stimulus_path_distractor), ~ paste(sort(c(.x, .y)), collapse = "_")),
         unique_textpair = pmap_chr(list(image_description_target, image_description_distractor), ~ paste(sort(c(.x, .y)), collapse = "_"))) |>
  group_by(unique_imagepair, unique_textpair) |>
  mutate(
    across(
      all_of(cols_to_update),
      ~ifelse(is.na(.), first(.[!is.na(.)]), .)
    )) |>
  ungroup() 

Filtering out trials which on the Peekbank-side allegedly had images but did not in actuality

unusable_trials <- usable_trials_imagepair_sims |> filter(is.na(image_similarity)) |> distinct(unique_pair, .keep_all=TRUE) |> select(dataset_name, unique_pair, target_id, distractor_id, image1, image2)

also filtering out non-vanilla trials and datasets

usable_trials_summarized_with_sims <- usable_trials_imagepair_sims |> filter(!is.na(image_similarity))
non_vanilla_trials <- usable_trials_summarized_with_sims |> filter(vanilla_trial == 0)
usable_trials_summarized_with_sims <- usable_trials_summarized_with_sims |> filter(vanilla_trial == 1)
non_vanilla_datasets <- non_vanilla_trials |> distinct(dataset_name) |> anti_join(usable_trials_summarized_with_sims |> distinct(dataset_name))
non_vanilla_datasets
# A tibble: 2 × 1
  dataset_name        
  <chr>               
1 baumgartner_2014    
2 luchkina_waxman_2024

Exporting trial-level data

write.csv(usable_trials_summarized_with_sims, here("data/usable_trials_with_similarities.csv"))

Printing data counts

# trial types for which all of the trials were marked as unusable
clip_data_summarized <- summarize_similarity_data_collapsed(usable_trials_summarized_with_sims, extra_fields = c("dataset_name", "vanilla_trial")) 
non_usable_unique_pairs <- clip_similarities |>
  anti_join(usable_trials_summarized |>
  left_join(downloaded_stimuli_trials |> select(trial_type_id, unique_pair, local_stimulus_path_target, local_stimulus_path_distractor), by=c("trial_type_id")), by = c("unique_pair")) |>
  distinct(unique_pair, .keep_all = TRUE)

trial_types <- trial_types |>
 mutate(unique_pair = pmap_chr(list(target_id, distractor_id), ~ paste(sort(c(.x, .y)), collapse = "_")))

print(paste("Number of datasets:", count(usable_trials_summarized_with_sims |> distinct(dataset_name))))
[1] "Number of datasets: 24"
print(paste("Total subjects:", count(usable_trials_summarized_with_sims|> left_join(administrations |> select(administration_id, subject_id)) |> distinct(subject_id))))
[1] "Total subjects: 1472"
print(paste("Total trials:", count(usable_trials_summarized_with_sims)))
[1] "Total trials: 23064"
cat("\nUnique label-target-distractor pairs\n")

Unique label-target-distractor pairs
print(paste("Number of unique label-target-distractor pairs:", count(usable_trials_summarized_with_sims |> distinct(image_description_target, image_description_distractor, dataset_name))))
[1] "Number of unique label-target-distractor pairs: 499"
print(paste("Number of unique label-target-distractor pairs with unique image-target distractor pairs:", count(usable_trials_summarized_with_sims |> distinct(image_description_target, image_description_distractor, image1, image2, dataset_name, full_phrase_language))))
[1] "Number of unique label-target-distractor pairs with unique image-target distractor pairs: 854"
print(paste("Number of unique label-target-distractor ids:", count(usable_trials_summarized_with_sims |> distinct(target_id, distractor_id, dataset_name))))
[1] "Number of unique label-target-distractor ids: 887"
print(paste("Total label-target-distractor pairs in Peekbank:", count(trial_types |> distinct(target_id, distractor_id))))
[1] "Total label-target-distractor pairs in Peekbank: 2277"
cat("\nUnique stimulus pairs\n")

Unique stimulus pairs
print(paste("Number of unique image and text pairs:", count(usable_trials_summarized_with_sims |> distinct(unique_textpair, unique_imagepair, full_phrase_language))))
[1] "Number of unique image and text pairs: 482"
print(paste("Number of unique stimulus id pairs:", count(usable_trials_summarized_with_sims |> distinct(unique_pair))))
[1] "Number of unique stimulus id pairs: 522"
print(paste("Number of unique image pairs with no usable trials:", count(non_usable_unique_pairs)))
[1] "Number of unique image pairs with no usable trials: 149"
print(paste("Total unique stimulus id pairs in Peekbank:", count(trial_types |> distinct(unique_pair))))
[1] "Total unique stimulus id pairs in Peekbank: 1518"
cat("\nTrial types\n")

Trial types
print(paste("Number of trial types:", count(usable_trials_summarized_with_sims |> distinct(trial_type_id))))
[1] "Number of trial types: 1847"
print(paste("Total trial types in Peekbank:", count(trial_types |> distinct(trial_type_id))))
[1] "Total trial types in Peekbank: 6918"

Descriptive plots

label-target-distractor combinations

trial_counts <- clip_data_summarized |>
  #filter(N > 10) |>
  distinct(image_description_target, image_description_distractor, dataset_name) |>
  summarize(count = n(), .by="dataset_name") |>
  arrange(count)

# Create stacked bar chart
ggplot(trial_counts, aes(x = reorder(dataset_name, -count), y = count)) +
  geom_col(fill="skyblue") +
  labs(title = "Number of unique label-target-distractor combinations",
       x = "Dataset Name", 
       y = "Count",
       fill = "Trial Type") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

raw trial counts

# Function to calculate Pearson's correlation per epoch
round_to_nearest <- function(x, round_to=3) {
  round(x / round_to) * round_to
}

# rounding each participant to the closest 5
age_based_trials <- usable_trials_summarized_with_sims |> mutate(
  rounded_age = round_to_nearest(age, round_to=5)
)

clip_age_stacked_bar <- age_based_trials |>
  #filter(N > 10) |>
  group_by(rounded_age, dataset_name) |>
  summarize(trial_count = n(), .groups = "drop")

ggplot(clip_age_stacked_bar, aes(x = factor(rounded_age), y = trial_count, fill = dataset_name)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Number of Trials by Dataset and Age",
    x = "Age in months (rounded to nearest 5 month)",
    y = "Number of Trials",
    fill = "Dataset"
  ) +
  theme_minimal()

Raw participant counts

Per dataset

trials_with_subject_info <- usable_trials_summarized_with_sims |> left_join(administrations |> select(administration_id, subject_id)) |>
  left_join(subjects |> select(subject_id, subject_aux_data)) 

subject_counts <- trials_with_subject_info |>
  mutate(has_cdi = grepl("cdi", subject_aux_data)) |>
  summarize(trial_count = n(), .by=c(has_cdi, subject_id, dataset_name))

dataset_subject_counts <- subject_counts |>
  summarize(subject_count = n(), average_trial_count = mean(trial_count), cdi_count = sum(has_cdi == TRUE), non_cdi_count = (sum(has_cdi == FALSE)), .by=dataset_name)  |>
  pivot_longer(cols = c(cdi_count, non_cdi_count), 
               names_to = "cdi", 
               values_to = "count")

# Create stacked bar chart
ggplot(dataset_subject_counts, aes(x = reorder(paste0(dataset_name,"(subj trials M=", round(average_trial_count, 1), ")"), -count), y = count)) +
  geom_bar(stat = "identity", position = "stack", fill="skyblue") +
  labs(title = "Number of participants",
       x = "Dataset Name", 
       y = "Count") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))