knitr::opts_chunk$set(cache = FALSE, arn = FALSE,warning=FALSE, message = FALSE,cache.lazy = FALSE)Peekbank image import
Download peekbankr
If you haven’t downloaded peekbankr (https://github.com/langcog/peekbankr) yet, be sure to do so first by uncommenting the lines below.
Preliminaries and data loading
Load packages. Since it takes a while to download and join the data, you probably want to just do that once, and then save the resulting dataset. Setting the parameter FIRST_TIME to FALSE after you run the script the first time allows you to bypass the data download process on subsequent runs. You can also use the most recent data file uploaded to GitHub.
FIRST_TIME = FALSE # set to true first time to download data from DB
if (FIRST_TIME) {
install.packages("remotes") # can also use devtools
remotes::install_github("peekbank/peekbankr", force=TRUE)
}
library(peekbankr)
library(tidyverse)
library(here)
library(RColorBrewer)
library(cowplot)
library(magrittr)
theme_set(theme_cowplot())
source("helpers.R")if (FIRST_TIME) {
conn = connect_to_peekbank("2026.1",port=3307)
administrations <- peekbankr::get_administrations(connection = conn) %>% dplyr::collect()
subjects <- peekbankr::get_subjects(connection = conn) %>% dplyr::collect()
aoi_timepoints <- peekbankr::get_aoi_timepoints(connection = conn) %>% dplyr::collect()
# need to reconnect since aoi_timepoints takes a long time and causes our connection to drop
conn <- connect_to_peekbank("2026.1", port=3307)
all_stimuli <- peekbankr::get_stimuli(connection = conn) %>% dplyr::collect()
trial_types <- peekbankr::get_trial_types(connection = conn) %>% dplyr::collect()
trials <- peekbankr::get_trials(connection = conn) %>% dplyr::collect()
saveRDS(administrations, here("intermediates/administrations.Rds"))
saveRDS(subjects, here("intermediates/subjects.Rds"))
saveRDS(aoi_timepoints, here("intermediates/aoi_timepoints.Rds"))
saveRDS(all_stimuli, here("intermediates/all_stimuli.Rds"))
saveRDS(trial_types, here("intermediates/trial_types.Rds"))
saveRDS(trials, here("intermediates/trials.Rds"))
DBI::dbDisconnect(conn)
} else {
administrations <- readRDS(here("intermediates/administrations.Rds"))
subjects <- readRDS(here("intermediates/subjects.Rds"))
aoi_timepoints <- readRDS(here("intermediates/aoi_timepoints.Rds"))
all_stimuli <- readRDS(here("intermediates/all_stimuli.Rds"))
trial_types <- readRDS(here("intermediates/trial_types.Rds"))
trials <- readRDS(here("intermediates/trials.Rds"))
}dataset level subject counts - more on this in the Descriptives section
subj_counts <- administrations |> left_join(subjects) |>
distinct(subject_id, .keep_all=TRUE) |>
summarize(n = n(), .by=c("dataset_name", "dataset_id"))Downloading stimuli files
if (FIRST_TIME) {
conn = connect_to_peekbank("2026.1",port=3307)
downloaded_stimuli <- peekbankr::download_stimuli(conn, local_base_dir = "stimulus_data")
DBI::dbDisconnect(conn)
saveRDS(downloaded_stimuli, here("intermediates/downloaded_stimuli.Rds"))
} else {
downloaded_stimuli <- readRDS(here("intermediates/downloaded_stimuli.Rds"))
}
stimuli_dataset_information <- downloaded_stimuli |>
group_by(dataset_id, dataset_name) |>
summarise(
stimuli_with_path = n(), # Total stimuli with defined paths in this dataset
total_stimuli = length(all_stimuli$stimulus_id[all_stimuli$dataset_id == dataset_id]),
prop_with_defined_path = (stimuli_with_path / total_stimuli), # Percentage of total stimuli
)
stimuli_dataset_information# A tibble: 26 × 5
# Groups: dataset_id [26]
dataset_id dataset_name stimuli_with_path total_stimuli
<int> <chr> <int> <int>
1 0 potter_remix 16 16
2 1 pomper_salientme 28 28
3 3 baumgartner_2014 10 10
4 4 sander-montant_2022 12 47
5 6 luchkina_waxman_2024 24 24
6 7 pomper_dimy 102 102
7 8 casillas_tseltal_2015 30 30
8 10 perry_cowpig 24 24
9 12 mahr_coartic 28 28
10 14 fernald_marchman_2012 104 104
# ℹ 16 more rows
# ℹ 1 more variable: prop_with_defined_path <dbl>
Just plotting dataset stimuli information
ggplot(stimuli_dataset_information, aes(x=prop_with_defined_path, y=stimuli_with_path, color=dataset_name)) +
xlab("Proportion of stimuli available") +
ylab("Number of stimuli available") +
geom_point(size = 4, alpha=0.3) Create a CSV file of all of the pairs of images in all of the trials that we have stimuli files for
stimuli_trial_info <- downloaded_stimuli |>
select(stimulus_id, image_description, local_stimulus_path)
downloaded_stimuli_trials <- trial_types |>
left_join(
stimuli_trial_info |>
rename_with(~ paste0(.x, "_target"), everything()),
by = c("target_id" = "stimulus_id_target")
) |>
left_join(
stimuli_trial_info |>
rename_with(~ paste0(.x, "_distractor"), everything()),
by = c("distractor_id" = "stimulus_id_distractor")
) |>
# This ensures that both the target and distractor do exist.
filter(!is.na(local_stimulus_path_distractor) & !is.na(local_stimulus_path_target)) |>
mutate(
unique_pair = pmap_chr(list(target_id, distractor_id), ~ paste(sort(c(.x, .y)), collapse = "_"))
) |>
select(-condition, -lab_trial_id, -aoi_region_set_id, -point_of_disambiguation)
# unique pairs keep trial-distractor pairing, we only need to find similarities for the trials that we but we will probably need to do pair-wise comparisons down the line
unique_stimuli_pairs <- downloaded_stimuli_trials |>
distinct(unique_pair, .keep_all=TRUE) |>
rename(text1=image_description_target, text2=image_description_distractor, image1=local_stimulus_path_target, image2=local_stimulus_path_distractor,dataset_name=dataset_name)
write.csv(unique_stimuli_pairs,file=here("peekbank_stimuli.csv"))paste("Datasets")[1] "Datasets"
unique(downloaded_stimuli_trials$dataset_name) [1] "adams_marchman_2018" "bacon_gendercues"
[3] "baumgartner_2014" "byers-heinlein_2017"
[5] "casillas_tseltal_2015" "fernald_marchman_2012"
[7] "frank_tablet_2016" "garrison_bergelson_2020"
[9] "kartushina_2019" "kremin_2021"
[11] "luchkina_waxman_2024" "mahr_coartic"
[13] "moore_bergelson_2022_verb" "perry_cowpig"
[15] "pomper_dimy" "pomper_prime"
[17] "pomper_saffran_2016" "pomper_salientme"
[19] "pomper_yumme" "potter_canine"
[21] "potter_remix" "reflook_v4"
[23] "sander-montant_2022" "weaver_saffran_2026"
[25] "weaver_zettersten_2024" "yoon_simpimp_2015"
Processing stimuli AOIs
downloaded_aois <- trials |>
filter(excluded == 0 & trial_type_id %in% downloaded_stimuli_trials$trial_type_id) |>
left_join(aoi_timepoints) |>
mutate(accuracy = ifelse(aoi == "target", 1, 0),
not_looking_away = aoi == "target" | aoi == "distractor",
accuracy = ifelse(not_looking_away, accuracy, NA))count(trials |> left_join(trial_types) |>
distinct(target_id, distractor_id))# A tibble: 1 × 1
n
<int>
1 2277
Summarize usable trials with stimuli
helpers
# Function to summarize whether a trial is usable based on whether the subject is looking at the screen for greater than 50% of the critical window
summarize_subj_usable_trials <- function(data, critical_window, suffix, additional_fields=NULL) {
additional_fields <- additional_fields %||% list()
data %>%
filter(t_norm >= critical_window[1] &
t_norm <= critical_window[2]) %>%
group_by(administration_id, trial_id, trial_type_id) %>%
summarize(
length = n(),
usable_frames = sum(not_looking_away, na.rm = TRUE),
percent_usable = usable_frames / length,
usable = ifelse(percent_usable >= 0.5, 1, 0), # usable if at least 50% looking
mean_target_looking = mean(accuracy, na.rm = TRUE),
!!!additional_fields,
) %>%
rename_with(~ paste0(., "_", suffix), -c(administration_id, trial_id, trial_type_id))
}
# Function to compute whether a trial is usable based on whether both the critical window and the baseline window are usable
compute_usable_trial <- function(baseline_col, critical_col) {
case_when(
is.na(baseline_col) ~ 0,
is.na(critical_col) ~ 0,
baseline_col == 1 & critical_col == 1 ~ 1,
TRUE ~ 0
)
}
# Calculate mean, standard deviation, standard error and confidence intervals for data grouped across two variables
summarized_data <- function(data, x_var, y_var, group_var) {
return(data |>
group_by(across(all_of(c(x_var, group_var)))) |>
summarize(
#across(everything(), ~ if (n_distinct(.) == 1) first(.) else NA),
mean_value = mean(.data[[y_var]], na.rm = TRUE),
sd_value = sd(.data[[y_var]], na.rm = TRUE),
N = n(),
se = sd_value / sqrt(n()),
ci=qt(0.975, N-1)*sd_value/sqrt(N),
lower_ci=mean_value-ci,
upper_ci=mean_value+ci,
.groups = 'drop') |>
select(where(~ !all(is.na(.))))
)
}using same settings as visvocab
critical_window <- c(300,3500)
critical_window_short <- c(300,1800)
baseline_window <- c(-2000,0)
# summarize critical window
summarize_subj_usable_trials_critical_window <- summarize_subj_usable_trials(
data = downloaded_aois,
critical_window = critical_window,
suffix = "critical_window"
)
# summarize short critical window
summarize_subj_usable_trials_critical_window_short <- summarize_subj_usable_trials(
data = downloaded_aois,
critical_window = critical_window_short,
suffix = "critical_window_short"
)
# combine into one dataset
summarize_subj_usable_trials_critical_window <- summarize_subj_usable_trials_critical_window %>%
left_join(summarize_subj_usable_trials_critical_window_short)
# summarize baseline window information
summarize_subj_usable_trials_baseline_window <- summarize_subj_usable_trials(
data = downloaded_aois,
critical_window = baseline_window,
suffix = "baseline_window")
#overall usable trials
summarize_subj_trials <- downloaded_aois %>%
distinct(administration_id, trial_id, trial_type_id) %>%
left_join(summarize_subj_usable_trials_critical_window) %>%
left_join(summarize_subj_usable_trials_baseline_window) %>%
mutate(
usable_window = compute_usable_trial(usable_baseline_window, usable_critical_window),
usable_window_short = compute_usable_trial(usable_baseline_window, usable_critical_window_short),
corrected_target_looking = mean_target_looking_critical_window - mean_target_looking_baseline_window,
corrected_target_looking_short = mean_target_looking_critical_window_short - mean_target_looking_baseline_window
)usable_trials_summarized <- summarize_subj_trials %>%
filter(usable_window == 1)Overall timecourse plot of proportion target looking
#summarizing within subject for each time point
summarize_subj_aois <- summarized_data(downloaded_aois |> filter(!is.na(accuracy) & trial_id %in% usable_trials_summarized$trial_id), "t_norm", "accuracy", "administration_id") |> rename(mean_accuracy = mean_value)
#summarizing across subjects for each time point
summarize_across_subj_aois <- summarized_data(summarize_subj_aois, "t_norm", "mean_accuracy", "t_norm") |> rename(accuracy = mean_value)
looking_times <- ggplot(summarize_across_subj_aois,aes(t_norm,accuracy))+
xlim(-2000,4000)+
geom_errorbar(aes(ymin=accuracy-ci,ymax=accuracy+ci),width=0, alpha=0.2)+
#geom_point(alpha=0.2)+
geom_smooth(method="gam")+
geom_vline(xintercept=0,size=1.5)+
geom_hline(yintercept=0.5,size=1.2,linetype="dashed")+
geom_vline(xintercept=300,linetype="dotted")+
ylim(0,1)+
xlab("Time (normalized to target word onset) in ms")+
ylab("Proportion Target Looking")
looking_timesggsave(here("figures","prop_looking_across_time.png"),looking_times,width=9,height=6,bg = "white")Age-related effects
summarize_subjects <- summarized_data(usable_trials_summarized |>
left_join(administrations), "administration_id", "corrected_target_looking", "age")
ggplot(summarize_subjects, aes(x = age, y = mean_value)) +
geom_hline(yintercept = 0.0, linetype = "dashed") +
geom_point(size = 4, alpha = 0.1,
position = position_jitter(width = 0.1, seed = 123)) +
geom_smooth(method="lm") +
scale_x_continuous(breaks = seq(5, 75, by = 10)) +
labs(x = "Age in months", y = "Corrected prop. of target looking by age") +
ggtitle("Corrected proportion of target looking by age") +
theme_minimal() +
ggpubr::stat_cor(method = "pearson")#Loading CLIP similarities
clip_similarities <- read.csv(here("data", "similarities-clip_data.csv")) |>
mutate(text_similarity = ifelse(grepl("/", text1), NA, text_similarity))
usable_trials_summarized_with_sims <- usable_trials_summarized |>
left_join(downloaded_stimuli_trials |> select(trial_type_id, unique_pair, local_stimulus_path_target, local_stimulus_path_distractor, image_description_target, image_description_distractor, target_id, distractor_id), by=c("trial_type_id")) |>
left_join(clip_similarities |> select(-trial_type_id, -target_id, -distractor_id), by = c("unique_pair")) |>
left_join(administrations |> select("administration_id", "age") )|>
group_by(unique_pair) |>
# similarity scores are only generated for a single trial type ID so here we're mapping those scores to other trials that have the same image pair
mutate(
across(
ends_with("similarity"),
~ifelse(is.na(.), first(.[!is.na(.)]), .)
)
) |>
ungroup()
cols_to_update <- colnames(clip_similarities|> select(-trial_type_id, -target_id, -distractor_id))
usable_trials_imagepair_sims <- usable_trials_summarized_with_sims |>
mutate(unique_imagepair = pmap_chr(list(local_stimulus_path_target, local_stimulus_path_distractor), ~ paste(sort(c(.x, .y)), collapse = "_")),
unique_textpair = pmap_chr(list(image_description_target, image_description_distractor), ~ paste(sort(c(.x, .y)), collapse = "_"))) |>
group_by(unique_imagepair, unique_textpair) |>
mutate(
across(
all_of(cols_to_update),
~ifelse(is.na(.), first(.[!is.na(.)]), .)
)) |>
ungroup() Filtering out trials which on the Peekbank-side allegedly had images but did not in actuality
unusable_trials <- usable_trials_imagepair_sims |> filter(is.na(image_similarity)) |> distinct(unique_pair, .keep_all=TRUE) |> select(dataset_name, unique_pair, target_id, distractor_id, image1, image2)also filtering out non-vanilla trials and datasets
usable_trials_summarized_with_sims <- usable_trials_imagepair_sims |> filter(!is.na(image_similarity))
non_vanilla_trials <- usable_trials_summarized_with_sims |> filter(vanilla_trial == 0)
usable_trials_summarized_with_sims <- usable_trials_summarized_with_sims |> filter(vanilla_trial == 1)
non_vanilla_datasets <- non_vanilla_trials |> distinct(dataset_name) |> anti_join(usable_trials_summarized_with_sims |> distinct(dataset_name))
non_vanilla_datasets# A tibble: 2 × 1
dataset_name
<chr>
1 baumgartner_2014
2 luchkina_waxman_2024
Exporting trial-level data
write.csv(usable_trials_summarized_with_sims, here("data/usable_trials_with_similarities.csv"))Printing data counts
# trial types for which all of the trials were marked as unusable
clip_data_summarized <- summarize_similarity_data_collapsed(usable_trials_summarized_with_sims, extra_fields = c("dataset_name", "vanilla_trial"))
non_usable_unique_pairs <- clip_similarities |>
anti_join(usable_trials_summarized |>
left_join(downloaded_stimuli_trials |> select(trial_type_id, unique_pair, local_stimulus_path_target, local_stimulus_path_distractor), by=c("trial_type_id")), by = c("unique_pair")) |>
distinct(unique_pair, .keep_all = TRUE)
trial_types <- trial_types |>
mutate(unique_pair = pmap_chr(list(target_id, distractor_id), ~ paste(sort(c(.x, .y)), collapse = "_")))
print(paste("Number of datasets:", count(usable_trials_summarized_with_sims |> distinct(dataset_name))))[1] "Number of datasets: 24"
print(paste("Total subjects:", count(usable_trials_summarized_with_sims|> left_join(administrations |> select(administration_id, subject_id)) |> distinct(subject_id))))[1] "Total subjects: 1472"
print(paste("Total trials:", count(usable_trials_summarized_with_sims)))[1] "Total trials: 23064"
cat("\nUnique label-target-distractor pairs\n")
Unique label-target-distractor pairs
print(paste("Number of unique label-target-distractor pairs:", count(usable_trials_summarized_with_sims |> distinct(image_description_target, image_description_distractor, dataset_name))))[1] "Number of unique label-target-distractor pairs: 499"
print(paste("Number of unique label-target-distractor pairs with unique image-target distractor pairs:", count(usable_trials_summarized_with_sims |> distinct(image_description_target, image_description_distractor, image1, image2, dataset_name, full_phrase_language))))[1] "Number of unique label-target-distractor pairs with unique image-target distractor pairs: 854"
print(paste("Number of unique label-target-distractor ids:", count(usable_trials_summarized_with_sims |> distinct(target_id, distractor_id, dataset_name))))[1] "Number of unique label-target-distractor ids: 887"
print(paste("Total label-target-distractor pairs in Peekbank:", count(trial_types |> distinct(target_id, distractor_id))))[1] "Total label-target-distractor pairs in Peekbank: 2277"
cat("\nUnique stimulus pairs\n")
Unique stimulus pairs
print(paste("Number of unique image and text pairs:", count(usable_trials_summarized_with_sims |> distinct(unique_textpair, unique_imagepair, full_phrase_language))))[1] "Number of unique image and text pairs: 482"
print(paste("Number of unique stimulus id pairs:", count(usable_trials_summarized_with_sims |> distinct(unique_pair))))[1] "Number of unique stimulus id pairs: 522"
print(paste("Number of unique image pairs with no usable trials:", count(non_usable_unique_pairs)))[1] "Number of unique image pairs with no usable trials: 149"
print(paste("Total unique stimulus id pairs in Peekbank:", count(trial_types |> distinct(unique_pair))))[1] "Total unique stimulus id pairs in Peekbank: 1518"
cat("\nTrial types\n")
Trial types
print(paste("Number of trial types:", count(usable_trials_summarized_with_sims |> distinct(trial_type_id))))[1] "Number of trial types: 1847"
print(paste("Total trial types in Peekbank:", count(trial_types |> distinct(trial_type_id))))[1] "Total trial types in Peekbank: 6918"
Descriptive plots
label-target-distractor combinations
trial_counts <- clip_data_summarized |>
#filter(N > 10) |>
distinct(image_description_target, image_description_distractor, dataset_name) |>
summarize(count = n(), .by="dataset_name") |>
arrange(count)
# Create stacked bar chart
ggplot(trial_counts, aes(x = reorder(dataset_name, -count), y = count)) +
geom_col(fill="skyblue") +
labs(title = "Number of unique label-target-distractor combinations",
x = "Dataset Name",
y = "Count",
fill = "Trial Type") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))raw trial counts
# Function to calculate Pearson's correlation per epoch
round_to_nearest <- function(x, round_to=3) {
round(x / round_to) * round_to
}
# rounding each participant to the closest 5
age_based_trials <- usable_trials_summarized_with_sims |> mutate(
rounded_age = round_to_nearest(age, round_to=5)
)
clip_age_stacked_bar <- age_based_trials |>
#filter(N > 10) |>
group_by(rounded_age, dataset_name) |>
summarize(trial_count = n(), .groups = "drop")
ggplot(clip_age_stacked_bar, aes(x = factor(rounded_age), y = trial_count, fill = dataset_name)) +
geom_bar(stat = "identity") +
labs(
title = "Number of Trials by Dataset and Age",
x = "Age in months (rounded to nearest 5 month)",
y = "Number of Trials",
fill = "Dataset"
) +
theme_minimal()Raw participant counts
Per dataset
trials_with_subject_info <- usable_trials_summarized_with_sims |> left_join(administrations |> select(administration_id, subject_id)) |>
left_join(subjects |> select(subject_id, subject_aux_data))
subject_counts <- trials_with_subject_info |>
mutate(has_cdi = grepl("cdi", subject_aux_data)) |>
summarize(trial_count = n(), .by=c(has_cdi, subject_id, dataset_name))
dataset_subject_counts <- subject_counts |>
summarize(subject_count = n(), average_trial_count = mean(trial_count), cdi_count = sum(has_cdi == TRUE), non_cdi_count = (sum(has_cdi == FALSE)), .by=dataset_name) |>
pivot_longer(cols = c(cdi_count, non_cdi_count),
names_to = "cdi",
values_to = "count")
# Create stacked bar chart
ggplot(dataset_subject_counts, aes(x = reorder(paste0(dataset_name,"(subj trials M=", round(average_trial_count, 1), ")"), -count), y = count)) +
geom_bar(stat = "identity", position = "stack", fill="skyblue") +
labs(title = "Number of participants",
x = "Dataset Name",
y = "Count") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))