── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1.9000 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
here() starts at /Users/visuallearninglab/Documents/visvocab
library(dotenv)library(readr)library(rlang)
Attaching package: 'rlang'
The following objects are masked from 'package:purrr':
%@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
flatten_raw, invoke, splice
METADATA_PATH =here("data", "metadata")DATA_TO_ANALYZE_PATH =here("data", "data_to_analyze")# Importing trial and participant information from Lookittrial_timing_info =read.csv(file.path(METADATA_PATH, "lookit_trial_timing_info.csv"))stimuli_metadata =read.csv(here("stimuli", "lookit", "pilot_stimuli.csv"))similarities <-read.csv(here("data", "metadata", "similarities.csv"))
# Function to summarize whether a trial is usable based on whether the subject is looking at the screen for greater than 50% of the critical windowsummarize_subj_usable_trials <-function(data, critical_window, suffix, additional_fields=NULL) { additional_fields <- additional_fields %||%list() data %>%filter(time_normalized_corrected >= critical_window[1] & time_normalized_corrected <= critical_window[2]) %>%group_by(SubjectInfo.subjID, Trials.trialID, Trials.ordinal, Trials.trialType) %>%summarize(length =n(),usable_frames =sum(not_looking_away, na.rm =TRUE),percent_usable = usable_frames / length,usable =ifelse(percent_usable >=0.5, 1, 0), # usable if at least 50% lookingmean_target_looking =mean(accuracy, na.rm =TRUE),!!!additional_fields, ) %>%rename_with(~paste0(., "_", suffix), -c(SubjectInfo.subjID, Trials.trialID, Trials.ordinal, Trials.trialType))}# Function to compute whether a trial is usable based on whether both the critical window and the baseline window are usablecompute_usable_trial <-function(baseline_col, critical_col) {case_when(is.na(baseline_col) ~0,is.na(critical_col) ~0, baseline_col ==1& critical_col ==1~1,TRUE~0 )}# Calculate mean, standard deviation, standard error and confidence intervals for data grouped across two variablessummarized_data <-function(data, x_var, y_var, group_var) {return(data |>group_by(across(all_of(c(x_var, group_var)))) |>summarize(#across(everything(), ~ if (n_distinct(.) == 1) first(.) else NA),mean_value =mean(.data[[y_var]], na.rm =TRUE),sd_value =sd(.data[[y_var]], na.rm =TRUE),N =n(),se = sd_value /sqrt(n()),ci=qt(0.975, N-1)*sd_value/sqrt(N),lower_ci=mean_value-ci,upper_ci=mean_value+ci,.groups ='drop') |>select(where(~!all(is.na(.)))) )}# make aesthetics aware size scale, also use better scalingscale_size_c <-function(name =waiver(), breaks =waiver(), labels =waiver(), limits =NULL, range =c(1, 6), trans ="identity", guide ="legend", aesthetics ="size") {continuous_scale(aesthetics, "area", scales::rescale_pal(range), name = name, breaks = breaks, labels = labels, limits = limits, trans = trans, guide = guide)}# summarize target looking by input conditionsummarize_data <-function(data,summary_field) {return(data |>summarize(N=n(),#mean_age = mean(age),#mean_age_mo = mean(age_mo),average_corrected_target_looking=mean(corrected_target_looking,na.rm=TRUE),se=sd(corrected_target_looking,na.rm=T)/sqrt(N),ci=qt(0.975, N-1)*sd(corrected_target_looking,na.rm=T)/sqrt(N),lower_ci=average_corrected_target_looking-ci,upper_ci=average_corrected_target_looking+ci,lower_se=average_corrected_target_looking-se,upper_se=average_corrected_target_looking+se,average_critical_window_looking=mean(mean_target_looking_critical_window,na.rm=TRUE),critical_window_ci =qt(0.975, N-1)*sd(mean_target_looking_critical_window,na.rm=T)/sqrt(N),critical_window_lower_ci=average_critical_window_looking-critical_window_ci,critical_window_upper_ci=average_critical_window_looking+critical_window_ci) |>rename_with(~paste0(., "_", suffix), -c(SubjectInfo.subjID, Trials.trialID, Trials.ordinal, Trials.trialType))) }#stolen from peekbank/ peekds#https://github.com/langcog/peekds/blob/master/R/generate_aoi.Rresample_aoi_trial <-function(df_trial, sample_duration=1000/30) {#print(paste0("Subject Number: ",unique(df_trial$sub_num), "; Trial Number: ", unique(df_trial$Trials.ordinal))) t_origin <- df_trial$t_norm data_origin <- df_trial$aoi# create the new timestamps for resampling t_start <-min(t_origin) - (min(t_origin) %% sample_duration) t_resampled <-seq(from = t_start, to =max(t_origin),by = sample_duration)# exchange strings values with integers for resampling# this step critical for interpolating missing vals quickly and correctly aoi_num <- data_origin %>% dplyr::recode(target =1, distractor =2, other =3, missing =4)# start resampling with approx aoi_resampled <- stats::approx(x = t_origin, y = aoi_num, xout = t_resampled,method ="constant", rule =2,ties ="ordered")$y aoi_resampled_recoded <- aoi_resampled %>% dplyr::recode("1"="target", "2"="distractor","3"="other", "4"="missing")# adding back the columns to match schema dplyr::tibble(t_norm = t_resampled,aoi = aoi_resampled_recoded,trial_id = df_trial$trial_id[1],administration_id = df_trial$administration_id[1])}resample_times <-function(df_table, sample_duration) {# first check if this data frame has all the correct columns required for# re-sampling required_columns <-c("trial_id", "administration_id", "t_norm", "aoi")# re-zero and normalize times first# this is mandatory, comes from our decision that not linking resampling and# centering causes a lot of problemsif (!all(required_columns %in%colnames(df_table))) {stop(.msg("Resample times function requires the following columns to be present in the dataframe: {paste(required_columns, collapse = ', ')}. Times should be re-zeroed and normalized first before being resampled!")) }# main resampling call# start resampling process by iterating through every trial within every# administration df_out <- df_table %>% dplyr::mutate(admin_trial_id =paste(.data$administration_id, .data$trial_id, sep ="_")) %>%split(.$admin_trial_id) %>% purrr::map_df(resample_aoi_trial, sample_duration=sample_duration) %>% dplyr::arrange(.data$administration_id, .data$trial_id)return(df_out)}
Importing looking time data after initial preprocessing of iCatcher data in Python
all_looking_times <- trial_timing_info |>group_by(SubjectInfo.subjID) |># Importing looking time data from iCatcher+mutate(timestamp_data =map(file.path(DATA_TO_ANALYZE_PATH, paste("1", "processed", "icatcher.csv", sep="_")), read_csv, show_col_types =FALSE)) |>unnest(timestamp_data) |>ungroup() |># Excluding duplicate columnfilter(Trials.trialID == trial_id) |>select(-trial_id) |>filter(SubjectInfo.subjID == child_id) |>select(-child_id) |>rowwise() |>mutate(Trials.imagePair =gsub("(easy-|hard-|-distractor)", "", Trials.trialID),Trials.distractorImage =setdiff(strsplit(Trials.trialID, "-")[[1]], c("easy", "hard", "distractor", Trials.targetImage))[1],# Normalizing all times to be 0ms when the target word starts. Target words start at the same time across carrier phrase lengths and conditions - subtracting '225ms' to account for use of article 'the'time_normalized_corrected = time_ms - Trials.audio_lag_vs_video_lag -225- Trials.target_onset *1000,# 'left' is coded as 1, 'right' as 2, we need to switch those since the video coming in from iCatcher+ is mirroredaccuracy =ifelse((lookType_coded ==2& Trials.targetImage == Trials.leftImage) | (lookType_coded ==1& Trials.targetImage == Trials.rightImage), 1, 0),confident_frame = confidence >0.6,not_looking_away = lookType_coded ==1| lookType_coded ==2,lookType =case_when(lookType_coded ==1~"right", lookType_coded ==2~"left", lookType_coded ==0~"away"),accuracy =ifelse(not_looking_away, accuracy, NA) )
Qualitatively looking at the range of lag across participants
In order for a trial to be included, participants must contribute at least 50% looking during the windows of interest when computing baseline-corrected proportion target looking: the critical window (300 ms - 3500 ms relative to target word onset) and the baseline window (-2000 ms - 0 ms relative to target word onset). We also create columns to track exclusions for an alternate critical window (300 ms - 1800 ms).
# create a list to track any overall subject-level exclusions (prior to excluding trials)subject_exclusions <-c()
Age-based exclusions
age_excluded_infant_sessions <- all_looking_times %>%distinct(SubjectInfo.subjID,SubjectInfo.testAge) %>%#filter out children outside of the age bounds (older than 14 months and younger than 24 months)filter(!(424<=SubjectInfo.testAge & SubjectInfo.testAge<=761)) %>%mutate(age_exclusion =1 )#reintegrate with data because this exclusion decision has implications for data-contribution-based exclusionsall_looking_times <- all_looking_times %>%left_join(age_excluded_infant_sessions) %>%mutate(age_exclusion =case_when(is.na(age_exclusion) ~0,TRUE~ age_exclusion) )summarize_subj_trials <- summarize_subj_trials %>%left_join(age_excluded_infant_sessions) %>%mutate(age_exclusion =case_when(is.na(age_exclusion) ~0,TRUE~ age_exclusion) )
English Language Exposure
Infants who had no exposure to English were excluded.
Infants born at a 35-week gestational age or less (born 28 days or more before their due date) were excluded from the final sample.
# infants 36 weeks or older are not considered preterm# (equivalent to less than 28 days pre due date)not_pre_term <-c("36 weeks","37 weeks","38 weeks","39 weeks","40 or more weeks")preterm_infants <- all_looking_times %>%distinct(SubjectInfo.subjID, SubjectInfo.age_at_birth) %>%#also ignore infants where we do not have this informationfilter(!(SubjectInfo.age_at_birth %in%c(not_pre_term,"Not sure or prefer not to answer"))) %>%pull(SubjectInfo.subjID)subject_exclusions <-c(subject_exclusions,preterm_infants)
Developmental Concerns
Any infants noted as having a developmental concern were excluded.
We analyzed whether infants preferentially fixated a single side of the screen prior to the onset of the target word (baseline window: -2000 ms - 0 ms prior to target word onset). Infants who fixated one side of the screen for more than 80% of the baseline window on 80% or more trials in a given testing session were excluded from further analyses (including data from their other test session).
side_bias_evaluation <- summarize_subj_usable_trials_baseline_window %>%group_by(SubjectInfo.subjID) %>%summarize(total_trials=n(),side_bias_right =sum(side_bias_right),side_bias_left =sum(side_bias_left),percent_side_bias_right = side_bias_right/total_trials,percent_side_bias_left = side_bias_left/total_trials )# Any infants with a persistent side bias to the left or right?side_bias_infants <- side_bias_evaluation %>%filter(percent_side_bias_right>=0.8| percent_side_bias_left>=0.8) %>%pull(SubjectInfo.subjID)subject_exclusions <-c(subject_exclusions,side_bias_infants)
No side bias infants were identified.
Data contribution
Subject-level data exclusions are handled below, after handling all trial-level exclusions.
Technical issues
We assessed technical concerns on the level of the trial and then subsequently excluded participants with insufficient usable trials, no overarching technical-level exclusions were made.
Trial-level exclusions
Technical error
We excluded trials for which experiment logs indicated some form of technical issue (unusual recording or audio times), that the parents paused the trial during, or had a frame rate below 15 Hz on average (rounded up).
#trial exclusions for unusual timing issues and/or pausestechnical_trial_issues <- all_looking_times |>group_by(SubjectInfo.subjID,Trials.trialID) |>mutate(video_capture_time = (max(time_ms) -min(time_ms))/1000,framerate =mean(1000/abs(diff(time_ms))),# Trials excluded because if average framerate was below 15Hz (rounded up) on averageframerate_issue = framerate <14.5,#mirror exclusion assignmenttechnical_issue_exclusion_reason =case_when(#!is.na(pauseStudy) ~ "study paused", video_capture_time>10~"unusually long trial", video_capture_time<6~"unusually short trial", framerate_issue ~"framerate issue",#audio_time_s>10 ~ "unusual audio time" ),exclude_technical_issue =!is.na(technical_issue_exclusion_reason)) |>distinct(SubjectInfo.subjID, Trials.trialID, .keep_all =TRUE) |>select(SubjectInfo.subjID, Trials.trialID,technical_issue_exclusion_reason,exclude_technical_issue, video_capture_time, framerate)
#integrate with central tracking filesummarize_subj_trials <- summarize_subj_trials %>%left_join(technical_trial_issues)
Auditory interference/ parent interference
This is handled during prescreening/ coding.
Codability of the video
This is handled during prescreening/ coding.
insufficient looking data
Trials are excluded if the child was not fixating either object for more than 50% of the critical window (300ms-3500ms) or the baseline window (-2000ms - 0ms).
This is handled in the section aggregating and summarizing trial-level information above.
#participants to keepprint(paste("Excluded participants:", sum(participant_total_usable_trials$exclude_participant_insufficient_data==0,na.rm=T)))
[1] "Excluded participants: 4"
#join with main data frameall_looking_times <- all_looking_times %>%left_join(participant_total_usable_trials)all_looking_times <- all_looking_times %>%left_join(summarize_usable_trials)all_looking_times <- all_looking_times %>%left_join(summarize_subj_trials)
In order to plot participants’ average proportion looking to the target across the trial, we smooth/ resample time. This is necessary when plotting the timecourses given the variable sampling rate in the data (otherwise the mean observations “jump around” due to varying contributing data composition at different time points).
target_ms_per_frame <-1000/30#resample timeslooking_times_resampled <- all_looking_times %>%#rename and transform variables for resampling functionmutate(aoi =case_when( accuracy ==1~"target", accuracy ==0~"distractor",is.na(accuracy) ~"other" ),trial_id = Trials.trialID,sub_num = SubjectInfo.subjID ) %>%mutate(t_norm=time_normalized_corrected,administration_id =paste(SubjectInfo.subjID,Trials.trialID,Trials.order,sep="_")) %>%resample_times(sample_duration=target_ms_per_frame) #time resampling is set to 30 Hz
#clean up resampled datalooking_time_resampled_clean <- looking_times_resampled %>%mutate(accuracy_transformed =case_when( aoi =="target"~1, aoi =="distractor"~0, aoi =="other"~NA ) ) %>%separate(administration_id,into=c("SubjectInfo.subjID","Trials.trialID", "Trials.ordinal"),sep="_",remove=F) %>%rename(time_normalized_corrected=t_norm) %>%mutate(time_normalized_corrected =round(time_normalized_corrected,0)) %>%select(-administration_id,-aoi) %>%select(SubjectInfo.subjID,Trials.trialID,time_normalized_corrected,accuracy_transformed)#check number of rows to make sure resampling rate is correctprint(paste("Number of rows after resampling:", nrow(looking_time_resampled_clean)))
[1] "Number of rows after resampling: 33587"
#extract key information from full datasetlooking_times_info <- all_looking_times %>%distinct(SubjectInfo.subjID, Trials.trialID, exclude_participant, exclude_participant_insufficient_data, SubjectInfo.testAge, trial_exclusion,trial_exclusion_reason, usable_window,Trials.targetImage, Trials.distractorImage, Trials.trialType, Trials.imagePair)#join with resampled datalooking_time_resampled_clean <- looking_time_resampled_clean %>%left_join(looking_times_info)
Joining with `by = join_by(SubjectInfo.subjID, Trials.trialID)`
Write data
#write a dataset with exclusions processedwrite_csv(all_looking_times, here("data","processed_data","looking_times_with_exclusion_info.csv"))write_csv(looking_time_resampled_clean, here("data","processed_data","looking_times_resampled_clean.csv"))saveRDS(all_looking_times, here("data","processed_data","looking_times_with_exclusion_info.rds"))#write out usable trial summarywrite_csv(participant_total_usable_trials,here("data","processed_data","participant_usable_trial_summary.csv"))write_csv(summarize_subj_trials,here("data","processed_data","trial_summary_data.csv"))write_csv(trial_metadata, here("data", "metadata", "trial_metadata.csv"))