1 Document Overview

In this document, we:

  1. create infant level data csv, by combining infant level data from individual datasets into a single csv.

  2. complete meta-analysis data (including means and sds for each type of trial) using data from infant level data csv.

  3. complete mega-analysis data by adding moderators to infant level data csv

2 Imports

options(scipen = 1, digits = 3)
library(pacman)

pacman::p_load(plyr,
               tidyverse,
               dplyr,
               stringr,
               metafor,
               lme4)

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.6.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lme4_1.1-34         metafor_4.4-0       numDeriv_2016.8-1.1
##  [4] metadat_1.2-0       Matrix_1.6-1.1      lubridate_1.9.3    
##  [7] forcats_1.0.0       stringr_1.5.0       dplyr_1.1.3        
## [10] purrr_1.0.2         readr_2.1.4         tidyr_1.3.0        
## [13] tibble_3.2.1        ggplot2_3.4.4       tidyverse_2.0.0    
## [16] plyr_1.8.9          pacman_0.5.1       
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.7        utf8_1.2.4        generics_0.1.3    stringi_1.7.12   
##  [5] lattice_0.21-9    hms_1.1.3         digest_0.6.33     magrittr_2.0.3   
##  [9] evaluate_0.22     grid_4.3.2        timechange_0.2.0  fastmap_1.1.1    
## [13] jsonlite_1.8.7    fansi_1.0.5       scales_1.2.1      jquerylib_0.1.4  
## [17] cli_3.6.1         rlang_1.1.1       splines_4.3.2     munsell_0.5.0    
## [21] withr_2.5.2       cachem_1.0.8      yaml_2.3.7        tools_4.3.2      
## [25] tzdb_0.4.0        nloptr_2.0.3      minqa_1.2.6       colorspace_2.1-0 
## [29] mathjaxr_1.6-0    boot_1.3-28.1     vctrs_0.6.4       R6_2.5.1         
## [33] lifecycle_1.0.3   MASS_7.3-60       pkgconfig_2.0.3   pillar_1.9.0     
## [37] bslib_0.5.1       gtable_0.3.4      glue_1.6.2        Rcpp_1.0.11      
## [41] xfun_0.40         tidyselect_1.2.0  rstudioapi_0.15.0 knitr_1.45       
## [45] nlme_3.1-163      htmltools_0.5.6.1 rmarkdown_2.25    compiler_4.3.2

3 Loading all datasets and combining data

brandone_2009 <- read.csv('outputs_(3)/brandone_2009.csv')
choi_2018 <- read.csv('outputs_(3)/choi_2018.csv')
chuey_2021 <- read.csv('outputs_(3)/chuey_2021.csv')
gerson_2014a <- read.csv('outputs_(3)/gerson_2014a.csv')
gerson_2014b <- read.csv('outputs_(3)/gerson_2014b.csv')
hespos_2009 <- read.csv('outputs_(3)/hespos_2009.csv')
liu_2017a <- read.csv('outputs_(3)/liu_2017a.csv')
liu_2017b <- read.csv('outputs_(3)/liu_2017b.csv')
liu_2019 <- read.csv('outputs_(3)/liu_2019.csv')
liu_2022 <- read.csv('outputs_(3)/liu_2022.csv')
liu_unpublisheda <- read.csv('outputs_(3)/liu_unpublisheda.csv')
liu_unpublishedb <- read.csv('outputs_(3)/liu_unpublishedb.csv')
liu_unpublishedc <- read.csv('outputs_(3)/liu_unpublishedc.csv')
liu_unpublishedd <- read.csv('outputs_(3)/liu_unpublishedd.csv')
luo_2005a <- read.csv('outputs_(3)/luo_2005a.csv')
luo_2005b <- read.csv('outputs_(3)/luo_2005b.csv')
luo_2009a <- read.csv('outputs_(3)/luo_2009a.csv')
luo_2009b <- read.csv('outputs_(3)/luo_2009b.csv')
luo_2010 <- read.csv('outputs_(3)/luo_2010.csv')
luo_2011 <- read.csv('outputs_(3)/luo_2011.csv')
martin_2017 <- read.csv('outputs_(3)/martin_2017.csv')
powell_unpublished <- read.csv('outputs_(3)/powell_unpublished.csv')
sanal_hayes_2022 <- read.csv('outputs_(3)/sanal-hayes_2022.csv')
skerry_2013 <- read.csv('outputs_(3)/skerry_2013.csv')
stojnic_2023 <- read.csv('outputs_(3)/stojnic_2023.csv')
thoermer_2013 <- read.csv('outputs_(3)/thoermer_2013.csv')
woo_2021 <- read.csv('outputs_(3)/woo_2021.csv')

all_data <- rbind.fill(brandone_2009, choi_2018, chuey_2021, gerson_2014a, gerson_2014b, 
                       hespos_2009, liu_2017a, liu_2017b, liu_2019, liu_2022, liu_unpublisheda, 
                       liu_unpublishedb, liu_unpublishedc, liu_unpublishedd, luo_2005a, luo_2005b, 
                       luo_2009a, luo_2009b, luo_2010, luo_2011, martin_2017, powell_unpublished,
                       sanal_hayes_2022, skerry_2013, stojnic_2023, thoermer_2013, woo_2021)

all_data <- as.data.frame(apply(all_data,2, str_remove_all, " ")) %>%
  mutate(specific = paste(paper, expt_num, expt_cond, subj, sep = "."))

4 Get last training trial

all_data$train1 <- as.numeric(as.character(all_data$train1))
## Warning: NAs introduced by coercion
all_data$train2 <- as.numeric(as.character(all_data$train2))
## Warning: NAs introduced by coercion
all_data$train3 <- as.numeric(as.character(all_data$train3))
## Warning: NAs introduced by coercion
all_data$train4 <- as.numeric(as.character(all_data$train4))
## Warning: NAs introduced by coercion
all_data$train5 <- as.numeric(as.character(all_data$train5))
## Warning: NAs introduced by coercion
all_data$train6 <- as.numeric(as.character(all_data$train6))
## Warning: NAs introduced by coercion
all_data$train7 <- as.numeric(as.character(all_data$train7))
all_data$train8 <- as.numeric(as.character(all_data$train8))
all_data$train9 <- as.numeric(as.character(all_data$train9))
all_data$train10 <- as.numeric(as.character(all_data$train10))
all_data$train11 <- as.numeric(as.character(all_data$train11))
all_data$train12 <- as.numeric(as.character(all_data$train12))
all_data$train13 <- as.numeric(as.character(all_data$train13))
all_data$train14 <- as.numeric(as.character(all_data$train14))
all_data$expected1 <- as.numeric(as.character(all_data$expected1))
## Warning: NAs introduced by coercion
all_data$expected2 <- as.numeric(as.character(all_data$expected2))
## Warning: NAs introduced by coercion
all_data$expected3 <- as.numeric(as.character(all_data$expected3))
## Warning: NAs introduced by coercion
all_data$unexpected1 <- as.numeric(as.character(all_data$unexpected1))
## Warning: NAs introduced by coercion
all_data$unexpected2 <- as.numeric(as.character(all_data$unexpected2))
## Warning: NAs introduced by coercion
all_data$unexpected3 <- as.numeric(as.character(all_data$unexpected3))
## Warning: NAs introduced by coercion
all_data$agedays <- as.numeric(as.character(all_data$agedays))

trainings<-all_data %>%
  dplyr::select(specific, starts_with('train'), -training_type) %>%
  pivot_longer(cols = starts_with("train"), names_to = "last_train_trial", values_to = "last_train_looking") %>%
  filter(!is.na(last_train_looking)) %>%
  group_by(specific) %>%
  summarise_all(last)

all_data_w_train <- left_join(all_data, trainings, by = c("specific" = "specific")) %>%
  mutate(paper_expt_info = paste(paper, expt_num, expt_cond, sep = "."))

5 Get Moderators

moderators <- read.csv('processed_data/meta-analysis_only_paper_data.csv') %>%
  mutate(paper_expt_info = paste(study_ID, expt_num, expt_condition, sep = ".")) %>%
  mutate(SD_1 = ifelse(is.na(SD_1),ifelse(is.na(SE_1), NA, SE_1 * sqrt(n_1)) , SD_1)) %>%
  mutate(SD_2 = ifelse(is.na(SD_2),ifelse(is.na(SE_2), NA, SE_2 * sqrt(n_1)) , SD_2)) %>%
  mutate(SD_3 = ifelse(is.na(SD_3),ifelse(is.na(SE_3), NA, SE_3 * sqrt(n_1)) , SD_3))

6 Meta-analysis data

6.1 Complete meta-analysis data using raw data for means/sds

looking_time_per_condition_raw <- all_data_w_train %>%
  group_by(paper_expt_info) %>%
  dplyr::summarize(mean_hab = mean(last_train_looking, na.rm = TRUE), sd_hab = sd(last_train_looking, na.rm = TRUE), mean_expected = mean(expected1, na.rm = TRUE), sd_expected = sd(expected1, na.rm = TRUE), mean_unexpected = mean(unexpected1, na.rm = TRUE), sd_unexpected = sd(unexpected1, na.rm = TRUE), mean_age = mean(agedays, na.rm = TRUE), sd_age = sd(agedays, na.rm = TRUE))

raw_data_mean_sd <- left_join(moderators, looking_time_per_condition_raw, by = c("paper_expt_info" = "paper_expt_info")) %>%
  dplyr::select(paper_expt_info, mean_hab, x_1, sd_hab, SD_1, mean_expected, x_2, sd_expected, SD_2, mean_unexpected, x_3, sd_unexpected, SD_3, mean_age, mean_age_1, sd_age, sd_age_1) %>%
  mutate(x_1 = ifelse(is.na(x_1), mean_hab, x_1), x_2 = ifelse(is.na(x_2), mean_expected, x_2), x_3 = ifelse(is.na(x_3), mean_unexpected, x_3), SD_1 = ifelse(is.na(SD_1), sd_hab, SD_1), SD_2 = ifelse(is.na(SD_2), sd_expected, SD_2), SD_3 = ifelse(is.na(SD_3), sd_unexpected, SD_3), mean_age = ifelse(is.na(mean_age_1), mean_age, mean_age_1), sd_age = ifelse(is.na(sd_age_1), sd_age, sd_age_1)) %>%
  dplyr::select(paper_expt_info, x_1, SD_1, x_2, SD_2, x_3, SD_3, mean_age, sd_age)

moderators_complete_means_sd <- moderators %>%
  mutate(x_1 = raw_data_mean_sd$x_1, x_2 = raw_data_mean_sd$x_2, x_3 = raw_data_mean_sd$x_3, SD_1 = raw_data_mean_sd$SD_1, SD_2 = raw_data_mean_sd$SD_2, SD_3 = raw_data_mean_sd$SD_3, mean_age_1 = raw_data_mean_sd$mean_age, sd_age_1 = raw_data_mean_sd$sd_age) %>%
  mutate(SE_1 = ifelse(is.na(SE_1),ifelse(is.na(SD_1), NA, SD_1 / sqrt(n_1)) , SE_1)) %>%
  mutate(SE_2 = ifelse(is.na(SE_2),ifelse(is.na(SD_2), NA, SD_2 / sqrt(n_1)) , SE_2)) %>%
  mutate(SE_3 = ifelse(is.na(SE_3),ifelse(is.na(SD_3), NA, SD_3 / sqrt(n_1)) , SE_3))

7 Mega-analysis data

7.1 Combine moderators with raw data

raw_data_w_moderators <- left_join(all_data_w_train, moderators_complete_means_sd, by = c("paper_expt_info" = "paper_expt_info")) %>%
  mutate(expt_num = expt_num.x) %>%
  mutate(specific_subject_id = specific) %>%
  dplyr::select(paper, long_cite, short_cite, doi, expt_num, expt_cond, subj, specific_subject_id, sex, agedays, order, exposure_phase, starts_with("train"), last_train_trial, last_train_looking, starts_with("expected"), starts_with("unexpected"), num_traintrials, num_trials, exp_or_control, equal_per_nov, domain, principle, PI_group, stim_loop, paper_expt_info) %>%
  mutate(data_contains_all_trials = ifelse(paper %in% c("hespos_2009", "thoermer_2013", "gerson_2014a", "gerson_2014b", "sanal_hayes_2022"), "no", "yes")) %>%
  mutate(data_contains_all_trials = ifelse(paper == "hespos_2009", "no: first and last 3 hab trials", data_contains_all_trials)) %>%
  mutate(data_contains_all_trials = ifelse(paper == "thoermer_2013", "no: first 6 and last 3 hab trials", data_contains_all_trials)) %>%
  mutate(data_contains_all_trials = ifelse(paper == "gerson_2014a", "no: first and last 3 hab trials", data_contains_all_trials)) %>%
  mutate(data_contains_all_trials = ifelse(paper == "gerson_2014b", "no: first and last 3 hab trials", data_contains_all_trials)) %>%
  mutate(data_contains_all_trials = ifelse(paper == "sanal_hayes_2022", "no: trials 7 through 10", data_contains_all_trials)) %>%
  mutate(data_contains_all_trials = ifelse(paper == "liu_unpublisheda", "no: only fam6 (last training trial)", data_contains_all_trials)) %>%
  mutate(sex = ifelse(sex == "Male"|sex == "male"|sex == 1| sex == "M"| sex == "m", "m", ifelse(is.na(sex), "NA", "f")))

raw_data_w_moderators$last_train_trial <- gsub("train", "", as.character(raw_data_w_moderators$last_train_trial))

raw_data_w_moderators <- raw_data_w_moderators %>%
  mutate(last_train_trial = ifelse(data_contains_all_trials == "yes", last_train_trial, NA)) %>%
  dplyr::select(-c(training_type, num_traintrials))

# data_contains_all_trials explains whether data includes all training trials. Also determines whether train# columns correspond to infant's number of train trial (for cases in which they don't, explains what trials are included in order)

# last_train_trial provides the number of training trials for each infant

8 Gender info

gender_proportion <- raw_data_w_moderators %>%
  group_by(paper_expt_info) %>%
  filter(!is.na(sex)) %>%
  mutate(gender_from_raw= round(sum(sex=='f')/n(),3)) %>%
  dplyr::select(paper_expt_info, gender_from_raw) %>%
  unique()

meta_analysis_data <- left_join(moderators_complete_means_sd, gender_proportion, by = c("paper_expt_info" = "paper_expt_info")) %>%
  mutate(gender_1 = ifelse(is.na(gender_from_raw), gender_1, gender_from_raw)) %>%
  dplyr::select(-c(gender_from_raw))

#Gender in infant-level data
gender_f_raw <- raw_data_w_moderators %>%
  filter(!is.na(sex), !is.na(order), !is.na(agedays)) %>%
  mutate(total_f = sum(sex == "f")/n()) # 51.2% female 

#Gender in condition-level data
num_f_cond <- meta_analysis_data %>%
  filter(study_ID!= "sanal-hayes_2022") %>%
  filter(!is.na(gender_1)) %>%
  mutate(num_f = gender_1 *n_1)

proportion_f_condition_level = sum(num_f_cond$num_f)/sum(num_f_cond$n_1) #50.3% female

9 Save Datasets

9.1 Write dataset meta-analysis

write.csv(meta_analysis_data,"processed_data/meta-analysis_data.csv", row.names = FALSE)

9.2 Write dataset mega-analysis

write.csv(raw_data_w_moderators,"processed_data/mega-analysis_data.csv", row.names = FALSE)