In this document, we:
create infant level data csv, by combining infant level data from individual datasets into a single csv.
complete meta-analysis data (including means and sds for each type of trial) using data from infant level data csv.
complete mega-analysis data by adding moderators to infant level data csv
options(scipen = 1, digits = 3)
library(pacman)
pacman::p_load(plyr,
tidyverse,
dplyr,
stringr,
metafor,
lme4)
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.6.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] lme4_1.1-34 metafor_4.4-0 numDeriv_2016.8-1.1
## [4] metadat_1.2-0 Matrix_1.6-1.1 lubridate_1.9.3
## [7] forcats_1.0.0 stringr_1.5.0 dplyr_1.1.3
## [10] purrr_1.0.2 readr_2.1.4 tidyr_1.3.0
## [13] tibble_3.2.1 ggplot2_3.4.4 tidyverse_2.0.0
## [16] plyr_1.8.9 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.7 utf8_1.2.4 generics_0.1.3 stringi_1.7.12
## [5] lattice_0.21-9 hms_1.1.3 digest_0.6.33 magrittr_2.0.3
## [9] evaluate_0.22 grid_4.3.2 timechange_0.2.0 fastmap_1.1.1
## [13] jsonlite_1.8.7 fansi_1.0.5 scales_1.2.1 jquerylib_0.1.4
## [17] cli_3.6.1 rlang_1.1.1 splines_4.3.2 munsell_0.5.0
## [21] withr_2.5.2 cachem_1.0.8 yaml_2.3.7 tools_4.3.2
## [25] tzdb_0.4.0 nloptr_2.0.3 minqa_1.2.6 colorspace_2.1-0
## [29] mathjaxr_1.6-0 boot_1.3-28.1 vctrs_0.6.4 R6_2.5.1
## [33] lifecycle_1.0.3 MASS_7.3-60 pkgconfig_2.0.3 pillar_1.9.0
## [37] bslib_0.5.1 gtable_0.3.4 glue_1.6.2 Rcpp_1.0.11
## [41] xfun_0.40 tidyselect_1.2.0 rstudioapi_0.15.0 knitr_1.45
## [45] nlme_3.1-163 htmltools_0.5.6.1 rmarkdown_2.25 compiler_4.3.2
brandone_2009 <- read.csv('outputs_(3)/brandone_2009.csv')
choi_2018 <- read.csv('outputs_(3)/choi_2018.csv')
chuey_2021 <- read.csv('outputs_(3)/chuey_2021.csv')
gerson_2014a <- read.csv('outputs_(3)/gerson_2014a.csv')
gerson_2014b <- read.csv('outputs_(3)/gerson_2014b.csv')
hespos_2009 <- read.csv('outputs_(3)/hespos_2009.csv')
liu_2017a <- read.csv('outputs_(3)/liu_2017a.csv')
liu_2017b <- read.csv('outputs_(3)/liu_2017b.csv')
liu_2019 <- read.csv('outputs_(3)/liu_2019.csv')
liu_2022 <- read.csv('outputs_(3)/liu_2022.csv')
liu_unpublisheda <- read.csv('outputs_(3)/liu_unpublisheda.csv')
liu_unpublishedb <- read.csv('outputs_(3)/liu_unpublishedb.csv')
liu_unpublishedc <- read.csv('outputs_(3)/liu_unpublishedc.csv')
liu_unpublishedd <- read.csv('outputs_(3)/liu_unpublishedd.csv')
luo_2005a <- read.csv('outputs_(3)/luo_2005a.csv')
luo_2005b <- read.csv('outputs_(3)/luo_2005b.csv')
luo_2009a <- read.csv('outputs_(3)/luo_2009a.csv')
luo_2009b <- read.csv('outputs_(3)/luo_2009b.csv')
luo_2010 <- read.csv('outputs_(3)/luo_2010.csv')
luo_2011 <- read.csv('outputs_(3)/luo_2011.csv')
martin_2017 <- read.csv('outputs_(3)/martin_2017.csv')
powell_unpublished <- read.csv('outputs_(3)/powell_unpublished.csv')
sanal_hayes_2022 <- read.csv('outputs_(3)/sanal-hayes_2022.csv')
skerry_2013 <- read.csv('outputs_(3)/skerry_2013.csv')
stojnic_2023 <- read.csv('outputs_(3)/stojnic_2023.csv')
thoermer_2013 <- read.csv('outputs_(3)/thoermer_2013.csv')
woo_2021 <- read.csv('outputs_(3)/woo_2021.csv')
all_data <- rbind.fill(brandone_2009, choi_2018, chuey_2021, gerson_2014a, gerson_2014b,
hespos_2009, liu_2017a, liu_2017b, liu_2019, liu_2022, liu_unpublisheda,
liu_unpublishedb, liu_unpublishedc, liu_unpublishedd, luo_2005a, luo_2005b,
luo_2009a, luo_2009b, luo_2010, luo_2011, martin_2017, powell_unpublished,
sanal_hayes_2022, skerry_2013, stojnic_2023, thoermer_2013, woo_2021)
all_data <- as.data.frame(apply(all_data,2, str_remove_all, " ")) %>%
mutate(specific = paste(paper, expt_num, expt_cond, subj, sep = "."))
all_data$train1 <- as.numeric(as.character(all_data$train1))
## Warning: NAs introduced by coercion
all_data$train2 <- as.numeric(as.character(all_data$train2))
## Warning: NAs introduced by coercion
all_data$train3 <- as.numeric(as.character(all_data$train3))
## Warning: NAs introduced by coercion
all_data$train4 <- as.numeric(as.character(all_data$train4))
## Warning: NAs introduced by coercion
all_data$train5 <- as.numeric(as.character(all_data$train5))
## Warning: NAs introduced by coercion
all_data$train6 <- as.numeric(as.character(all_data$train6))
## Warning: NAs introduced by coercion
all_data$train7 <- as.numeric(as.character(all_data$train7))
all_data$train8 <- as.numeric(as.character(all_data$train8))
all_data$train9 <- as.numeric(as.character(all_data$train9))
all_data$train10 <- as.numeric(as.character(all_data$train10))
all_data$train11 <- as.numeric(as.character(all_data$train11))
all_data$train12 <- as.numeric(as.character(all_data$train12))
all_data$train13 <- as.numeric(as.character(all_data$train13))
all_data$train14 <- as.numeric(as.character(all_data$train14))
all_data$expected1 <- as.numeric(as.character(all_data$expected1))
## Warning: NAs introduced by coercion
all_data$expected2 <- as.numeric(as.character(all_data$expected2))
## Warning: NAs introduced by coercion
all_data$expected3 <- as.numeric(as.character(all_data$expected3))
## Warning: NAs introduced by coercion
all_data$unexpected1 <- as.numeric(as.character(all_data$unexpected1))
## Warning: NAs introduced by coercion
all_data$unexpected2 <- as.numeric(as.character(all_data$unexpected2))
## Warning: NAs introduced by coercion
all_data$unexpected3 <- as.numeric(as.character(all_data$unexpected3))
## Warning: NAs introduced by coercion
all_data$agedays <- as.numeric(as.character(all_data$agedays))
trainings<-all_data %>%
dplyr::select(specific, starts_with('train'), -training_type) %>%
pivot_longer(cols = starts_with("train"), names_to = "last_train_trial", values_to = "last_train_looking") %>%
filter(!is.na(last_train_looking)) %>%
group_by(specific) %>%
summarise_all(last)
all_data_w_train <- left_join(all_data, trainings, by = c("specific" = "specific")) %>%
mutate(paper_expt_info = paste(paper, expt_num, expt_cond, sep = "."))
moderators <- read.csv('processed_data/meta-analysis_only_paper_data.csv') %>%
mutate(paper_expt_info = paste(study_ID, expt_num, expt_condition, sep = ".")) %>%
mutate(SD_1 = ifelse(is.na(SD_1),ifelse(is.na(SE_1), NA, SE_1 * sqrt(n_1)) , SD_1)) %>%
mutate(SD_2 = ifelse(is.na(SD_2),ifelse(is.na(SE_2), NA, SE_2 * sqrt(n_1)) , SD_2)) %>%
mutate(SD_3 = ifelse(is.na(SD_3),ifelse(is.na(SE_3), NA, SE_3 * sqrt(n_1)) , SD_3))
looking_time_per_condition_raw <- all_data_w_train %>%
group_by(paper_expt_info) %>%
dplyr::summarize(mean_hab = mean(last_train_looking, na.rm = TRUE), sd_hab = sd(last_train_looking, na.rm = TRUE), mean_expected = mean(expected1, na.rm = TRUE), sd_expected = sd(expected1, na.rm = TRUE), mean_unexpected = mean(unexpected1, na.rm = TRUE), sd_unexpected = sd(unexpected1, na.rm = TRUE), mean_age = mean(agedays, na.rm = TRUE), sd_age = sd(agedays, na.rm = TRUE))
raw_data_mean_sd <- left_join(moderators, looking_time_per_condition_raw, by = c("paper_expt_info" = "paper_expt_info")) %>%
dplyr::select(paper_expt_info, mean_hab, x_1, sd_hab, SD_1, mean_expected, x_2, sd_expected, SD_2, mean_unexpected, x_3, sd_unexpected, SD_3, mean_age, mean_age_1, sd_age, sd_age_1) %>%
mutate(x_1 = ifelse(is.na(x_1), mean_hab, x_1), x_2 = ifelse(is.na(x_2), mean_expected, x_2), x_3 = ifelse(is.na(x_3), mean_unexpected, x_3), SD_1 = ifelse(is.na(SD_1), sd_hab, SD_1), SD_2 = ifelse(is.na(SD_2), sd_expected, SD_2), SD_3 = ifelse(is.na(SD_3), sd_unexpected, SD_3), mean_age = ifelse(is.na(mean_age_1), mean_age, mean_age_1), sd_age = ifelse(is.na(sd_age_1), sd_age, sd_age_1)) %>%
dplyr::select(paper_expt_info, x_1, SD_1, x_2, SD_2, x_3, SD_3, mean_age, sd_age)
moderators_complete_means_sd <- moderators %>%
mutate(x_1 = raw_data_mean_sd$x_1, x_2 = raw_data_mean_sd$x_2, x_3 = raw_data_mean_sd$x_3, SD_1 = raw_data_mean_sd$SD_1, SD_2 = raw_data_mean_sd$SD_2, SD_3 = raw_data_mean_sd$SD_3, mean_age_1 = raw_data_mean_sd$mean_age, sd_age_1 = raw_data_mean_sd$sd_age) %>%
mutate(SE_1 = ifelse(is.na(SE_1),ifelse(is.na(SD_1), NA, SD_1 / sqrt(n_1)) , SE_1)) %>%
mutate(SE_2 = ifelse(is.na(SE_2),ifelse(is.na(SD_2), NA, SD_2 / sqrt(n_1)) , SE_2)) %>%
mutate(SE_3 = ifelse(is.na(SE_3),ifelse(is.na(SD_3), NA, SD_3 / sqrt(n_1)) , SE_3))
raw_data_w_moderators <- left_join(all_data_w_train, moderators_complete_means_sd, by = c("paper_expt_info" = "paper_expt_info")) %>%
mutate(expt_num = expt_num.x) %>%
mutate(specific_subject_id = specific) %>%
dplyr::select(paper, long_cite, short_cite, doi, expt_num, expt_cond, subj, specific_subject_id, sex, agedays, order, exposure_phase, starts_with("train"), last_train_trial, last_train_looking, starts_with("expected"), starts_with("unexpected"), num_traintrials, num_trials, exp_or_control, equal_per_nov, domain, principle, PI_group, stim_loop, paper_expt_info) %>%
mutate(data_contains_all_trials = ifelse(paper %in% c("hespos_2009", "thoermer_2013", "gerson_2014a", "gerson_2014b", "sanal_hayes_2022"), "no", "yes")) %>%
mutate(data_contains_all_trials = ifelse(paper == "hespos_2009", "no: first and last 3 hab trials", data_contains_all_trials)) %>%
mutate(data_contains_all_trials = ifelse(paper == "thoermer_2013", "no: first 6 and last 3 hab trials", data_contains_all_trials)) %>%
mutate(data_contains_all_trials = ifelse(paper == "gerson_2014a", "no: first and last 3 hab trials", data_contains_all_trials)) %>%
mutate(data_contains_all_trials = ifelse(paper == "gerson_2014b", "no: first and last 3 hab trials", data_contains_all_trials)) %>%
mutate(data_contains_all_trials = ifelse(paper == "sanal_hayes_2022", "no: trials 7 through 10", data_contains_all_trials)) %>%
mutate(data_contains_all_trials = ifelse(paper == "liu_unpublisheda", "no: only fam6 (last training trial)", data_contains_all_trials)) %>%
mutate(sex = ifelse(sex == "Male"|sex == "male"|sex == 1| sex == "M"| sex == "m", "m", ifelse(is.na(sex), "NA", "f")))
raw_data_w_moderators$last_train_trial <- gsub("train", "", as.character(raw_data_w_moderators$last_train_trial))
raw_data_w_moderators <- raw_data_w_moderators %>%
mutate(last_train_trial = ifelse(data_contains_all_trials == "yes", last_train_trial, NA)) %>%
dplyr::select(-c(training_type, num_traintrials))
# data_contains_all_trials explains whether data includes all training trials. Also determines whether train# columns correspond to infant's number of train trial (for cases in which they don't, explains what trials are included in order)
# last_train_trial provides the number of training trials for each infant
gender_proportion <- raw_data_w_moderators %>%
group_by(paper_expt_info) %>%
filter(!is.na(sex)) %>%
mutate(gender_from_raw= round(sum(sex=='f')/n(),3)) %>%
dplyr::select(paper_expt_info, gender_from_raw) %>%
unique()
meta_analysis_data <- left_join(moderators_complete_means_sd, gender_proportion, by = c("paper_expt_info" = "paper_expt_info")) %>%
mutate(gender_1 = ifelse(is.na(gender_from_raw), gender_1, gender_from_raw)) %>%
dplyr::select(-c(gender_from_raw))
#Gender in infant-level data
gender_f_raw <- raw_data_w_moderators %>%
filter(!is.na(sex), !is.na(order), !is.na(agedays)) %>%
mutate(total_f = sum(sex == "f")/n()) # 51.2% female
#Gender in condition-level data
num_f_cond <- meta_analysis_data %>%
filter(study_ID!= "sanal-hayes_2022") %>%
filter(!is.na(gender_1)) %>%
mutate(num_f = gender_1 *n_1)
proportion_f_condition_level = sum(num_f_cond$num_f)/sum(num_f_cond$n_1) #50.3% female
write.csv(meta_analysis_data,"processed_data/meta-analysis_data.csv", row.names = FALSE)
write.csv(raw_data_w_moderators,"processed_data/mega-analysis_data.csv", row.names = FALSE)