library(tidyverse)
library(here)
library(ggthemes)
library(metafor)
library(kableExtra)
library(ggrepel)

d <- read_csv(here("data/metalab_data_mini.csv"))
author_d <- read_csv(here("data/clean_author.csv"))

source(here("helper/model_comparison_help.r"))
source(here("helper/method_mod_help.r"))
source(here("helper/author_help.r"))



best_fit_bmeasure_model_df <- readRDS(here("cached_data/best_fit_behavioral_measure_df.Rds"))
best_fit_ep_model_df <- readRDS(here("cached_data/best_fit_exposure_phase.Rds"))
best_fit_sn_model_df <- readRDS(here("cached_data/best_fit_stimuli_naturalness.Rds"))
best_fit_major_author_df <- readRDS(here("cached_data/best_fit_major_author_df.Rds"))

age_modesl_df <- readRDS(here("cached_data/age_models_df.Rds"))

major_author_fit_df <- readRDS(here("cached_data/major_author_model_ds.Rds"))

1 1. behavioral_measure

1.1 running models for comparison

1.2 model stats for significant values

best_fit_bmeasure_model_df %>% 
  filter(grepl("behavioral_measure", term)) %>% 
  filter(p.value < .05) %>% 
  kableExtra::kable(digits = .5) %>%  
  kableExtra::kable_styling() %>%
  kableExtra::scroll_box(width = "100%", height = "500px")
term type estimate std.error statistic p.value model_spec dataset model_spec_clean ic ML REML model_type
behavioral_measuremanual summary 0.8 0.2 3.6 0 d_calc ~ 1 + behavioral_measure Sound symbolism Const AICc 51.0 51.2 bmeasure
behavioral_measurelooking summary -0.8 0.2 -5.1 0 d_calc ~ 1 + behavioral_measure Vowel discrimination (native) Const AICc 210.2 206.8 bmeasure
behavioral_measurelooking summary -1.5 0.4 -3.4 0 d_calc ~ 1 + behavioral_measure Vowel discrimination (non-native) Const AICc 60.5 54.5 bmeasure

1.3 visualization

d %>% 
  filter(mean_age_months < 36) %>% 
  filter(ds_clean %in% unique(best_fit_bmeasure_model_df$dataset)) %>% 
  ggplot(aes(x = mean_age_months, y = d_calc, color = behavioral_measure)) + 
  geom_point(alpha = .5) + 
  geom_smooth(method = "lm") + 
  facet_wrap(~ds_clean)+ 
  theme_few()

2 2. Exposure phase

2.1 running models for comparison

2.2 model stats for significant values

best_fit_ep_model_df %>% 
  filter(grepl("exposure_phase", term)) %>% 
  filter(p.value < .05) %>% 
  kableExtra::kable(digits = .2) %>%  
  kable_styling() %>%
  kableExtra::scroll_box(width = "100%", height = "500px")
term type estimate std.error statistic p.value model_spec dataset model_spec_clean ic ML REML model_type
exposure_phasehabituation summary 0 0 -3 0 d_calc ~ log(mean_age_months) + exposure_phase Language discrimination and preference Log AICc 155 151 ep
exposure_phasefamiliarization summary 2 1 2 0 d_calc ~ 1 + exposure_phase Natural speech preference Const AICc 110 106 ep
exposure_phasenot_applicable summary 0 0 -2 0 d_calc ~ 1 + exposure_phase Neonatal Imitation Const AICc 58 53 ep
exposure_phasefamiliarization summary -1 0 -2 0 d_calc ~ 1 + exposure_phase Vowel discrimination (native) Const AICc 230 223 ep
exposure_phasehabituation summary -1 0 -3 0 d_calc ~ 1 + exposure_phase Vowel discrimination (native) Const AICc 230 223 ep

2.3 visualization

d %>% 
  filter(mean_age_months < 36) %>% 
  filter(ds_clean %in% unique(best_fit_ep_model_df$dataset)) %>% 
  ggplot(aes(x = mean_age_months, y = d_calc, color = exposure_phase)) + 
  geom_point(alpha = .5) + 
  geom_smooth(method = "lm") + 
  facet_wrap(~ds_clean)+ 
  theme_few()

3 3. Stimuli naturalness

## get subset of data
sn_d <- d %>% 
  distinct(ds_clean, stimuli_naturalness) %>% 
  filter(!is.na(stimuli_naturalness)) %>% 
  group_by(ds_clean) %>% 
  count() %>% 
  filter(!n == 1) %>% 
  pull(ds_clean)

## get model comparison stats
sn_models_df <- lapply(sn_d, 
       function(name){
         print(name)
         get_compare_IC_df(d %>% filter(!is.na(stimuli_naturalness)), name, c("stimuli_naturalness"))
       }) %>% 
  bind_rows() %>% 
  mutate(model_type = "sn")
## [1] "Abstract rule learning"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "Categorization bias"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "Label advantage in concept learning"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "Mutual exclusivity"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "Prosocial agents"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "Statistical word segmentation"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "Vowel discrimination (native)"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "Vowel discrimination (non-native)"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## [1] "full!"
## find best fitting models 
min_AICc_sn_models_df <- sn_models_df %>% 
  filter(ic == "AICc") %>% 
  group_by(dataset) %>% 
  filter(REML == min(REML)) 

## finding the stats of the models 
sn_models_fits_df <- lapply(sn_d, 
       function(name){
         print(name)
         get_model_fit_df(d %>% filter(!is.na(stimuli_naturalness)), name, c("stimuli_naturalness"))
       }) %>% 
  bind_rows()
## [1] "Abstract rule learning"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "Categorization bias"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "Label advantage in concept learning"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "Mutual exclusivity"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "Prosocial agents"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "Statistical word segmentation"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "Vowel discrimination (native)"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "Vowel discrimination (non-native)"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## [1] "~ 1 | short_cite/same_infant/unique_row"
## find the best fit model's stats
best_fit_sn_model_df <- sn_models_fits_df %>% 
  left_join(min_AICc_sn_models_df, 
            by =  c("model_spec", "dataset", "model_spec_clean")) %>% 
  filter(!is.na(REML)) 

saveRDS(best_fit_sn_model_df, here("cached_data/best_fit_stimuli_naturalness.Rds"))

3.0.1 significant models

best_fit_sn_model_df %>% 
  filter(grepl("naturalness", term)) %>% 
  filter(p.value < 0.05) %>% 
  kableExtra::kable(digits = .2) %>%  
  kable_styling() %>%
  kableExtra::scroll_box(width = "100%", height = "500px")
term type estimate std.error statistic p.value model_spec dataset model_spec_clean ic ML REML model_type
stimuli_naturalnessnatural summary 0 0 2 0 d_calc ~ I(mean_age_months^2) + stimuli_naturalness Abstract rule learning Quadratic AICc 140 137 sn
stimuli_naturalnessnatural summary 0 0 2 0 d_calc ~ 1 + stimuli_naturalness Label advantage in concept learning Const AICc 170 169 sn
stimuli_naturalnessnatural summary 0 0 4 0 d_calc ~ 1 + stimuli_naturalness Statistical word segmentation Const AICc 118 118 sn

4 4. Major author

all_nested_d <-lapply(distinct(d, ds_clean) %>% pull(), 
       function(x){
         get_major_author_effect(d, x, author_d)
       }) %>% 
  bind_rows() %>% 
  group_by(ds_clean, major_author) %>% 
  nest() %>% 
  filter(!major_author %in% c("…SweigWilson", "deVilliers", "Fernald","GrafEstes", "HillairetdeBoisferon"
                              )) %>% 
  filter(!is.na(major_author) & ds_clean != "Prosocial Agents")

4.1 get the best fit major author models

major_author_fit_df <- readRDS(here("cached_data/major_author_model_ds.Rds"))

major_author_model_fit_df <- readRDS(here("cached_data/major_author_fits_ds.Rds"))

4.2 get the significant value

best_fit_major_author_model <- readRDS(here("cached_data/best_fit_major_author_df.Rds"))

best_fit_major_author_model %>% 
  filter(grepl("by_major_author", term), 
         p.value < 0.05) 
## # A tibble: 12 × 13
##    term    type  estim…¹ std.e…² stati…³ p.value model…⁴ dataset major…⁵ model…⁶
##    <chr>   <chr>   <dbl>   <dbl>   <dbl>   <dbl> <chr>   <chr>   <chr>   <chr>  
##  1 by_maj… summ…   2.04   0.924     2.21 2.72e-2 d_calc… Catego… Bauer   Log    
##  2 by_maj… summ…   2.04   0.924     2.21 2.72e-2 d_calc… Catego… Mandler Log    
##  3 by_maj… summ…  -0.388  0.190    -2.05 4.08e-2 d_calc… Cross-… Vlach   Quadra…
##  4 by_maj… summ…  -0.423  0.0731   -5.79 6.92e-9 d_calc… Famili… Keren-… Linear 
##  5 by_maj… summ…   0.536  0.252     2.13 3.35e-2 d_calc… Gaze f… Mundy   Quadra…
##  6 by_maj… summ…   0.650  0.230     2.83 4.66e-3 d_calc… Mispro… Swingl… Quadra…
##  7 by_maj… summ…   1.11   0.564     1.97 4.84e-2 d_calc… Online… Weisle… Const  
##  8 by_maj… summ…  -0.243  0.106    -2.29 2.20e-2 d_calc… Statis… Thiess… Const  
##  9 by_maj… summ…   0.542  0.274     1.98 4.82e-2 d_calc… Vowel … Bohn    Const  
## 10 by_maj… summ…   0.542  0.274     1.98 4.82e-2 d_calc… Vowel … Polka   Const  
## 11 by_maj… summ…   0.241  0.0819    2.94 3.24e-3 d_calc… Word S… Singh   Const  
## 12 by_maj… summ…   0.522  0.170     3.07 2.13e-3 d_calc… Infant… Werker  Const  
## # … with 3 more variables: ic <chr>, ML <dbl>, REML <dbl>, and abbreviated
## #   variable names ¹​estimate, ²​std.error, ³​statistic, ⁴​model_spec,
## #   ⁵​major_author, ⁶​model_spec_clean
best_fit_major_author_model %>% 
  filter(grepl("by_major_author", term), 
         p.value < 0.05) %>% 
  kableExtra::kable(digits = .2) %>%  
  kable_styling() %>%
  kableExtra::scroll_box(width = "100%", height = "500px") 
term type estimate std.error statistic p.value model_spec dataset major_author model_spec_clean ic ML REML
by_major_authoryes summary 2 1 2 0 d_calc ~ log(mean_age_months) + by_major_author Categorization bias Bauer Log AICc 284 273
by_major_authoryes summary 2 1 2 0 d_calc ~ log(mean_age_months) + by_major_author Categorization bias Mandler Log AICc 284 273
by_major_authoryes summary 0 0 -2 0 d_calc ~ I(mean_age_months^2) + by_major_author Cross-situational word learning Vlach Quadratic AICc 79 78
by_major_authoryes summary 0 0 -6 0 d_calc ~ mean_age_months + by_major_author Familiar word recognition Keren-Portnoy Linear AICc 16 21
by_major_authoryes summary 1 0 2 0 d_calc ~ I(mean_age_months^2) + by_major_author Gaze following (combined) Mundy Quadratic AICc 151 146
by_major_authoryes summary 1 0 3 0 d_calc ~ I(mean_age_months^2) + by_major_author Mispronunciation sensitivity Swingley Quadratic AICc 616 609
by_major_authoryes summary 1 1 2 0 d_calc ~ 1 + by_major_author Online word recognition Weisleder Const AICc 52 51
by_major_authoryes summary 0 0 -2 0 d_calc ~ 1 + by_major_author Statistical word segmentation Thiessen Const AICc 125 125
by_major_authoryes summary 1 0 2 0 d_calc ~ 1 + by_major_author Vowel discrimination (native) Bohn Const AICc 255 252
by_major_authoryes summary 1 0 2 0 d_calc ~ 1 + by_major_author Vowel discrimination (native) Polka Const AICc 255 252
by_major_authoryes summary 0 0 3 0 d_calc ~ 1 + by_major_author Word Segmentation (combined) Singh Const AICc 322 320
by_major_authoryes summary 1 0 3 0 d_calc ~ 1 + by_major_author Infant directed speech preference Werker Const AICc 67 66

4.3 visualization: not sure what’s the best way to do it