library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4          ✔ readr     2.1.5     
✔ forcats   1.0.0          ✔ stringr   1.5.1     
✔ ggplot2   3.5.1.9000     ✔ tibble    3.2.1     
✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
✔ purrr     1.0.4          
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(here)

here() starts at /Users/visuallearninglab/Documents/vedi_survey/vedi

library(knitr)
library(cowplot)


Attaching package: 'cowplot'

The following object is masked from 'package:lubridate':

    stamp

library(grid)
library(ggthemes)


Attaching package: 'ggthemes'

The following object is masked from 'package:cowplot':

    theme_map

library(lmerTest)

Loading required package: lme4
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack


Attaching package: 'lmerTest'

The following object is masked from 'package:lme4':

    lmer

The following object is masked from 'package:stats':

    step

library(mirt)

Loading required package: stats4
Loading required package: lattice

Attaching package: 'mirt'

The following object is masked from 'package:lme4':

    fixef

library(grid)
library(viridis)

Loading required package: viridisLite

#devtools::install_github("langcog/wordbankr")
library(wordbankr)
library(glue)
library(broom)

source("helpers.R")
base_size_print=10
label_size=3

#devtools::install_github("mikabr/jglmm"): https://github.com/mikabr/jglmm
# TODO: switch to just using lmer
options(JULIA_HOME = "/Users/visuallearninglab/.juliaup/bin")
library(jglmm)


Attaching package: 'jglmm'

The following objects are masked from 'package:lme4':

    cbpp, grouseticks, sleepstudy

jglmm_setup()

Julia version 1.11.3 at location /Users/visuallearninglab/.julia/juliaup/julia-1.11.3+0.aarch64.apple.darwin14/bin will be used.

Loading setup script for JuliaCall...

Finish loading setup script for JuliaCall.

# TODO: switch to more efficiently using scale(), I did not know that was a function 
center_vars <- function(data) {
  return(data |>
           mutate(visual_frequency_centered = scale(Frequency_numeric_val),
         total_count_centered = scale(TotalCount_numeric_val),
         interaction_count_centered = scale(InteractionCount_numeric_val),
         formats_seen_centered = scale(FormatsSeen_numeric_val),
         babiness_centered = scale(babiness_mean),
         age_centered = scale(childAge)))
}

combined_normed <- read_csv(file.path(here('data','main', 'processed'),"processed-responses.csv"))

Rows: 2351 Columns: 22
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr   (6): object, globalID, category, childSiblingAges, feedback, age_half
dbl  (14): aoa, Frequency_normalized_val, TotalCount_normalized_val, Interac...
dttm  (2): startTimestamp, endTimestamp

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

combined_normed_summary <- read_csv(file.path(here('data','main', 'processed'),"processed-responses-summarized.csv"))

Rows: 49 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): object, category
dbl (14): aoa, Frequency_normalized_val, TotalCount_normalized_val, Interact...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# can't use this directly since there are multiple elements with the same AoA dependent variable value, can't add item-level random effects
centered_vars <- combined_normed |>
  filter(!is.na(InteractionCount_numeric_val) & !is.na(aoa)) |>
  center_vars()

# can use this but it does not include anything age-related and we are just predicting object
centered_summarized_vars <- combined_normed_summary |>
  center_vars()

# loaded from https://github.com/mikabr/aoa-prediction
load(here("analysis/main/coef_data/uni_joined.Rdata"))
eng_joined <- uni_joined |> filter(language == "English (American)")

vedi_for_lm <- centered_vars |> filter(7.5 < childAge & childAge < 31) |>
  mutate(rounded_age = round(childAge)) |>
  select(-globalID) |>
  summarize(participant_count = n(), across(where(is.numeric), ~ mean(.x, na.rm = TRUE)), .by=c("object", "category", "rounded_age")) |> 
  center_vars() |>
  mutate(babiness_centered = ifelse(is.nan(babiness_centered), NA, babiness_centered)) |>
  select(-matches("normalized|mean|numeric")) |>
  # joining mean values for each item as fixed item effects 
  left_join(combined_normed_summary |> select("object", "Frequency_numeric_val", "FormatsSeen_numeric_val",
                                              "TotalCount_numeric_val", "InteractionCount_numeric_val"))

Joining with `by = join_by(object)`

eng_vars <- vedi_for_lm |>
  left_join(eng_joined, by=c("object"="uni_lemma","rounded_age"="age")) |>
  filter(!is.na(final_frequency))

nrow(eng_vars |> distinct(object))

[1] 42

nrow(eng_joined |> distinct(uni_lemma))

[1] 393

nrow(combined_normed |> distinct(globalID))

[1] 61

#eng_vars <- eng_joined |> rename(object=uni_lemma, rounded_age=age)

Initial models with only babiness as an additional predictor

main_effect_per_object <- lmer(aoa ~ (1 | category) + babiness_centered + formats_seen_centered +
                      interaction_count_centered + total_count_centered + visual_frequency_centered, data=centered_summarized_vars)

main_effect <- lmer(aoa ~ (1 | globalID) + babiness_centered * age_centered + age_centered * formats_seen_centered +
                      age_centered * interaction_count_centered + age_centered * total_count_centered + age_centered * visual_frequency_centered, data=centered_vars)
simpler_model <- lmer(aoa ~ (1 | globalID), data=centered_vars)

boundary (singular) fit: see help('isSingular')

summary(main_effect)

Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: 
aoa ~ (1 | globalID) + babiness_centered * age_centered + age_centered *  
    formats_seen_centered + age_centered * interaction_count_centered +  
    age_centered * total_count_centered + age_centered * visual_frequency_centered
   Data: centered_vars

REML criterion at convergence: 7637.7

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.5326 -0.7199  0.1041  0.7140  2.6252 

Random effects:
 Groups   Name        Variance Std.Dev.
 globalID (Intercept) 0.6145   0.7839  
 Residual             7.5422   2.7463  
Number of obs: 1553, groups:  globalID, 61

Fixed effects:
                                          Estimate Std. Error         df
(Intercept)                               22.62806    0.12432   45.40676
babiness_centered                         -0.82004    0.07370 1494.05709
age_centered                               0.16394    0.12143   47.32315
formats_seen_centered                     -1.14473    0.08219 1219.87944
interaction_count_centered                 0.02668    0.08595 1520.05056
total_count_centered                      -0.42895    0.08559 1483.51853
visual_frequency_centered                 -0.34800    0.09181 1483.14829
babiness_centered:age_centered            -0.09864    0.07175 1488.91347
age_centered:formats_seen_centered         0.08923    0.07911  676.71247
age_centered:interaction_count_centered    0.02157    0.09002 1486.88152
age_centered:total_count_centered          0.05605    0.07556 1498.89029
age_centered:visual_frequency_centered    -0.07090    0.08931 1462.97999
                                        t value Pr(>|t|)    
(Intercept)                             182.020  < 2e-16 ***
babiness_centered                       -11.127  < 2e-16 ***
age_centered                              1.350 0.183413    
formats_seen_centered                   -13.928  < 2e-16 ***
interaction_count_centered                0.310 0.756322    
total_count_centered                     -5.011 6.05e-07 ***
visual_frequency_centered                -3.791 0.000156 ***
babiness_centered:age_centered           -1.375 0.169411    
age_centered:formats_seen_centered        1.128 0.259753    
age_centered:interaction_count_centered   0.240 0.810646    
age_centered:total_count_centered         0.742 0.458285    
age_centered:visual_frequency_centered   -0.794 0.427381    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
             (Intr) bbnss_ ag_cnt frmt__ intr__ ttl_c_ vsl_f_ bbn_:_
bbnss_cntrd   0.005                                                 
age_centerd   0.010  0.009                                          
frmts_sn_cn   0.040  0.071 -0.015                                   
intrctn_cn_   0.009 -0.093 -0.009 -0.027                            
ttl_cnt_cnt   0.008  0.044 -0.068 -0.179 -0.181                     
vsl_frqncy_  -0.036 -0.220  0.040 -0.158 -0.412 -0.236              
bbnss_cnt:_   0.009 -0.003  0.012 -0.003  0.014 -0.005  0.004       
ag_cntrd:f__ -0.017 -0.008 -0.056 -0.018  0.013  0.077 -0.002  0.019
ag_cntrd:n__ -0.015  0.017 -0.021  0.020 -0.086 -0.002  0.026 -0.082
ag_cntrd:t__ -0.062  0.013 -0.005  0.083  0.005 -0.256 -0.005  0.086
ag_cntrd:v__  0.040  0.000  0.003 -0.003  0.034 -0.004  0.004 -0.163
             ag_cntrd:f__ ag_cntrd:n__ ag_cntrd:t__
bbnss_cntrd                                        
age_centerd                                        
frmts_sn_cn                                        
intrctn_cn_                                        
ttl_cnt_cnt                                        
vsl_frqncy_                                        
bbnss_cnt:_                                        
ag_cntrd:f__                                       
ag_cntrd:n__  0.003                                
ag_cntrd:t__ -0.150       -0.180                   
ag_cntrd:v__ -0.117       -0.483       -0.228

summary(main_effect_per_object)

Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: aoa ~ (1 | category) + babiness_centered + formats_seen_centered +  
    interaction_count_centered + total_count_centered + visual_frequency_centered
   Data: centered_summarized_vars

REML criterion at convergence: 152.8

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.24210 -0.65315  0.04542  0.65079  2.07593 

Random effects:
 Groups   Name        Variance Std.Dev.
 category (Intercept) 0.4037   0.6354  
 Residual             3.5587   1.8864  
Number of obs: 38, groups:  category, 7

Fixed effects:
                           Estimate Std. Error       df t value Pr(>|t|)    
(Intercept)                22.87712    0.40068  2.46881  57.096 6.46e-05 ***
babiness_centered          -0.92937    0.38611 29.33707  -2.407   0.0226 *  
formats_seen_centered      -2.58112    0.43207 15.07343  -5.974 2.50e-05 ***
interaction_count_centered -0.22555    0.59758 28.27300  -0.377   0.7087    
total_count_centered        0.14401    0.49031 30.86503   0.294   0.7709    
visual_frequency_centered   0.06727    0.79719 30.64012   0.084   0.9333    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation of Fixed Effects:
            (Intr) bbnss_ frmt__ intr__ ttl_c_
bbnss_cntrd -0.029                            
frmts_sn_cn  0.086  0.077                     
intrctn_cn_ -0.087 -0.024 -0.007              
ttl_cnt_cnt -0.114  0.257 -0.393  0.142       
vsl_frqncy_  0.093 -0.371 -0.100 -0.708 -0.534

summary(simpler_model)

Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: aoa ~ (1 | globalID)
   Data: centered_vars

REML criterion at convergence: 8869.5

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-1.54579 -0.90387  0.05901  0.70093  1.98478 

Random effects:
 Groups   Name        Variance  Std.Dev. 
 globalID (Intercept) 8.049e-17 8.972e-09
 Residual             9.707e+00 3.116e+00
Number of obs: 1735, groups:  globalID, 61

Fixed effects:
             Estimate Std. Error        df t value Pr(>|t|)    
(Intercept)   22.8161     0.0748 1734.0000     305   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
optimizer (nloptwrap) convergence code: 0 (OK)
boundary (singular) fit: see help('isSingular')

Formats seen, frequency, total count and babiness all show separate positive predictivity which is encouraging.

Implementing Mika’s code

Pulled code below from https://github.com/mikabr/aoa-prediction – joined AoA data and predictor data.

#load("../saved_data/uni_joined.RData")

predictors <- c("frequency", "MLU", "final_frequency", "solo_frequency",
                "num_phons", "concreteness", "valence", "arousal", "babiness", "Frequency_numeric_val", "FormatsSeen_numeric_val",
                                              "TotalCount_numeric_val", "InteractionCount_numeric_val")
.alpha <- 0.05
set.seed(42)

##Impute and scale data for input into models.

model_data <- eng_vars %>%
  select(object, all_of(predictors)) %>%
  distinct() #%>%
 # group_by(language) %>%
  # mutate_at(vars(!!predictors), funs(as.numeric(scale(.)))) %>%
  #nest()

pred_sources <- list(
  c("frequency", "MLU", "final_frequency", "solo_frequency"),
  c("valence", "arousal"),
  c("Frequency_numeric_val", "FormatsSeen_numeric_val", "TotalCount_numeric_val", "InteractionCount_numeric_val"),
  "concreteness", "babiness", "num_phons"
)

 #c("interaction_count_centered", "formats_seen_centered",
    #            "total_count_centered", "visual_frequency_centered"),

fit_predictor <- function(pred, d) {
   if (!"object" %in% colnames(d)) stop("Missing 'object' column in data")
  xs <- pred_sources %>% discard(~pred %in% .x) %>% unlist()
  if (length(xs) == 0) stop(glue("No valid predictors for {pred}"))
  print(length(xs))
  x_str <- xs %>% paste(collapse = " + ")
  formula_str <- glue("{pred} ~ {x_str}")
  print(glue("Fitting model: {formula_str}"))  # Debugging
  lm(as.formula(glue("{pred} ~ {x_str}")), data = d) %>%
    augment(newdata = d) %>%
    select(object, .fitted)
}

max_steps <- 10
iterate_predictors <- function(lang_data) {
  missing <- lang_data %>%
    pivot_longer(names_to="predictor", values_to="value", all_of(predictors)) %>%
    mutate(missing = is.na(value)) %>%
    select(-value) %>%
    pivot_wider(names_from=predictor, values_from=missing)
  predictor_order <- lang_data %>%
    gather(predictor, value, all_of(predictors)) %>%
    group_by(predictor) %>%
    summarise(num_na = sum(is.na(value))) %>%
    filter(num_na != 0) %>%
    arrange(num_na) %>%
    pull(predictor)
  imputation_data <- lang_data %>%
   mutate(across(all_of(predictors), ~ as.numeric(Hmisc::impute(., fun = "random"))))

  for (i in 0:max_steps) {
    pred <- predictor_order[(i %% length(predictor_order)) + 1]
    imputation_fits <- fit_predictor(pred, imputation_data)
    imputation_data <- missing %>%
      select(object, !!pred) %>%
      rename(missing = !!pred) %>%
      right_join(imputation_data) %>%
      left_join(imputation_fits) %>%
      mutate(across(pred, ~ if_else(is.na(.), .fitted, .))) %>%
      select(-.fitted, -missing)
  }
  return(imputation_data)
}

model_data_imputed <- iterate_predictors(model_data)

[1] 12
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + valence + arousal + Frequency_numeric_val + FormatsSeen_numeric_val + TotalCount_numeric_val + InteractionCount_numeric_val + concreteness + num_phons

Joining with `by = join_by(object)`
Joining with `by = join_by(object)`

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `across(pred, ~if_else(is.na(.), .fitted, .))`.
Caused by warning:
! Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(pred)

  # Now:
  data %>% select(all_of(pred))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

[1] 12
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + valence + arousal + Frequency_numeric_val + FormatsSeen_numeric_val + TotalCount_numeric_val + InteractionCount_numeric_val + concreteness + num_phons

Joining with `by = join_by(object)`
Joining with `by = join_by(object)`

[1] 12
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + valence + arousal + Frequency_numeric_val + FormatsSeen_numeric_val + TotalCount_numeric_val + InteractionCount_numeric_val + concreteness + num_phons

Joining with `by = join_by(object)`
Joining with `by = join_by(object)`

[1] 12
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + valence + arousal + Frequency_numeric_val + FormatsSeen_numeric_val + TotalCount_numeric_val + InteractionCount_numeric_val + concreteness + num_phons

Joining with `by = join_by(object)`
Joining with `by = join_by(object)`

[1] 12
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + valence + arousal + Frequency_numeric_val + FormatsSeen_numeric_val + TotalCount_numeric_val + InteractionCount_numeric_val + concreteness + num_phons

Joining with `by = join_by(object)`
Joining with `by = join_by(object)`

[1] 12
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + valence + arousal + Frequency_numeric_val + FormatsSeen_numeric_val + TotalCount_numeric_val + InteractionCount_numeric_val + concreteness + num_phons

Joining with `by = join_by(object)`
Joining with `by = join_by(object)`

[1] 12
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + valence + arousal + Frequency_numeric_val + FormatsSeen_numeric_val + TotalCount_numeric_val + InteractionCount_numeric_val + concreteness + num_phons

Joining with `by = join_by(object)`
Joining with `by = join_by(object)`

[1] 12
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + valence + arousal + Frequency_numeric_val + FormatsSeen_numeric_val + TotalCount_numeric_val + InteractionCount_numeric_val + concreteness + num_phons

Joining with `by = join_by(object)`
Joining with `by = join_by(object)`

[1] 12
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + valence + arousal + Frequency_numeric_val + FormatsSeen_numeric_val + TotalCount_numeric_val + InteractionCount_numeric_val + concreteness + num_phons

Joining with `by = join_by(object)`
Joining with `by = join_by(object)`

[1] 12
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + valence + arousal + Frequency_numeric_val + FormatsSeen_numeric_val + TotalCount_numeric_val + InteractionCount_numeric_val + concreteness + num_phons

Joining with `by = join_by(object)`
Joining with `by = join_by(object)`

[1] 12
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + valence + arousal + Frequency_numeric_val + FormatsSeen_numeric_val + TotalCount_numeric_val + InteractionCount_numeric_val + concreteness + num_phons

Joining with `by = join_by(object)`
Joining with `by = join_by(object)`

uni_model_data <- model_data_imputed %>%
  mutate(across(all_of(predictors), ~ as.numeric(scale(.)))) %>%  
  right_join(
    eng_vars %>% 
      select(all_of(c("measure", "object", "rounded_age", "num_true", "num_false" 
                      ,"interaction_count_centered", "formats_seen_centered",
                      "total_count_centered", "visual_frequency_centered", "participant_count"
                      )))
  ) %>%
  group_by(measure) %>%
  mutate(
    unscaled_age = rounded_age, 
    age = scale(rounded_age), 
    total = as.double(num_true + num_false), 
    prop = num_true / total
)

Joining with `by = join_by(object)`

Fitting models

create_formula <- function(response = "prop", interaction_prefix = "age", predictors_list, random_effect = "(age | object)") {
  # Construct interaction terms for each list of predictors
  effects <- paste(interaction_prefix, predictors_list, sep = " * ")
  
  # Combine terms into a full formula string
  formula_str <- glue("{response} ~ {random_effect} + {paste(effects, collapse = ' + ')}")
  
  # Return as a formula object
  return(as.formula(formula_str))
}

# TODO: use glmer instead of jglmm for reproducibility
fit_group_model <- function(group_data, group_formula, contrasts = NULL) {
  group <- unique(group_data$group)
  message(glue("Fitting model for {group}..."))
    jglmm(formula = group_formula, data = group_data, family = "binomial",
          weights = group_data$total, contrasts = contrasts)
}

varying_vedi_predictors <- c("interaction_count_centered", "formats_seen_centered",
                "total_count_centered", "visual_frequency_centered")

by_measure_data <- uni_model_data %>%
  mutate(group = paste(measure)) %>%
  select(measure, group, object, prop,
         total, age, !!predictors, varying_vedi_predictors) %>%
  group_by(measure) %>%
  nest()

Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(varying_vedi_predictors)

  # Now:
  data %>% select(all_of(varying_vedi_predictors))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

non_vedi_predictors <- predictors[!grepl("numeric", predictors)]
varying_predictors = c(non_vedi_predictors, varying_vedi_predictors)
effects_formula <- create_formula(predictors_list = predictors)
varying_effects_formula <- create_formula(predictors_list = varying_predictors)
non_vedi_effects_formula <- create_formula(predictors_list = non_vedi_predictors)

  measure_models <- by_measure_data %>%
  mutate(model = data %>%
           map(~fit_group_model(.x, effects_formula)))

Fitting model for produces...

Fitting model for understands...

  measure_fits <- measure_models %>%
  mutate(results = map(model, tidy))

measure_levels <- c("understands", "produces")

measure_coefs <- measure_fits %>%
  select(measure, results) %>%
  unnest() %>%
  rename(std_error = std.error,
         #z_value = z.value,
         p_value = p.value) %>%
  separate(term, c("term", "effect"), sep = " & ", fill = "right") %>%
  mutate(effect = if_else(is.na(effect), "main effect", effect),
         term = if_else(term == "age" & effect != "main effect",
                        effect, term),
         effect = if_else(term == effect, "age", effect),
         effect = if_else(effect == "main effect", effect,
                          paste("interaction with", effect)),
         effect = effect %>%
           fct_relevel("main effect",
                       "interaction with age"),
         measure = factor(measure, levels = measure_levels),
         signif = p_value < .alpha) %>%
  group_by(measure, term, effect) %>%
  # TODO:added in this line?
  filter(!is.na(std_error)) %>%
  nest()

Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(results)`.

predictor_effects <- measure_coefs %>%
  filter(effect == "main effect", term %in% predictors) %>%
  rename(predictor_effect = data) %>%
  select(-effect)

Adding missing grouping variables: `effect`

mean_term_coefs <- measure_coefs %>%
  unnest(cols = c(data)) %>%
  filter(effect == "main effect") %>%
  group_by(term) %>%
  summarise(mean_estimate = mean(estimate),
            n_sig = sum(signif==TRUE),
            n_pos = sum(estimate > 0),
            n_neg = sum(estimate < 0)) %>%
  arrange(desc(abs(mean_estimate)))

coef_order <- mean_term_coefs %>% pull(term)

display_predictors <- function(predictors) {
  predictors %>%
    str_replace("Count", " count") %>%
    str_replace("Seen", " seen") %>%
    str_replace("Frequency_numeric_val", "Visual frequency") %>%
    str_replace("_numeric_val", "") %>%
    str_replace("num", "number") %>% str_replace("phons", "phonemes") %>%
    map_chr(label_caps) %>% str_replace("MLU", "MLU-w")
}

label_caps <- function(value) {
  if_else(toupper(value) == value, value,
          paste0(toupper(substr(value, 1, 1)),
                 tolower(substr(value, 2, nchar(value))))) %>%
    str_replace_all("_", " ")
}

plt_fixed_coefs <- measure_coefs %>%
  unnest() %>%
  mutate(term = term %>% factor(levels = rev(coef_order)) %>%
           fct_relabel(display_predictors),
         signif = ifelse(signif, "significant", "non-significant") %>%
           fct_rev(),
         measure = fct_recode(measure, "comprehension" = "understands",
                              "production" = "produces"))

Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(data)`.

ggplot(plt_fixed_coefs, aes(x = estimate, y = term)) +
  facet_grid(measure ~ effect, scales = "free",
             labeller = as_labeller(label_caps)) +
  geom_pointrange(aes(colour = term, shape = signif,
                      xmin = estimate - 1.96 * std_error,
                      xmax = estimate + 1.96 * std_error)) +
  geom_vline(xintercept = 0, color = "grey", linetype = "dotted") +
  scale_colour_viridis(discrete = TRUE, guide = FALSE) +
  scale_shape_manual(values = c(19, 21), guide = FALSE) +
  labs(y = "", x = "Coefficient estimate", title="Fixed VEDI item effects")

Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
ggplot2 3.3.4.
ℹ Please use "none" instead.

Estimates of coefficients in predicting words’ developmental trajectories for English comprehension and production data. Larger coefficient values indicate a greater effect of the predictor on acquisition: positive main effects indicate that words with higher values of the predictor tend to be understood/produced by more children, while negative main effects indicate that words with lower values of the predictor tend to be understood/produced by more children; positive age interactions indicate that the predictor’s effect increases with age, while negative age interactions indicate the predictor’s effect decreases with age. Line ranges indicates 95% confidence intervals; filled in points indicate coefficients for which \(p < 0.05\).

This isn’t exactly what I expected so I’m going to have to try to re-run this without our predictors too

plot_effects_data <- function(formula, plot_title="Fixed VEDI item effects") {
measure_models <- by_measure_data %>%
  mutate(model = data %>%
           map(~fit_group_model(.x, formula)))
  measure_fits <- measure_models %>%
  mutate(results = map(model, tidy))

measure_levels <- c("understands", "produces")

measure_coefs <- measure_fits %>%
  select(measure, results) %>%
  unnest() %>%
  rename(std_error = std.error,
         #z_value = z.value,
         p_value = p.value) %>%
  separate(term, c("term", "effect"), sep = " & ", fill = "right") %>%
  mutate(effect = if_else(is.na(effect), "main effect", effect),
         term = if_else(term == "age" & effect != "main effect",
                        effect, term),
         effect = if_else(term == effect, "age", effect),
         effect = if_else(effect == "main effect", effect,
                          paste("interaction with", effect)),
         effect = effect %>%
           fct_relevel("main effect",
                       "interaction with age"),
         measure = factor(measure, levels = measure_levels),
         signif = p_value < .alpha) %>%
  group_by(measure, term, effect) %>%
  # TODO:added in this line?
  filter(!is.na(std_error)) %>%
  nest()
predictor_effects <- measure_coefs %>%
  filter(effect == "main effect", term %in% predictors) %>%
  rename(predictor_effect = data) %>%
  select(-effect)
mean_term_coefs <- measure_coefs %>%
  unnest(cols = c(data)) %>%
  filter(effect == "main effect") %>%
  group_by(term) %>%
  summarise(mean_estimate = mean(estimate),
            n_sig = sum(signif==TRUE),
            n_pos = sum(estimate > 0),
            n_neg = sum(estimate < 0)) %>%
  arrange(desc(abs(mean_estimate)))

coef_order <- mean_term_coefs %>% pull(term)
plt_fixed_coefs <- measure_coefs %>%
  unnest() %>%
  mutate(term = term %>% factor(levels = rev(coef_order)) %>%
           fct_relabel(display_predictors),
         signif = ifelse(signif, "significant", "non-significant") %>%
           fct_rev(),
         measure = fct_recode(measure, "comprehension" = "understands",
                              "production" = "produces"))
ggplot(plt_fixed_coefs, aes(x = estimate, y = term)) +
  facet_grid(measure ~ effect, scales = "free",
             labeller = as_labeller(label_caps)) +
  geom_pointrange(aes(colour = term, shape = signif,
                      xmin = estimate - 1.96 * std_error,
                      xmax = estimate + 1.96 * std_error)) +
  geom_vline(xintercept = 0, color = "grey", linetype = "dotted") +
  scale_colour_viridis(discrete = TRUE, guide = FALSE) +
  scale_shape_manual(values = c(19, 21), guide = FALSE) +
  labs(y = "", x = "Coefficient estimate", title=plot_title)

}

Fitting model for produces...

Fitting model for understands...

Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(results)`.

Adding missing grouping variables: `effect`

Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(data)`.

Fitting model for produces...
Fitting model for understands...

Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(results)`.

Adding missing grouping variables: `effect`

Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(data)`.

Fitting model for produces...
Fitting model for understands...

Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(results)`.

Adding missing grouping variables: `effect`

Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(data)`.

Got rid of this code but I was basically able to replicate the original plot with the expected negative MLU correlation, indicating that we might not have enough items to reliably plot our effects. Lack of age-based effects should not be surprising given that each point only had 1 or 2 different values. How do we interpret that the less items that infants interact with look at the faster they learn words?