Most of the code here is borrowed from https://github.com/mikabr/aoa-prediction

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4          ✔ readr     2.1.5     
✔ forcats   1.0.0          ✔ stringr   1.5.1     
✔ ggplot2   3.5.1.9000     ✔ tibble    3.2.1     
✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
✔ purrr     1.0.4          
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
here() starts at /Users/visuallearninglab/Documents/babyview-object
library(knitr)
library(cowplot)

Attaching package: 'cowplot'

The following object is masked from 'package:lubridate':

    stamp
library(grid)
library(ggthemes)

Attaching package: 'ggthemes'

The following object is masked from 'package:cowplot':

    theme_map
library(lmerTest)
Loading required package: lme4
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack


Attaching package: 'lmerTest'

The following object is masked from 'package:lme4':

    lmer

The following object is masked from 'package:stats':

    step
library(mirt)
Loading required package: stats4
Loading required package: lattice

Attaching package: 'mirt'

The following object is masked from 'package:lme4':

    fixef
library(grid)
library(viridis)
Loading required package: viridisLite
library(broom.mixed)
#devtools::install_github("langcog/wordbankr")
library(wordbankr)
library(glue)
library(broom)
library(MuMIn)
Registered S3 method overwritten by 'MuMIn':
  method        from 
  nobs.multinom broom
source("helpers.R")
base_size_print=10
label_size=3

1. Import object detection data

object_counts <- read_csv(here("data/overall_category_distribution.csv"))
Rows: 304 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): class_name
dbl (8): n_frame_video_subject_age_detections, n_unique_videos, total_frame_...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
aoa_values <- read_csv(here("data/coef_data/eng_aoas.csv")) %>%
  mutate(
    item_definition = case_when(
      item_definition == "TV" ~ "tv",
      item_definition %in% c("couch") ~ "couch, sofa",
      item_definition %in% c("trash") ~ "garbage, trash",
      item_definition %in% c("backyard") ~ "backyard, yard",
      item_definition %in% c("noodles") ~ "noodles, spaghetti",
      TRUE ~ item_definition
    )
  ) %>%
  pivot_wider(
    id_cols = "item_definition",
    names_from = measure,
    values_from = aoa,
    names_prefix = "aoa_"
  )
Rows: 1077 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): item_definition, measure
dbl (3): intercept, slope, aoa

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
object_counts_cleaned <- object_counts |>
  distinct(class_name, .keep_all=TRUE) |>
  # filtering out more spurious classes
  filter(class_name != "person" & class_name != "picture") |>
  mutate(proportion_cleaned = round(10000* proportion,5),
         percentage = 100*proportion,
         log_proportion = log(percentage),
         visual_frequency = scale(log_proportion)) |>
  rename(object=class_name)

2. Load existing word trajectory predictors

# loaded from https://github.com/mikabr/aoa-prediction
original_uni_joined <- load(here("data/coef_data/original_uni_joined.RData"))
original_uni_joined <- get(original_uni_joined)
load(here("data/coef_data/uni_joined.Rdata"))
eng_joined <- uni_joined |> filter(language == "English (American)") |> left_join(aoa_values, by=c("words"="item_definition"))
original_eng_joined <- original_uni_joined |> filter(language == "English (American)") |> left_join(aoa_values, by=c("words"="item_definition"))

Checking if the proportions from the new preprocessing code I incorporated correlates with the old ones pulled directly from Mika’s code.

added_coefs <- eng_joined |>
  left_join(original_eng_joined |> rename(original_prop = prop, original_frequency = frequency) |> select(original_prop, uni_lemma, measure, original_frequency, age)) |>
  filter(!is.na(original_prop) & !is.na(prop) & !is.na(original_prop) & !is.na(original_frequency))
Joining with `by = join_by(measure, uni_lemma, age)`
cor(added_coefs |> select(prop, original_prop, frequency, original_frequency))
                         prop original_prop  frequency original_frequency
prop               1.00000000    0.96592140 0.07433446         0.07476677
original_prop      0.96592140    1.00000000 0.06895806         0.06986208
frequency          0.07433446    0.06895806 1.00000000         0.99808364
original_frequency 0.07476677    0.06986208 0.99808364         1.00000000

Phew.

# based on this model just going to join on any row that contains the object even if that means matching with multiple words, since that's what it looks like is being done with MLU etc.
longer_cdi_objects <- object_counts_cleaned |>
  select(object, visual_frequency, log_proportion, AoA) |>
  rowwise() |>
  mutate(matching = list(
    eng_joined |> filter(str_starts(uni_lemma, fixed(paste0(object, " ")))) |> filter(lexical_classes == "nouns")
  )) |>
  unnest(matching) |>
  rename(original_object=object) |>
  filter(uni_lemma != "ice cream") |>
  rename(object=uni_lemma) |>
  filter(!is.na(language))

shorter_cdi_objects <- object_counts_cleaned |>
  select(object, visual_frequency, log_proportion, AoA)  |>
  mutate(original_object=object) |>
  left_join(eng_joined, by=c("object"="uni_lemma")) |>
  # getting rid of words that do not have a match
  filter(!is.na(language))
  
eng_vars <- bind_rows(longer_cdi_objects, shorter_cdi_objects) |>
  # re-calculating visual frequency after getting rid of all of the CDI words that don't exist in the model
  mutate(visual_frequency = scale(log_proportion))

And now looking at whether we missed any objects that were detected:

anti_join(object_counts_cleaned |> distinct(object), eng_vars |> distinct(original_object), by = c("object"="original_object"))
# A tibble: 3 × 1
  object
  <chr> 
1 penis 
2 butt  
3 vagina

Right.

Looking at whether any of our words don’t have AoAs and adding in AoAs to the object counts

eng_vars |> filter(is.na(aoa_produces) & is.na(aoa_understands)) |> distinct(words)
# A tibble: 0 × 1
# ℹ 1 variable: words <chr>
object_counts_cleaned <- object_counts_cleaned |> left_join(eng_vars |> distinct(original_object, aoa_produces, aoa_understands),
                                                            by = c("object"="original_object"))

Sanity checks:

nrow(eng_vars |> distinct(object))
[1] 288
nrow(eng_vars |> distinct(original_object))
[1] 285
nrow(eng_joined |> distinct(uni_lemma))
[1] 661

3. Straightforward AoA predictor

object_counts_long <- object_counts_cleaned %>%
  pivot_longer(
    cols = c(aoa_produces, aoa_understands),
    names_to = "aoa_type",
    values_to = "aoa_value"
  ) %>%
  mutate(aoa_type = recode(aoa_type,
                           "aoa_produces" = "AoA Production",
                           "aoa_understands" = "AoA Comprehension"))

# Create the faceted plot
aoa_objects_plot <- ggplot(object_counts_long, aes(x = aoa_value, y = log_proportion)) +
  geom_point() +
  geom_smooth(method = "lm") +
  facet_wrap(~ aoa_type, scales = "free_x") +
   ggrepel::geom_label_repel(
    data = object_counts_long |> filter(object %in% c("book", "dog", "milk", "penny", "chair", "table")),
    aes(label = object)
  ) +
  ylim(-10, 8) +  
  ggpubr::stat_cor(vjust = 2) +  
  ylab("Log (percentage of frames with object detected)") +
  xlab("Age of Acquisition") +
  theme_minimal()

aoa_objects_plot
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 107 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 107 rows containing non-finite outside the scale range
(`stat_cor()`).
Warning: Removed 107 rows containing missing values or values outside the scale range
(`geom_point()`).

ggsave(here("data/figures/aoa_plot.png"), aoa_objects_plot)
Saving 7 x 5 in image
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 107 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 107 rows containing non-finite outside the scale range
(`stat_cor()`).
Warning: Removed 107 rows containing missing values or values outside the scale range
(`geom_point()`).

3a. Linear modeling

simple_effect <- lm(aoa_understands ~ visual_frequency, data=object_counts_cleaned)

summary(simple_effect)

Call:
lm(formula = aoa_understands ~ visual_frequency, data = object_counts_cleaned)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.4271 -1.8977  0.2143  1.8240  5.9583 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)       16.5143     0.2069  79.834  < 2e-16 ***
visual_frequency  -0.7446     0.2120  -3.511  0.00056 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.809 on 185 degrees of freedom
  (104 observations deleted due to missingness)
Multiple R-squared:  0.06248,   Adjusted R-squared:  0.05742 
F-statistic: 12.33 on 1 and 185 DF,  p-value: 0.00056
all_effects <- lm(aoa_understands ~ visual_frequency + scale(MLU) + scale(frequency) + scale(babiness) + scale(concreteness) + scale(final_frequency) + scale(solo_freq) + scale(valence) + scale(arousal), data=eng_vars |> filter(!is.na(aoa_understands)))
summary(all_effects)

Call:
lm(formula = aoa_understands ~ visual_frequency + scale(MLU) + 
    scale(frequency) + scale(babiness) + scale(concreteness) + 
    scale(final_frequency) + scale(solo_freq) + scale(valence) + 
    scale(arousal), data = filter(eng_vars, !is.na(aoa_understands)))

Residuals:
    Min      1Q  Median      3Q     Max 
-5.0944 -1.3730  0.2687  1.3691  4.3914 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)            16.46332    0.02310 712.616  < 2e-16 ***
visual_frequency       -0.53989    0.02614 -20.653  < 2e-16 ***
scale(MLU)              0.30832    0.03443   8.955  < 2e-16 ***
scale(frequency)       -1.11273    0.06296 -17.674  < 2e-16 ***
scale(babiness)        -1.26348    0.02492 -50.704  < 2e-16 ***
scale(concreteness)    -0.24537    0.02476  -9.911  < 2e-16 ***
scale(final_frequency) -0.10040    0.04314  -2.327     0.02 *  
scale(solo_freq)       -0.24691    0.05229  -4.722 2.38e-06 ***
scale(valence)          0.12422    0.02407   5.160 2.53e-07 ***
scale(arousal)          0.24314    0.02492   9.756  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.996 on 7580 degrees of freedom
  (1012 observations deleted due to missingness)
Multiple R-squared:  0.5428,    Adjusted R-squared:  0.5423 
F-statistic: 999.9 on 9 and 7580 DF,  p-value: < 2.2e-16
r.squaredGLMM(all_effects)
           R2m       R2c
[1,] 0.5424989 0.5424989
car::vif(all_effects)
      visual_frequency             scale(MLU)       scale(frequency) 
              1.325997               2.327931               7.601051 
       scale(babiness)    scale(concreteness) scale(final_frequency) 
              1.150909               1.175413               3.660791 
      scale(solo_freq)         scale(valence)         scale(arousal) 
              4.858893               1.152224               1.181066 
#plot(all_effects)

The main models we’re using for CCN:

smaller_model_understands <- lm(aoa_understands ~ visual_frequency + scale(frequency) + scale(concreteness), data=eng_vars)
summary(smaller_model_understands)

Call:
lm(formula = aoa_understands ~ visual_frequency + scale(frequency) + 
    scale(concreteness), data = eng_vars)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.8108 -1.5604  0.0679  1.7191  5.1765 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)         16.89479    0.02762 611.579   <2e-16 ***
visual_frequency    -0.53941    0.02716 -19.858   <2e-16 ***
scale(frequency)    -1.60476    0.03086 -52.004   <2e-16 ***
scale(concreteness) -0.04485    0.03063  -1.464    0.143    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.442 on 8506 degrees of freedom
  (3122 observations deleted due to missingness)
Multiple R-squared:  0.2881,    Adjusted R-squared:  0.2879 
F-statistic:  1148 on 3 and 8506 DF,  p-value: < 2.2e-16
smaller_model_produces <- lm(aoa_produces ~ visual_frequency + scale(frequency) + scale(concreteness), data=eng_vars)
summary(smaller_model_produces)

Call:
lm(formula = aoa_produces ~ visual_frequency + scale(frequency) + 
    scale(concreteness), data = eng_vars)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.0127 -1.2800 -0.1222  1.4095 11.4814 

Coefficients:
                     Estimate Std. Error  t value Pr(>|t|)    
(Intercept)         23.871026   0.021252 1123.233   <2e-16 ***
visual_frequency     0.005206   0.021615    0.241     0.81    
scale(frequency)    -2.231432   0.021629 -103.169   <2e-16 ***
scale(concreteness) -0.312489   0.021234  -14.716   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.28 on 11506 degrees of freedom
  (122 observations deleted due to missingness)
Multiple R-squared:  0.4926,    Adjusted R-squared:  0.4925 
F-statistic:  3724 on 3 and 11506 DF,  p-value: < 2.2e-16

Which words might be causing this disconnect

anti_join(eng_vars |> distinct(object), eng_vars |> filter(!is.na(aoa_understands)) |> distinct(object))
Joining with `by = join_by(object)`
# A tibble: 101 × 1
   object       
   <chr>        
 1 can (object) 
 2 tape (object)
 3 nail (object)
 4 glue (object)
 5 bat (object) 
 6 mop (object) 
 7 crayon       
 8 bucket       
 9 napkin       
10 basket       
# ℹ 91 more rows

just looking at correlations

eng_vars_filtered <- eng_vars |> 
  filter(!is.na(frequency) & !is.na(visual_frequency) & !is.na(concreteness))
Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
ℹ Please use one dimensional logical vectors instead.
cor(eng_vars_filtered |> distinct(visual_frequency, frequency, final_frequency, concreteness, object) |> select(-object), use = "pairwise.complete.obs")
                 visual_frequency   frequency final_frequency concreteness
visual_frequency       1.00000000  0.18575190       0.2703418   0.04820423
frequency              0.18575190  1.00000000       0.6907779  -0.03236825
final_frequency        0.27034184  0.69077787       1.0000000   0.11109767
concreteness           0.04820423 -0.03236825       0.1110977   1.00000000

Formats seen, frequency, total count and babiness all show separate positive predictivity which is encouraging.

4. Implementing Mika’s code

Pulled code below from https://github.com/mikabr/aoa-prediction – joined AoA data and predictor data.

predictors <- c("frequency", "MLU", "final_frequency", "solo_frequency",
                "num_phons", "concreteness", "valence", "arousal", "babiness", "visual_frequency")
.alpha <- 0.05
set.seed(42)

4a. Impute and scale data for input into models.

model_data <- eng_vars %>%
  select(object, all_of(predictors)) %>%
  distinct() 

pred_sources <- list(
  c("frequency", "MLU", "final_frequency", "solo_frequency"),
  c("valence", "arousal"),
  c("visual_frequency"),
  "concreteness", "babiness", "num_phons"
)

fit_predictor <- function(pred, d) {
   if (!"object" %in% colnames(d)) stop("Missing 'object' column in data")
  xs <- predictors[predictors != pred]
  if (length(xs) == 0) stop(glue("No valid predictors for {pred}"))
  print(length(xs))
  x_str <- xs %>% paste(collapse = " + ")
  formula_str <- glue("{pred} ~ {x_str}")
  print(glue("Fitting model: {formula_str}"))  # Debugging
  lm(as.formula(glue("{pred} ~ {x_str}")), data = d) %>%
    augment(newdata = d) %>%
    select(object, .fitted)
}

max_steps <- 10
iterate_predictors <- function(lang_data) {
  missing <- lang_data %>%
    pivot_longer(names_to="predictor", values_to="value", all_of(predictors)) %>%
    mutate(missing = is.na(value)) %>%
    select(-value) %>%
    pivot_wider(names_from=predictor, values_from=missing)
  predictor_order <- lang_data %>%
    gather(predictor, value, all_of(predictors)) %>%
    group_by(predictor) %>%
    summarise(num_na = sum(is.na(value))) %>%
    filter(num_na != 0) %>%
    arrange(num_na) %>%
    pull(predictor)
  imputation_data <- lang_data %>%
   mutate(across(all_of(predictors), ~ as.numeric(Hmisc::impute(., fun = "random"))))

  for (i in 0:max_steps) {
    pred <- predictor_order[(i %% length(predictor_order)) + 1]
    imputation_fits <- fit_predictor(pred, imputation_data)
    imputation_data <- missing %>%
      select(object, !!pred) %>%
      rename(missing = !!pred) %>%
      right_join(imputation_data) %>%
      left_join(imputation_fits) %>%
      mutate(across(pred, ~ if_else(is.na(.), .fitted, .))) %>%
      select(-.fitted, -missing)
  }
  return(imputation_data)
}

model_data_imputed <- iterate_predictors(model_data)
Warning: attributes are not identical across measure variables; they will be
dropped
[1] 9
Fitting model: MLU ~ frequency + final_frequency + solo_frequency + num_phons + concreteness + valence + arousal + babiness + visual_frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `across(pred, ~if_else(is.na(.), .fitted, .))`.
Caused by warning:
! Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(pred)

  # Now:
  data %>% select(all_of(pred))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
[1] 9
Fitting model: final_frequency ~ frequency + MLU + solo_frequency + num_phons + concreteness + valence + arousal + babiness + visual_frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 9
Fitting model: frequency ~ MLU + final_frequency + solo_frequency + num_phons + concreteness + valence + arousal + babiness + visual_frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 9
Fitting model: solo_frequency ~ frequency + MLU + final_frequency + num_phons + concreteness + valence + arousal + babiness + visual_frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 9
Fitting model: concreteness ~ frequency + MLU + final_frequency + solo_frequency + num_phons + valence + arousal + babiness + visual_frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 9
Fitting model: arousal ~ frequency + MLU + final_frequency + solo_frequency + num_phons + concreteness + valence + babiness + visual_frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 9
Fitting model: valence ~ frequency + MLU + final_frequency + solo_frequency + num_phons + concreteness + arousal + babiness + visual_frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 9
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + num_phons + concreteness + valence + arousal + visual_frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 9
Fitting model: MLU ~ frequency + final_frequency + solo_frequency + num_phons + concreteness + valence + arousal + babiness + visual_frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 9
Fitting model: final_frequency ~ frequency + MLU + solo_frequency + num_phons + concreteness + valence + arousal + babiness + visual_frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 9
Fitting model: frequency ~ MLU + final_frequency + solo_frequency + num_phons + concreteness + valence + arousal + babiness + visual_frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
uni_model_data <- model_data_imputed %>%
  mutate(across(all_of(predictors), ~ as.numeric(scale(.)))) %>%  
  right_join(
    eng_vars %>% 
      select(all_of(c("measure", "object", "age", "num_true", "num_false"
                      )))
  ) %>%
  group_by(measure) %>%
  mutate(
    unscaled_age = age, age = scale(age),
    total = as.double(num_true + num_false), 
    prop = num_true / total
)
Joining with `by = join_by(object)`

4b. Fitting models

create_formula <- function(response = "prop", interaction_prefix = "age", predictors_list, random_effect = "(age | object)") {
  # Construct interaction terms for each list of predictors
  effects <- paste(interaction_prefix, predictors_list, sep = " * ")
  
  # Combine terms into a full formula string
  formula_str <- glue("{response} ~ {random_effect} + {paste(effects, collapse = ' + ')}")
  
  # Return as a formula object
  return(as.formula(formula_str))
}

fit_group_model <- function(group_data, group_formula, contrasts = NULL) {
  group <- unique(group_data$group)
  message(glue("Fitting model for {group}..."))
    glmer(formula = group_formula,
      data = group_data,
      family = binomial,
      weights = total,
      contrasts = contrasts)
}


by_measure_data <- uni_model_data %>%
  mutate(group = paste(measure)) %>%
  select(measure, group, object, prop,
         total, age, !!predictors) %>%
  group_by(measure) %>%
  nest()

effects_formula <- create_formula(predictors_list = predictors)

  measure_models <- by_measure_data %>%
  mutate(model = data %>%
           map(~fit_group_model(.x, effects_formula)))
Fitting model for produces...
Fitting model for understands...
  measure_fits <- measure_models %>%
  mutate(results = map(model, tidy))

measure_levels <- c("understands", "produces")

measure_coefs <- measure_fits %>%
  select(measure, results) %>%
  unnest() %>%
  rename(std_error = std.error,
         #z_value = z.value,
         p_value = p.value) %>%
  separate(term, c("term", "effect"), sep = " & ", fill = "right") %>%
 mutate(
   effect = if_else(grepl(":", term), "interaction with age", effect),
    effect = if_else(is.na(effect), "main effect", effect),
    term = if_else(grepl(":", term),
                   sub(".*:", "", term), term),
    effect = fct_relevel(effect, "main effect", "interaction with age"),
    measure = factor(measure, levels = measure_levels),
         signif = p_value < .alpha) %>%
  group_by(measure, term, effect) %>%
  # TODO:added in this line?
  filter(!is.na(std_error)) %>%
  nest()
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(results)`.

4c. Plotting model effects

predictor_effects <- measure_coefs %>%
  filter(effect == "main effect", term %in% predictors) %>%
  rename(predictor_effect = data) %>%
  select(-effect)
Adding missing grouping variables: `effect`
mean_term_coefs <- measure_coefs %>%
  unnest(cols = c(data)) %>%
  filter(effect == "main effect") %>%
  group_by(term) %>%
  summarise(mean_estimate = mean(estimate),
            n_sig = sum(signif==TRUE),
            n_pos = sum(estimate > 0),
            n_neg = sum(estimate < 0)) %>%
  arrange(desc(abs(mean_estimate)))

coef_order <- mean_term_coefs %>% pull(term)

display_predictors <- function(predictors) {
  predictors %>%
    str_replace("num", "number") %>% str_replace("phons", "phonemes") %>%
    map_chr(label_caps) %>% str_replace("MLU", "MLU-w")
}

label_caps <- function(value) {
  if_else(toupper(value) == value, value,
          paste0(toupper(substr(value, 1, 1)),
                 tolower(substr(value, 2, nchar(value))))) %>%
    str_replace_all("_", " ")
}

plt_fixed_coefs <- measure_coefs %>%
  unnest() %>%
  mutate(term = term %>% factor(levels = rev(coef_order)) %>%
           fct_relabel(display_predictors),
         signif = ifelse(signif, "significant", "non-significant") %>%
           fct_rev(),
         measure = fct_recode(measure, "comprehension" = "understands",
                              "production" = "produces")) |>
  filter(!(term %in% c("Age", "(intercept)")))
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(data)`.
ggplot(plt_fixed_coefs, aes(x = estimate, y = term)) +
  facet_grid(measure ~ effect, scales = "free",
             labeller = as_labeller(label_caps)) +
  geom_pointrange(aes(colour = term, shape = signif,
                      xmin = estimate - 1.96 * std_error,
                      xmax = estimate + 1.96 * std_error)) +
  geom_vline(xintercept = 0, color = "grey", linetype = "dotted") +
  scale_colour_viridis(discrete = TRUE, guide = FALSE) +
  scale_shape_manual(values = c(19, 21), guide = FALSE) +
  labs(y = "", x = "Coefficient estimate", title="Word developmental trajectory predictors")
Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
ggplot2 3.3.4.
ℹ Please use "none" instead.

Estimates of coefficients in predicting words’ developmental trajectories for English comprehension and production data. Larger coefficient values indicate a greater effect of the predictor on acquisition: positive main effects indicate that words with higher values of the predictor tend to be understood/produced by more children, while negative main effects indicate that words with lower values of the predictor tend to be understood/produced by more children; positive age interactions indicate that the predictor’s effect increases with age, while negative age interactions indicate the predictor’s effect decreases with age. Line ranges indicates 95% confidence intervals; filled in points indicate coefficients for which \(p < 0.05\).
write.csv(plt_fixed_coefs, "fixed_coefs.csv", row.names = FALSE)

This isn’t exactly what I expected so I’m going to try out different models quickly by collapsing our code into helper funcs ## 4d. Testing out different models by collapsing into functions

get_model_data <- function(eng_vars=eng_vars, curr_predictors=predictors) {
  model_data <- eng_vars %>%
  select(object, all_of(curr_predictors)) %>%
  distinct() 

fit_predictor <- function(pred, d) {
   if (!"object" %in% colnames(d)) stop("Missing 'object' column in data")
  xs <-xs <- curr_predictors[curr_predictors != pred]
  if (length(xs) == 0) stop(glue("No valid predictors for {pred}"))
  print(length(xs))
  x_str <- xs %>% paste(collapse = " + ")
  formula_str <- glue("{pred} ~ {x_str}")
  print(glue("Fitting model: {formula_str}"))  # Debugging
  lm(as.formula(glue("{pred} ~ {x_str}")), data = d) %>%
    augment(newdata = d) %>%
    select(object, .fitted)
}

max_steps <- 10
iterate_predictors <- function(lang_data) {
  missing <- lang_data %>%
    pivot_longer(names_to="predictor", values_to="value", all_of(curr_predictors)) %>%
    mutate(missing = is.na(value)) %>%
    select(-value) %>%
    pivot_wider(names_from=predictor, values_from=missing)
  predictor_order <- lang_data %>%
    gather(predictor, value, all_of(curr_predictors)) %>%
    group_by(predictor) %>%
    summarise(num_na = sum(is.na(value))) %>%
    filter(num_na != 0) %>%
    arrange(num_na) %>%
    pull(predictor)
  imputation_data <- lang_data %>%
   mutate(across(all_of(curr_predictors), ~ as.numeric(Hmisc::impute(., fun = "random"))))

  for (i in 0:max_steps) {
    pred <- predictor_order[(i %% length(predictor_order)) + 1]
    imputation_fits <- fit_predictor(pred, imputation_data)
    imputation_data <- missing %>%
      select(object, !!pred) %>%
      rename(missing = !!pred) %>%
      right_join(imputation_data) %>%
      left_join(imputation_fits) %>%
      mutate(across(pred, ~ if_else(is.na(.), .fitted, .))) %>%
      select(-.fitted, -missing)
  }
  return(imputation_data)
}

model_data_imputed <- iterate_predictors(model_data)

uni_model_data <- model_data_imputed %>%
  mutate(across(all_of(curr_predictors), ~ as.numeric(scale(.)))) %>%  
  right_join(
    eng_vars %>% 
      select(all_of(c("measure", "object", "age", "num_true", "num_false"
                      )))
  ) %>%
  group_by(measure) %>%
  mutate(
    unscaled_age = age, age = scale(age),
    total = as.double(num_true + num_false), 
    prop = num_true / total
)
return(uni_model_data)
}
get_measure_coefs <- function(formula,model_data=uni_model_data, curr_predictors=predictors) {
   by_measure_data <- model_data %>%
  mutate(group = paste(measure)) %>%
  select(measure, group, object, prop,
         total, age, !!curr_predictors) %>%
  group_by(measure) %>%
  nest()
  
measure_models <- by_measure_data %>%
  mutate(model = data %>%
           map(~fit_group_model(.x, formula)))
  measure_fits <- measure_models %>%
  mutate(results = map(model, tidy))

measure_levels <- c("understands", "produces")

measure_coefs <- measure_fits %>%
  select(measure, results) %>%
  unnest() %>%
  rename(std_error = std.error,
         #z_value = z.value,
         p_value = p.value) %>%
  separate(term, c("term", "effect"), sep = " & ", fill = "right") %>%
 mutate(
   effect = if_else(grepl(":", term), "interaction with age", effect),
    effect = if_else(is.na(effect), "main effect", effect),
    term = if_else(grepl(":", term),
                   sub(".*:", "", term), term),
    effect = fct_relevel(effect, "main effect", "interaction with age"),
    measure = factor(measure, levels = measure_levels),
         signif = p_value < .alpha) %>%
  group_by(measure, term, effect) %>%
  # TODO:added in this line?
  filter(!is.na(std_error)) %>%
  nest()
return(measure_coefs)
}

plot_effects_data <- function(measure_coefs, plot_title="Word developmental trajectory predictors", curr_predictors=predictors) {
predictor_effects <- measure_coefs %>%
  filter(effect == "main effect", term %in% curr_predictors) %>%
  rename(predictor_effect = data) %>%
  select(-effect)
mean_term_coefs <- measure_coefs %>%
  unnest(cols = c(data)) %>%
  filter(effect == "main effect") %>%
  group_by(term) %>%
  summarise(mean_estimate = mean(estimate),
            n_sig = sum(signif==TRUE),
            n_pos = sum(estimate > 0),
            n_neg = sum(estimate < 0)) %>%
  arrange(desc(abs(mean_estimate)))

coef_order <- mean_term_coefs %>% pull(term)
effects_colors <- viridisLite::viridis(length(coef_order), option = "D")[-1]
plt_fixed_coefs <- measure_coefs %>%
  unnest() %>%
  mutate(term = term %>% factor(levels = rev(coef_order)) %>%
           fct_relabel(display_predictors),
         signif = ifelse(signif, "significant", "non-significant") %>%
           fct_rev(),
         measure = fct_recode(measure, "comprehension" = "understands",
                              "production" = "produces")) |>
  filter(!(term %in% c("Age", "(intercept)")))
ggplot(plt_fixed_coefs, aes(x = estimate, y = term)) +
  facet_grid(measure ~ effect, scales = "free",
             labeller = as_labeller(label_caps)) +
  geom_pointrange(aes(colour = term, shape = signif,
                      xmin = estimate - 1.96 * std_error,
                      xmax = estimate + 1.96 * std_error)) +
  geom_vline(xintercept = 0, color = "grey", linetype = "dotted") +
  scale_colour_manual(values = effects_colors, guide = FALSE) +
  scale_shape_manual(values = c(19, 21), guide = FALSE) +
  labs(y = "", x = "Coefficient estimate", title=plot_title)
}

4e. Finally testing out different models

Fitting model for produces...
Fitting model for understands...
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(results)`.
Adding missing grouping variables: `effect`
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(data)`.

Just removing visual frequency and seeing what the effects look like for our set of items

non_viz_predictors <- predictors[!grepl("visual", predictors)]
non_viz_effects_formula <- create_formula(predictors_list = non_viz_predictors)
non_viz_model_measure_coefs <- get_measure_coefs(non_viz_effects_formula) 
Fitting model for produces...
Fitting model for understands...
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(results)`.
plot_effects_data(non_viz_model_measure_coefs, plot_title="Non visual frequency effects")
Adding missing grouping variables: `effect`
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(data)`.

The predictors we probably want for the CCN paper

smaller_set_of_predictors <- c("visual_frequency", "frequency", "concreteness")
input_model_data <- eng_vars
smaller_effects_formula <- create_formula(predictors_list = smaller_set_of_predictors)
smaller_model_data <- get_model_data(eng_vars=input_model_data, curr_predictors=smaller_set_of_predictors)
Warning: attributes are not identical across measure variables; they will be
dropped
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: concreteness ~ visual_frequency + frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: concreteness ~ visual_frequency + frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: concreteness ~ visual_frequency + frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: concreteness ~ visual_frequency + frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: concreteness ~ visual_frequency + frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
smaller_model_measure_coefs <- get_measure_coefs(smaller_effects_formula, model_data=smaller_model_data, curr_predictors=smaller_set_of_predictors) 
Fitting model for produces...
Fitting model for understands...
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(results)`.
plot_effects_data(smaller_model_measure_coefs, plot_title="Word developmental trajectory predictors", curr_predictors=smaller_set_of_predictors)
Adding missing grouping variables: `effect`
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(data)`.

Trying to get rid of low object detections to see if that makes a difference

filtered_model_data <- eng_vars |> filter(log_proportion>-5) |> mutate(visual_frequency=scale(log_proportion))
smaller_model_data <- get_model_data(eng_vars=filtered_model_data, curr_predictors=smaller_set_of_predictors)
Warning: attributes are not identical across measure variables; they will be
dropped
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: concreteness ~ visual_frequency + frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: concreteness ~ visual_frequency + frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: concreteness ~ visual_frequency + frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: concreteness ~ visual_frequency + frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: concreteness ~ visual_frequency + frequency
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 2
Fitting model: frequency ~ visual_frequency + concreteness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
smaller_model_measure_coefs <- get_measure_coefs(smaller_effects_formula, model_data=smaller_model_data, curr_predictors=smaller_set_of_predictors) 
Fitting model for produces...
Fitting model for understands...
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(results)`.
plot_effects_data(smaller_model_measure_coefs, plot_title="Word developmental trajectory predictors", curr_predictors=smaller_set_of_predictors)
Adding missing grouping variables: `effect`
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(data)`.

Seeing how closely I can replicate the original Wordbank plot with the caveat that I’m not using lexical category as an additional predictor

original_predictors = predictors[!(predictors %in% c("visual_frequency"))]
original_model_data <- get_model_data(eng_vars=eng_joined |> rename("object"="uni_lemma"), curr_predictors=original_predictors)
Warning: attributes are not identical across measure variables; they will be
dropped
[1] 8
Fitting model: final_frequency ~ frequency + MLU + solo_frequency + num_phons + concreteness + valence + arousal + babiness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 8
Fitting model: frequency ~ MLU + final_frequency + solo_frequency + num_phons + concreteness + valence + arousal + babiness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 8
Fitting model: solo_frequency ~ frequency + MLU + final_frequency + num_phons + concreteness + valence + arousal + babiness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 8
Fitting model: MLU ~ frequency + final_frequency + solo_frequency + num_phons + concreteness + valence + arousal + babiness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 8
Fitting model: concreteness ~ frequency + MLU + final_frequency + solo_frequency + num_phons + valence + arousal + babiness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 8
Fitting model: babiness ~ frequency + MLU + final_frequency + solo_frequency + num_phons + concreteness + valence + arousal
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 8
Fitting model: arousal ~ frequency + MLU + final_frequency + solo_frequency + num_phons + concreteness + valence + babiness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 8
Fitting model: valence ~ frequency + MLU + final_frequency + solo_frequency + num_phons + concreteness + arousal + babiness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 8
Fitting model: final_frequency ~ frequency + MLU + solo_frequency + num_phons + concreteness + valence + arousal + babiness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 8
Fitting model: frequency ~ MLU + final_frequency + solo_frequency + num_phons + concreteness + valence + arousal + babiness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
[1] 8
Fitting model: solo_frequency ~ frequency + MLU + final_frequency + num_phons + concreteness + valence + arousal + babiness
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
Joining with `by = join_by(object)`
original_model_measure_coefs <- get_measure_coefs(non_viz_effects_formula, model_data=original_model_data, curr_predictors=original_predictors) 
Fitting model for produces...
Fitting model for understands...
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(results)`.
plot_effects_data(original_model_measure_coefs, plot_title="Original model data", curr_predictors=original_predictors)
Adding missing grouping variables: `effect`
Warning: `cols` is now required when using `unnest()`.
ℹ Please use `cols = c(data)`.

5. Correlation with other predictors

# Compute correlation matrix

# Compute correlation matrix
cor_matrix <- cor(model_data |> select(-object, -num_phons, -valence, -arousal), use = "pairwise.complete.obs")

# Convert correlation matrix to long format
cor_melted <- reshape::melt(cor_matrix)
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
the caller; using TRUE
# Reverse order of labels
cor_melted$X1 <- factor(cor_melted$X1, levels = rev(unique(cor_melted$X1)))
cor_melted$X2 <- factor(cor_melted$X2, levels = rev(unique(cor_melted$X2)))

# Define color mapping for axis labels
axis_label_colors <- ifelse(grepl("visual", levels(cor_melted$X1)), "red", "black")

# Plot heatmap with updated label ordering and coloring
ggplot(cor_melted, aes(X1, X2, fill = value)) +
  geom_tile() +
  scale_fill_viridis(option = "mako", direction = 1) + 
  theme_minimal() +
  geom_text(aes(label = round(value, 2)), color = "white", size = 3) +  # Label each square
  labs(title = "Heatmap of Predictor Correlations",
       fill = "Correlation") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, color = axis_label_colors),
        axis.text.y = element_text(color = axis_label_colors))
Warning: Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.
Warning: Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.