1 Zero-order correlations among predictors & outcomes
2 First step: predict AoAs
3 Get a sense of what words are most different for each measure
4 Get a sense of difference distributions
5 Predict Wordbank-Kuperman
- 5.1 Plot Wordbank-Kuperman models
6 Predict Kuperman - Picture-naming
- 6.1 Plot Kuperman - Picture-naming models
7 Predict Wordbank - Picture-naming
- 7.1 Plot Wordbank - Picture-naming models
8 Predict Wordbank - SONA
- 8.1 Plot Wordbank - SONA models
- 8.2 Plot frequency vs. difference
9 Compare models for subjective - ground truth measures (K - pn and Wb - pn)

generality <- read_csv("generality_ratings_byWord_combined.csv") %>% 
  select(-X1, -num_item_id_wrong) %>% 
  rename(word_generality=word)

data <- read_csv("all_variables_for_aoa_regressions_clean.csv") %>% 
  left_join(generality, by="num_item_id") %>% 
  mutate(kuperman_minus_pn = KupermanAoA - morrison_aoa_threshold_years,
         parent_minus_pn = parentreport_calculated_aoa_years - morrison_aoa_threshold_years)

complete_generality <- data %>% filter(!is.na(mean_generality))

### left childes out as a filter here because when it's included, that only leaves us with 23 words
complete_predictors_nowordbank <- data %>% filter(!is.na(morrison_aoa_threshold_years))

# word info: where are we losing things?
# picture-naming: have for 128 words - but there are 300 originally tested. It turns out only 128 words overlap with the parent-report measures (10 of our words + 118 on CDI)

# get CDI threshold info (e.g. Thill & Twomey 2016) to look at parent-report AoA that way
wordbank_items <- get_item_data(language="English (American)", form="WS")
wordbank_threshold_aoa <- summarise_items(wordbank_items) %>% 
  group_by(uni_lemma) %>% 
  filter(production >=.5) %>% 
  filter(production==min(production)) %>% 
  cSplit(., "item_id", sep="_") %>% 
  mutate(num_item_id = item_id_2 +1) %>% 
  select(num_item_id, uni_lemma, wordbank_threshold_age = age, n_wordbank_children = n_children, wordbank_pct_producing_at_aoa = production)

wordbank_threshold_aoa$num_item_id <- as.character(wordbank_threshold_aoa$num_item_id)


naive_sona <- read_csv("../6_naive_adult_cdi/naive_sona_aoas.csv") %>% 
  group_by(word) %>%
  filter(prop_say >= .5) %>%
  filter(age_tested==min(age_tested)) %>% 
  mutate(sona_aoa = age_tested/12) %>% 
  select(word, sona_aoa)

complete_predictors <- complete_predictors_nowordbank %>%
  left_join(wordbank_threshold_aoa, by="num_item_id") %>% 
  left_join(naive_sona, by="word") %>% 
  mutate(sona_aoa_years = ifelse(is.na(sona_aoa), 4, sona_aoa)) %>% 
  select(-sona_aoa) %>% 
  filter(!is.na(wordbank_threshold_age)) %>% 
  mutate(wordbank_aoa_years = wordbank_threshold_age/12,
         wordbank_minus_pn = wordbank_aoa_years - morrison_aoa_threshold_years,
         wordbank_minus_kuperman = wordbank_aoa_years - KupermanAoA,
         wordbank_minus_sona = wordbank_aoa_years - sona_aoa_years,
         sona_minus_pn = sona_aoa_years - morrison_aoa_threshold_years,
         sona_minus_kuperman = sona_aoa_years - KupermanAoA,
         concreteness_centered = scale(concreteness, scale=FALSE),
         preschoolness_centered = scale(preschoolness, scale=FALSE),
         frequency_centered = scale(childes_adult_log_freq, scale=FALSE),
         helpfulness_centered = scale(helpfulness, scale=FALSE))
#13 words don't meet 50% production criteria by 30 months

#write csv for loading into viz script
#write_csv(complete_predictors, "complete_predictors.csv")

full_predictor_list <- c("preschoolness","helpfulness","childes_adult_log_freq","concreteness")
centered_predictor_list <- c("preschoolness_centered","helpfulness_centered","concreteness_centered","frequency_centered")

write_model_formula <- function(outcome_var) {
  fmla_full <- as.formula(paste(as.character(outcome_var), paste(" ~ "), paste(full_predictor_list, collapse="+")))
}

1 Zero-order correlations among predictors & outcomes

Codebook:
wordbank_thresh_aoa: AoA, determined by youngest age in Wordbank (CDI) at which 50% of kids produce word
KupermanAoA: AoA from Kuperman norms (adult retrospective self-report, unconstrained)
picture_naming_aoa: AoA from Morrison et al. (1997), had actual children name pictures. AoA is youngest age at which 75% of kids named picture correctly. Youngest age tested was 3 years so AoAs <3 are extrapolated from CDI norms :/
naive_adult_aoa: AoA from a survey of college students (N=36 so far). Were randomized to complete CDI checklist for either an 18-month-old or 24-month-old. AoA is youngest age at which 50% of raters said child would produce word so possible values are 1.5 or 2 years. For words that didn’t reach 50% threshold, put AoA as 4 years.
babiness and preschoolness: on a scale of 1-5, how much is the word associated with babies/preschoolers (MTurk)
helpfulness: on a scale of 1-5, how helpful would it be for a preschooler to know this word (MTurk)
frequency: log frequency based on adult speech in CHILDES
concreteness: concreteness norms from Brysbaert et al. - adults asked to rate on a scale of 1-5
generality: on a scale of 1-5, how general is this word (MTurk)

data_forcorr <- complete_predictors %>% 
  select(KupermanAoA, wordbank_thresh_aoa = wordbank_aoa_years,
         picture_naming_aoa = morrison_aoa_threshold_years, naive_adult_aoa = sona_aoa_years,
         babiness, preschoolness, helpfulness,
         frequency = childes_adult_log_freq, concreteness, generality = mean_generality)

corrs <- cor(data_forcorr, use="pairwise.complete.obs",method="pearson")
pmat <- cor.mtest(data_forcorr, method="pearson")
pval <- pmat$p

corrplot(corrs, method="color", type="lower", addCoef.col = TRUE,
         tl.col="black", diag = FALSE, p.mat=pval, sig.level=.05, insig="blank",number.cex = .7)

Note: although generality is correlated with other variables, it wasn’t a significant predictor in any models tested (raw AoAs or differences between different AoA measures), so it’s not included in any further analyses here.

2 First step: predict AoAs

2.1 Wordbank AoA

Kuperman and naive adults predict Wordbank, but picture-naming doesn’t.

pr_kup <- lm(wordbank_aoa_years ~ preschoolness + helpfulness + childes_adult_log_freq + concreteness +
                KupermanAoA, complete_predictors)
summary(pr_kup)

## 
## Call:
## lm(formula = wordbank_aoa_years ~ preschoolness + helpfulness + 
##     childes_adult_log_freq + concreteness + KupermanAoA, data = complete_predictors)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41163 -0.13177  0.00245  0.11739  0.51277 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             4.294873   0.845642   5.079 1.58e-06 ***
## preschoolness          -0.030126   0.027151  -1.110 0.269623    
## helpfulness             0.009449   0.029107   0.325 0.746083    
## childes_adult_log_freq -0.144526   0.018367  -7.869 2.81e-12 ***
## concreteness           -0.352973   0.169164  -2.087 0.039260 *  
## KupermanAoA             0.093548   0.023625   3.960 0.000134 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1838 on 109 degrees of freedom
## Multiple R-squared:  0.6205, Adjusted R-squared:  0.6031 
## F-statistic: 35.65 on 5 and 109 DF,  p-value: < 2.2e-16

pr_kup_plot <- tidy(pr_kup) %>% 
  mutate(model = "kuperman, N = 115, R2 = .6") %>% 
  arrange(term)

pr_picturename <- lm(wordbank_aoa_years ~ preschoolness + helpfulness + childes_adult_log_freq + concreteness +
                morrison_aoa_threshold_years, complete_predictors)
summary(pr_picturename)

## 
## Call:
## lm(formula = wordbank_aoa_years ~ preschoolness + helpfulness + 
##     childes_adult_log_freq + concreteness + morrison_aoa_threshold_years, 
##     data = complete_predictors)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45553 -0.11640 -0.00363  0.11843  0.46944 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   4.59665    0.92577   4.965 2.55e-06 ***
## preschoolness                -0.07361    0.02762  -2.665  0.00887 ** 
## helpfulness                  -0.01219    0.03201  -0.381  0.70412    
## childes_adult_log_freq       -0.15940    0.02150  -7.415 2.79e-11 ***
## concreteness                 -0.29029    0.18410  -1.577  0.11775    
## morrison_aoa_threshold_years  0.01838    0.01411   1.303  0.19540    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.195 on 109 degrees of freedom
## Multiple R-squared:  0.5726, Adjusted R-squared:  0.553 
## F-statistic:  29.2 on 5 and 109 DF,  p-value: < 2.2e-16

pr_picturename_plot <- tidy(pr_picturename) %>% 
  mutate(model = "picture-naming, N = 115, R2 = .55") %>% 
  arrange(term)

pr_naive <- lm(wordbank_aoa_years ~ preschoolness + helpfulness + childes_adult_log_freq + concreteness + 
                 sona_aoa_years, complete_predictors)
summary(pr_naive)

## 
## Call:
## lm(formula = wordbank_aoa_years ~ preschoolness + helpfulness + 
##     childes_adult_log_freq + concreteness + sona_aoa_years, data = complete_predictors)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41282 -0.09905  0.00566  0.11030  0.42198 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             4.61422    0.82091   5.621 1.48e-07 ***
## preschoolness          -0.05608    0.02531  -2.216   0.0287 *  
## helpfulness             0.01555    0.02884   0.539   0.5909    
## childes_adult_log_freq -0.12694    0.01980  -6.412 3.79e-09 ***
## concreteness           -0.39581    0.16728  -2.366   0.0197 *  
## sona_aoa_years          0.08135    0.01860   4.373 2.81e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1813 on 109 degrees of freedom
## Multiple R-squared:  0.6307, Adjusted R-squared:  0.6138 
## F-statistic: 37.24 on 5 and 109 DF,  p-value: < 2.2e-16

pr_naive_plot <- tidy(pr_naive) %>% 
  mutate(model = "naive_adults, N = 115, R2 = .61") %>% 
  arrange(term)

print(anova(pr_kup, pr_picturename, pr_naive))

## Analysis of Variance Table
## 
## Model 1: wordbank_aoa_years ~ preschoolness + helpfulness + childes_adult_log_freq + 
##     concreteness + KupermanAoA
## Model 2: wordbank_aoa_years ~ preschoolness + helpfulness + childes_adult_log_freq + 
##     concreteness + morrison_aoa_threshold_years
## Model 3: wordbank_aoa_years ~ preschoolness + helpfulness + childes_adult_log_freq + 
##     concreteness + sona_aoa_years
##   Res.Df    RSS Df Sum of Sq F Pr(>F)
## 1    109 3.6806                      
## 2    109 4.1455  0  -0.46489         
## 3    109 3.5816  0   0.56391

# pr_childes <- lm(wordbank_threshold_age ~ preschoolness + helpfulness + childes_adult_log_freq + concreteness + 
#                 childes_est_aoa_years, complete_predictors)
# summary(pr_childes)
# pr_childes_plot <- tidy(pr_childes) %>% 
#   mutate(model = "childes, N = 23, R2 = .34") %>% 
#   arrange(term)

2.1.1 Plot Wordbank AoA

pr_plots <- bind_rows(pr_kup_plot, pr_picturename_plot, pr_naive_plot)

dwplot(pr_plots,
       vline = geom_vline(xintercept = 0, colour = "grey60", linetype = 2)) %>%
    relabel_predictors(childes_adult_log_freq = "frequency") +
     theme_bw() + xlab("Coefficient Estimate") + ylab("") +
     geom_vline(xintercept = 0, colour = "grey60", linetype = 2) +
     ggtitle("Predicting Wordbank AoA") +
     theme(plot.title = element_text(face="bold"),
           legend.position = c(0.007, 0.01),
           legend.justification = c(0, 0), 
           legend.background = element_rect(colour="grey80"),
           legend.title = element_blank())

2.2 Kuperman AoA

Wordbank, naive, and picture-naming all significant predictors of Kuperman (though estimate for picture-naming much smaller).

kup_pr <- lm(KupermanAoA ~ preschoolness + helpfulness + childes_adult_log_freq + concreteness + wordbank_aoa_years, complete_predictors)
summary(kup_pr)

## 
## Call:
## lm(formula = KupermanAoA ~ preschoolness + helpfulness + childes_adult_log_freq + 
##     concreteness + wordbank_aoa_years, data = complete_predictors)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.58950 -0.41479 -0.00181  0.40851  2.64383 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.59585    3.56436   0.167 0.867548    
## preschoolness          -0.30093    0.09941  -3.027 0.003080 ** 
## helpfulness            -0.10741    0.10991  -0.977 0.330600    
## childes_adult_log_freq -0.09414    0.08672  -1.086 0.280048    
## concreteness            0.56282    0.65172   0.864 0.389707    
## wordbank_aoa_years      1.34429    0.33949   3.960 0.000134 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6966 on 109 degrees of freedom
## Multiple R-squared:  0.4366, Adjusted R-squared:  0.4107 
## F-statistic: 16.89 on 5 and 109 DF,  p-value: 2.462e-12

kup_pr_plot <- tidy(kup_pr) %>% 
  mutate(model = "Wordbank, N = 115, R2 = .41") %>% 
  arrange(term)

kup_picturename <- lm(KupermanAoA ~ preschoolness + helpfulness + childes_adult_log_freq + concreteness + 
                morrison_aoa_threshold_years, complete_predictors)
summary(kup_picturename)

## 
## Call:
## lm(formula = KupermanAoA ~ preschoolness + helpfulness + childes_adult_log_freq + 
##     concreteness + morrison_aoa_threshold_years, data = complete_predictors)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.35676 -0.45309 -0.05726  0.42251  3.04352 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   4.84537    3.45784   1.401  0.16397    
## preschoolness                -0.43518    0.10316  -4.218  5.1e-05 ***
## helpfulness                  -0.18225    0.11957  -1.524  0.13036    
## childes_adult_log_freq       -0.22720    0.08029  -2.830  0.00555 ** 
## concreteness                  0.44308    0.68765   0.644  0.52071    
## morrison_aoa_threshold_years  0.11809    0.05269   2.241  0.02705 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7284 on 109 degrees of freedom
## Multiple R-squared:  0.3839, Adjusted R-squared:  0.3557 
## F-statistic: 13.59 on 5 and 109 DF,  p-value: 2.667e-10

kup_picturename_plot <- tidy(kup_picturename) %>% 
  mutate(model = "picture-naming, N = 115, R2 = .44") %>% 
  arrange(term)

kup_naive <- lm(KupermanAoA ~ preschoolness + helpfulness + childes_adult_log_freq + concreteness + 
                sona_aoa_years, complete_predictors)
summary(kup_naive)

## 
## Call:
## lm(formula = KupermanAoA ~ preschoolness + helpfulness + childes_adult_log_freq + 
##     concreteness + sona_aoa_years, data = complete_predictors)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81816 -0.39035 -0.07628  0.42448  2.87536 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.28208    3.23659   1.941 0.054845 .  
## preschoolness          -0.36124    0.09977  -3.621 0.000447 ***
## helpfulness            -0.06336    0.11371  -0.557 0.578546    
## childes_adult_log_freq -0.19567    0.07805  -2.507 0.013654 *  
## concreteness           -0.04383    0.65955  -0.066 0.947132    
## sona_aoa_years          0.22538    0.07334   3.073 0.002675 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7147 on 109 degrees of freedom
## Multiple R-squared:  0.4069, Adjusted R-squared:  0.3797 
## F-statistic: 14.96 on 5 and 109 DF,  p-value: 3.644e-11

kup_naive_plot <- tidy(kup_naive) %>% 
  mutate(model = "naive_adults, N = 115, R2 = .38") %>% 
  arrange(term)

print(anova(kup_pr, kup_picturename, kup_naive))

## Analysis of Variance Table
## 
## Model 1: KupermanAoA ~ preschoolness + helpfulness + childes_adult_log_freq + 
##     concreteness + wordbank_aoa_years
## Model 2: KupermanAoA ~ preschoolness + helpfulness + childes_adult_log_freq + 
##     concreteness + morrison_aoa_threshold_years
## Model 3: KupermanAoA ~ preschoolness + helpfulness + childes_adult_log_freq + 
##     concreteness + sona_aoa_years
##   Res.Df    RSS Df Sum of Sq F Pr(>F)
## 1    109 52.890                      
## 2    109 57.833  0   -4.9432         
## 3    109 55.674  0    2.1593

# kup_childes <- lm(KupermanAoA ~ preschoolness + helpfulness + childes_adult_log_freq + concreteness + 
#                 childes_est_aoa_years, complete_predictors)
# summary(kup_childes)
# kup_childes_plot <- tidy(kup_childes) %>% 
#   mutate(model = "childes, N = 23, R2 = .35") %>% 
#   arrange(term)

2.2.1 Plot Kuperman AoA

kuperman_plot <- bind_rows(kup_pr_plot, kup_picturename_plot, kup_naive_plot)

dwplot(kuperman_plot,
       vline = geom_vline(xintercept = 0, colour = "grey60", linetype = 2)) %>%
    relabel_predictors(childes_adult_log_freq = "frequency") +
     theme_bw() + xlab("Coefficient Estimate") + ylab("") +
     geom_vline(xintercept = 0, colour = "grey60", linetype = 2) +
     ggtitle("Predicting Kuperman AoA") +
     theme(plot.title = element_text(face="bold"),
           legend.position = c(0.525, 0.01),
           legend.justification = c(0, 0), 
           legend.background = element_rect(colour="grey80"),
           legend.title = element_blank())

2.3 Picture-naming AoA

Kuperman predicts picture-naming; Wordbank & naive adult ratings do not.

pn_pr <- lm(morrison_aoa_threshold_years ~ preschoolness + helpfulness + childes_adult_log_freq + concreteness +
                   wordbank_aoa_years, complete_predictors)
summary(pn_pr)

## 
## Call:
## lm(formula = morrison_aoa_threshold_years ~ preschoolness + helpfulness + 
##     childes_adult_log_freq + concreteness + wordbank_aoa_years, 
##     data = complete_predictors)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4996 -0.6993 -0.1218  0.6529  3.8487 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             16.5126     6.7228   2.456  0.01562 *  
## preschoolness            0.4336     0.1875   2.312  0.02263 *  
## helpfulness              0.6265     0.2073   3.022  0.00313 ** 
## childes_adult_log_freq  -0.7235     0.1636  -4.423 2.31e-05 ***
## concreteness            -2.6099     1.2292  -2.123  0.03600 *  
## wordbank_aoa_years       0.8342     0.6403   1.303  0.19540    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.314 on 109 degrees of freedom
## Multiple R-squared:  0.3694, Adjusted R-squared:  0.3405 
## F-statistic: 12.77 on 5 and 109 DF,  p-value: 8.982e-10

pn_pr_plot <- tidy(pn_pr) %>% 
  mutate(model = "Wordbank, N = 123, R2 = .34") %>% 
  arrange(term)

pn_kup <- lm(morrison_aoa_threshold_years ~ preschoolness + helpfulness + childes_adult_log_freq + concreteness + 
                KupermanAoA, complete_predictors)
summary(pn_kup)

## 
## Call:
## lm(formula = morrison_aoa_threshold_years ~ preschoolness + helpfulness + 
##     childes_adult_log_freq + concreteness + KupermanAoA, data = complete_predictors)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4615 -0.8758 -0.1451  0.5611  3.7114 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             17.9464     5.9574   3.012  0.00322 ** 
## preschoolness            0.5236     0.1913   2.738  0.00723 ** 
## helpfulness              0.6663     0.2051   3.249  0.00154 ** 
## childes_adult_log_freq  -0.7468     0.1294  -5.771 7.49e-08 ***
## concreteness            -2.9342     1.1917  -2.462  0.01538 *  
## KupermanAoA              0.3730     0.1664   2.241  0.02705 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.295 on 109 degrees of freedom
## Multiple R-squared:  0.3878, Adjusted R-squared:  0.3597 
## F-statistic: 13.81 on 5 and 109 DF,  p-value: 1.919e-10

pn_kup_plot <- tidy(pn_kup) %>% 
  mutate(model = "kuperman, N = 123, R2 = .36") %>% 
  arrange(term)

pn_naive <- lm(morrison_aoa_threshold_years ~ preschoolness + helpfulness + childes_adult_log_freq + concreteness + 
                sona_aoa_years, complete_predictors)
summary(pn_naive)

## 
## Call:
## lm(formula = morrison_aoa_threshold_years ~ preschoolness + helpfulness + 
##     childes_adult_log_freq + concreteness + sona_aoa_years, data = complete_predictors)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8199 -0.6746 -0.1121  0.6761  3.8652 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            20.60917    5.99591   3.437 0.000833 ***
## preschoolness           0.37955    0.18483   2.054 0.042412 *  
## helpfulness             0.62836    0.21065   2.983 0.003524 ** 
## childes_adult_log_freq -0.86250    0.14459  -5.965 3.08e-08 ***
## concreteness           -2.90437    1.22183  -2.377 0.019195 *  
## sona_aoa_years          0.01228    0.13586   0.090 0.928139    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.324 on 109 degrees of freedom
## Multiple R-squared:  0.3596, Adjusted R-squared:  0.3303 
## F-statistic: 12.24 on 5 and 109 DF,  p-value: 1.998e-09

pn_naive_plot <- tidy(pn_naive) %>% 
  mutate(model = "naive_adults, N = 123, R2 = .33") %>% 
  arrange(term)

print(anova(pn_pr, pn_kup))

## Analysis of Variance Table
## 
## Model 1: morrison_aoa_threshold_years ~ preschoolness + helpfulness + 
##     childes_adult_log_freq + concreteness + wordbank_aoa_years
## Model 2: morrison_aoa_threshold_years ~ preschoolness + helpfulness + 
##     childes_adult_log_freq + concreteness + KupermanAoA
##   Res.Df    RSS Df Sum of Sq F Pr(>F)
## 1    109 188.15                      
## 2    109 182.66  0    5.4872

2.3.1 Plot picture-naming AoA

pn_plot <- bind_rows(pn_pr_plot, pn_kup_plot, pn_naive_plot)

dwplot(pn_plot,
       vline = geom_vline(xintercept = 0, colour = "grey60", linetype = 2)) %>%
    relabel_predictors(childes_adult_log_freq = "frequency") +
     theme_bw() + xlab("Coefficient Estimate") + ylab("") +
     geom_vline(xintercept = 0, colour = "grey60", linetype = 2) +
     ggtitle("Predicting picture-naming AoA") +
     theme(plot.title = element_text(face="bold"),
           legend.position = c(0.007, 0.01),
           legend.justification = c(0, 0), 
           legend.background = element_rect(colour="grey80"),
           legend.title = element_blank())

3 Get a sense of what words are most different for each measure

3.1 Wordbank/Kuperman

ggplot(complete_predictors, aes(x = wordbank_aoa_years, y = KupermanAoA, label=word))+
  geom_point()+
  geom_label()+
  geom_abline(slope=1, intercept=0)+
  theme_classic()+
  xlim(0,10)+
  ylim(0,10)

3.2 Picture-naming/Kuperman

ggplot(complete_predictors, aes(x = morrison_aoa_threshold_years, y = KupermanAoA, label=word))+
  geom_point()+
  geom_label()+
  geom_abline(slope=1, intercept=0)+
  theme_classic()+
  xlim(0,10)+
  ylim(0,10)

3.3 Picture-naming/Wordbank

ggplot(complete_predictors, aes(x = morrison_aoa_threshold_years, y = wordbank_aoa_years, label=word))+
  geom_point()+
  geom_label()+
  geom_abline(slope=1, intercept=0)+
  theme_classic()+
  xlim(0,10)+
  ylim(0,10)

3.4 Naive/picture-naming

ggplot(complete_predictors, aes(y = morrison_aoa_threshold_years, x = sona_aoa_years, label=word))+
  geom_point()+
  geom_label()+
  geom_abline(slope=1, intercept=0)+
  theme_classic()+
  xlim(0,10)+
  ylim(0,10)

3.5 Wordbank/Naive

ggplot(complete_predictors, aes(y = sona_aoa_years, x = wordbank_aoa_years, label=word))+
  geom_point()+
  geom_label()+
  geom_abline(slope=1, intercept=0)+
  theme_classic()+
  xlim(0,10)+
  ylim(0,10)

3.6 Naive/Kuperman

ggplot(complete_predictors, aes(x = sona_aoa_years, y = KupermanAoA, label=word))+
  geom_point()+
  geom_label()+
  geom_abline(slope=1, intercept=0)+
  theme_classic()+
  xlim(0,10)+
  ylim(0,10)

4 Get a sense of difference distributions

4.1 Wordbank/Kuperman

hist(complete_predictors$wordbank_minus_kuperman)

4.2 Picture-naming/Kuperman

hist(complete_predictors$kuperman_minus_pn)

4.3 Picture-naming/Wordbank

hist(complete_predictors$wordbank_minus_pn)

4.4 Wordbank/Naive

hist(complete_predictors$wordbank_minus_sona)

4.5 Naive/picture-naming

hist(complete_predictors$sona_minus_pn)

4.6 Naive/Kuperman

hist(complete_predictors$sona_minus_kuperman)

5 Predict Wordbank-Kuperman

pr_k_full_fm <- write_model_formula("wordbank_minus_kuperman")

pr_k_full <- lm(pr_k_full_fm, complete_predictors)
summary(pr_k_full)

## 
## Call:
## lm(formula = pr_k_full_fm, data = complete_predictors)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.71841 -0.41582  0.02148  0.42687  1.56469 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)            -2.30918    3.13894  -0.736  0.46351   
## preschoolness           0.32388    0.09681   3.346  0.00112 **
## helpfulness             0.10765    0.10992   0.979  0.32958   
## childes_adult_log_freq  0.15453    0.06305   2.451  0.01583 * 
## concreteness           -0.44455    0.64128  -0.693  0.48963   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6967 on 110 degrees of freedom
## Multiple R-squared:  0.2016, Adjusted R-squared:  0.1725 
## F-statistic: 6.943 on 4 and 110 DF,  p-value: 5.071e-05

pr_k_full_plot <- tidy(pr_k_full) %>% 
  mutate(model = "Full model, R2 = .17") %>% 
  arrange(term)

5.1 Plot Wordbank-Kuperman models

#pr_k_allmodels_plot <- bind_rows(pr_k_full_plot, pr_k_nohyper_plot, pr_k_reduced_plot) %>% filter(term != "(Intercept)")

dwplot(pr_k_full_plot,
       vline = geom_vline(xintercept = 0, colour = "grey60", linetype = 2)) %>%
    relabel_predictors(childes_adult_log_freq = "frequency") +
     theme_bw() + xlab("Coefficient Estimate") + ylab("") +
     geom_vline(xintercept = 0, colour = "grey60", linetype = 2) +
     ggtitle("Predicting Wordbank - Kuperman") +
     theme(plot.title = element_text(face="bold"),
           legend.position = c(0.007, 0.01),
           legend.justification = c(0, 0), 
           legend.background = element_rect(colour="grey80"),
           legend.title = element_blank())

6 Predict Kuperman - Picture-naming

pn_k_full_fm <- write_model_formula("kuperman_minus_pn")

pn_k_full <- lm(pn_k_full_fm, complete_predictors)
summary(pn_k_full)

## 
## Call:
## lm(formula = pn_k_full_fm, data = complete_predictors)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7753 -0.8172  0.1200  0.8702  3.8537 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -13.3782     6.1725  -2.167  0.03236 *  
## preschoolness           -0.7685     0.1904  -4.037  0.00010 ***
## helpfulness             -0.7342     0.2162  -3.397  0.00095 ***
## childes_adult_log_freq   0.5399     0.1240   4.354 3.01e-05 ***
## concreteness             2.9975     1.2610   2.377  0.01918 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.37 on 110 degrees of freedom
## Multiple R-squared:  0.2585, Adjusted R-squared:  0.2315 
## F-statistic: 9.586 on 4 and 110 DF,  p-value: 1.094e-06

pn_k_full_plot <- tidy(pn_k_full) %>% 
  mutate(model = "Full model, R2 = .23") %>% 
  arrange(term)

# pn_k_backwards <- lm(kuperman_minus_pn ~ preschoolness+helpfulness+childes_adult_log_freq, complete_predictors)
# summary(pn_k_backwards)
# pn_k_backwards_plot <- tidy(pn_k_backwards) %>% 
#   mutate(model = "Backward, R2 = .20") %>% 
#   arrange(term)

k_pn_interact <- lm(kuperman_minus_pn ~ preschoolness_centered*frequency_centered + helpfulness_centered + concreteness_centered, complete_predictors)
summary(k_pn_interact)

## 
## Call:
## lm(formula = kuperman_minus_pn ~ preschoolness_centered * frequency_centered + 
##     helpfulness_centered + concreteness_centered, data = complete_predictors)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9775 -0.8005  0.0971  0.8564  3.8618 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                 0.9971     0.1353   7.367 3.55e-11
## preschoolness_centered                     -0.7584     0.1933  -3.924 0.000153
## frequency_centered                          0.5425     0.1247   4.350 3.07e-05
## helpfulness_centered                       -0.7202     0.2206  -3.265 0.001465
## concreteness_centered                       2.9392     1.2767   2.302 0.023227
## preschoolness_centered:frequency_centered  -0.0582     0.1643  -0.354 0.723850
##                                              
## (Intercept)                               ***
## preschoolness_centered                    ***
## frequency_centered                        ***
## helpfulness_centered                      ** 
## concreteness_centered                     *  
## preschoolness_centered:frequency_centered    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.375 on 109 degrees of freedom
## Multiple R-squared:  0.2593, Adjusted R-squared:  0.2254 
## F-statistic: 7.633 on 5 and 109 DF,  p-value: 3.5e-06

k_pn_interact_plot <- tidy(k_pn_interact) %>% 
  mutate(model = "Full model, R2 = .23") %>% 
  arrange(term)

6.1 Plot Kuperman - Picture-naming models

# pn_k_allmodels_plot <- bind_rows(pn_k_full_plot, pn_k_backwards_plot) %>% 
#   filter(term != "(Intercept)")

dwplot(filter(pn_k_full_plot, term != "(Intercept)"),
       vline = geom_vline(xintercept = 0, colour = "grey60", linetype = 2)) %>%
    relabel_predictors(c(pos_scale_n_defs = "N defs",
                         pos_scale_hypernyms = "hypernyms",
                         pos_scale_hyponyms = "hyponyms",
                         pos_scale_n_synsets = "N synsets",
                         childes_adult_log_freq = "frequency")) +
     theme_bw() + xlab("Coefficient Estimate") + ylab("") +
     geom_vline(xintercept = 0, colour = "grey60", linetype = 2) +
     ggtitle("Predicting Kuperman - Picture-naming") +
     theme(plot.title = element_text(face="bold"),
           legend.position = c(0.007, 0.01),
           legend.justification = c(0, 0), 
           legend.background = element_rect(colour="grey80"),
           legend.title = element_blank())

7 Predict Wordbank - Picture-naming

pn_pr_full_fm <- write_model_formula("wordbank_minus_pn")

pn_pr_full <- lm(pn_pr_full_fm, complete_predictors)
summary(pn_pr_full)

## 
## Call:
## lm(formula = pn_pr_full_fm, data = complete_predictors)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8428 -0.6910  0.1156  0.7098  2.4371 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -15.6874     5.8945  -2.661  0.00895 ** 
## preschoolness           -0.4446     0.1818  -2.446  0.01604 *  
## helpfulness             -0.6266     0.2064  -3.035  0.00300 ** 
## childes_adult_log_freq   0.6944     0.1184   5.865  4.8e-08 ***
## concreteness             2.5530     1.2042   2.120  0.03626 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.308 on 110 degrees of freedom
## Multiple R-squared:  0.2805, Adjusted R-squared:  0.2544 
## F-statistic: 10.72 on 4 and 110 DF,  p-value: 2.243e-07

pn_pr_full_plot <- tidy(pn_pr_full) %>% 
  mutate(model = "Full model, R2 = .25") %>% 
  arrange(term)

# pn_pr_backwards <- lm(pn_minus_parentreport ~ helpfulness+childes_adult_log_freq+concreteness, complete_predictors)
# summary(pn_pr_backwards)
# pn_pr_backwards_plot <- tidy(pn_pr_backwards) %>% 
#   mutate(model = "Backward, R2 = .34") %>% 
#   arrange(term)

7.1 Plot Wordbank - Picture-naming models

# pn_pr_allmodels_plot <- bind_rows(pn_pr_full_plot, pn_pr_backwards_plot) %>%
#   filter(term != "(Intercept)")

dwplot(filter(pn_pr_full_plot, term != "(Intercept)"),
       vline = geom_vline(xintercept = 0, colour = "grey60", linetype = 2)) %>%
    relabel_predictors(c(pos_scale_n_defs = "N defs",
                         pos_scale_hypernyms = "hypernyms",
                         pos_scale_hyponyms = "hyponyms",
                         pos_scale_n_synsets = "N synsets",
                         childes_adult_log_freq = "frequency")) +
     theme_bw() + xlab("Coefficient Estimate") + ylab("") +
     geom_vline(xintercept = 0, colour = "grey60", linetype = 2) +
     ggtitle("Predicting Wordbank - Picture-naming") +
     theme(plot.title = element_text(face="bold"),
           legend.position = c(0.007, 0.01),
           legend.justification = c(0, 0), 
           legend.background = element_rect(colour="grey80"),
           legend.title = element_blank())

8 Predict Wordbank - SONA

pr_sona_full_fm <- write_model_formula("wordbank_minus_sona")

pr_sona_full <- lm(pr_sona_full_fm, complete_predictors)
summary(pr_sona_full)

## 
## Call:
## lm(formula = pr_sona_full_fm, data = complete_predictors)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9935 -0.6768  0.1137  0.5900  1.9150 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.52377    3.93096   0.133    0.894    
## preschoolness           0.06335    0.12124   0.523    0.602    
## helpfulness             0.19887    0.13766   1.445    0.151    
## childes_adult_log_freq  0.42026    0.07896   5.322 5.47e-07 ***
## concreteness           -0.98628    0.80309  -1.228    0.222    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8725 on 110 degrees of freedom
## Multiple R-squared:  0.2688, Adjusted R-squared:  0.2423 
## F-statistic: 10.11 on 4 and 110 DF,  p-value: 5.233e-07

pr_sona_full_plot <- tidy(pr_sona_full) %>% 
  mutate(model = "Full model, R2 = .24") %>% 
  arrange(term)

# pn_pr_backwards <- lm(pn_minus_parentreport ~ helpfulness+childes_adult_log_freq+concreteness, complete_predictors)
# summary(pn_pr_backwards)
# pn_pr_backwards_plot <- tidy(pn_pr_backwards) %>% 
#   mutate(model = "Backward, R2 = .34") %>% 
#   arrange(term)

8.1 Plot Wordbank - SONA models

dwplot(filter(pr_sona_full_plot, term != "(Intercept)"),
       vline = geom_vline(xintercept = 0, colour = "grey60", linetype = 2)) %>%
    relabel_predictors(c(pos_scale_n_defs = "N defs",
                         pos_scale_hypernyms = "hypernyms",
                         pos_scale_hyponyms = "hyponyms",
                         pos_scale_n_synsets = "N synsets",
                         childes_adult_log_freq = "frequency")) +
     theme_bw() + xlab("Coefficient Estimate") + ylab("") +
     geom_vline(xintercept = 0, colour = "grey60", linetype = 2) +
     ggtitle("Predicting Wordbank - Naive adult") +
     theme(plot.title = element_text(face="bold"),
           legend.position = c(0.007, 0.01),
           legend.justification = c(0, 0), 
           legend.background = element_rect(colour="grey80"),
           legend.title = element_blank())

8.2 Plot frequency vs. difference

library(ggrepel)
ggplot(complete_predictors, aes(x = wordbank_minus_sona, y = childes_adult_log_freq, label=word))+
  geom_point()+
  geom_text_repel(segment.alpha=.3, size=3)+
  theme_classic()+
  geom_smooth(method="lm")

9 Compare models for subjective - ground truth measures (K - pn and Wb - pn)

k_pn_comparison_plot <- tidy(pn_k_full) %>% 
  mutate(model = "Kuperman-PN, R2 = .23") %>% 
  arrange(term)

w_pn_comparison_plot <- tidy(pn_pr_full) %>% 
  mutate(model = "Wordbank-PN, R2 = .25") %>% 
  arrange(term)

comparison_plots <- bind_rows(k_pn_comparison_plot, w_pn_comparison_plot) %>% 
  filter(term != "(Intercept)")


dwplot(comparison_plots,
       vline = geom_vline(xintercept = 0, colour = "grey60", linetype = 2)) %>%
    relabel_predictors(childes_adult_log_freq = "frequency") +
     theme_bw() + xlab("Coefficient Estimate") + ylab("") +
     geom_vline(xintercept = 0, colour = "grey60", linetype = 2) +
     ggtitle("Predicting subj - ground truth measures (N = 115)") +
     theme(plot.title = element_text(face="bold"),
           legend.position = c(0.69, 0.01),
           legend.justification = c(0, 0), 
           legend.background = element_rect(colour="grey80"),
           legend.title = element_blank())

Why are adults wrong about what words kids know? (and estimating AoAs)

Christina

2020-10-13