Variables

calculated_aoa_years: AoA, calculated using Braginsky & Yurovsky Wordbank script
KupermanAoA: adult-estimated AoA, from Kuperman norms
hypernyms: how many words are superordinate to this word? (per wordnet)
hyponyms: how many words are subordinate to this word? (per wordnet)
concreteness: how concrete (i.e. tangible) is the word’s referent, scale of 1 (abstract) to 5 (concrete), from Brysbaert norms
frequency: word frequency from SUBTLEX
neighbour_concentration: measure of semantic density (how concentrated is local word neighborhood), from Thompson (unpublished)
arc: measure of semantic density (average radius of co-occurrence), from HiDEx (Shaoul & Westbury)
ncount: measure of semantic density (neighbor count), from HiDEx (Shaoul & Westbury)
helpfulness: how helpful would it be for a preschooler to know this word, scale of 1 (not helpful) to 5 (very helpful), collected on mturk
babiness: how much is this word associated with babies, scale of 1 to 10, from Perry et al. (2015)
n_synsets: how many synsets are there for this word on wordnet?
n_definitions: how many definitions are for this word (as this part of speech) on wordnet? dog.n.01, dog.n.02, etc.

Correlations among predictors

corr_vars <- filtered_calculated_aoas %>% 
  mutate(log_hyponyms = log(hyponyms+1)) %>% 
  select(all_ages_says, all_ages_understands, calculated_aoa_years, KupermanAoA, hypernyms, log_hyponyms, concreteness,
         frequency_subtlex, childes_adult_log_freq, childes_kid_log_freq, n_synsets, n_definitions,
         neighbour_concentration, arc, ncount, helpfulness, babiness) %>% 
  cor(use="pairwise.complete.obs", method="pearson")

p.mat <- cor.mtest(corr_vars)
pMatrix <- p.mat$p

corrplot(corr_vars, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
         tl.col = "black", number.font=2, number.cex=.6, p.mat=pMatrix, sig.level = 0.05, insig = "blank")

Models: all data (CDI + our survey)

Generate differences for calculated_aoa_years - Kuperman (all data)

all_words_kuperman_diff<- filtered_calculated_aoas %>% 
  mutate(diff_aoas = calculated_aoa_years - KupermanAoA)

#base_mod_scaled_data <- lm(calculated_aoa_years ~ KupermanAoA, data = awaf_scaled)
#scaled_pos_diff <- awaf_scaled %>% 
#  ungroup() %>% 
#  mutate(diff_aoas = resid(base_mod_scaled_data))

Relation between difference and hypernyms, helpfulness?

Hypernyms

ggplot(all_words_kuperman_diff, aes(x=hypernyms, y=diff_aoas))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(all_words_kuperman_diff, aes(x=helpfulness, y=diff_aoas))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Model all words/predictors (all data)

Base model

all_words_no_pos <- all_words_kuperman_diff %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)
summary(all_words_no_pos)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9633 -0.7186 -0.0772  0.7078 10.3312 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)   
## (Intercept)               0.28858    1.53280   0.188  0.85072   
## scale(hypernyms)         -0.15959    0.05971  -2.673  0.00770 **
## scale(log(hyponyms + 1)) -0.04604    0.05613  -0.820  0.41231   
## frequency_subtlex         0.40153    0.12396   3.239  0.00126 **
## concreteness             -0.26725    0.08258  -3.236  0.00127 **
## neighbour_concentration  -5.83633    3.91925  -1.489  0.13691   
## arc                      -1.61192    0.78267  -2.059  0.03983 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.307 on 679 degrees of freedom
##   (42 observations deleted due to missingness)
## Multiple R-squared:  0.1112, Adjusted R-squared:  0.1034 
## F-statistic: 14.16 on 6 and 679 DF,  p-value: 3.127e-15
all_words_no_pos_childes <- all_words_kuperman_diff %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ childes_adult_log_freq+concreteness+neighbour_concentration+arc,data=.)
summary(all_words_no_pos_childes)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + childes_adult_log_freq + concreteness + neighbour_concentration + 
##     arc, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9274 -0.6867 -0.1044  0.6770 10.6128 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.87458    1.49061   0.587  0.55758    
## scale(hypernyms)         -0.17240    0.05855  -2.945  0.00334 ** 
## scale(log(hyponyms + 1)) -0.07404    0.05694  -1.300  0.19389    
## childes_adult_log_freq    0.15559    0.03869   4.021 6.44e-05 ***
## concreteness             -0.35594    0.07867  -4.524 7.15e-06 ***
## neighbour_concentration  -5.71449    3.90352  -1.464  0.14368    
## arc                      -0.67208    0.57405  -1.171  0.24211    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.302 on 679 degrees of freedom
##   (42 observations deleted due to missingness)
## Multiple R-squared:  0.1185, Adjusted R-squared:  0.1107 
## F-statistic: 15.21 on 6 and 679 DF,  p-value: < 2.2e-16

Include PoS

all_words_with_pos <- all_words_kuperman_diff %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc+dom_pos_factor,data=.)
summary(all_words_with_pos)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9850 -0.7163 -0.0659  0.6800 10.3210 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)   
## (Intercept)              -0.20868    1.55623  -0.134  0.89337   
## scale(hypernyms)         -0.09491    0.06766  -1.403  0.16119   
## scale(log(hyponyms + 1)) -0.03696    0.05622  -0.657  0.51110   
## frequency_subtlex         0.37000    0.12641   2.927  0.00354 **
## concreteness             -0.21875    0.08695  -2.516  0.01211 * 
## neighbour_concentration  -5.19785    3.92778  -1.323  0.18616   
## arc                      -1.40396    0.79635  -1.763  0.07835 . 
## dom_pos_factor2           0.33949    0.16773   2.024  0.04337 * 
## dom_pos_factor3           0.10241    0.28214   0.363  0.71674   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.305 on 677 degrees of freedom
##   (42 observations deleted due to missingness)
## Multiple R-squared:  0.1166, Adjusted R-squared:  0.1061 
## F-statistic: 11.16 on 8 and 677 DF,  p-value: 6.754e-15

Include babiness, helpfulness, n_synsets

all_words_with_pos_helpful <- all_words_kuperman_diff %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+ concreteness+ neighbour_concentration+ arc+ dom_pos_factor+ babiness+helpfulness+n_synsets+n_definitions,data=.)
summary(all_words_with_pos_helpful)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + babiness + helpfulness + n_synsets + 
##     n_definitions, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3225 -0.5466 -0.0237  0.5439  7.3469 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.750875   1.899809  -1.974   0.0491 *  
## scale(hypernyms)          0.034111   0.079547   0.429   0.6683    
## scale(log(hyponyms + 1)) -0.035697   0.062119  -0.575   0.5659    
## frequency_subtlex         0.864104   0.155573   5.554 5.18e-08 ***
## concreteness              0.126892   0.133301   0.952   0.3417    
## neighbour_concentration  -2.887957   4.541251  -0.636   0.5252    
## arc                      -2.146080   1.014993  -2.114   0.0351 *  
## dom_pos_factor2           0.258929   0.237857   1.089   0.2770    
## babiness                  0.125603   0.031038   4.047 6.27e-05 ***
## helpfulness              -0.366708   0.089081  -4.117 4.70e-05 ***
## n_synsets                 0.023027   0.009996   2.304   0.0218 *  
## n_definitions            -0.025683   0.020657  -1.243   0.2145    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.078 on 388 degrees of freedom
##   (328 observations deleted due to missingness)
## Multiple R-squared:  0.2282, Adjusted R-squared:  0.2063 
## F-statistic: 10.43 on 11 and 388 DF,  p-value: < 2.2e-16

This model is not great! So much variance unaccounted for… why?

filtered_calculated_aoas_typed <- filtered_calculated_aoas %>% 
  mutate(type_broad = ifelse(type=="CDI","CDI","non-CDI"))

ggplot(filtered_calculated_aoas_typed, aes(x=KupermanAoA, y=calculated_aoa_years, color=type_broad))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Seems like we should look at CDI and non-CDI words separately, as very different things are going on.

CDI words only

cdi_words_all_features <- filter(filtered_calculated_aoas, type=="CDI")
base_mod_CDI <- lm(calculated_aoa_years ~ KupermanAoA, data=cdi_words_all_features)

CDI_kuperman_diff <- cdi_words_all_features %>% 
  mutate(diff_aoas = resid(base_mod_CDI))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(CDI_kuperman_diff, aes(x=hypernyms, y=diff_aoas))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(CDI_kuperman_diff, aes(x=helpfulness, y=diff_aoas))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models: CDI data

Base model

CDI_kuperman_diff_filtered <- CDI_kuperman_diff %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3))))

CDI_kuperman_diff_base <- CDI_kuperman_diff_filtered %>% 
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)

summary(CDI_kuperman_diff_base)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.03694 -0.18233  0.00858  0.17994  0.80625 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)  
## (Intercept)               0.0468630  0.4676315   0.100   0.9202  
## scale(hypernyms)         -0.0378966  0.0196006  -1.933   0.0539 .
## scale(log(hyponyms + 1)) -0.0004546  0.0167676  -0.027   0.9784  
## frequency_subtlex         0.0781497  0.0397402   1.967   0.0500 *
## concreteness             -0.0086657  0.0345472  -0.251   0.8021  
## neighbour_concentration  -0.0205250  1.1104706  -0.018   0.9853  
## arc                      -0.6140758  0.2416764  -2.541   0.0114 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.287 on 387 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.04746,    Adjusted R-squared:  0.03269 
## F-statistic: 3.214 on 6 and 387 DF,  p-value: 0.004312
CDI_kuperman_diff_childes <- CDI_kuperman_diff_filtered %>% 
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+childes_adult_log_freq+concreteness+neighbour_concentration+arc,data=.)

summary(CDI_kuperman_diff_childes)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + childes_adult_log_freq + concreteness + neighbour_concentration + 
##     arc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02428 -0.18568  0.00169  0.17340  0.78069 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)  
## (Intercept)               0.257130   0.460926   0.558   0.5773  
## scale(hypernyms)         -0.045622   0.019270  -2.368   0.0184 *
## scale(log(hyponyms + 1))  0.002409   0.017033   0.141   0.8876  
## childes_adult_log_freq    0.006734   0.014182   0.475   0.6352  
## concreteness             -0.026375   0.033584  -0.785   0.4327  
## neighbour_concentration  -0.038208   1.116624  -0.034   0.9727  
## arc                      -0.298512   0.181972  -1.640   0.1017  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2883 on 387 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.0385, Adjusted R-squared:  0.02359 
## F-statistic: 2.583 on 6 and 387 DF,  p-value: 0.01824
anova(CDI_kuperman_diff_base, CDI_kuperman_diff_childes)
## Analysis of Variance Table
## 
## Model 1: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex + 
##     concreteness + neighbour_concentration + arc
## Model 2: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + childes_adult_log_freq + 
##     concreteness + neighbour_concentration + arc
##   Res.Df    RSS Df Sum of Sq F Pr(>F)
## 1    387 31.867                      
## 2    387 32.167  0   -0.2997

Include PoS

CDI_kuperman_diff_pos <- CDI_kuperman_diff_filtered %>%
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)

summary(CDI_kuperman_diff_pos)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05096 -0.17667  0.01129  0.18169  0.78824 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)   
## (Intercept)               0.1655816  0.4848964   0.341  0.73293   
## scale(hypernyms)         -0.0493641  0.0236466  -2.088  0.03749 * 
## scale(log(hyponyms + 1)) -0.0009144  0.0167736  -0.055  0.95655   
## frequency_subtlex         0.0817653  0.0399881   2.045  0.04156 * 
## concreteness             -0.0225603  0.0373953  -0.603  0.54667   
## neighbour_concentration  -0.1599139  1.1171573  -0.143  0.88625   
## arc                      -0.6317641  0.2420696  -2.610  0.00941 **
## dom_pos_factor2          -0.0654387  0.0623090  -1.050  0.29427   
## dom_pos_factor3           0.1472005  0.1713815   0.859  0.39093   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2869 on 385 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.05303,    Adjusted R-squared:  0.03336 
## F-statistic: 2.695 on 8 and 385 DF,  p-value: 0.006781

Include helpfulness

CDI_kuperman_diff_pos_helpful <- CDI_kuperman_diff_filtered %>%
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)

summary(CDI_kuperman_diff_pos_helpful)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + helpfulness, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.89267 -0.12702  0.00387  0.15013  0.73962 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.96512    0.42312   2.281 0.023097 *  
## scale(hypernyms)         -0.06206    0.02039  -3.044 0.002498 ** 
## scale(log(hyponyms + 1)) -0.01939    0.01453  -1.335 0.182789    
## frequency_subtlex         0.12278    0.03461   3.548 0.000437 ***
## concreteness             -0.03751    0.03222  -1.164 0.245116    
## neighbour_concentration  -0.43911    0.96218  -0.456 0.648380    
## arc                      -0.66265    0.20844  -3.179 0.001597 ** 
## dom_pos_factor2          -0.02744    0.05375  -0.510 0.610005    
## dom_pos_factor3           0.05639    0.14777   0.382 0.702954    
## helpfulness              -0.23209    0.01995 -11.633  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.247 on 384 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.2998, Adjusted R-squared:  0.2834 
## F-statistic: 18.27 on 9 and 384 DF,  p-value: < 2.2e-16

Include babiness and n_synsets

CDI_kuperman_diff_pos_helpful_babiness_synsets <- CDI_kuperman_diff_filtered %>%
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness + babiness + n_synsets+n_definitions,data=.)

summary(CDI_kuperman_diff_pos_helpful_babiness_synsets)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + helpfulness + babiness + n_synsets + 
##     n_definitions, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.88297 -0.13110  0.00425  0.14903  0.75059 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.7734898  0.4665918   1.658  0.09831 .  
## scale(hypernyms)         -0.0597568  0.0229642  -2.602  0.00968 ** 
## scale(log(hyponyms + 1)) -0.0238444  0.0156846  -1.520  0.12940    
## frequency_subtlex         0.1256374  0.0405908   3.095  0.00213 ** 
## concreteness             -0.0315968  0.0346254  -0.913  0.36215    
## neighbour_concentration   0.0390665  1.1036098   0.035  0.97178    
## arc                      -0.6937365  0.2538741  -2.733  0.00662 ** 
## dom_pos_factor2          -0.0203306  0.0638641  -0.318  0.75043    
## helpfulness              -0.2348867  0.0222147 -10.573  < 2e-16 ***
## babiness                 -0.0018235  0.0077741  -0.235  0.81469    
## n_synsets                 0.0010570  0.0031582   0.335  0.73807    
## n_definitions             0.0006282  0.0059635   0.105  0.91616    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2526 on 332 degrees of freedom
##   (79 observations deleted due to missingness)
## Multiple R-squared:  0.2882, Adjusted R-squared:  0.2646 
## F-statistic: 12.22 on 11 and 332 DF,  p-value: < 2.2e-16

Compare models

anova(CDI_kuperman_diff_base, CDI_kuperman_diff_pos, CDI_kuperman_diff_pos_helpful)
## Analysis of Variance Table
## 
## Model 1: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex + 
##     concreteness + neighbour_concentration + arc
## Model 2: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex + 
##     concreteness + neighbour_concentration + arc + dom_pos_factor
## Model 3: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex + 
##     concreteness + neighbour_concentration + arc + dom_pos_factor + 
##     helpfulness
##   Res.Df    RSS Df Sum of Sq        F Pr(>F)    
## 1    387 31.867                                 
## 2    385 31.681  2    0.1865   1.5283 0.2182    
## 3    384 23.425  1    8.2559 135.3361 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

For CDI words, including PoS doesn’t improve model fit, but including helpfulness does

If we look just at non-CDI words, there seem to be 2 clusters - AoAs overestimate some and underestimate others

filtered_calculated_aoas_typed <- filtered_calculated_aoas_typed %>% 
  mutate(concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))

ggplot(filter(filtered_calculated_aoas_typed, type_broad=="non-CDI"),
       aes(x=KupermanAoA, y=calculated_aoa_years))+
  geom_point(aes(color=as.factor(concreteness_meansplit)))+
  geom_smooth(method=lm)+
  theme_classic()

Non-CDI words only: production

non_CDI_all_features <- filter(filtered_calculated_aoas, type != "CDI")
base_mod_nonCDI <- lm(calculated_aoa_years ~ KupermanAoA, data=non_CDI_all_features)

nonCDI_kuperman_diff <- non_CDI_all_features %>% 
  mutate(diff_aoas = resid(base_mod_nonCDI),
         concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(nonCDI_kuperman_diff, aes(x=hypernyms, y=diff_aoas))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(nonCDI_kuperman_diff, aes(x=helpfulness, y=diff_aoas))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models

Base model

base_diff_nonCDI <- nonCDI_kuperman_diff %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>% 
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1)) + frequency_subtlex+concreteness+neighbour_concentration+arc, data=.)
summary(base_diff_nonCDI)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2708 -0.8760 -0.2008  0.7648 10.2389 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               4.51198    2.78761   1.619  0.10664    
## scale(hypernyms)         -0.44253    0.09759  -4.535 8.51e-06 ***
## scale(log(hyponyms + 1)) -0.28382    0.09867  -2.876  0.00433 ** 
## frequency_subtlex         0.02061    0.21277   0.097  0.92291    
## concreteness             -0.60670    0.13105  -4.629 5.58e-06 ***
## neighbour_concentration  -3.91530    7.33987  -0.533  0.59415    
## arc                      -1.23298    1.37411  -0.897  0.37032    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.527 on 285 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.2016, Adjusted R-squared:  0.1848 
## F-statistic:    12 on 6 and 285 DF,  p-value: 5.155e-12

Including PoS

pos_diff_nonCDI <- nonCDI_kuperman_diff %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)
  
summary(pos_diff_nonCDI)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3849 -0.8224 -0.2603  0.6685 10.1474 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.315632   2.778860   1.193 0.233805    
## scale(hypernyms)         -0.304285   0.103574  -2.938 0.003577 ** 
## scale(log(hyponyms + 1)) -0.224735   0.098392  -2.284 0.023108 *  
## frequency_subtlex        -0.132037   0.215200  -0.614 0.540003    
## concreteness             -0.496752   0.134419  -3.696 0.000263 ***
## neighbour_concentration  -2.203285   7.242030  -0.304 0.761172    
## arc                      -0.225332   1.389417  -0.162 0.871282    
## dom_pos_factor2           0.871128   0.250032   3.484 0.000572 ***
## dom_pos_factor3          -0.002668   0.353214  -0.008 0.993979    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.499 on 283 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.2353, Adjusted R-squared:  0.2137 
## F-statistic: 10.89 on 8 and 283 DF,  p-value: 2.243e-13

Adding helpfulness

helpful_pos_diff_nonCDI <- nonCDI_kuperman_diff %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)
  
summary(helpful_pos_diff_nonCDI)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + helpfulness, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5637 -0.8283 -0.2162  0.7413  9.8701 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.59213    2.77043   1.297 0.195829    
## scale(hypernyms)         -0.30104    0.10313  -2.919 0.003794 ** 
## scale(log(hyponyms + 1)) -0.18578    0.10012  -1.855 0.064569 .  
## frequency_subtlex         0.04347    0.23372   0.186 0.852579    
## concreteness             -0.47475    0.13433  -3.534 0.000478 ***
## neighbour_concentration  -2.45067    7.21107  -0.340 0.734224    
## arc                      -0.75676    1.41187  -0.536 0.592380    
## dom_pos_factor2           0.92216    0.25040   3.683 0.000276 ***
## dom_pos_factor3           0.13020    0.35869   0.363 0.716886    
## helpfulness              -0.25304    0.13467  -1.879 0.061282 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.493 on 282 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.2448, Adjusted R-squared:  0.2207 
## F-statistic: 10.15 on 9 and 282 DF,  p-value: 1.53e-13

Adding n_synsets

synsets_helpful_pos_diff_nonCDI <- nonCDI_kuperman_diff %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + n_synsets,data=.)
  
summary(synsets_helpful_pos_diff_nonCDI)
## 
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + n_synsets, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3124 -0.7830 -0.1418  0.7013 10.3394 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               4.44081    3.01108   1.475  0.14156    
## scale(hypernyms)         -0.21622    0.11643  -1.857  0.06452 .  
## scale(log(hyponyms + 1)) -0.11108    0.10728  -1.035  0.30152    
## frequency_subtlex        -0.31057    0.24587  -1.263  0.20776    
## concreteness             -0.59897    0.14477  -4.137 4.85e-05 ***
## neighbour_concentration  -1.79178    7.91223  -0.226  0.82104    
## arc                      -0.93559    1.47345  -0.635  0.52605    
## dom_pos_factor2           0.84434    0.28885   2.923  0.00379 ** 
## n_synsets                 0.04602    0.01643   2.801  0.00551 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.511 on 242 degrees of freedom
##   (54 observations deleted due to missingness)
## Multiple R-squared:  0.2911, Adjusted R-squared:  0.2677 
## F-statistic: 12.42 on 8 and 242 DF,  p-value: 6.757e-15

Compare models

anova(base_diff_nonCDI, pos_diff_nonCDI, helpful_pos_diff_nonCDI)
## Analysis of Variance Table
## 
## Model 1: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex + 
##     concreteness + neighbour_concentration + arc
## Model 2: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex + 
##     concreteness + neighbour_concentration + arc + dom_pos_factor
## Model 3: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex + 
##     concreteness + neighbour_concentration + arc + dom_pos_factor + 
##     helpfulness
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)   
## 1    285 664.27                                
## 2    283 636.25  2    28.013 6.2857 0.002134 **
## 3    282 628.39  1     7.867 3.5305 0.061282 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Accounting for PoS and helpfulness also improve model fit for non-CDI words

What words do adults think children learn earlier than they actually do? (calculated AoA > Kuperman AoA)

pos_diffs <- all_words_kuperman_diff %>% 
  filter(diff_aoas > .5) %>% 
  select(word = word.x, calculated_aoa_years, KupermanAoA, aoa_difference = diff_aoas, 
         dom_pos, hypernyms, hyponyms, concreteness, helpfulness, babiness, childes_adult_log_freq, arc) %>% 
  arrange(desc(aoa_difference))

DT::datatable(pos_diffs)

What words do adults think children learn later than they actually do? (Kuperman AoA > calculated AoA)

neg_diffs <- all_words_kuperman_diff %>% 
  filter(diff_aoas < -.5) %>% 
  select(word = word.x, calculated_aoa_years, KupermanAoA, aoa_difference = diff_aoas, 
         dom_pos, hypernyms, hyponyms, concreteness, helpfulness, babiness, childes_adult_log_freq, arc) %>% 
  arrange(aoa_difference)

DT::datatable(neg_diffs)

What words are reasonably close? (Kuperman AoA ~= calculated AoA)

pretty_good <- all_words_kuperman_diff %>% 
  filter(diff_aoas > -.5 & diff_aoas < .5) %>% 
  select(word = word.x, calculated_aoa_years, KupermanAoA, aoa_difference = diff_aoas, 
         dom_pos, hypernyms, hyponyms, concreteness, helpfulness, babiness, childes_adult_log_freq, arc) %>% 
  arrange(desc(aoa_difference))

DT::datatable(pretty_good)