Variables

calculated_aoa_years: AoA, calculated using Braginsky & Yurovsky Wordbank script
KupermanAoA: adult-estimated AoA, from Kuperman norms
hypernyms: how many words are superordinate to this word? (per wordnet)
hyponyms: how many words are subordinate to this word? (per wordnet)
concreteness: how concrete (i.e. tangible) is the word’s referent, scale of 1 (abstract) to 5 (concrete), from Brysbaert norms
frequency: word frequency from SUBTLEX
neighbour_concentration: measure of semantic density (how concentrated is local word neighborhood), from Thompson (unpublished)
arc: measure of semantic density (average radius of co-occurrence), from HiDEx (Shaoul & Westbury)
ncount: measure of semantic density (neighbor count), from HiDEx (Shaoul & Westbury)
helpfulness: how helpful would it be for a preschooler to know this word, scale of 1 (not helpful) to 5 (very helpful), collected on mturk
babiness: how much is this word associated with babies, scale of 1 to 10, from Perry et al. (2015)
n_synsets: how many synsets are there for this word on wordnet?
n_definitions: how many definitions are for this word (as this part of speech) on wordnet? dog.n.01, dog.n.02, etc.

Correlations among predictors

corr_vars <- filtered_calculated_aoas %>% 
  mutate(log_hyponyms = log(hyponyms+1)) %>% 
  select(all_ages_says, all_ages_understands, calculated_aoa_years, KupermanAoA, hypernyms, log_hyponyms, concreteness,
         frequency_subtlex, childes_adult_log_freq, childes_kid_log_freq, n_synsets, n_definitions,
         neighbour_concentration, arc, ncount, helpfulness, babiness) %>% 
  cor(use="pairwise.complete.obs", method="pearson")

p.mat <- cor.mtest(corr_vars)
pMatrix <- p.mat$p

corrplot(corr_vars, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
         tl.col = "black", number.font=2, number.cex=.6, p.mat=pMatrix, sig.level = 0.05, insig = "blank")

How does helpfulness correlate with Kuperman, hypernyms, and actual production?

All words

Kuperman

ggplot(filtered_calculated_aoas, aes(x=KupermanAoA, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Hypernyms

ggplot(filtered_calculated_aoas, aes(x=hypernyms, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Production

ggplot(filtered_calculated_aoas, aes(x=calculated_aoa_years, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Non-CDI words only

Kuperman

ggplot(filter(filtered_calculated_aoas, type!="CDI"), aes(x=KupermanAoA, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Hypernyms

ggplot(filter(filtered_calculated_aoas, type!="CDI"), aes(x=hypernyms, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Production

ggplot(filter(filtered_calculated_aoas, type!="CDI"), aes(x=all_ages_says, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models: all data (CDI + our survey)

Generate residuals for all_ages_says ~ Kuperman (all data)

base_mod <- lm(calculated_aoa_years ~ KupermanAoA, data=filtered_calculated_aoas)

all_words_kuperman_resid <- filtered_calculated_aoas %>% 
  mutate(calculated_aoa_years.Kuperman = resid(base_mod))

#base_mod_scaled_data <- lm(calculated_aoa_years ~ KupermanAoA, data = awaf_scaled)
#scaled_pos_resid <- awaf_scaled %>% 
#  ungroup() %>% 
#  mutate(calculated_aoa_years.Kuperman = resid(base_mod_scaled_data))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(all_words_kuperman_resid, aes(x=hypernyms, y=calculated_aoa_years.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(all_words_kuperman_resid, aes(x=helpfulness, y=calculated_aoa_years.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Model all words/predictors (all data)

Base model

all_words_no_pos <- all_words_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)
summary(all_words_no_pos)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8586 -0.6677 -0.1402  0.5504 10.2412 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.58167    1.42250   2.518  0.01204 *  
## scale(hypernyms)         -0.17046    0.05541  -3.076  0.00218 ** 
## scale(log(hyponyms + 1)) -0.10982    0.05209  -2.108  0.03537 *  
## frequency_subtlex         0.03512    0.11504   0.305  0.76021    
## concreteness             -0.46912    0.07664  -6.121 1.57e-09 ***
## neighbour_concentration  -4.13646    3.63722  -1.137  0.25583    
## arc                      -0.36300    0.72635  -0.500  0.61741    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.213 on 679 degrees of freedom
##   (42 observations deleted due to missingness)
## Multiple R-squared:  0.1406, Adjusted R-squared:  0.133 
## F-statistic: 18.51 on 6 and 679 DF,  p-value: < 2.2e-16
all_words_no_pos_childes <- all_words_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ childes_adult_log_freq+concreteness+neighbour_concentration+arc,data=.)
summary(all_words_no_pos_childes)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + childes_adult_log_freq + concreteness + 
##     neighbour_concentration + arc, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9169 -0.6692 -0.1311  0.5596 10.1467 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.77298    1.38868   2.717  0.00676 ** 
## scale(hypernyms)         -0.17826    0.05454  -3.268  0.00114 ** 
## scale(log(hyponyms + 1)) -0.09919    0.05304  -1.870  0.06192 .  
## childes_adult_log_freq   -0.02367    0.03605  -0.657  0.51167    
## concreteness             -0.47442    0.07329  -6.473 1.84e-10 ***
## neighbour_concentration  -4.23147    3.63661  -1.164  0.24500    
## arc                      -0.05691    0.53480  -0.106  0.91528    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.213 on 679 degrees of freedom
##   (42 observations deleted due to missingness)
## Multiple R-squared:  0.141,  Adjusted R-squared:  0.1334 
## F-statistic: 18.58 on 6 and 679 DF,  p-value: < 2.2e-16

Include PoS

all_words_with_pos <- all_words_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc+dom_pos_factor,data=.)
summary(all_words_with_pos)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc + dom_pos_factor, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8734 -0.6630 -0.1419  0.5387 10.2348 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.19984    1.44576   2.213   0.0272 *  
## scale(hypernyms)         -0.12208    0.06286  -1.942   0.0525 .  
## scale(log(hyponyms + 1)) -0.10298    0.05223  -1.972   0.0491 *  
## frequency_subtlex         0.01282    0.11744   0.109   0.9131    
## concreteness             -0.43186    0.08078  -5.346 1.23e-07 ***
## neighbour_concentration  -3.64991    3.64895  -1.000   0.3175    
## arc                      -0.21451    0.73982  -0.290   0.7719    
## dom_pos_factor2           0.25445    0.15583   1.633   0.1030    
## dom_pos_factor3           0.09488    0.26211   0.362   0.7175    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.212 on 677 degrees of freedom
##   (42 observations deleted due to missingness)
## Multiple R-squared:  0.144,  Adjusted R-squared:  0.1339 
## F-statistic: 14.23 on 8 and 677 DF,  p-value: < 2.2e-16

Include babiness, helpfulness, n_synsets

all_words_with_pos_helpful <- all_words_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+ concreteness+ neighbour_concentration+ arc+ dom_pos_factor+ babiness+helpfulness+n_synsets+n_definitions,data=.)
summary(all_words_with_pos_helpful)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc + dom_pos_factor + babiness + 
##     helpfulness + n_synsets + n_definitions, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0886 -0.4341 -0.0438  0.3891  7.3779 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.880417   1.622888  -0.543  0.58779    
## scale(hypernyms)          0.004936   0.067952   0.073  0.94213    
## scale(log(hyponyms + 1)) -0.067475   0.053064  -1.272  0.20429    
## frequency_subtlex         0.697439   0.132897   5.248 2.54e-07 ***
## concreteness              0.037764   0.113871   0.332  0.74034    
## neighbour_concentration  -2.014398   3.879308  -0.519  0.60387    
## arc                      -1.915973   0.867046  -2.210  0.02770 *  
## dom_pos_factor2           0.220237   0.203186   1.084  0.27908    
## babiness                  0.078121   0.026514   2.946  0.00341 ** 
## helpfulness              -0.364523   0.076097  -4.790 2.37e-06 ***
## n_synsets                 0.024889   0.008539   2.915  0.00377 ** 
## n_definitions            -0.029496   0.017646  -1.672  0.09542 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9206 on 388 degrees of freedom
##   (328 observations deleted due to missingness)
## Multiple R-squared:  0.2365, Adjusted R-squared:  0.2148 
## F-statistic: 10.93 on 11 and 388 DF,  p-value: < 2.2e-16

This model is not great! So much variance unaccounted for… why?

filtered_calculated_aoas_typed <- filtered_calculated_aoas %>% 
  mutate(type_broad = ifelse(type=="CDI","CDI","non-CDI"))

ggplot(filtered_calculated_aoas_typed, aes(x=KupermanAoA, y=calculated_aoa_years, color=type_broad))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Seems like we should look at CDI and non-CDI words separately, as very different things are going on.

CDI words only

cdi_words_all_features <- filter(filtered_calculated_aoas, type=="CDI")
base_mod_CDI <- lm(calculated_aoa_years ~ KupermanAoA, data=cdi_words_all_features)

CDI_kuperman_resid <- cdi_words_all_features %>% 
  mutate(calculated_aoa_years.Kuperman = resid(base_mod_CDI))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(CDI_kuperman_resid, aes(x=hypernyms, y=calculated_aoa_years.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(CDI_kuperman_resid, aes(x=helpfulness, y=calculated_aoa_years.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models: CDI data

Base model

CDI_kuperman_resid_filtered <- CDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3))))

CDI_kuperman_resid_base <- CDI_kuperman_resid_filtered %>% 
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)

summary(CDI_kuperman_resid_base)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.03694 -0.18233  0.00858  0.17994  0.80625 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)  
## (Intercept)               0.0468630  0.4676315   0.100   0.9202  
## scale(hypernyms)         -0.0378966  0.0196006  -1.933   0.0539 .
## scale(log(hyponyms + 1)) -0.0004546  0.0167676  -0.027   0.9784  
## frequency_subtlex         0.0781497  0.0397402   1.967   0.0500 *
## concreteness             -0.0086657  0.0345472  -0.251   0.8021  
## neighbour_concentration  -0.0205250  1.1104706  -0.018   0.9853  
## arc                      -0.6140758  0.2416764  -2.541   0.0114 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.287 on 387 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.04746,    Adjusted R-squared:  0.03269 
## F-statistic: 3.214 on 6 and 387 DF,  p-value: 0.004312
CDI_kuperman_resid_childes <- CDI_kuperman_resid_filtered %>% 
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+childes_adult_log_freq+concreteness+neighbour_concentration+arc,data=.)

summary(CDI_kuperman_resid_childes)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + childes_adult_log_freq + concreteness + 
##     neighbour_concentration + arc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02428 -0.18568  0.00169  0.17340  0.78069 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)  
## (Intercept)               0.257130   0.460926   0.558   0.5773  
## scale(hypernyms)         -0.045622   0.019270  -2.368   0.0184 *
## scale(log(hyponyms + 1))  0.002409   0.017033   0.141   0.8876  
## childes_adult_log_freq    0.006734   0.014182   0.475   0.6352  
## concreteness             -0.026375   0.033584  -0.785   0.4327  
## neighbour_concentration  -0.038208   1.116624  -0.034   0.9727  
## arc                      -0.298512   0.181972  -1.640   0.1017  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2883 on 387 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.0385, Adjusted R-squared:  0.02359 
## F-statistic: 2.583 on 6 and 387 DF,  p-value: 0.01824
anova(CDI_kuperman_resid_base, CDI_kuperman_resid_childes)
## Analysis of Variance Table
## 
## Model 1: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc
## Model 2: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + childes_adult_log_freq + concreteness + neighbour_concentration + 
##     arc
##   Res.Df    RSS Df Sum of Sq F Pr(>F)
## 1    387 31.867                      
## 2    387 32.167  0   -0.2997

Include PoS

CDI_kuperman_resid_pos <- CDI_kuperman_resid_filtered %>%
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)

summary(CDI_kuperman_resid_pos)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc + dom_pos_factor, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05096 -0.17667  0.01129  0.18169  0.78824 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)   
## (Intercept)               0.1655816  0.4848964   0.341  0.73293   
## scale(hypernyms)         -0.0493641  0.0236466  -2.088  0.03749 * 
## scale(log(hyponyms + 1)) -0.0009144  0.0167736  -0.055  0.95655   
## frequency_subtlex         0.0817653  0.0399881   2.045  0.04156 * 
## concreteness             -0.0225603  0.0373953  -0.603  0.54667   
## neighbour_concentration  -0.1599139  1.1171573  -0.143  0.88625   
## arc                      -0.6317641  0.2420696  -2.610  0.00941 **
## dom_pos_factor2          -0.0654387  0.0623090  -1.050  0.29427   
## dom_pos_factor3           0.1472005  0.1713815   0.859  0.39093   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2869 on 385 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.05303,    Adjusted R-squared:  0.03336 
## F-statistic: 2.695 on 8 and 385 DF,  p-value: 0.006781

Include helpfulness

CDI_kuperman_resid_pos_helpful <- CDI_kuperman_resid_filtered %>%
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)

summary(CDI_kuperman_resid_pos_helpful)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc + dom_pos_factor + helpfulness, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.89267 -0.12702  0.00387  0.15013  0.73962 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.96512    0.42312   2.281 0.023097 *  
## scale(hypernyms)         -0.06206    0.02039  -3.044 0.002498 ** 
## scale(log(hyponyms + 1)) -0.01939    0.01453  -1.335 0.182789    
## frequency_subtlex         0.12278    0.03461   3.548 0.000437 ***
## concreteness             -0.03751    0.03222  -1.164 0.245116    
## neighbour_concentration  -0.43911    0.96218  -0.456 0.648380    
## arc                      -0.66265    0.20844  -3.179 0.001597 ** 
## dom_pos_factor2          -0.02744    0.05375  -0.510 0.610005    
## dom_pos_factor3           0.05639    0.14777   0.382 0.702954    
## helpfulness              -0.23209    0.01995 -11.633  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.247 on 384 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.2998, Adjusted R-squared:  0.2834 
## F-statistic: 18.27 on 9 and 384 DF,  p-value: < 2.2e-16

Include babiness and n_synsets

CDI_kuperman_resid_pos_helpful_babiness_synsets <- CDI_kuperman_resid_filtered %>%
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness + babiness + n_synsets+n_definitions,data=.)

summary(CDI_kuperman_resid_pos_helpful_babiness_synsets)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc + dom_pos_factor + helpfulness + 
##     babiness + n_synsets + n_definitions, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.88297 -0.13110  0.00425  0.14903  0.75059 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.7734898  0.4665918   1.658  0.09831 .  
## scale(hypernyms)         -0.0597568  0.0229642  -2.602  0.00968 ** 
## scale(log(hyponyms + 1)) -0.0238444  0.0156846  -1.520  0.12940    
## frequency_subtlex         0.1256374  0.0405908   3.095  0.00213 ** 
## concreteness             -0.0315968  0.0346254  -0.913  0.36215    
## neighbour_concentration   0.0390665  1.1036098   0.035  0.97178    
## arc                      -0.6937365  0.2538741  -2.733  0.00662 ** 
## dom_pos_factor2          -0.0203306  0.0638641  -0.318  0.75043    
## helpfulness              -0.2348867  0.0222147 -10.573  < 2e-16 ***
## babiness                 -0.0018235  0.0077741  -0.235  0.81469    
## n_synsets                 0.0010570  0.0031582   0.335  0.73807    
## n_definitions             0.0006282  0.0059635   0.105  0.91616    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2526 on 332 degrees of freedom
##   (79 observations deleted due to missingness)
## Multiple R-squared:  0.2882, Adjusted R-squared:  0.2646 
## F-statistic: 12.22 on 11 and 332 DF,  p-value: < 2.2e-16

Compare models

anova(CDI_kuperman_resid_base, CDI_kuperman_resid_pos, CDI_kuperman_resid_pos_helpful)
## Analysis of Variance Table
## 
## Model 1: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc
## Model 2: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor
## Model 3: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + helpfulness
##   Res.Df    RSS Df Sum of Sq        F Pr(>F)    
## 1    387 31.867                                 
## 2    385 31.681  2    0.1865   1.5283 0.2182    
## 3    384 23.425  1    8.2559 135.3361 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

For CDI words, including PoS doesn’t improve model fit, but including helpfulness does

If we look just at non-CDI words, there seem to be 2 clusters - AoAs overestimate some and underestimate others

filtered_calculated_aoas_typed <- filtered_calculated_aoas_typed %>% 
  mutate(concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))

ggplot(filter(filtered_calculated_aoas_typed, type_broad=="non-CDI"),
       aes(x=KupermanAoA, y=calculated_aoa_years))+
  geom_point(aes(color=as.factor(concreteness_meansplit)))+
  geom_smooth(method=lm)+
  theme_classic()

Non-CDI words only: production

non_CDI_all_features <- filter(filtered_calculated_aoas, type != "CDI")
base_mod_nonCDI <- lm(calculated_aoa_years ~ KupermanAoA, data=non_CDI_all_features)

nonCDI_kuperman_resid <- non_CDI_all_features %>% 
  mutate(calculated_aoa_years.Kuperman = resid(base_mod_nonCDI),
         concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(nonCDI_kuperman_resid, aes(x=hypernyms, y=calculated_aoa_years.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(nonCDI_kuperman_resid, aes(x=helpfulness, y=calculated_aoa_years.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models

Base model

base_resid_nonCDI <- nonCDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>% 
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + frequency_subtlex+concreteness+neighbour_concentration+arc, data=.)
summary(base_resid_nonCDI)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2708 -0.8760 -0.2008  0.7648 10.2389 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               4.51198    2.78761   1.619  0.10664    
## scale(hypernyms)         -0.44253    0.09759  -4.535 8.51e-06 ***
## scale(log(hyponyms + 1)) -0.28382    0.09867  -2.876  0.00433 ** 
## frequency_subtlex         0.02061    0.21277   0.097  0.92291    
## concreteness             -0.60670    0.13105  -4.629 5.58e-06 ***
## neighbour_concentration  -3.91530    7.33987  -0.533  0.59415    
## arc                      -1.23298    1.37411  -0.897  0.37032    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.527 on 285 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.2016, Adjusted R-squared:  0.1848 
## F-statistic:    12 on 6 and 285 DF,  p-value: 5.155e-12

Including PoS

pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)
  
summary(pos_resid_nonCDI)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc + dom_pos_factor, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3849 -0.8224 -0.2603  0.6685 10.1474 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.315632   2.778860   1.193 0.233805    
## scale(hypernyms)         -0.304285   0.103574  -2.938 0.003577 ** 
## scale(log(hyponyms + 1)) -0.224735   0.098392  -2.284 0.023108 *  
## frequency_subtlex        -0.132037   0.215200  -0.614 0.540003    
## concreteness             -0.496752   0.134419  -3.696 0.000263 ***
## neighbour_concentration  -2.203285   7.242030  -0.304 0.761172    
## arc                      -0.225332   1.389417  -0.162 0.871282    
## dom_pos_factor2           0.871128   0.250032   3.484 0.000572 ***
## dom_pos_factor3          -0.002668   0.353214  -0.008 0.993979    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.499 on 283 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.2353, Adjusted R-squared:  0.2137 
## F-statistic: 10.89 on 8 and 283 DF,  p-value: 2.243e-13

Adding helpfulness

helpful_pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)
  
summary(helpful_pos_resid_nonCDI)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc + dom_pos_factor + helpfulness, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5637 -0.8283 -0.2162  0.7413  9.8701 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.59213    2.77043   1.297 0.195829    
## scale(hypernyms)         -0.30104    0.10313  -2.919 0.003794 ** 
## scale(log(hyponyms + 1)) -0.18578    0.10012  -1.855 0.064569 .  
## frequency_subtlex         0.04347    0.23372   0.186 0.852579    
## concreteness             -0.47475    0.13433  -3.534 0.000478 ***
## neighbour_concentration  -2.45067    7.21107  -0.340 0.734224    
## arc                      -0.75676    1.41187  -0.536 0.592380    
## dom_pos_factor2           0.92216    0.25040   3.683 0.000276 ***
## dom_pos_factor3           0.13020    0.35869   0.363 0.716886    
## helpfulness              -0.25304    0.13467  -1.879 0.061282 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.493 on 282 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.2448, Adjusted R-squared:  0.2207 
## F-statistic: 10.15 on 9 and 282 DF,  p-value: 1.53e-13

Adding n_synsets

synsets_helpful_pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + n_synsets,data=.)
  
summary(synsets_helpful_pos_resid_nonCDI)
## 
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc + dom_pos_factor + n_synsets, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3124 -0.7830 -0.1418  0.7013 10.3394 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               4.44081    3.01108   1.475  0.14156    
## scale(hypernyms)         -0.21622    0.11643  -1.857  0.06452 .  
## scale(log(hyponyms + 1)) -0.11108    0.10728  -1.035  0.30152    
## frequency_subtlex        -0.31057    0.24587  -1.263  0.20776    
## concreteness             -0.59897    0.14477  -4.137 4.85e-05 ***
## neighbour_concentration  -1.79178    7.91223  -0.226  0.82104    
## arc                      -0.93559    1.47345  -0.635  0.52605    
## dom_pos_factor2           0.84434    0.28885   2.923  0.00379 ** 
## n_synsets                 0.04602    0.01643   2.801  0.00551 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.511 on 242 degrees of freedom
##   (54 observations deleted due to missingness)
## Multiple R-squared:  0.2911, Adjusted R-squared:  0.2677 
## F-statistic: 12.42 on 8 and 242 DF,  p-value: 6.757e-15

Compare models

anova(base_resid_nonCDI, pos_resid_nonCDI, helpful_pos_resid_nonCDI)
## Analysis of Variance Table
## 
## Model 1: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc
## Model 2: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor
## Model 3: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + helpfulness
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)   
## 1    285 664.27                                
## 2    283 636.25  2    28.013 6.2857 0.002134 **
## 3    282 628.39  1     7.867 3.5305 0.061282 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Accounting for PoS and helpfulness also improve model fit for non-CDI words

How do CDI and non-CDI words compare on these characteristics we’re looking at?

t.test(formula = concreteness ~ type_broad, data = filtered_calculated_aoas_typed)
## 
##  Welch Two Sample t-test
## 
## data:  concreteness by type_broad
## t = 10.333, df = 639.23, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.4503704 0.6617004
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              4.518625              3.962589
t.test(formula = frequency_subtlex ~ type_broad, data = filtered_calculated_aoas_typed)
## 
##  Welch Two Sample t-test
## 
## data:  frequency_subtlex by type_broad
## t = -0.15526, df = 695.69, p-value = 0.8767
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1275906  0.1088896
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              4.484470              4.493821
t.test(formula = helpfulness ~ type_broad, data = filtered_calculated_aoas_typed)
## 
##  Welch Two Sample t-test
## 
## data:  helpfulness by type_broad
## t = 6.8864, df = 650.54, p-value = 1.351e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2675637 0.4810183
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              3.463996              3.089705
t.test(formula = arc ~ type_broad, data = filtered_calculated_aoas_typed)
## 
##  Welch Two Sample t-test
## 
## data:  arc by type_broad
## t = -1.4592, df = 658.92, p-value = 0.145
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.028983681  0.004270751
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##             0.5712578             0.5836143
t.test(formula = KupermanAoA ~ type_broad, data = filtered_calculated_aoas_typed)
## 
##  Welch Two Sample t-test
## 
## data:  KupermanAoA by type_broad
## t = -12.157, df = 518.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.3647365 -0.9850257
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              4.378858              5.553739
t.test(formula = hypernyms ~ type_broad, data = filtered_calculated_aoas_typed)
## 
##  Welch Two Sample t-test
## 
## data:  hypernyms by type_broad
## t = 6.2654, df = 741.47, p-value = 6.303e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.001287 1.915097
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              7.327628              5.869436
t.test(formula = hyponyms ~ type_broad, data = filtered_calculated_aoas_typed)
## 
##  Welch Two Sample t-test
## 
## data:  hyponyms by type_broad
## t = 2.0145, df = 451.54, p-value = 0.04455
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   1.092282 88.233033
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              63.73193              19.06928
t.test(formula = n_synsets ~ type_broad, data = filtered_calculated_aoas_typed)
## 
##  Welch Two Sample t-test
## 
## data:  n_synsets by type_broad
## t = -1.9109, df = 583.2, p-value = 0.05651
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.37151242  0.03253687
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              6.028986              7.198473