all_words_all_features <- read.csv("all_words_with_norms_and_helpfulness_no_duplicate_words.csv") %>% select(-X)
awaf_scaled <- all_words_all_features %>% 
  filter(dom_pos %in% c("Noun","Verb","Adjective")) %>% 
  group_by(dom_pos) %>% 
  mutate(pos_scale_hypernyms = scale(hypernyms), pos_scale_hyponyms = scale(hyponyms),
         pos_scale_concreteness = scale(Conc.M), pos_scale_freq = scale(Zipf.value), pos_scale_helpful = scale(resp_mean))

all_words_all_features$word <- as.character(all_words_all_features$word)


nonCDI_only <- filter(all_words_all_features, type != "CDI")

Correlations among predictors

corr_vars <- all_words_all_features %>% 
  mutate(log_hyponyms = log(hyponyms+1)) %>% 
  select(all_ages_says, KupermanAoA, hypernyms, log_hyponyms, concreteness=Conc.M, frequency=Zipf.value, neighbour_concentration, helpfulness=resp_mean) %>% 
  cor(use="pairwise.complete.obs", method="pearson")

p.mat <- cor.mtest(corr_vars)
pMatrix <- p.mat$p

corrplot(corr_vars, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
         tl.col = "black", number.font=2, number.cex=.8, p.mat=pMatrix, sig.level = 0.05, insig = "blank")

How does helpfulness correlate with Kuperman, hypernyms, and actual production?

All words

Kuperman

ggplot(all_words_all_features, aes(x=KupermanAoA, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Hypernyms

ggplot(all_words_all_features, aes(x=hypernyms, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Production

ggplot(all_words_all_features, aes(x=all_ages_says, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Non-CDI words only

Kuperman

ggplot(filter(all_words_all_features, type!="CDI"), aes(x=KupermanAoA, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Hypernyms

ggplot(filter(all_words_all_features, type!="CDI"), aes(x=hypernyms, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Production

ggplot(filter(all_words_all_features, type!="CDI"), aes(x=all_ages_says, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models: all data (CDI + our survey)

Generate residuals for all_ages_says ~ Kuperman (all data)

base_mod <- lm(all_ages_says ~ KupermanAoA, data=all_words_all_features)

all_words_kuperman_resid <- all_words_all_features %>% 
  mutate(all_ages_says.Kuperman = resid(base_mod))

base_mod_scaled_data <- lm(all_ages_says ~ KupermanAoA, data = awaf_scaled)
scaled_pos_resid <- awaf_scaled %>% 
  ungroup() %>% 
  mutate(all_ages_says.Kuperman = resid(base_mod_scaled_data))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(all_words_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(all_words_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Model all words/predictors (all data)

Base model

all_words_no_pos <- all_words_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ Zipf.value+Conc.M+neighbour_concentration,data=.)
summary(all_words_no_pos)
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + Conc.M + neighbour_concentration, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43260 -0.14684 -0.03654  0.13315  0.51495 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)  
## (Intercept)              -0.222360   0.246081  -0.904   0.3665  
## scale(hypernyms)          0.022692   0.009502   2.388   0.0172 *
## scale(log(hyponyms + 1))  0.005514   0.008967   0.615   0.5389  
## Zipf.value                0.007834   0.013154   0.596   0.5517  
## Conc.M                    0.016098   0.013209   1.219   0.2234  
## neighbour_concentration   0.313034   0.626853   0.499   0.6177  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2064 on 664 degrees of freedom
##   (37 observations deleted due to missingness)
## Multiple R-squared:  0.02069,    Adjusted R-squared:  0.01332 
## F-statistic: 2.806 on 5 and 664 DF,  p-value: 0.01615

Include PoS

all_words_with_pos <- all_words_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ dom_pos_factor + Zipf.value+Conc.M+neighbour_concentration,data=.)
summary(all_words_with_pos)
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47455 -0.14552 -0.01863  0.13471  0.54103 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.2233074  0.2446068  -0.913 0.361615    
## scale(hypernyms)          0.0005508  0.0108786   0.051 0.959631    
## scale(log(hyponyms + 1))  0.0029732  0.0088188   0.337 0.736119    
## dom_pos_factor2          -0.0910299  0.0268306  -3.393 0.000733 ***
## dom_pos_factor3           0.1677100  0.0452126   3.709 0.000225 ***
## Zipf.value                0.0142485  0.0129325   1.102 0.270965    
## Conc.M                    0.0127926  0.0136361   0.938 0.348514    
## neighbour_concentration   0.3159452  0.6158518   0.513 0.608107    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2021 on 662 degrees of freedom
##   (37 observations deleted due to missingness)
## Multiple R-squared:  0.06387,    Adjusted R-squared:  0.05397 
## F-statistic: 6.452 on 7 and 662 DF,  p-value: 2.278e-07

Include helpfulness

all_words_with_pos_helpful <- all_words_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ dom_pos_factor + Zipf.value+Conc.M+neighbour_concentration+resp_mean,data=.)
summary(all_words_with_pos_helpful)
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration + 
##     resp_mean, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48535 -0.13558 -0.02497  0.12890  0.62788 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.3472242  0.2359209  -1.472 0.141560    
## scale(hypernyms)         -0.0015503  0.0104728  -0.148 0.882362    
## scale(log(hyponyms + 1))  0.0007269  0.0084795   0.086 0.931715    
## dom_pos_factor2          -0.1144256  0.0260788  -4.388 1.33e-05 ***
## dom_pos_factor3           0.1483719  0.0434479   3.415 0.000677 ***
## Zipf.value               -0.0168297  0.0131192  -1.283 0.200005    
## Conc.M                    0.0038806  0.0131855   0.294 0.768617    
## neighbour_concentration   0.4140774  0.5933213   0.698 0.485489    
## resp_mean                 0.0821740  0.0107465   7.647 7.38e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1938 on 655 degrees of freedom
##   (43 observations deleted due to missingness)
## Multiple R-squared:  0.1422, Adjusted R-squared:  0.1317 
## F-statistic: 13.58 on 8 and 655 DF,  p-value: < 2.2e-16

This model is not great! So much variance unaccounted for… why?

all_words_all_features_typed <- all_words_all_features %>% 
  mutate(type_broad = ifelse(type=="CDI","CDI","non-CDI"))

ggplot(all_words_all_features_typed, aes(x=KupermanAoA, y=all_ages_says, color=type_broad))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Seems like we should look at CDI and non-CDI words separately, as very different things are going on.

CDI words only

cdi_words_all_features <- filter(all_words_all_features, type=="CDI")
base_mod_CDI <- lm(all_ages_says ~ KupermanAoA, data=cdi_words_all_features)

CDI_kuperman_resid <- cdi_words_all_features %>% 
  mutate(all_ages_says.Kuperman = resid(base_mod_CDI))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(CDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(CDI_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models: CDI data

Base model

CDI_kuperman_resid_filtered <- CDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3))))

CDI_kuperman_resid_base <- CDI_kuperman_resid_filtered %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ Zipf.value+Conc.M+neighbour_concentration,data=.)

summary(CDI_kuperman_resid_base)
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + Conc.M + neighbour_concentration, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36509 -0.11983 -0.00431  0.11049  0.48936 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)  
## (Intercept)              -0.036110   0.278892  -0.129   0.8970  
## scale(hypernyms)          0.029495   0.011627   2.537   0.0116 *
## scale(log(hyponyms + 1))  0.004650   0.009912   0.469   0.6392  
## Zipf.value                0.003637   0.015620   0.233   0.8160  
## Conc.M                    0.015923   0.020342   0.783   0.4343  
## neighbour_concentration  -0.143816   0.661743  -0.217   0.8281  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1711 on 388 degrees of freedom
##   (28 observations deleted due to missingness)
## Multiple R-squared:  0.04077,    Adjusted R-squared:  0.02841 
## F-statistic: 3.299 on 5 and 388 DF,  p-value: 0.006259

Include PoS

CDI_kuperman_resid_pos <- CDI_kuperman_resid_filtered %>%
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration,data=.)

summary(CDI_kuperman_resid_pos)
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.35410 -0.11657 -0.00737  0.10965  0.48680 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)   
## (Intercept)              -0.099825   0.289129  -0.345  0.73008   
## scale(hypernyms)          0.036754   0.014166   2.595  0.00983 **
## scale(log(hyponyms + 1))  0.004709   0.009930   0.474  0.63565   
## Zipf.value                0.004139   0.015714   0.263  0.79237   
## dom_pos_factor2           0.034858   0.038020   0.917  0.35980   
## dom_pos_factor3          -0.058153   0.172233  -0.338  0.73582   
## Conc.M                    0.024192   0.022228   1.088  0.27710   
## neighbour_concentration  -0.095383   0.665752  -0.143  0.88615   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1713 on 386 degrees of freedom
##   (28 observations deleted due to missingness)
## Multiple R-squared:  0.04317,    Adjusted R-squared:  0.02581 
## F-statistic: 2.488 on 7 and 386 DF,  p-value: 0.0165

Include helpfulness

CDI_kuperman_resid_pos_helpful <- CDI_kuperman_resid_filtered %>%
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration + resp_mean,data=.)

summary(CDI_kuperman_resid_pos_helpful)
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration + 
##     resp_mean, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42930 -0.09641 -0.00274  0.08358  0.37673 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.631164   0.249443  -2.530 0.011794 *  
## scale(hypernyms)          0.044370   0.012051   3.682 0.000264 ***
## scale(log(hyponyms + 1))  0.015076   0.008479   1.778 0.076188 .  
## Zipf.value               -0.019896   0.013494  -1.474 0.141181    
## dom_pos_factor2           0.024519   0.032312   0.759 0.448420    
## dom_pos_factor3          -0.108785   0.146383  -0.743 0.457843    
## Conc.M                    0.038314   0.018919   2.025 0.043542 *  
## neighbour_concentration   0.141186   0.565935   0.249 0.803128    
## resp_mean                 0.141600   0.011570  12.239  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1455 on 385 degrees of freedom
##   (28 observations deleted due to missingness)
## Multiple R-squared:  0.3112, Adjusted R-squared:  0.2969 
## F-statistic: 21.74 on 8 and 385 DF,  p-value: < 2.2e-16

Compare models

anova(CDI_kuperman_resid_base, CDI_kuperman_resid_pos, CDI_kuperman_resid_pos_helpful)
## Analysis of Variance Table
## 
## Model 1: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + Conc.M + neighbour_concentration
## Model 2: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration
## Model 3: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration + 
##     resp_mean
##   Res.Df     RSS Df Sum of Sq        F Pr(>F)    
## 1    388 11.3567                                 
## 2    386 11.3284  2    0.0283   0.6688 0.5129    
## 3    385  8.1554  1    3.1730 149.7923 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

For CDI words, including PoS doesn’t improve model fit, but including helpfulness does

If we look just at non-CDI words, there seem to be 2 clusters - AoAs overestimate some and underestimate others

ggplot(filter(all_words_all_features_typed, type_broad=="non-CDI"), aes(x=KupermanAoA, y=all_ages_says))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Non-CDI words only

non_CDI_all_features <- filter(all_words_all_features, type != "CDI")
base_mod_nonCDI <- lm(all_ages_says ~ KupermanAoA, data=non_CDI_all_features)

nonCDI_kuperman_resid <- non_CDI_all_features %>% 
  mutate(all_ages_says.Kuperman = resid(base_mod_nonCDI))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(nonCDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(nonCDI_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models

Base model

base_resid_nonCDI <- nonCDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(resp_mean))) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + Zipf.value+Conc.M+neighbour_concentration, data=.)
summary(base_resid_nonCDI)
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + Conc.M + neighbour_concentration, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50003 -0.11004 -0.00252  0.10841  0.72808 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.62384    0.32217  -1.936  0.05389 .  
## scale(hypernyms)          0.07237    0.01118   6.472 4.67e-10 ***
## scale(log(hyponyms + 1))  0.03647    0.01132   3.222  0.00143 ** 
## Zipf.value                0.03752    0.01578   2.378  0.01814 *  
## Conc.M                    0.10663    0.01476   7.226 5.35e-12 ***
## neighbour_concentration   0.06275    0.85159   0.074  0.94131    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1686 on 264 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.3315, Adjusted R-squared:  0.3188 
## F-statistic: 26.18 on 5 and 264 DF,  p-value: < 2.2e-16

Including PoS

pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(resp_mean))) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration,data=.)
  
summary(pos_resid_nonCDI)
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49037 -0.08904  0.00136  0.08419  0.74659 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.56984    0.30952  -1.841 0.066743 .  
## scale(hypernyms)          0.04432    0.01180   3.754 0.000214 ***
## scale(log(hyponyms + 1))  0.02702    0.01091   2.477 0.013892 *  
## dom_pos_factor2          -0.13282    0.02793  -4.755 3.28e-06 ***
## dom_pos_factor3           0.07647    0.03765   2.031 0.043269 *  
## Zipf.value                0.04634    0.01505   3.079 0.002295 ** 
## Conc.M                    0.09337    0.01458   6.406 6.90e-10 ***
## neighbour_concentration   0.03416    0.81178   0.042 0.966469    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1599 on 262 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.4034, Adjusted R-squared:  0.3875 
## F-statistic: 25.31 on 7 and 262 DF,  p-value: < 2.2e-16

Including PoS and helpfulness

helpful_pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration + resp_mean,data=.)
  
summary(helpful_pos_resid_nonCDI)
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration + 
##     resp_mean, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43060 -0.09404 -0.00433  0.08122  0.78549 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.57774    0.30370  -1.902 0.058223 .  
## scale(hypernyms)          0.04618    0.01156   3.996 8.40e-05 ***
## scale(log(hyponyms + 1))  0.02222    0.01071   2.074 0.039089 *  
## dom_pos_factor2          -0.13981    0.02749  -5.087 6.97e-07 ***
## dom_pos_factor3           0.05482    0.03751   1.462 0.145035    
## Zipf.value                0.02356    0.01626   1.448 0.148708    
## Conc.M                    0.09093    0.01432   6.350 9.50e-10 ***
## neighbour_concentration  -0.03000    0.79672  -0.038 0.969997    
## resp_mean                 0.04769    0.01427   3.341 0.000958 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1569 on 261 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.4279, Adjusted R-squared:  0.4104 
## F-statistic:  24.4 on 8 and 261 DF,  p-value: < 2.2e-16

Compare models

anova(base_resid_nonCDI, pos_resid_nonCDI, helpful_pos_resid_nonCDI)
## Analysis of Variance Table
## 
## Model 1: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + Conc.M + neighbour_concentration
## Model 2: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration
## Model 3: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration + 
##     resp_mean
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    264 7.5056                                  
## 2    262 6.6979  2   0.80762 16.408 1.939e-07 ***
## 3    261 6.4233  1   0.27467 11.161 0.0009576 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Accounting for PoS and helpfulness also improve model fit for non-CDI words