all_words_all_features <- read.csv("all_words_with_norms_and_helpfulness.csv") %>% select(-X)

Correlations among predictors

corr_vars <- all_words_all_features %>% 
  mutate(log_hyponyms = log(hyponyms+1)) %>% 
  select(all_ages_says, KupermanAoA, hypernyms, log_hyponyms, Conc.M, Zipf.value, neighbour_concentration, helpfulness=resp_mean) %>% 
  cor(use="pairwise.complete.obs", method="pearson")

p.mat <- cor.mtest(corr_vars)
pMatrix <- p.mat$p

corrplot(corr_vars, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
         tl.col = "black", number.font=2, number.cex=.8, p.mat=pMatrix, sig.level = 0.05, insig = "blank")

How does helpfulness correlate with Kuperman, hypernyms, and actual production?

All words

Kuperman

ggplot(all_words_all_features, aes(x=KupermanAoA, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Hypernyms

ggplot(all_words_all_features, aes(x=hypernyms, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Production

ggplot(all_words_all_features, aes(x=all_ages_says, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Non-CDI words only

Kuperman

ggplot(filter(all_words_all_features, type!="CDI"), aes(x=KupermanAoA, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Hypernyms

ggplot(filter(all_words_all_features, type!="CDI"), aes(x=hypernyms, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Production

ggplot(filter(all_words_all_features, type!="CDI"), aes(x=all_ages_says, y=resp_mean))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Generate residuals for all_ages_says ~ Kuperman (all data)

base_mod <- lm(all_ages_says ~ KupermanAoA, data=all_words_all_features)

all_words_kuperman_resid <- all_words_all_features %>% 
  mutate(all_ages_says.Kuperman = resid(base_mod))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(all_words_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(all_words_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Model all words/predictors

all_words_kuperman_resid %>%
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + Zipf.value+Conc.M+neighbour_concentration + resp_mean,data=.) %>%
  summary
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + Conc.M + neighbour_concentration + resp_mean, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43726 -0.13292 -0.03494  0.13428  0.73275 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.779794   0.207793  -3.753 0.000188 ***
## scale(hypernyms)          0.029195   0.008124   3.594 0.000347 ***
## scale(log(hyponyms + 1)) -0.014061   0.007699  -1.826 0.068167 .  
## Zipf.value                0.028729   0.011958   2.402 0.016522 *  
## Conc.M                    0.015243   0.011563   1.318 0.187782    
## neighbour_concentration   1.381596   0.531050   2.602 0.009457 ** 
## resp_mean                 0.025948   0.009874   2.628 0.008765 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1945 on 766 degrees of freedom
##   (54 observations deleted due to missingness)
## Multiple R-squared:  0.05239,    Adjusted R-squared:  0.04496 
## F-statistic: 7.058 on 6 and 766 DF,  p-value: 2.499e-07

CDI words only

cdi_words_all_features <- filter(all_words_all_features, type=="CDI")
base_mod_CDI <- lm(all_ages_says ~ KupermanAoA, data=cdi_words_all_features)

CDI_kuperman_resid <- cdi_words_all_features %>% 
  mutate(all_ages_says.Kuperman = resid(base_mod_CDI))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(CDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(CDI_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Model

CDI_kuperman_resid %>%
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + Zipf.value+Conc.M+neighbour_concentration + resp_mean,data=.) %>%
  summary
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + Conc.M + neighbour_concentration + resp_mean, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30794 -0.08980 -0.00372  0.07845  0.53471 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.577548   0.201070  -2.872 0.004278 ** 
## scale(hypernyms)          0.024454   0.008264   2.959 0.003259 ** 
## scale(log(hyponyms + 1))  0.001960   0.007280   0.269 0.787926    
## Zipf.value                0.017334   0.011759   1.474 0.141209    
## Conc.M                    0.054509   0.014833   3.675 0.000268 ***
## neighbour_concentration   0.401475   0.477596   0.841 0.401034    
## resp_mean                 0.030259   0.009822   3.081 0.002198 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1324 on 426 degrees of freedom
##   (33 observations deleted due to missingness)
## Multiple R-squared:  0.1212, Adjusted R-squared:  0.1088 
## F-statistic: 9.787 on 6 and 426 DF,  p-value: 4.089e-10

Non-CDI words only

non_CDI_all_features <- filter(all_words_all_features, type != "CDI")
base_mod_nonCDI <- lm(all_ages_says ~ KupermanAoA, data=non_CDI_all_features)

nonCDI_kuperman_resid <- non_CDI_all_features %>% 
  mutate(all_ages_says.Kuperman = resid(base_mod_nonCDI))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(nonCDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(nonCDI_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Model

nonCDI_kuperman_resid %>%
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + Zipf.value+Conc.M+neighbour_concentration + resp_mean,data=.) %>%
  summary
## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + Zipf.value + Conc.M + neighbour_concentration + resp_mean, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.46904 -0.11695  0.00317  0.10453  0.78451 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -1.274945   0.267397  -4.768 2.79e-06 ***
## scale(hypernyms)          0.065575   0.009688   6.769 5.87e-11 ***
## scale(log(hyponyms + 1))  0.021557   0.009605   2.244  0.02546 *  
## Zipf.value                0.026740   0.014432   1.853  0.06478 .  
## Conc.M                    0.095098   0.013210   7.199 4.05e-12 ***
## neighbour_concentration   1.873870   0.707358   2.649  0.00846 ** 
## resp_mean                 0.031359   0.012322   2.545  0.01138 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1673 on 333 degrees of freedom
##   (21 observations deleted due to missingness)
## Multiple R-squared:  0.3173, Adjusted R-squared:  0.305 
## F-statistic: 25.79 on 6 and 333 DF,  p-value: < 2.2e-16