all_words_all_features <- read.csv("all_words_with_norms_and_helpfulness.csv") %>% select(-X)
Correlations among predictors
corr_vars <- all_words_all_features %>%
mutate(log_hyponyms = log(hyponyms+1)) %>%
select(all_ages_says, KupermanAoA, hypernyms, log_hyponyms, Conc.M, Zipf.value, neighbour_concentration, helpfulness=resp_mean) %>%
cor(use="pairwise.complete.obs", method="pearson")
p.mat <- cor.mtest(corr_vars)
pMatrix <- p.mat$p
corrplot(corr_vars, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
tl.col = "black", number.font=2, number.cex=.8, p.mat=pMatrix, sig.level = 0.05, insig = "blank")

How does helpfulness correlate with Kuperman, hypernyms, and actual production?
All words
Kuperman
ggplot(all_words_all_features, aes(x=KupermanAoA, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Hypernyms
ggplot(all_words_all_features, aes(x=hypernyms, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Production
ggplot(all_words_all_features, aes(x=all_ages_says, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Non-CDI words only
Kuperman
ggplot(filter(all_words_all_features, type!="CDI"), aes(x=KupermanAoA, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Hypernyms
ggplot(filter(all_words_all_features, type!="CDI"), aes(x=hypernyms, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Production
ggplot(filter(all_words_all_features, type!="CDI"), aes(x=all_ages_says, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Generate residuals for all_ages_says ~ Kuperman (all data)
base_mod <- lm(all_ages_says ~ KupermanAoA, data=all_words_all_features)
all_words_kuperman_resid <- all_words_all_features %>%
mutate(all_ages_says.Kuperman = resid(base_mod))
Relation between residual and hypernyms, helpfulness?
Hypernyms
ggplot(all_words_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Helpfulness
ggplot(all_words_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Model all words/predictors
all_words_kuperman_resid %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + Zipf.value+Conc.M+neighbour_concentration + resp_mean,data=.) %>%
summary
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + Conc.M + neighbour_concentration + resp_mean,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43726 -0.13292 -0.03494 0.13428 0.73275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.779794 0.207793 -3.753 0.000188 ***
## scale(hypernyms) 0.029195 0.008124 3.594 0.000347 ***
## scale(log(hyponyms + 1)) -0.014061 0.007699 -1.826 0.068167 .
## Zipf.value 0.028729 0.011958 2.402 0.016522 *
## Conc.M 0.015243 0.011563 1.318 0.187782
## neighbour_concentration 1.381596 0.531050 2.602 0.009457 **
## resp_mean 0.025948 0.009874 2.628 0.008765 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1945 on 766 degrees of freedom
## (54 observations deleted due to missingness)
## Multiple R-squared: 0.05239, Adjusted R-squared: 0.04496
## F-statistic: 7.058 on 6 and 766 DF, p-value: 2.499e-07
CDI words only
cdi_words_all_features <- filter(all_words_all_features, type=="CDI")
base_mod_CDI <- lm(all_ages_says ~ KupermanAoA, data=cdi_words_all_features)
CDI_kuperman_resid <- cdi_words_all_features %>%
mutate(all_ages_says.Kuperman = resid(base_mod_CDI))
Relation between residual and hypernyms, helpfulness?
Hypernyms
ggplot(CDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Helpfulness
ggplot(CDI_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Model
CDI_kuperman_resid %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + Zipf.value+Conc.M+neighbour_concentration + resp_mean,data=.) %>%
summary
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + Conc.M + neighbour_concentration + resp_mean,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30794 -0.08980 -0.00372 0.07845 0.53471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.577548 0.201070 -2.872 0.004278 **
## scale(hypernyms) 0.024454 0.008264 2.959 0.003259 **
## scale(log(hyponyms + 1)) 0.001960 0.007280 0.269 0.787926
## Zipf.value 0.017334 0.011759 1.474 0.141209
## Conc.M 0.054509 0.014833 3.675 0.000268 ***
## neighbour_concentration 0.401475 0.477596 0.841 0.401034
## resp_mean 0.030259 0.009822 3.081 0.002198 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1324 on 426 degrees of freedom
## (33 observations deleted due to missingness)
## Multiple R-squared: 0.1212, Adjusted R-squared: 0.1088
## F-statistic: 9.787 on 6 and 426 DF, p-value: 4.089e-10
Non-CDI words only
non_CDI_all_features <- filter(all_words_all_features, type != "CDI")
base_mod_nonCDI <- lm(all_ages_says ~ KupermanAoA, data=non_CDI_all_features)
nonCDI_kuperman_resid <- non_CDI_all_features %>%
mutate(all_ages_says.Kuperman = resid(base_mod_nonCDI))
Relation between residual and hypernyms, helpfulness?
Hypernyms
ggplot(nonCDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Helpfulness
ggplot(nonCDI_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()

Model
nonCDI_kuperman_resid %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + Zipf.value+Conc.M+neighbour_concentration + resp_mean,data=.) %>%
summary
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + Conc.M + neighbour_concentration + resp_mean,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46904 -0.11695 0.00317 0.10453 0.78451
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.274945 0.267397 -4.768 2.79e-06 ***
## scale(hypernyms) 0.065575 0.009688 6.769 5.87e-11 ***
## scale(log(hyponyms + 1)) 0.021557 0.009605 2.244 0.02546 *
## Zipf.value 0.026740 0.014432 1.853 0.06478 .
## Conc.M 0.095098 0.013210 7.199 4.05e-12 ***
## neighbour_concentration 1.873870 0.707358 2.649 0.00846 **
## resp_mean 0.031359 0.012322 2.545 0.01138 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1673 on 333 degrees of freedom
## (21 observations deleted due to missingness)
## Multiple R-squared: 0.3173, Adjusted R-squared: 0.305
## F-statistic: 25.79 on 6 and 333 DF, p-value: < 2.2e-16