all_words_all_features <- read.csv("all_words_with_norms_and_helpfulness_no_duplicate_words.csv") %>% select(-X)
awaf_scaled <- all_words_all_features %>%
filter(dom_pos %in% c("Noun","Verb","Adjective")) %>%
group_by(dom_pos) %>%
mutate(pos_scale_hypernyms = scale(hypernyms), pos_scale_hyponyms = scale(hyponyms),
pos_scale_concreteness = scale(Conc.M), pos_scale_freq = scale(Zipf.value), pos_scale_helpful = scale(resp_mean))
all_words_all_features$word <- as.character(all_words_all_features$word)
nonCDI_only <- filter(all_words_all_features, type != "CDI")
corr_vars <- all_words_all_features %>%
mutate(log_hyponyms = log(hyponyms+1)) %>%
select(all_ages_says, KupermanAoA, hypernyms, log_hyponyms, concreteness=Conc.M, frequency=Zipf.value, neighbour_concentration, helpfulness=resp_mean) %>%
cor(use="pairwise.complete.obs", method="pearson")
p.mat <- cor.mtest(corr_vars)
pMatrix <- p.mat$p
corrplot(corr_vars, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
tl.col = "black", number.font=2, number.cex=.8, p.mat=pMatrix, sig.level = 0.05, insig = "blank")
ggplot(all_words_all_features, aes(x=KupermanAoA, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(all_words_all_features, aes(x=hypernyms, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(all_words_all_features, aes(x=all_ages_says, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(filter(all_words_all_features, type!="CDI"), aes(x=KupermanAoA, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(filter(all_words_all_features, type!="CDI"), aes(x=hypernyms, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(filter(all_words_all_features, type!="CDI"), aes(x=all_ages_says, y=resp_mean))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
base_mod <- lm(all_ages_says ~ KupermanAoA, data=all_words_all_features)
all_words_kuperman_resid <- all_words_all_features %>%
mutate(all_ages_says.Kuperman = resid(base_mod))
base_mod_scaled_data <- lm(all_ages_says ~ KupermanAoA, data = awaf_scaled)
scaled_pos_resid <- awaf_scaled %>%
ungroup() %>%
mutate(all_ages_says.Kuperman = resid(base_mod_scaled_data))
ggplot(all_words_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(all_words_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
all_words_no_pos <- all_words_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ Zipf.value+Conc.M+neighbour_concentration,data=.)
summary(all_words_no_pos)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + Conc.M + neighbour_concentration, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43260 -0.14684 -0.03654 0.13315 0.51495
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.222360 0.246081 -0.904 0.3665
## scale(hypernyms) 0.022692 0.009502 2.388 0.0172 *
## scale(log(hyponyms + 1)) 0.005514 0.008967 0.615 0.5389
## Zipf.value 0.007834 0.013154 0.596 0.5517
## Conc.M 0.016098 0.013209 1.219 0.2234
## neighbour_concentration 0.313034 0.626853 0.499 0.6177
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2064 on 664 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.02069, Adjusted R-squared: 0.01332
## F-statistic: 2.806 on 5 and 664 DF, p-value: 0.01615
all_words_with_pos <- all_words_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ dom_pos_factor + Zipf.value+Conc.M+neighbour_concentration,data=.)
summary(all_words_with_pos)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47455 -0.14552 -0.01863 0.13471 0.54103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2233074 0.2446068 -0.913 0.361615
## scale(hypernyms) 0.0005508 0.0108786 0.051 0.959631
## scale(log(hyponyms + 1)) 0.0029732 0.0088188 0.337 0.736119
## dom_pos_factor2 -0.0910299 0.0268306 -3.393 0.000733 ***
## dom_pos_factor3 0.1677100 0.0452126 3.709 0.000225 ***
## Zipf.value 0.0142485 0.0129325 1.102 0.270965
## Conc.M 0.0127926 0.0136361 0.938 0.348514
## neighbour_concentration 0.3159452 0.6158518 0.513 0.608107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2021 on 662 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.06387, Adjusted R-squared: 0.05397
## F-statistic: 6.452 on 7 and 662 DF, p-value: 2.278e-07
all_words_with_pos_helpful <- all_words_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ dom_pos_factor + Zipf.value+Conc.M+neighbour_concentration+resp_mean,data=.)
summary(all_words_with_pos_helpful)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration +
## resp_mean, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48535 -0.13558 -0.02497 0.12890 0.62788
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3472242 0.2359209 -1.472 0.141560
## scale(hypernyms) -0.0015503 0.0104728 -0.148 0.882362
## scale(log(hyponyms + 1)) 0.0007269 0.0084795 0.086 0.931715
## dom_pos_factor2 -0.1144256 0.0260788 -4.388 1.33e-05 ***
## dom_pos_factor3 0.1483719 0.0434479 3.415 0.000677 ***
## Zipf.value -0.0168297 0.0131192 -1.283 0.200005
## Conc.M 0.0038806 0.0131855 0.294 0.768617
## neighbour_concentration 0.4140774 0.5933213 0.698 0.485489
## resp_mean 0.0821740 0.0107465 7.647 7.38e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1938 on 655 degrees of freedom
## (43 observations deleted due to missingness)
## Multiple R-squared: 0.1422, Adjusted R-squared: 0.1317
## F-statistic: 13.58 on 8 and 655 DF, p-value: < 2.2e-16
all_words_all_features_typed <- all_words_all_features %>%
mutate(type_broad = ifelse(type=="CDI","CDI","non-CDI"))
ggplot(all_words_all_features_typed, aes(x=KupermanAoA, y=all_ages_says, color=type_broad))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
Seems like we should look at CDI and non-CDI words separately, as very different things are going on.
cdi_words_all_features <- filter(all_words_all_features, type=="CDI")
base_mod_CDI <- lm(all_ages_says ~ KupermanAoA, data=cdi_words_all_features)
CDI_kuperman_resid <- cdi_words_all_features %>%
mutate(all_ages_says.Kuperman = resid(base_mod_CDI))
ggplot(CDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(CDI_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
CDI_kuperman_resid_filtered <- CDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3))))
CDI_kuperman_resid_base <- CDI_kuperman_resid_filtered %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ Zipf.value+Conc.M+neighbour_concentration,data=.)
summary(CDI_kuperman_resid_base)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + Conc.M + neighbour_concentration, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36509 -0.11983 -0.00431 0.11049 0.48936
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.036110 0.278892 -0.129 0.8970
## scale(hypernyms) 0.029495 0.011627 2.537 0.0116 *
## scale(log(hyponyms + 1)) 0.004650 0.009912 0.469 0.6392
## Zipf.value 0.003637 0.015620 0.233 0.8160
## Conc.M 0.015923 0.020342 0.783 0.4343
## neighbour_concentration -0.143816 0.661743 -0.217 0.8281
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1711 on 388 degrees of freedom
## (28 observations deleted due to missingness)
## Multiple R-squared: 0.04077, Adjusted R-squared: 0.02841
## F-statistic: 3.299 on 5 and 388 DF, p-value: 0.006259
CDI_kuperman_resid_pos <- CDI_kuperman_resid_filtered %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration,data=.)
summary(CDI_kuperman_resid_pos)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.35410 -0.11657 -0.00737 0.10965 0.48680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.099825 0.289129 -0.345 0.73008
## scale(hypernyms) 0.036754 0.014166 2.595 0.00983 **
## scale(log(hyponyms + 1)) 0.004709 0.009930 0.474 0.63565
## Zipf.value 0.004139 0.015714 0.263 0.79237
## dom_pos_factor2 0.034858 0.038020 0.917 0.35980
## dom_pos_factor3 -0.058153 0.172233 -0.338 0.73582
## Conc.M 0.024192 0.022228 1.088 0.27710
## neighbour_concentration -0.095383 0.665752 -0.143 0.88615
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1713 on 386 degrees of freedom
## (28 observations deleted due to missingness)
## Multiple R-squared: 0.04317, Adjusted R-squared: 0.02581
## F-statistic: 2.488 on 7 and 386 DF, p-value: 0.0165
CDI_kuperman_resid_pos_helpful <- CDI_kuperman_resid_filtered %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration + resp_mean,data=.)
summary(CDI_kuperman_resid_pos_helpful)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration +
## resp_mean, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42930 -0.09641 -0.00274 0.08358 0.37673
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.631164 0.249443 -2.530 0.011794 *
## scale(hypernyms) 0.044370 0.012051 3.682 0.000264 ***
## scale(log(hyponyms + 1)) 0.015076 0.008479 1.778 0.076188 .
## Zipf.value -0.019896 0.013494 -1.474 0.141181
## dom_pos_factor2 0.024519 0.032312 0.759 0.448420
## dom_pos_factor3 -0.108785 0.146383 -0.743 0.457843
## Conc.M 0.038314 0.018919 2.025 0.043542 *
## neighbour_concentration 0.141186 0.565935 0.249 0.803128
## resp_mean 0.141600 0.011570 12.239 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1455 on 385 degrees of freedom
## (28 observations deleted due to missingness)
## Multiple R-squared: 0.3112, Adjusted R-squared: 0.2969
## F-statistic: 21.74 on 8 and 385 DF, p-value: < 2.2e-16
anova(CDI_kuperman_resid_base, CDI_kuperman_resid_pos, CDI_kuperman_resid_pos_helpful)
## Analysis of Variance Table
##
## Model 1: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + Conc.M + neighbour_concentration
## Model 2: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration
## Model 3: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + dom_pos_factor + Conc.M + neighbour_concentration +
## resp_mean
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 388 11.3567
## 2 386 11.3284 2 0.0283 0.6688 0.5129
## 3 385 8.1554 1 3.1730 149.7923 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
For CDI words, including PoS doesn’t improve model fit, but including helpfulness does
ggplot(filter(all_words_all_features_typed, type_broad=="non-CDI"), aes(x=KupermanAoA, y=all_ages_says))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
non_CDI_all_features <- filter(all_words_all_features, type != "CDI")
base_mod_nonCDI <- lm(all_ages_says ~ KupermanAoA, data=non_CDI_all_features)
nonCDI_kuperman_resid <- non_CDI_all_features %>%
mutate(all_ages_says.Kuperman = resid(base_mod_nonCDI))
ggplot(nonCDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(nonCDI_kuperman_resid, aes(x=resp_mean, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
base_resid_nonCDI <- nonCDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(resp_mean))) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + Zipf.value+Conc.M+neighbour_concentration, data=.)
summary(base_resid_nonCDI)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + Conc.M + neighbour_concentration, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50003 -0.11004 -0.00252 0.10841 0.72808
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.62384 0.32217 -1.936 0.05389 .
## scale(hypernyms) 0.07237 0.01118 6.472 4.67e-10 ***
## scale(log(hyponyms + 1)) 0.03647 0.01132 3.222 0.00143 **
## Zipf.value 0.03752 0.01578 2.378 0.01814 *
## Conc.M 0.10663 0.01476 7.226 5.35e-12 ***
## neighbour_concentration 0.06275 0.85159 0.074 0.94131
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1686 on 264 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.3315, Adjusted R-squared: 0.3188
## F-statistic: 26.18 on 5 and 264 DF, p-value: < 2.2e-16
pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(resp_mean))) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration,data=.)
summary(pos_resid_nonCDI)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49037 -0.08904 0.00136 0.08419 0.74659
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.56984 0.30952 -1.841 0.066743 .
## scale(hypernyms) 0.04432 0.01180 3.754 0.000214 ***
## scale(log(hyponyms + 1)) 0.02702 0.01091 2.477 0.013892 *
## dom_pos_factor2 -0.13282 0.02793 -4.755 3.28e-06 ***
## dom_pos_factor3 0.07647 0.03765 2.031 0.043269 *
## Zipf.value 0.04634 0.01505 3.079 0.002295 **
## Conc.M 0.09337 0.01458 6.406 6.90e-10 ***
## neighbour_concentration 0.03416 0.81178 0.042 0.966469
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1599 on 262 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.4034, Adjusted R-squared: 0.3875
## F-statistic: 25.31 on 7 and 262 DF, p-value: < 2.2e-16
helpful_pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration + resp_mean,data=.)
summary(helpful_pos_resid_nonCDI)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration +
## resp_mean, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43060 -0.09404 -0.00433 0.08122 0.78549
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.57774 0.30370 -1.902 0.058223 .
## scale(hypernyms) 0.04618 0.01156 3.996 8.40e-05 ***
## scale(log(hyponyms + 1)) 0.02222 0.01071 2.074 0.039089 *
## dom_pos_factor2 -0.13981 0.02749 -5.087 6.97e-07 ***
## dom_pos_factor3 0.05482 0.03751 1.462 0.145035
## Zipf.value 0.02356 0.01626 1.448 0.148708
## Conc.M 0.09093 0.01432 6.350 9.50e-10 ***
## neighbour_concentration -0.03000 0.79672 -0.038 0.969997
## resp_mean 0.04769 0.01427 3.341 0.000958 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1569 on 261 degrees of freedom
## (15 observations deleted due to missingness)
## Multiple R-squared: 0.4279, Adjusted R-squared: 0.4104
## F-statistic: 24.4 on 8 and 261 DF, p-value: < 2.2e-16
anova(base_resid_nonCDI, pos_resid_nonCDI, helpful_pos_resid_nonCDI)
## Analysis of Variance Table
##
## Model 1: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + Zipf.value + Conc.M + neighbour_concentration
## Model 2: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration
## Model 3: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + dom_pos_factor + Zipf.value + Conc.M + neighbour_concentration +
## resp_mean
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 264 7.5056
## 2 262 6.6979 2 0.80762 16.408 1.939e-07 ***
## 3 261 6.4233 1 0.27467 11.161 0.0009576 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Accounting for PoS and helpfulness also improve model fit for non-CDI words