all_ages_says: on average, what proportion of children sampled say this word?
KupermanAoA: adult-estimated AoA, from Kuperman norms
hypernyms: how many words are superordinate to this word? (per wordnet)
hyponyms: how many words are subordinate to this word? (per wordnet)
concreteness: how concrete (i.e. tangible) is the word’s referent, scale of 1 (abstract) to 5 (concrete), from Brysbaert norms
frequency: word frequency from SUBTLEX
neighbour_concentration: measure of semantic density (how concentrated is local word neighborhood), from Thompson (unpublished)
arc: measure of semantic density (average radius of co-occurrence), from HiDEx (Shaoul & Westbury)
ncount: measure of semantic density (neighbor count), from HiDEx (Shaoul & Westbury)
helpfulness: how helpful would it be for a preschooler to know this word, scale of 1 (not helpful) to 5 (very helpful), collected on mturk
babiness: how much is this word associated with babies, scale of 1 to 10, from Perry et al. (2015)
n_synsets: how many synsets are there for this word on wordnet?
corr_vars <- all_words_all_features %>%
mutate(log_hyponyms = log(hyponyms+1)) %>%
select(all_ages_says, all_ages_understands, KupermanAoA, hypernyms, log_hyponyms, concreteness, frequency_subtlex,
childes_adult_log_freq, childes_kid_log_freq, n_synsets, neighbour_concentration, arc, ncount, helpfulness, babiness) %>%
cor(use="pairwise.complete.obs", method="pearson")
p.mat <- cor.mtest(corr_vars)
pMatrix <- p.mat$p
corrplot(corr_vars, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
tl.col = "black", number.font=2, number.cex=.6, p.mat=pMatrix, sig.level = 0.05, insig = "blank")
ggplot(all_words_all_features, aes(x=KupermanAoA, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(all_words_all_features, aes(x=hypernyms, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(all_words_all_features, aes(x=all_ages_says, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(filter(all_words_all_features, type!="CDI"), aes(x=KupermanAoA, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(filter(all_words_all_features, type!="CDI"), aes(x=hypernyms, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(filter(all_words_all_features, type!="CDI"), aes(x=all_ages_says, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
base_mod <- lm(all_ages_says ~ KupermanAoA, data=all_words_all_features)
all_words_kuperman_resid <- all_words_all_features %>%
mutate(all_ages_says.Kuperman = resid(base_mod))
base_mod_scaled_data <- lm(all_ages_says ~ KupermanAoA, data = awaf_scaled)
scaled_pos_resid <- awaf_scaled %>%
ungroup() %>%
mutate(all_ages_says.Kuperman = resid(base_mod_scaled_data))
ggplot(all_words_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(all_words_kuperman_resid, aes(x=helpfulness, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
all_words_no_pos <- all_words_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)
summary(all_words_no_pos)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46675 -0.14650 -0.03747 0.12603 0.51788
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.346615 0.235225 -1.474 0.141051
## scale(hypernyms) 0.021040 0.009044 2.326 0.020280 *
## scale(log(hyponyms + 1)) -0.003147 0.008591 -0.366 0.714211
## frequency_subtlex -0.038439 0.018365 -2.093 0.036704 *
## concreteness 0.017649 0.012607 1.400 0.161975
## neighbour_concentration 0.492623 0.601144 0.819 0.412793
## arc 0.444512 0.117260 3.791 0.000163 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2032 on 704 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.04397, Adjusted R-squared: 0.03582
## F-statistic: 5.396 on 6 and 704 DF, p-value: 1.828e-05
all_words_with_pos <- all_words_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc+dom_pos_factor,data=.)
summary(all_words_with_pos)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47507 -0.14257 -0.02117 0.12831 0.56068
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.313935 0.236507 -1.327 0.18481
## scale(hypernyms) 0.007503 0.010098 0.743 0.45771
## scale(log(hyponyms + 1)) -0.004623 0.008515 -0.543 0.58735
## frequency_subtlex -0.021529 0.018575 -1.159 0.24684
## concreteness 0.015231 0.013101 1.163 0.24540
## neighbour_concentration 0.413132 0.596873 0.692 0.48906
## arc 0.343181 0.118343 2.900 0.00385 **
## dom_pos_factor2 -0.066987 0.025082 -2.671 0.00775 **
## dom_pos_factor3 0.126002 0.043231 2.915 0.00367 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2008 on 702 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.06881, Adjusted R-squared: 0.0582
## F-statistic: 6.484 on 8 and 702 DF, p-value: 3.641e-08
all_words_with_pos_helpful <- all_words_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+ concreteness+ neighbour_concentration+ arc+ dom_pos_factor+ babiness+helpfulness+n_synsets ,data=.)
summary(all_words_with_pos_helpful)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + babiness + helpfulness + n_synsets,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47450 -0.11795 -0.01713 0.09326 0.65740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5926252 0.3107823 -1.907 0.05726 .
## scale(hypernyms) -0.0145690 0.0127941 -1.139 0.25551
## scale(log(hyponyms + 1)) -0.0160590 0.0100813 -1.593 0.11198
## frequency_subtlex -0.0693377 0.0250764 -2.765 0.00596 **
## concreteness 0.0246668 0.0214897 1.148 0.25173
## neighbour_concentration 0.2278351 0.7258777 0.314 0.75378
## arc 0.4574819 0.1631046 2.805 0.00528 **
## dom_pos_factor2 -0.1140070 0.0384902 -2.962 0.00324 **
## babiness -0.0055373 0.0050773 -1.091 0.27612
## helpfulness 0.1313714 0.0145795 9.011 < 2e-16 ***
## n_synsets 0.0001001 0.0013634 0.073 0.94153
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1775 on 394 degrees of freedom
## (348 observations deleted due to missingness)
## Multiple R-squared: 0.2262, Adjusted R-squared: 0.2065
## F-statistic: 11.52 on 10 and 394 DF, p-value: < 2.2e-16
all_words_all_features_typed <- all_words_all_features %>%
mutate(type_broad = ifelse(type=="CDI","CDI","non-CDI"))
ggplot(all_words_all_features_typed, aes(x=KupermanAoA, y=all_ages_says, color=type_broad))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
Seems like we should look at CDI and non-CDI words separately, as very different things are going on.
cdi_words_all_features <- filter(all_words_all_features, type=="CDI")
base_mod_CDI <- lm(all_ages_says ~ KupermanAoA, data=cdi_words_all_features)
CDI_kuperman_resid <- cdi_words_all_features %>%
mutate(all_ages_says.Kuperman = resid(base_mod_CDI))
ggplot(CDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(CDI_kuperman_resid, aes(x=helpfulness, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
CDI_kuperman_resid_filtered <- CDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3))))
CDI_kuperman_resid_base <- CDI_kuperman_resid_filtered %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)
summary(CDI_kuperman_resid_base)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39342 -0.12055 -0.00696 0.11161 0.52158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0340127 0.2766866 -0.123 0.9022
## scale(hypernyms) 0.0266763 0.0115631 2.307 0.0216 *
## scale(log(hyponyms + 1)) 0.0005019 0.0099173 0.051 0.9597
## frequency_subtlex -0.0367231 0.0231990 -1.583 0.1142
## concreteness 0.0109750 0.0203843 0.538 0.5906
## neighbour_concentration -0.1096482 0.6555909 -0.167 0.8673
## arc 0.3310336 0.1422894 2.326 0.0205 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1699 on 389 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.05416, Adjusted R-squared: 0.03957
## F-statistic: 3.712 on 6 and 389 DF, p-value: 0.001333
CDI_kuperman_resid_childes <- CDI_kuperman_resid_filtered %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+childes_adult_log_freq+concreteness+neighbour_concentration+arc,data=.)
summary(CDI_kuperman_resid_childes)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + childes_adult_log_freq + concreteness + neighbour_concentration +
## arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38351 -0.11435 -0.00293 0.11239 0.48183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1209326 0.2718342 -0.445 0.65666
## scale(hypernyms) 0.0301777 0.0113496 2.659 0.00816 **
## scale(log(hyponyms + 1)) -0.0004437 0.0100455 -0.044 0.96479
## childes_adult_log_freq -0.0054974 0.0083562 -0.658 0.51100
## concreteness 0.0182683 0.0198145 0.922 0.35712
## neighbour_concentration -0.0980764 0.6577419 -0.149 0.88154
## arc 0.1959196 0.1073796 1.825 0.06884 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1703 on 389 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.04912, Adjusted R-squared: 0.03445
## F-statistic: 3.349 on 6 and 389 DF, p-value: 0.003139
anova(CDI_kuperman_resid_base, CDI_kuperman_resid_childes)
## Analysis of Variance Table
##
## Model 1: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc
## Model 2: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + childes_adult_log_freq + concreteness + neighbour_concentration +
## arc
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 389 11.223
## 2 389 11.283 0 -0.05974
CDI_kuperman_resid_pos <- CDI_kuperman_resid_filtered %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)
summary(CDI_kuperman_resid_pos)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38584 -0.11597 -0.00807 0.11151 0.52852
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.084383 0.287071 -0.294 0.7690
## scale(hypernyms) 0.031420 0.013966 2.250 0.0250 *
## scale(log(hyponyms + 1)) 0.000832 0.009926 0.084 0.9332
## frequency_subtlex -0.038952 0.023358 -1.668 0.0962 .
## concreteness 0.017050 0.022106 0.771 0.4410
## neighbour_concentration -0.047967 0.659761 -0.073 0.9421
## arc 0.340952 0.142585 2.391 0.0173 *
## dom_pos_factor2 0.029159 0.036878 0.791 0.4296
## dom_pos_factor3 -0.095277 0.101494 -0.939 0.3484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1699 on 387 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.05867, Adjusted R-squared: 0.03921
## F-statistic: 3.015 on 8 and 387 DF, p-value: 0.002694
CDI_kuperman_resid_pos_helpful <- CDI_kuperman_resid_filtered %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)
summary(CDI_kuperman_resid_pos_helpful)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + helpfulness, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.40919 -0.09689 -0.00054 0.08081 0.40882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.587779 0.246332 -2.386 0.017508 *
## scale(hypernyms) 0.038163 0.011833 3.225 0.001366 **
## scale(log(hyponyms + 1)) 0.011926 0.008449 1.412 0.158878
## frequency_subtlex -0.066797 0.019897 -3.357 0.000866 ***
## concreteness 0.025943 0.018724 1.386 0.166692
## neighbour_concentration 0.176188 0.558707 0.315 0.752666
## arc 0.367297 0.120701 3.043 0.002502 **
## dom_pos_factor2 0.006019 0.031269 0.193 0.847449
## dom_pos_factor3 -0.040387 0.086017 -0.470 0.638961
## helpfulness 0.142944 0.011511 12.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1438 on 386 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.3274, Adjusted R-squared: 0.3117
## F-statistic: 20.88 on 9 and 386 DF, p-value: < 2.2e-16
CDI_kuperman_resid_pos_helpful_babiness_synsets <- CDI_kuperman_resid_filtered %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness + babiness + n_synsets,data=.)
summary(CDI_kuperman_resid_pos_helpful_babiness_synsets)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + helpfulness + babiness + n_synsets,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41283 -0.09745 0.00206 0.08341 0.39965
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4509575 0.2681275 -1.682 0.09353 .
## scale(hypernyms) 0.0338070 0.0131756 2.566 0.01073 *
## scale(log(hyponyms + 1)) 0.0140630 0.0090256 1.558 0.12015
## frequency_subtlex -0.0638006 0.0228710 -2.790 0.00558 **
## concreteness 0.0235046 0.0197273 1.191 0.23431
## neighbour_concentration -0.1601826 0.6217865 -0.258 0.79686
## arc 0.3537708 0.1448743 2.442 0.01513 *
## dom_pos_factor2 -0.0015195 0.0356091 -0.043 0.96599
## helpfulness 0.1422203 0.0127649 11.142 < 2e-16 ***
## babiness -0.0001798 0.0044717 -0.040 0.96796
## n_synsets -0.0007588 0.0013440 -0.565 0.57273
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1457 on 334 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.3061, Adjusted R-squared: 0.2853
## F-statistic: 14.73 on 10 and 334 DF, p-value: < 2.2e-16
anova(CDI_kuperman_resid_base, CDI_kuperman_resid_pos, CDI_kuperman_resid_pos_helpful)
## Analysis of Variance Table
##
## Model 1: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc
## Model 2: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor
## Model 3: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + helpfulness
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 389 11.2230
## 2 387 11.1694 2 0.0536 1.296 0.2748
## 3 386 7.9808 1 3.1886 154.219 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
For CDI words, including PoS doesn’t improve model fit, but including helpfulness does
all_words_all_features_typed <- all_words_all_features_typed %>%
mutate(concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))
ggplot(filter(all_words_all_features_typed, type_broad=="non-CDI"),
aes(x=KupermanAoA, y=all_ages_says))+
geom_point(aes(color=as.factor(concreteness_meansplit)))+
geom_smooth(method=lm)+
theme_classic()
non_CDI_all_features <- filter(all_words_all_features, type != "CDI")
base_mod_nonCDI <- lm(all_ages_says ~ KupermanAoA, data=non_CDI_all_features)
nonCDI_kuperman_resid <- non_CDI_all_features %>%
mutate(all_ages_says.Kuperman = resid(base_mod_nonCDI),
concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))
ggplot(nonCDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(nonCDI_kuperman_resid, aes(x=helpfulness, y=all_ages_says.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
base_resid_nonCDI <- nonCDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + frequency_subtlex+concreteness+neighbour_concentration+arc, data=.)
summary(base_resid_nonCDI)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50481 -0.11560 0.00255 0.11031 0.74971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.90099 0.29941 -3.009 0.00284 **
## scale(hypernyms) 0.06624 0.01039 6.374 6.91e-10 ***
## scale(log(hyponyms + 1)) 0.03713 0.01057 3.514 0.00051 ***
## frequency_subtlex 0.01913 0.02222 0.861 0.38986
## concreteness 0.10482 0.01403 7.474 8.61e-13 ***
## neighbour_concentration 0.90850 0.79180 1.147 0.25214
## arc 0.09575 0.14362 0.667 0.50549
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1672 on 300 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.3338, Adjusted R-squared: 0.3205
## F-statistic: 25.06 on 6 and 300 DF, p-value: < 2.2e-16
pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)
summary(pos_resid_nonCDI)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48162 -0.09535 0.00603 0.09382 0.77676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.78716 0.29089 -2.706 0.00720 **
## scale(hypernyms) 0.04594 0.01070 4.294 2.38e-05 ***
## scale(log(hyponyms + 1)) 0.03009 0.01023 2.943 0.00351 **
## frequency_subtlex 0.04869 0.02190 2.223 0.02697 *
## concreteness 0.09588 0.01392 6.887 3.39e-11 ***
## neighbour_concentration 0.70693 0.76143 0.928 0.35394
## arc -0.09687 0.14165 -0.684 0.49459
## dom_pos_factor2 -0.11425 0.02578 -4.431 1.32e-05 ***
## dom_pos_factor3 0.09587 0.03744 2.561 0.01094 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1598 on 298 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.3956, Adjusted R-squared: 0.3794
## F-statistic: 24.38 on 8 and 298 DF, p-value: < 2.2e-16
helpful_pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)
summary(helpful_pos_resid_nonCDI)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + helpfulness, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43119 -0.09495 0.00396 0.08811 0.80900
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.83451 0.28630 -2.915 0.003830 **
## scale(hypernyms) 0.04539 0.01046 4.341 1.95e-05 ***
## scale(log(hyponyms + 1)) 0.02305 0.01018 2.265 0.024258 *
## frequency_subtlex 0.01491 0.02375 0.628 0.530655
## concreteness 0.09210 0.01373 6.707 1.00e-10 ***
## neighbour_concentration 0.73950 0.74858 0.988 0.324018
## arc 0.00986 0.14280 0.069 0.944994
## dom_pos_factor2 -0.12208 0.02545 -4.796 2.56e-06 ***
## dom_pos_factor3 0.07100 0.03754 1.891 0.059532 .
## helpfulness 0.04692 0.01391 3.372 0.000845 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1571 on 297 degrees of freedom
## (21 observations deleted due to missingness)
## Multiple R-squared: 0.4179, Adjusted R-squared: 0.4002
## F-statistic: 23.69 on 9 and 297 DF, p-value: < 2.2e-16
synsets_helpful_pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + n_synsets,data=.)
summary(synsets_helpful_pos_resid_nonCDI)
##
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + n_synsets, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49041 -0.09457 -0.00371 0.09183 0.77780
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.863974 0.305278 -2.830 0.00501 **
## scale(hypernyms) 0.037150 0.011554 3.215 0.00147 **
## scale(log(hyponyms + 1)) 0.019794 0.010645 1.860 0.06407 .
## frequency_subtlex 0.072756 0.023443 3.104 0.00212 **
## concreteness 0.102250 0.014463 7.070 1.41e-11 ***
## neighbour_concentration 0.625596 0.800812 0.781 0.43539
## arc -0.091018 0.143999 -0.632 0.52789
## dom_pos_factor2 -0.121867 0.028915 -4.215 3.45e-05 ***
## n_synsets -0.003418 0.001605 -2.130 0.03411 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1574 on 262 degrees of freedom
## (57 observations deleted due to missingness)
## Multiple R-squared: 0.4223, Adjusted R-squared: 0.4047
## F-statistic: 23.94 on 8 and 262 DF, p-value: < 2.2e-16
anova(base_resid_nonCDI, pos_resid_nonCDI, helpful_pos_resid_nonCDI)
## Analysis of Variance Table
##
## Model 1: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc
## Model 2: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor
## Model 3: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + helpfulness
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 300 8.3860
## 2 298 7.6087 2 0.77733 15.752 3.148e-07 ***
## 3 297 7.3281 1 0.28059 11.372 0.0008445 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Accounting for PoS and helpfulness also improve model fit for non-CDI words
non_CDI_all_features_understands <- filter(all_words_all_features, type != "CDI" & !is.na(all_ages_understands))
base_mod_nonCDI_understands <- lm(all_ages_understands ~ KupermanAoA, data=non_CDI_all_features_understands)
nonCDI_kuperman_resid_understands <- non_CDI_all_features_understands %>%
mutate(all_ages_understands.Kuperman = resid(base_mod_nonCDI_understands),
concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))
ggplot(nonCDI_kuperman_resid_understands, aes(x=KupermanAoA, y=all_ages_understands))+
geom_point(aes(color=as.factor(concreteness_meansplit)))+
geom_smooth(method=lm)+
theme_classic()
ggplot(nonCDI_kuperman_resid_understands, aes(x=hypernyms, y=all_ages_understands.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(nonCDI_kuperman_resid_understands, aes(x=helpfulness, y=all_ages_understands.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
base_resid_nonCDI_understands <- nonCDI_kuperman_resid_understands %>%
filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>%
lm(all_ages_understands.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + frequency_subtlex+concreteness+neighbour_concentration+arc, data=.)
summary(base_resid_nonCDI_understands)
##
## Call:
## lm(formula = all_ages_understands.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57722 -0.12454 -0.00199 0.11143 0.72459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.95695 0.31308 -3.057 0.002441 **
## scale(hypernyms) 0.05717 0.01090 5.245 2.96e-07 ***
## scale(log(hyponyms + 1)) 0.03976 0.01109 3.585 0.000393 ***
## frequency_subtlex 0.02646 0.02329 1.136 0.256749
## concreteness 0.10834 0.01468 7.383 1.55e-12 ***
## neighbour_concentration 1.20830 0.82806 1.459 0.145563
## arc -0.05358 0.15018 -0.357 0.721514
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1748 on 299 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.3056, Adjusted R-squared: 0.2917
## F-statistic: 21.93 on 6 and 299 DF, p-value: < 2.2e-16
pos_resid_nonCDI_understands <- nonCDI_kuperman_resid_understands %>%
filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_understands.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)
summary(pos_resid_nonCDI_understands)
##
## Call:
## lm(formula = all_ages_understands.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc + dom_pos_factor, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.55766 -0.10936 0.01553 0.10139 0.74802
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.78821 0.30612 -2.575 0.01051 *
## scale(hypernyms) 0.03563 0.01131 3.149 0.00181 **
## scale(log(hyponyms + 1)) 0.03219 0.01079 2.983 0.00309 **
## frequency_subtlex 0.05415 0.02307 2.347 0.01960 *
## concreteness 0.09574 0.01465 6.533 2.79e-10 ***
## neighbour_concentration 0.92560 0.80133 1.155 0.24899
## arc -0.23916 0.14907 -1.604 0.10970
## dom_pos_factor2 -0.12824 0.02722 -4.712 3.77e-06 ***
## dom_pos_factor3 0.04921 0.03940 1.249 0.21263
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1682 on 297 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.3619, Adjusted R-squared: 0.3447
## F-statistic: 21.06 on 8 and 297 DF, p-value: < 2.2e-16
helpful_pos_resid_nonCDI_understands <- nonCDI_kuperman_resid_understands %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_understands.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)
summary(helpful_pos_resid_nonCDI_understands)
##
## Call:
## lm(formula = all_ages_understands.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc + dom_pos_factor + helpfulness,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49081 -0.11171 -0.00079 0.09561 0.79075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.850866 0.297855 -2.857 0.00458 **
## scale(hypernyms) 0.035333 0.011003 3.211 0.00147 **
## scale(log(hyponyms + 1)) 0.023011 0.010686 2.153 0.03211 *
## frequency_subtlex 0.009406 0.024722 0.380 0.70385
## concreteness 0.090714 0.014289 6.349 8.15e-10 ***
## neighbour_concentration 0.968333 0.778809 1.243 0.21472
## arc -0.097511 0.148569 -0.656 0.51212
## dom_pos_factor2 -0.138514 0.026557 -5.216 3.45e-07 ***
## dom_pos_factor3 0.016213 0.039053 0.415 0.67833
## helpfulness 0.062231 0.014477 4.299 2.33e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1634 on 296 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.3994, Adjusted R-squared: 0.3811
## F-statistic: 21.87 on 9 and 296 DF, p-value: < 2.2e-16
synsets_helpful_pos_resid_nonCDI_understands <- nonCDI_kuperman_resid_understands %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(all_ages_understands.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness+ n_synsets,data=.)
summary(synsets_helpful_pos_resid_nonCDI_understands)
##
## Call:
## lm(formula = all_ages_understands.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc + dom_pos_factor + helpfulness +
## n_synsets, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50927 -0.11269 -0.00717 0.09274 0.78439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.925822 0.318141 -2.910 0.003935 **
## scale(hypernyms) 0.025540 0.012200 2.094 0.037296 *
## scale(log(hyponyms + 1)) 0.013898 0.011312 1.229 0.220380
## frequency_subtlex 0.030137 0.027383 1.101 0.272132
## concreteness 0.098996 0.015146 6.536 3.46e-10 ***
## neighbour_concentration 0.942313 0.839741 1.122 0.262864
## arc -0.089704 0.153575 -0.584 0.559670
## dom_pos_factor2 -0.151957 0.030481 -4.985 1.15e-06 ***
## helpfulness 0.054514 0.016001 3.407 0.000764 ***
## n_synsets -0.001969 0.001686 -1.168 0.244012
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1622 on 253 degrees of freedom
## (57 observations deleted due to missingness)
## Multiple R-squared: 0.4304, Adjusted R-squared: 0.4102
## F-statistic: 21.24 on 9 and 253 DF, p-value: < 2.2e-16
anova(base_resid_nonCDI_understands, pos_resid_nonCDI_understands, helpful_pos_resid_nonCDI_understands)
## Analysis of Variance Table
##
## Model 1: all_ages_understands.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc
## Model 2: all_ages_understands.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor
## Model 3: all_ages_understands.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + helpfulness
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 299 9.1385
## 2 297 8.3978 2 0.74067 13.868 1.749e-06 ***
## 3 296 7.9043 1 0.49346 18.479 2.334e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Accounting for PoS and helpfulness also improve model fit for non-CDI words
t.test(formula = concreteness ~ type_broad, data = all_words_all_features_typed)
##
## Welch Two Sample t-test
##
## data: concreteness by type_broad
## t = 10.264, df = 695.48, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.4323634 0.6369131
## sample estimates:
## mean in group CDI mean in group non-CDI
## 4.519722 3.985083
t.test(formula = frequency_subtlex ~ type_broad, data = all_words_all_features_typed)
##
## Welch Two Sample t-test
##
## data: frequency_subtlex by type_broad
## t = 0.35543, df = 747.14, p-value = 0.7224
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.09454249 0.13634481
## sample estimates:
## mean in group CDI mean in group non-CDI
## 4.488141 4.467240
t.test(formula = helpfulness ~ type_broad, data = all_words_all_features_typed)
##
## Welch Two Sample t-test
##
## data: helpfulness by type_broad
## t = 7.4627, df = 685.51, p-value = 2.578e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2953863 0.5063141
## sample estimates:
## mean in group CDI mean in group non-CDI
## 3.469448 3.068597
t.test(formula = arc ~ type_broad, data = all_words_all_features_typed)
##
## Welch Two Sample t-test
##
## data: arc by type_broad
## t = -1.3346, df = 709.91, p-value = 0.1824
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.027240481 0.005192645
## sample estimates:
## mean in group CDI mean in group non-CDI
## 0.5715478 0.5825717
t.test(formula = KupermanAoA ~ type_broad, data = all_words_all_features_typed)
##
## Welch Two Sample t-test
##
## data: KupermanAoA by type_broad
## t = -12.583, df = 550.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.410939 -1.029901
## sample estimates:
## mean in group CDI mean in group non-CDI
## 4.376311 5.596731
t.test(formula = hypernyms ~ type_broad, data = all_words_all_features_typed)
##
## Welch Two Sample t-test
##
## data: hypernyms by type_broad
## t = 6.7084, df = 770, p-value = 3.812e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.087927 1.988038
## sample estimates:
## mean in group CDI mean in group non-CDI
## 7.335766 5.797784
t.test(formula = hyponyms ~ type_broad, data = all_words_all_features_typed)
##
## Welch Two Sample t-test
##
## data: hyponyms by type_broad
## t = 2.0635, df = 450.87, p-value = 0.03963
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.165934 88.765034
## sample estimates:
## mean in group CDI mean in group non-CDI
## 63.61717 18.15169
t.test(formula = n_synsets ~ type_broad, data = all_words_all_features_typed)
##
## Welch Two Sample t-test
##
## data: n_synsets by type_broad
## t = -1.945, df = 624.45, p-value = 0.05223
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.3553737 0.0113339
## sample estimates:
## mean in group CDI mean in group non-CDI
## 6.033654 7.205674