calculated_aoa_years: AoA, calculated using Braginsky & Yurovsky Wordbank script
KupermanAoA: adult-estimated AoA, from Kuperman norms
hypernyms: how many words are superordinate to this word? (per wordnet)
hyponyms: how many words are subordinate to this word? (per wordnet)
concreteness: how concrete (i.e. tangible) is the word’s referent, scale of 1 (abstract) to 5 (concrete), from Brysbaert norms
frequency: word frequency from SUBTLEX
neighbour_concentration: measure of semantic density (how concentrated is local word neighborhood), from Thompson (unpublished)
arc: measure of semantic density (average radius of co-occurrence), from HiDEx (Shaoul & Westbury)
ncount: measure of semantic density (neighbor count), from HiDEx (Shaoul & Westbury)
helpfulness: how helpful would it be for a preschooler to know this word, scale of 1 (not helpful) to 5 (very helpful), collected on mturk
babiness: how much is this word associated with babies, scale of 1 to 10, from Perry et al. (2015)
n_synsets: how many synsets are there for this word on wordnet?
n_definitions: how many definitions are for this word (as this part of speech) on wordnet? dog.n.01, dog.n.02, etc.
corr_vars <- filtered_calculated_aoas %>%
mutate(log_hyponyms = log(hyponyms+1)) %>%
select(all_ages_says, all_ages_understands, calculated_aoa_years, KupermanAoA, hypernyms, log_hyponyms, concreteness,
frequency_subtlex, childes_adult_log_freq, childes_kid_log_freq, n_synsets, n_definitions,
neighbour_concentration, arc, ncount, helpfulness, babiness) %>%
cor(use="pairwise.complete.obs", method="pearson")
p.mat <- cor.mtest(corr_vars)
pMatrix <- p.mat$p
corrplot(corr_vars, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
tl.col = "black", number.font=2, number.cex=.6, p.mat=pMatrix, sig.level = 0.05, insig = "blank")
all_words_kuperman_diff<- filtered_calculated_aoas %>%
mutate(diff_aoas = calculated_aoa_years - KupermanAoA)
#base_mod_scaled_data <- lm(calculated_aoa_years ~ KupermanAoA, data = awaf_scaled)
#scaled_pos_diff <- awaf_scaled %>%
# ungroup() %>%
# mutate(diff_aoas = resid(base_mod_scaled_data))
ggplot(all_words_kuperman_diff, aes(x=hypernyms, y=diff_aoas))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(all_words_kuperman_diff, aes(x=helpfulness, y=diff_aoas))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
all_words_no_pos <- all_words_kuperman_diff %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)
summary(all_words_no_pos)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9633 -0.7186 -0.0772 0.7078 10.3312
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.28858 1.53280 0.188 0.85072
## scale(hypernyms) -0.15959 0.05971 -2.673 0.00770 **
## scale(log(hyponyms + 1)) -0.04604 0.05613 -0.820 0.41231
## frequency_subtlex 0.40153 0.12396 3.239 0.00126 **
## concreteness -0.26725 0.08258 -3.236 0.00127 **
## neighbour_concentration -5.83633 3.91925 -1.489 0.13691
## arc -1.61192 0.78267 -2.059 0.03983 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.307 on 679 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.1112, Adjusted R-squared: 0.1034
## F-statistic: 14.16 on 6 and 679 DF, p-value: 3.127e-15
all_words_no_pos_childes <- all_words_kuperman_diff %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ childes_adult_log_freq+concreteness+neighbour_concentration+arc,data=.)
summary(all_words_no_pos_childes)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + childes_adult_log_freq + concreteness + neighbour_concentration +
## arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9274 -0.6867 -0.1044 0.6770 10.6128
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.87458 1.49061 0.587 0.55758
## scale(hypernyms) -0.17240 0.05855 -2.945 0.00334 **
## scale(log(hyponyms + 1)) -0.07404 0.05694 -1.300 0.19389
## childes_adult_log_freq 0.15559 0.03869 4.021 6.44e-05 ***
## concreteness -0.35594 0.07867 -4.524 7.15e-06 ***
## neighbour_concentration -5.71449 3.90352 -1.464 0.14368
## arc -0.67208 0.57405 -1.171 0.24211
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.302 on 679 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.1185, Adjusted R-squared: 0.1107
## F-statistic: 15.21 on 6 and 679 DF, p-value: < 2.2e-16
all_words_with_pos <- all_words_kuperman_diff %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc+dom_pos_factor,data=.)
summary(all_words_with_pos)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9850 -0.7163 -0.0659 0.6800 10.3210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.20868 1.55623 -0.134 0.89337
## scale(hypernyms) -0.09491 0.06766 -1.403 0.16119
## scale(log(hyponyms + 1)) -0.03696 0.05622 -0.657 0.51110
## frequency_subtlex 0.37000 0.12641 2.927 0.00354 **
## concreteness -0.21875 0.08695 -2.516 0.01211 *
## neighbour_concentration -5.19785 3.92778 -1.323 0.18616
## arc -1.40396 0.79635 -1.763 0.07835 .
## dom_pos_factor2 0.33949 0.16773 2.024 0.04337 *
## dom_pos_factor3 0.10241 0.28214 0.363 0.71674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.305 on 677 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.1166, Adjusted R-squared: 0.1061
## F-statistic: 11.16 on 8 and 677 DF, p-value: 6.754e-15
all_words_with_pos_helpful <- all_words_kuperman_diff %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+ concreteness+ neighbour_concentration+ arc+ dom_pos_factor+ babiness+helpfulness+n_synsets+n_definitions,data=.)
summary(all_words_with_pos_helpful)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + babiness + helpfulness + n_synsets +
## n_definitions, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3225 -0.5466 -0.0237 0.5439 7.3469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.750875 1.899809 -1.974 0.0491 *
## scale(hypernyms) 0.034111 0.079547 0.429 0.6683
## scale(log(hyponyms + 1)) -0.035697 0.062119 -0.575 0.5659
## frequency_subtlex 0.864104 0.155573 5.554 5.18e-08 ***
## concreteness 0.126892 0.133301 0.952 0.3417
## neighbour_concentration -2.887957 4.541251 -0.636 0.5252
## arc -2.146080 1.014993 -2.114 0.0351 *
## dom_pos_factor2 0.258929 0.237857 1.089 0.2770
## babiness 0.125603 0.031038 4.047 6.27e-05 ***
## helpfulness -0.366708 0.089081 -4.117 4.70e-05 ***
## n_synsets 0.023027 0.009996 2.304 0.0218 *
## n_definitions -0.025683 0.020657 -1.243 0.2145
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.078 on 388 degrees of freedom
## (328 observations deleted due to missingness)
## Multiple R-squared: 0.2282, Adjusted R-squared: 0.2063
## F-statistic: 10.43 on 11 and 388 DF, p-value: < 2.2e-16
filtered_calculated_aoas_typed <- filtered_calculated_aoas %>%
mutate(type_broad = ifelse(type=="CDI","CDI","non-CDI"))
ggplot(filtered_calculated_aoas_typed, aes(x=KupermanAoA, y=calculated_aoa_years, color=type_broad))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
Seems like we should look at CDI and non-CDI words separately, as very different things are going on.
cdi_words_all_features <- filter(filtered_calculated_aoas, type=="CDI")
base_mod_CDI <- lm(calculated_aoa_years ~ KupermanAoA, data=cdi_words_all_features)
CDI_kuperman_diff <- cdi_words_all_features %>%
mutate(diff_aoas = resid(base_mod_CDI))
ggplot(CDI_kuperman_diff, aes(x=hypernyms, y=diff_aoas))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(CDI_kuperman_diff, aes(x=helpfulness, y=diff_aoas))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
CDI_kuperman_diff_filtered <- CDI_kuperman_diff %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3))))
CDI_kuperman_diff_base <- CDI_kuperman_diff_filtered %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)
summary(CDI_kuperman_diff_base)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.03694 -0.18233 0.00858 0.17994 0.80625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0468630 0.4676315 0.100 0.9202
## scale(hypernyms) -0.0378966 0.0196006 -1.933 0.0539 .
## scale(log(hyponyms + 1)) -0.0004546 0.0167676 -0.027 0.9784
## frequency_subtlex 0.0781497 0.0397402 1.967 0.0500 *
## concreteness -0.0086657 0.0345472 -0.251 0.8021
## neighbour_concentration -0.0205250 1.1104706 -0.018 0.9853
## arc -0.6140758 0.2416764 -2.541 0.0114 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.287 on 387 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.04746, Adjusted R-squared: 0.03269
## F-statistic: 3.214 on 6 and 387 DF, p-value: 0.004312
CDI_kuperman_diff_childes <- CDI_kuperman_diff_filtered %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+childes_adult_log_freq+concreteness+neighbour_concentration+arc,data=.)
summary(CDI_kuperman_diff_childes)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + childes_adult_log_freq + concreteness + neighbour_concentration +
## arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.02428 -0.18568 0.00169 0.17340 0.78069
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.257130 0.460926 0.558 0.5773
## scale(hypernyms) -0.045622 0.019270 -2.368 0.0184 *
## scale(log(hyponyms + 1)) 0.002409 0.017033 0.141 0.8876
## childes_adult_log_freq 0.006734 0.014182 0.475 0.6352
## concreteness -0.026375 0.033584 -0.785 0.4327
## neighbour_concentration -0.038208 1.116624 -0.034 0.9727
## arc -0.298512 0.181972 -1.640 0.1017
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2883 on 387 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.0385, Adjusted R-squared: 0.02359
## F-statistic: 2.583 on 6 and 387 DF, p-value: 0.01824
anova(CDI_kuperman_diff_base, CDI_kuperman_diff_childes)
## Analysis of Variance Table
##
## Model 1: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex +
## concreteness + neighbour_concentration + arc
## Model 2: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + childes_adult_log_freq +
## concreteness + neighbour_concentration + arc
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 387 31.867
## 2 387 32.167 0 -0.2997
CDI_kuperman_diff_pos <- CDI_kuperman_diff_filtered %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)
summary(CDI_kuperman_diff_pos)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05096 -0.17667 0.01129 0.18169 0.78824
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1655816 0.4848964 0.341 0.73293
## scale(hypernyms) -0.0493641 0.0236466 -2.088 0.03749 *
## scale(log(hyponyms + 1)) -0.0009144 0.0167736 -0.055 0.95655
## frequency_subtlex 0.0817653 0.0399881 2.045 0.04156 *
## concreteness -0.0225603 0.0373953 -0.603 0.54667
## neighbour_concentration -0.1599139 1.1171573 -0.143 0.88625
## arc -0.6317641 0.2420696 -2.610 0.00941 **
## dom_pos_factor2 -0.0654387 0.0623090 -1.050 0.29427
## dom_pos_factor3 0.1472005 0.1713815 0.859 0.39093
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2869 on 385 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.05303, Adjusted R-squared: 0.03336
## F-statistic: 2.695 on 8 and 385 DF, p-value: 0.006781
CDI_kuperman_diff_pos_helpful <- CDI_kuperman_diff_filtered %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)
summary(CDI_kuperman_diff_pos_helpful)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + helpfulness, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.89267 -0.12702 0.00387 0.15013 0.73962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.96512 0.42312 2.281 0.023097 *
## scale(hypernyms) -0.06206 0.02039 -3.044 0.002498 **
## scale(log(hyponyms + 1)) -0.01939 0.01453 -1.335 0.182789
## frequency_subtlex 0.12278 0.03461 3.548 0.000437 ***
## concreteness -0.03751 0.03222 -1.164 0.245116
## neighbour_concentration -0.43911 0.96218 -0.456 0.648380
## arc -0.66265 0.20844 -3.179 0.001597 **
## dom_pos_factor2 -0.02744 0.05375 -0.510 0.610005
## dom_pos_factor3 0.05639 0.14777 0.382 0.702954
## helpfulness -0.23209 0.01995 -11.633 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.247 on 384 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.2998, Adjusted R-squared: 0.2834
## F-statistic: 18.27 on 9 and 384 DF, p-value: < 2.2e-16
CDI_kuperman_diff_pos_helpful_babiness_synsets <- CDI_kuperman_diff_filtered %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness + babiness + n_synsets+n_definitions,data=.)
summary(CDI_kuperman_diff_pos_helpful_babiness_synsets)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + helpfulness + babiness + n_synsets +
## n_definitions, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.88297 -0.13110 0.00425 0.14903 0.75059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7734898 0.4665918 1.658 0.09831 .
## scale(hypernyms) -0.0597568 0.0229642 -2.602 0.00968 **
## scale(log(hyponyms + 1)) -0.0238444 0.0156846 -1.520 0.12940
## frequency_subtlex 0.1256374 0.0405908 3.095 0.00213 **
## concreteness -0.0315968 0.0346254 -0.913 0.36215
## neighbour_concentration 0.0390665 1.1036098 0.035 0.97178
## arc -0.6937365 0.2538741 -2.733 0.00662 **
## dom_pos_factor2 -0.0203306 0.0638641 -0.318 0.75043
## helpfulness -0.2348867 0.0222147 -10.573 < 2e-16 ***
## babiness -0.0018235 0.0077741 -0.235 0.81469
## n_synsets 0.0010570 0.0031582 0.335 0.73807
## n_definitions 0.0006282 0.0059635 0.105 0.91616
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2526 on 332 degrees of freedom
## (79 observations deleted due to missingness)
## Multiple R-squared: 0.2882, Adjusted R-squared: 0.2646
## F-statistic: 12.22 on 11 and 332 DF, p-value: < 2.2e-16
anova(CDI_kuperman_diff_base, CDI_kuperman_diff_pos, CDI_kuperman_diff_pos_helpful)
## Analysis of Variance Table
##
## Model 1: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex +
## concreteness + neighbour_concentration + arc
## Model 2: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex +
## concreteness + neighbour_concentration + arc + dom_pos_factor
## Model 3: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex +
## concreteness + neighbour_concentration + arc + dom_pos_factor +
## helpfulness
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 387 31.867
## 2 385 31.681 2 0.1865 1.5283 0.2182
## 3 384 23.425 1 8.2559 135.3361 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
For CDI words, including PoS doesn’t improve model fit, but including helpfulness does
filtered_calculated_aoas_typed <- filtered_calculated_aoas_typed %>%
mutate(concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))
ggplot(filter(filtered_calculated_aoas_typed, type_broad=="non-CDI"),
aes(x=KupermanAoA, y=calculated_aoa_years))+
geom_point(aes(color=as.factor(concreteness_meansplit)))+
geom_smooth(method=lm)+
theme_classic()
non_CDI_all_features <- filter(filtered_calculated_aoas, type != "CDI")
base_mod_nonCDI <- lm(calculated_aoa_years ~ KupermanAoA, data=non_CDI_all_features)
nonCDI_kuperman_diff <- non_CDI_all_features %>%
mutate(diff_aoas = resid(base_mod_nonCDI),
concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))
ggplot(nonCDI_kuperman_diff, aes(x=hypernyms, y=diff_aoas))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(nonCDI_kuperman_diff, aes(x=helpfulness, y=diff_aoas))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
base_diff_nonCDI <- nonCDI_kuperman_diff %>%
filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1)) + frequency_subtlex+concreteness+neighbour_concentration+arc, data=.)
summary(base_diff_nonCDI)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2708 -0.8760 -0.2008 0.7648 10.2389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.51198 2.78761 1.619 0.10664
## scale(hypernyms) -0.44253 0.09759 -4.535 8.51e-06 ***
## scale(log(hyponyms + 1)) -0.28382 0.09867 -2.876 0.00433 **
## frequency_subtlex 0.02061 0.21277 0.097 0.92291
## concreteness -0.60670 0.13105 -4.629 5.58e-06 ***
## neighbour_concentration -3.91530 7.33987 -0.533 0.59415
## arc -1.23298 1.37411 -0.897 0.37032
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.527 on 285 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.2016, Adjusted R-squared: 0.1848
## F-statistic: 12 on 6 and 285 DF, p-value: 5.155e-12
pos_diff_nonCDI <- nonCDI_kuperman_diff %>%
filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)
summary(pos_diff_nonCDI)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3849 -0.8224 -0.2603 0.6685 10.1474
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.315632 2.778860 1.193 0.233805
## scale(hypernyms) -0.304285 0.103574 -2.938 0.003577 **
## scale(log(hyponyms + 1)) -0.224735 0.098392 -2.284 0.023108 *
## frequency_subtlex -0.132037 0.215200 -0.614 0.540003
## concreteness -0.496752 0.134419 -3.696 0.000263 ***
## neighbour_concentration -2.203285 7.242030 -0.304 0.761172
## arc -0.225332 1.389417 -0.162 0.871282
## dom_pos_factor2 0.871128 0.250032 3.484 0.000572 ***
## dom_pos_factor3 -0.002668 0.353214 -0.008 0.993979
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.499 on 283 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.2353, Adjusted R-squared: 0.2137
## F-statistic: 10.89 on 8 and 283 DF, p-value: 2.243e-13
helpful_pos_diff_nonCDI <- nonCDI_kuperman_diff %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)
summary(helpful_pos_diff_nonCDI)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + helpfulness, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5637 -0.8283 -0.2162 0.7413 9.8701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.59213 2.77043 1.297 0.195829
## scale(hypernyms) -0.30104 0.10313 -2.919 0.003794 **
## scale(log(hyponyms + 1)) -0.18578 0.10012 -1.855 0.064569 .
## frequency_subtlex 0.04347 0.23372 0.186 0.852579
## concreteness -0.47475 0.13433 -3.534 0.000478 ***
## neighbour_concentration -2.45067 7.21107 -0.340 0.734224
## arc -0.75676 1.41187 -0.536 0.592380
## dom_pos_factor2 0.92216 0.25040 3.683 0.000276 ***
## dom_pos_factor3 0.13020 0.35869 0.363 0.716886
## helpfulness -0.25304 0.13467 -1.879 0.061282 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.493 on 282 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.2448, Adjusted R-squared: 0.2207
## F-statistic: 10.15 on 9 and 282 DF, p-value: 1.53e-13
synsets_helpful_pos_diff_nonCDI <- nonCDI_kuperman_diff %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(diff_aoas ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + n_synsets,data=.)
summary(synsets_helpful_pos_diff_nonCDI)
##
## Call:
## lm(formula = diff_aoas ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + n_synsets, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3124 -0.7830 -0.1418 0.7013 10.3394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.44081 3.01108 1.475 0.14156
## scale(hypernyms) -0.21622 0.11643 -1.857 0.06452 .
## scale(log(hyponyms + 1)) -0.11108 0.10728 -1.035 0.30152
## frequency_subtlex -0.31057 0.24587 -1.263 0.20776
## concreteness -0.59897 0.14477 -4.137 4.85e-05 ***
## neighbour_concentration -1.79178 7.91223 -0.226 0.82104
## arc -0.93559 1.47345 -0.635 0.52605
## dom_pos_factor2 0.84434 0.28885 2.923 0.00379 **
## n_synsets 0.04602 0.01643 2.801 0.00551 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.511 on 242 degrees of freedom
## (54 observations deleted due to missingness)
## Multiple R-squared: 0.2911, Adjusted R-squared: 0.2677
## F-statistic: 12.42 on 8 and 242 DF, p-value: 6.757e-15
anova(base_diff_nonCDI, pos_diff_nonCDI, helpful_pos_diff_nonCDI)
## Analysis of Variance Table
##
## Model 1: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex +
## concreteness + neighbour_concentration + arc
## Model 2: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex +
## concreteness + neighbour_concentration + arc + dom_pos_factor
## Model 3: diff_aoas ~ scale(hypernyms) + scale(log(hyponyms + 1)) + frequency_subtlex +
## concreteness + neighbour_concentration + arc + dom_pos_factor +
## helpfulness
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 285 664.27
## 2 283 636.25 2 28.013 6.2857 0.002134 **
## 3 282 628.39 1 7.867 3.5305 0.061282 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Accounting for PoS and helpfulness also improve model fit for non-CDI words
pos_diffs <- all_words_kuperman_diff %>%
filter(diff_aoas > .5) %>%
select(word = word.x, calculated_aoa_years, KupermanAoA, aoa_difference = diff_aoas,
dom_pos, hypernyms, hyponyms, concreteness, helpfulness, babiness, childes_adult_log_freq, arc) %>%
arrange(desc(aoa_difference))
DT::datatable(pos_diffs)
neg_diffs <- all_words_kuperman_diff %>%
filter(diff_aoas < -.5) %>%
select(word = word.x, calculated_aoa_years, KupermanAoA, aoa_difference = diff_aoas,
dom_pos, hypernyms, hyponyms, concreteness, helpfulness, babiness, childes_adult_log_freq, arc) %>%
arrange(aoa_difference)
DT::datatable(neg_diffs)
pretty_good <- all_words_kuperman_diff %>%
filter(diff_aoas > -.5 & diff_aoas < .5) %>%
select(word = word.x, calculated_aoa_years, KupermanAoA, aoa_difference = diff_aoas,
dom_pos, hypernyms, hyponyms, concreteness, helpfulness, babiness, childes_adult_log_freq, arc) %>%
arrange(desc(aoa_difference))
DT::datatable(pretty_good)