calculated_aoa_years: AoA, calculated using Braginsky & Yurovsky Wordbank script
KupermanAoA: adult-estimated AoA, from Kuperman norms
hypernyms: how many words are superordinate to this word? (per wordnet)
hyponyms: how many words are subordinate to this word? (per wordnet)
concreteness: how concrete (i.e. tangible) is the word’s referent, scale of 1 (abstract) to 5 (concrete), from Brysbaert norms
frequency: word frequency from SUBTLEX
neighbour_concentration: measure of semantic density (how concentrated is local word neighborhood), from Thompson (unpublished)
arc: measure of semantic density (average radius of co-occurrence), from HiDEx (Shaoul & Westbury)
ncount: measure of semantic density (neighbor count), from HiDEx (Shaoul & Westbury)
helpfulness: how helpful would it be for a preschooler to know this word, scale of 1 (not helpful) to 5 (very helpful), collected on mturk
babiness: how much is this word associated with babies, scale of 1 to 10, from Perry et al. (2015)
n_synsets: how many synsets are there for this word on wordnet?
n_definitions: how many definitions are for this word (as this part of speech) on wordnet? dog.n.01, dog.n.02, etc.
corr_vars <- filtered_calculated_aoas %>%
mutate(log_hyponyms = log(hyponyms+1)) %>%
select(all_ages_says, all_ages_understands, calculated_aoa_years, KupermanAoA, hypernyms, log_hyponyms, concreteness,
frequency_subtlex, childes_adult_log_freq, childes_kid_log_freq, n_synsets, n_definitions,
neighbour_concentration, arc, ncount, helpfulness, babiness) %>%
cor(use="pairwise.complete.obs", method="pearson")
p.mat <- cor.mtest(corr_vars)
pMatrix <- p.mat$p
corrplot(corr_vars, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
tl.col = "black", number.font=2, number.cex=.6, p.mat=pMatrix, sig.level = 0.05, insig = "blank")
ggplot(filtered_calculated_aoas, aes(x=KupermanAoA, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(filtered_calculated_aoas, aes(x=hypernyms, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(filtered_calculated_aoas, aes(x=calculated_aoa_years, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(filter(filtered_calculated_aoas, type!="CDI"), aes(x=KupermanAoA, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(filter(filtered_calculated_aoas, type!="CDI"), aes(x=hypernyms, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(filter(filtered_calculated_aoas, type!="CDI"), aes(x=all_ages_says, y=helpfulness))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
base_mod <- lm(calculated_aoa_years ~ KupermanAoA, data=filtered_calculated_aoas)
all_words_kuperman_resid <- filtered_calculated_aoas %>%
mutate(calculated_aoa_years.Kuperman = resid(base_mod))
#base_mod_scaled_data <- lm(calculated_aoa_years ~ KupermanAoA, data = awaf_scaled)
#scaled_pos_resid <- awaf_scaled %>%
# ungroup() %>%
# mutate(calculated_aoa_years.Kuperman = resid(base_mod_scaled_data))
ggplot(all_words_kuperman_resid, aes(x=hypernyms, y=calculated_aoa_years.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(all_words_kuperman_resid, aes(x=helpfulness, y=calculated_aoa_years.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
all_words_no_pos <- all_words_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)
summary(all_words_no_pos)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8586 -0.6677 -0.1402 0.5504 10.2412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.58167 1.42250 2.518 0.01204 *
## scale(hypernyms) -0.17046 0.05541 -3.076 0.00218 **
## scale(log(hyponyms + 1)) -0.10982 0.05209 -2.108 0.03537 *
## frequency_subtlex 0.03512 0.11504 0.305 0.76021
## concreteness -0.46912 0.07664 -6.121 1.57e-09 ***
## neighbour_concentration -4.13646 3.63722 -1.137 0.25583
## arc -0.36300 0.72635 -0.500 0.61741
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.213 on 679 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.1406, Adjusted R-squared: 0.133
## F-statistic: 18.51 on 6 and 679 DF, p-value: < 2.2e-16
all_words_no_pos_childes <- all_words_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ childes_adult_log_freq+concreteness+neighbour_concentration+arc,data=.)
summary(all_words_no_pos_childes)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + childes_adult_log_freq + concreteness +
## neighbour_concentration + arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9169 -0.6692 -0.1311 0.5596 10.1467
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.77298 1.38868 2.717 0.00676 **
## scale(hypernyms) -0.17826 0.05454 -3.268 0.00114 **
## scale(log(hyponyms + 1)) -0.09919 0.05304 -1.870 0.06192 .
## childes_adult_log_freq -0.02367 0.03605 -0.657 0.51167
## concreteness -0.47442 0.07329 -6.473 1.84e-10 ***
## neighbour_concentration -4.23147 3.63661 -1.164 0.24500
## arc -0.05691 0.53480 -0.106 0.91528
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.213 on 679 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.141, Adjusted R-squared: 0.1334
## F-statistic: 18.58 on 6 and 679 DF, p-value: < 2.2e-16
all_words_with_pos <- all_words_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc+dom_pos_factor,data=.)
summary(all_words_with_pos)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc + dom_pos_factor, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8734 -0.6630 -0.1419 0.5387 10.2348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.19984 1.44576 2.213 0.0272 *
## scale(hypernyms) -0.12208 0.06286 -1.942 0.0525 .
## scale(log(hyponyms + 1)) -0.10298 0.05223 -1.972 0.0491 *
## frequency_subtlex 0.01282 0.11744 0.109 0.9131
## concreteness -0.43186 0.08078 -5.346 1.23e-07 ***
## neighbour_concentration -3.64991 3.64895 -1.000 0.3175
## arc -0.21451 0.73982 -0.290 0.7719
## dom_pos_factor2 0.25445 0.15583 1.633 0.1030
## dom_pos_factor3 0.09488 0.26211 0.362 0.7175
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.212 on 677 degrees of freedom
## (42 observations deleted due to missingness)
## Multiple R-squared: 0.144, Adjusted R-squared: 0.1339
## F-statistic: 14.23 on 8 and 677 DF, p-value: < 2.2e-16
all_words_with_pos_helpful <- all_words_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+ concreteness+ neighbour_concentration+ arc+ dom_pos_factor+ babiness+helpfulness+n_synsets+n_definitions,data=.)
summary(all_words_with_pos_helpful)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc + dom_pos_factor + babiness +
## helpfulness + n_synsets + n_definitions, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0886 -0.4341 -0.0438 0.3891 7.3779
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.880417 1.622888 -0.543 0.58779
## scale(hypernyms) 0.004936 0.067952 0.073 0.94213
## scale(log(hyponyms + 1)) -0.067475 0.053064 -1.272 0.20429
## frequency_subtlex 0.697439 0.132897 5.248 2.54e-07 ***
## concreteness 0.037764 0.113871 0.332 0.74034
## neighbour_concentration -2.014398 3.879308 -0.519 0.60387
## arc -1.915973 0.867046 -2.210 0.02770 *
## dom_pos_factor2 0.220237 0.203186 1.084 0.27908
## babiness 0.078121 0.026514 2.946 0.00341 **
## helpfulness -0.364523 0.076097 -4.790 2.37e-06 ***
## n_synsets 0.024889 0.008539 2.915 0.00377 **
## n_definitions -0.029496 0.017646 -1.672 0.09542 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9206 on 388 degrees of freedom
## (328 observations deleted due to missingness)
## Multiple R-squared: 0.2365, Adjusted R-squared: 0.2148
## F-statistic: 10.93 on 11 and 388 DF, p-value: < 2.2e-16
filtered_calculated_aoas_typed <- filtered_calculated_aoas %>%
mutate(type_broad = ifelse(type=="CDI","CDI","non-CDI"))
ggplot(filtered_calculated_aoas_typed, aes(x=KupermanAoA, y=calculated_aoa_years, color=type_broad))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
Seems like we should look at CDI and non-CDI words separately, as very different things are going on.
cdi_words_all_features <- filter(filtered_calculated_aoas, type=="CDI")
base_mod_CDI <- lm(calculated_aoa_years ~ KupermanAoA, data=cdi_words_all_features)
CDI_kuperman_resid <- cdi_words_all_features %>%
mutate(calculated_aoa_years.Kuperman = resid(base_mod_CDI))
ggplot(CDI_kuperman_resid, aes(x=hypernyms, y=calculated_aoa_years.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(CDI_kuperman_resid, aes(x=helpfulness, y=calculated_aoa_years.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
CDI_kuperman_resid_filtered <- CDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3))))
CDI_kuperman_resid_base <- CDI_kuperman_resid_filtered %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)
summary(CDI_kuperman_resid_base)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.03694 -0.18233 0.00858 0.17994 0.80625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0468630 0.4676315 0.100 0.9202
## scale(hypernyms) -0.0378966 0.0196006 -1.933 0.0539 .
## scale(log(hyponyms + 1)) -0.0004546 0.0167676 -0.027 0.9784
## frequency_subtlex 0.0781497 0.0397402 1.967 0.0500 *
## concreteness -0.0086657 0.0345472 -0.251 0.8021
## neighbour_concentration -0.0205250 1.1104706 -0.018 0.9853
## arc -0.6140758 0.2416764 -2.541 0.0114 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.287 on 387 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.04746, Adjusted R-squared: 0.03269
## F-statistic: 3.214 on 6 and 387 DF, p-value: 0.004312
CDI_kuperman_resid_childes <- CDI_kuperman_resid_filtered %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+childes_adult_log_freq+concreteness+neighbour_concentration+arc,data=.)
summary(CDI_kuperman_resid_childes)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + childes_adult_log_freq + concreteness +
## neighbour_concentration + arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.02428 -0.18568 0.00169 0.17340 0.78069
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.257130 0.460926 0.558 0.5773
## scale(hypernyms) -0.045622 0.019270 -2.368 0.0184 *
## scale(log(hyponyms + 1)) 0.002409 0.017033 0.141 0.8876
## childes_adult_log_freq 0.006734 0.014182 0.475 0.6352
## concreteness -0.026375 0.033584 -0.785 0.4327
## neighbour_concentration -0.038208 1.116624 -0.034 0.9727
## arc -0.298512 0.181972 -1.640 0.1017
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2883 on 387 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.0385, Adjusted R-squared: 0.02359
## F-statistic: 2.583 on 6 and 387 DF, p-value: 0.01824
anova(CDI_kuperman_resid_base, CDI_kuperman_resid_childes)
## Analysis of Variance Table
##
## Model 1: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc
## Model 2: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + childes_adult_log_freq + concreteness + neighbour_concentration +
## arc
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 387 31.867
## 2 387 32.167 0 -0.2997
CDI_kuperman_resid_pos <- CDI_kuperman_resid_filtered %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)
summary(CDI_kuperman_resid_pos)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc + dom_pos_factor, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05096 -0.17667 0.01129 0.18169 0.78824
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1655816 0.4848964 0.341 0.73293
## scale(hypernyms) -0.0493641 0.0236466 -2.088 0.03749 *
## scale(log(hyponyms + 1)) -0.0009144 0.0167736 -0.055 0.95655
## frequency_subtlex 0.0817653 0.0399881 2.045 0.04156 *
## concreteness -0.0225603 0.0373953 -0.603 0.54667
## neighbour_concentration -0.1599139 1.1171573 -0.143 0.88625
## arc -0.6317641 0.2420696 -2.610 0.00941 **
## dom_pos_factor2 -0.0654387 0.0623090 -1.050 0.29427
## dom_pos_factor3 0.1472005 0.1713815 0.859 0.39093
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2869 on 385 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.05303, Adjusted R-squared: 0.03336
## F-statistic: 2.695 on 8 and 385 DF, p-value: 0.006781
CDI_kuperman_resid_pos_helpful <- CDI_kuperman_resid_filtered %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)
summary(CDI_kuperman_resid_pos_helpful)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc + dom_pos_factor + helpfulness,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.89267 -0.12702 0.00387 0.15013 0.73962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.96512 0.42312 2.281 0.023097 *
## scale(hypernyms) -0.06206 0.02039 -3.044 0.002498 **
## scale(log(hyponyms + 1)) -0.01939 0.01453 -1.335 0.182789
## frequency_subtlex 0.12278 0.03461 3.548 0.000437 ***
## concreteness -0.03751 0.03222 -1.164 0.245116
## neighbour_concentration -0.43911 0.96218 -0.456 0.648380
## arc -0.66265 0.20844 -3.179 0.001597 **
## dom_pos_factor2 -0.02744 0.05375 -0.510 0.610005
## dom_pos_factor3 0.05639 0.14777 0.382 0.702954
## helpfulness -0.23209 0.01995 -11.633 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.247 on 384 degrees of freedom
## (29 observations deleted due to missingness)
## Multiple R-squared: 0.2998, Adjusted R-squared: 0.2834
## F-statistic: 18.27 on 9 and 384 DF, p-value: < 2.2e-16
CDI_kuperman_resid_pos_helpful_babiness_synsets <- CDI_kuperman_resid_filtered %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness + babiness + n_synsets+n_definitions,data=.)
summary(CDI_kuperman_resid_pos_helpful_babiness_synsets)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc + dom_pos_factor + helpfulness +
## babiness + n_synsets + n_definitions, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.88297 -0.13110 0.00425 0.14903 0.75059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7734898 0.4665918 1.658 0.09831 .
## scale(hypernyms) -0.0597568 0.0229642 -2.602 0.00968 **
## scale(log(hyponyms + 1)) -0.0238444 0.0156846 -1.520 0.12940
## frequency_subtlex 0.1256374 0.0405908 3.095 0.00213 **
## concreteness -0.0315968 0.0346254 -0.913 0.36215
## neighbour_concentration 0.0390665 1.1036098 0.035 0.97178
## arc -0.6937365 0.2538741 -2.733 0.00662 **
## dom_pos_factor2 -0.0203306 0.0638641 -0.318 0.75043
## helpfulness -0.2348867 0.0222147 -10.573 < 2e-16 ***
## babiness -0.0018235 0.0077741 -0.235 0.81469
## n_synsets 0.0010570 0.0031582 0.335 0.73807
## n_definitions 0.0006282 0.0059635 0.105 0.91616
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2526 on 332 degrees of freedom
## (79 observations deleted due to missingness)
## Multiple R-squared: 0.2882, Adjusted R-squared: 0.2646
## F-statistic: 12.22 on 11 and 332 DF, p-value: < 2.2e-16
anova(CDI_kuperman_resid_base, CDI_kuperman_resid_pos, CDI_kuperman_resid_pos_helpful)
## Analysis of Variance Table
##
## Model 1: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc
## Model 2: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor
## Model 3: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + helpfulness
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 387 31.867
## 2 385 31.681 2 0.1865 1.5283 0.2182
## 3 384 23.425 1 8.2559 135.3361 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
For CDI words, including PoS doesn’t improve model fit, but including helpfulness does
filtered_calculated_aoas_typed <- filtered_calculated_aoas_typed %>%
mutate(concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))
ggplot(filter(filtered_calculated_aoas_typed, type_broad=="non-CDI"),
aes(x=KupermanAoA, y=calculated_aoa_years))+
geom_point(aes(color=as.factor(concreteness_meansplit)))+
geom_smooth(method=lm)+
theme_classic()
non_CDI_all_features <- filter(filtered_calculated_aoas, type != "CDI")
base_mod_nonCDI <- lm(calculated_aoa_years ~ KupermanAoA, data=non_CDI_all_features)
nonCDI_kuperman_resid <- non_CDI_all_features %>%
mutate(calculated_aoa_years.Kuperman = resid(base_mod_nonCDI),
concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))
ggplot(nonCDI_kuperman_resid, aes(x=hypernyms, y=calculated_aoa_years.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
ggplot(nonCDI_kuperman_resid, aes(x=helpfulness, y=calculated_aoa_years.Kuperman))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
base_resid_nonCDI <- nonCDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + frequency_subtlex+concreteness+neighbour_concentration+arc, data=.)
summary(base_resid_nonCDI)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2708 -0.8760 -0.2008 0.7648 10.2389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.51198 2.78761 1.619 0.10664
## scale(hypernyms) -0.44253 0.09759 -4.535 8.51e-06 ***
## scale(log(hyponyms + 1)) -0.28382 0.09867 -2.876 0.00433 **
## frequency_subtlex 0.02061 0.21277 0.097 0.92291
## concreteness -0.60670 0.13105 -4.629 5.58e-06 ***
## neighbour_concentration -3.91530 7.33987 -0.533 0.59415
## arc -1.23298 1.37411 -0.897 0.37032
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.527 on 285 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.2016, Adjusted R-squared: 0.1848
## F-statistic: 12 on 6 and 285 DF, p-value: 5.155e-12
pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)
summary(pos_resid_nonCDI)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc + dom_pos_factor, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3849 -0.8224 -0.2603 0.6685 10.1474
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.315632 2.778860 1.193 0.233805
## scale(hypernyms) -0.304285 0.103574 -2.938 0.003577 **
## scale(log(hyponyms + 1)) -0.224735 0.098392 -2.284 0.023108 *
## frequency_subtlex -0.132037 0.215200 -0.614 0.540003
## concreteness -0.496752 0.134419 -3.696 0.000263 ***
## neighbour_concentration -2.203285 7.242030 -0.304 0.761172
## arc -0.225332 1.389417 -0.162 0.871282
## dom_pos_factor2 0.871128 0.250032 3.484 0.000572 ***
## dom_pos_factor3 -0.002668 0.353214 -0.008 0.993979
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.499 on 283 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.2353, Adjusted R-squared: 0.2137
## F-statistic: 10.89 on 8 and 283 DF, p-value: 2.243e-13
helpful_pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)
summary(helpful_pos_resid_nonCDI)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc + dom_pos_factor + helpfulness,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5637 -0.8283 -0.2162 0.7413 9.8701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.59213 2.77043 1.297 0.195829
## scale(hypernyms) -0.30104 0.10313 -2.919 0.003794 **
## scale(log(hyponyms + 1)) -0.18578 0.10012 -1.855 0.064569 .
## frequency_subtlex 0.04347 0.23372 0.186 0.852579
## concreteness -0.47475 0.13433 -3.534 0.000478 ***
## neighbour_concentration -2.45067 7.21107 -0.340 0.734224
## arc -0.75676 1.41187 -0.536 0.592380
## dom_pos_factor2 0.92216 0.25040 3.683 0.000276 ***
## dom_pos_factor3 0.13020 0.35869 0.363 0.716886
## helpfulness -0.25304 0.13467 -1.879 0.061282 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.493 on 282 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.2448, Adjusted R-squared: 0.2207
## F-statistic: 10.15 on 9 and 282 DF, p-value: 1.53e-13
synsets_helpful_pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
filter(dom_pos %in% c("Verb","Noun","Adjective")) %>%
mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>%
lm(calculated_aoa_years.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + n_synsets,data=.)
summary(synsets_helpful_pos_resid_nonCDI)
##
## Call:
## lm(formula = calculated_aoa_years.Kuperman ~ scale(hypernyms) +
## scale(log(hyponyms + 1)) + frequency_subtlex + concreteness +
## neighbour_concentration + arc + dom_pos_factor + n_synsets,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3124 -0.7830 -0.1418 0.7013 10.3394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.44081 3.01108 1.475 0.14156
## scale(hypernyms) -0.21622 0.11643 -1.857 0.06452 .
## scale(log(hyponyms + 1)) -0.11108 0.10728 -1.035 0.30152
## frequency_subtlex -0.31057 0.24587 -1.263 0.20776
## concreteness -0.59897 0.14477 -4.137 4.85e-05 ***
## neighbour_concentration -1.79178 7.91223 -0.226 0.82104
## arc -0.93559 1.47345 -0.635 0.52605
## dom_pos_factor2 0.84434 0.28885 2.923 0.00379 **
## n_synsets 0.04602 0.01643 2.801 0.00551 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.511 on 242 degrees of freedom
## (54 observations deleted due to missingness)
## Multiple R-squared: 0.2911, Adjusted R-squared: 0.2677
## F-statistic: 12.42 on 8 and 242 DF, p-value: 6.757e-15
anova(base_resid_nonCDI, pos_resid_nonCDI, helpful_pos_resid_nonCDI)
## Analysis of Variance Table
##
## Model 1: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc
## Model 2: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor
## Model 3: calculated_aoa_years.Kuperman ~ scale(hypernyms) + scale(log(hyponyms +
## 1)) + frequency_subtlex + concreteness + neighbour_concentration +
## arc + dom_pos_factor + helpfulness
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 285 664.27
## 2 283 636.25 2 28.013 6.2857 0.002134 **
## 3 282 628.39 1 7.867 3.5305 0.061282 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Accounting for PoS and helpfulness also improve model fit for non-CDI words
t.test(formula = concreteness ~ type_broad, data = filtered_calculated_aoas_typed)
##
## Welch Two Sample t-test
##
## data: concreteness by type_broad
## t = 10.333, df = 639.23, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.4503704 0.6617004
## sample estimates:
## mean in group CDI mean in group non-CDI
## 4.518625 3.962589
t.test(formula = frequency_subtlex ~ type_broad, data = filtered_calculated_aoas_typed)
##
## Welch Two Sample t-test
##
## data: frequency_subtlex by type_broad
## t = -0.15526, df = 695.69, p-value = 0.8767
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1275906 0.1088896
## sample estimates:
## mean in group CDI mean in group non-CDI
## 4.484470 4.493821
t.test(formula = helpfulness ~ type_broad, data = filtered_calculated_aoas_typed)
##
## Welch Two Sample t-test
##
## data: helpfulness by type_broad
## t = 6.8864, df = 650.54, p-value = 1.351e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2675637 0.4810183
## sample estimates:
## mean in group CDI mean in group non-CDI
## 3.463996 3.089705
t.test(formula = arc ~ type_broad, data = filtered_calculated_aoas_typed)
##
## Welch Two Sample t-test
##
## data: arc by type_broad
## t = -1.4592, df = 658.92, p-value = 0.145
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.028983681 0.004270751
## sample estimates:
## mean in group CDI mean in group non-CDI
## 0.5712578 0.5836143
t.test(formula = KupermanAoA ~ type_broad, data = filtered_calculated_aoas_typed)
##
## Welch Two Sample t-test
##
## data: KupermanAoA by type_broad
## t = -12.157, df = 518.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.3647365 -0.9850257
## sample estimates:
## mean in group CDI mean in group non-CDI
## 4.378858 5.553739
t.test(formula = hypernyms ~ type_broad, data = filtered_calculated_aoas_typed)
##
## Welch Two Sample t-test
##
## data: hypernyms by type_broad
## t = 6.2654, df = 741.47, p-value = 6.303e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.001287 1.915097
## sample estimates:
## mean in group CDI mean in group non-CDI
## 7.327628 5.869436
t.test(formula = hyponyms ~ type_broad, data = filtered_calculated_aoas_typed)
##
## Welch Two Sample t-test
##
## data: hyponyms by type_broad
## t = 2.0145, df = 451.54, p-value = 0.04455
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.092282 88.233033
## sample estimates:
## mean in group CDI mean in group non-CDI
## 63.73193 19.06928
t.test(formula = n_synsets ~ type_broad, data = filtered_calculated_aoas_typed)
##
## Welch Two Sample t-test
##
## data: n_synsets by type_broad
## t = -1.9109, df = 583.2, p-value = 0.05651
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.37151242 0.03253687
## sample estimates:
## mean in group CDI mean in group non-CDI
## 6.028986 7.198473