Why are adults wrong about what words kids know?

Variables

all_ages_says: on average, what proportion of children sampled say this word?
KupermanAoA: adult-estimated AoA, from Kuperman norms
hypernyms: how many words are superordinate to this word? (per wordnet)
hyponyms: how many words are subordinate to this word? (per wordnet)
concreteness: how concrete (i.e. tangible) is the word’s referent, scale of 1 (abstract) to 5 (concrete), from Brysbaert norms
frequency: word frequency from SUBTLEX
neighbour_concentration: measure of semantic density (how concentrated is local word neighborhood), from Thompson (unpublished)
arc: measure of semantic density (average radius of co-occurrence), from HiDEx (Shaoul & Westbury)
ncount: measure of semantic density (neighbor count), from HiDEx (Shaoul & Westbury)
helpfulness: how helpful would it be for a preschooler to know this word, scale of 1 (not helpful) to 5 (very helpful), collected on mturk
babiness: how much is this word associated with babies, scale of 1 to 10, from Perry et al. (2015)
n_synsets: how many synsets are there for this word on wordnet?

Correlations among predictors

corr_vars <- all_words_all_features %>% 
  mutate(log_hyponyms = log(hyponyms+1)) %>% 
  select(all_ages_says, all_ages_understands, KupermanAoA, hypernyms, log_hyponyms, concreteness, frequency_subtlex,
         childes_adult_log_freq, childes_kid_log_freq, n_synsets, neighbour_concentration, arc, ncount, helpfulness, babiness) %>% 
  cor(use="pairwise.complete.obs", method="pearson")

p.mat <- cor.mtest(corr_vars)
pMatrix <- p.mat$p

corrplot(corr_vars, method = 'color', type='lower', diag = TRUE, addCoef.col = "black",
         tl.col = "black", number.font=2, number.cex=.6, p.mat=pMatrix, sig.level = 0.05, insig = "blank")

How does helpfulness correlate with Kuperman, hypernyms, and actual production?

All words

Kuperman

ggplot(all_words_all_features, aes(x=KupermanAoA, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Hypernyms

ggplot(all_words_all_features, aes(x=hypernyms, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Production

ggplot(all_words_all_features, aes(x=all_ages_says, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Non-CDI words only

Kuperman

ggplot(filter(all_words_all_features, type!="CDI"), aes(x=KupermanAoA, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Hypernyms

ggplot(filter(all_words_all_features, type!="CDI"), aes(x=hypernyms, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Production

ggplot(filter(all_words_all_features, type!="CDI"), aes(x=all_ages_says, y=helpfulness))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models: all data (CDI + our survey)

Generate residuals for all_ages_says ~ Kuperman (all data)

base_mod <- lm(all_ages_says ~ KupermanAoA, data=all_words_all_features)

all_words_kuperman_resid <- all_words_all_features %>% 
  mutate(all_ages_says.Kuperman = resid(base_mod))

base_mod_scaled_data <- lm(all_ages_says ~ KupermanAoA, data = awaf_scaled)
scaled_pos_resid <- awaf_scaled %>% 
  ungroup() %>% 
  mutate(all_ages_says.Kuperman = resid(base_mod_scaled_data))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(all_words_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(all_words_kuperman_resid, aes(x=helpfulness, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Model all words/predictors (all data)

Base model

all_words_no_pos <- all_words_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)
summary(all_words_no_pos)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.46675 -0.14650 -0.03747  0.12603  0.51788 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.346615   0.235225  -1.474 0.141051    
## scale(hypernyms)          0.021040   0.009044   2.326 0.020280 *  
## scale(log(hyponyms + 1)) -0.003147   0.008591  -0.366 0.714211    
## frequency_subtlex        -0.038439   0.018365  -2.093 0.036704 *  
## concreteness              0.017649   0.012607   1.400 0.161975    
## neighbour_concentration   0.492623   0.601144   0.819 0.412793    
## arc                       0.444512   0.117260   3.791 0.000163 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2032 on 704 degrees of freedom
##   (42 observations deleted due to missingness)
## Multiple R-squared:  0.04397,    Adjusted R-squared:  0.03582 
## F-statistic: 5.396 on 6 and 704 DF,  p-value: 1.828e-05

Include PoS

all_words_with_pos <- all_words_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc+dom_pos_factor,data=.)
summary(all_words_with_pos)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47507 -0.14257 -0.02117  0.12831  0.56068 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)   
## (Intercept)              -0.313935   0.236507  -1.327  0.18481   
## scale(hypernyms)          0.007503   0.010098   0.743  0.45771   
## scale(log(hyponyms + 1)) -0.004623   0.008515  -0.543  0.58735   
## frequency_subtlex        -0.021529   0.018575  -1.159  0.24684   
## concreteness              0.015231   0.013101   1.163  0.24540   
## neighbour_concentration   0.413132   0.596873   0.692  0.48906   
## arc                       0.343181   0.118343   2.900  0.00385 **
## dom_pos_factor2          -0.066987   0.025082  -2.671  0.00775 **
## dom_pos_factor3           0.126002   0.043231   2.915  0.00367 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2008 on 702 degrees of freedom
##   (42 observations deleted due to missingness)
## Multiple R-squared:  0.06881,    Adjusted R-squared:  0.0582 
## F-statistic: 6.484 on 8 and 702 DF,  p-value: 3.641e-08

Include babiness, helpfulness, n_synsets

all_words_with_pos_helpful <- all_words_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex+ concreteness+ neighbour_concentration+ arc+ dom_pos_factor+ babiness+helpfulness+n_synsets ,data=.)
summary(all_words_with_pos_helpful)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + babiness + helpfulness + n_synsets, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47450 -0.11795 -0.01713  0.09326  0.65740 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.5926252  0.3107823  -1.907  0.05726 .  
## scale(hypernyms)         -0.0145690  0.0127941  -1.139  0.25551    
## scale(log(hyponyms + 1)) -0.0160590  0.0100813  -1.593  0.11198    
## frequency_subtlex        -0.0693377  0.0250764  -2.765  0.00596 ** 
## concreteness              0.0246668  0.0214897   1.148  0.25173    
## neighbour_concentration   0.2278351  0.7258777   0.314  0.75378    
## arc                       0.4574819  0.1631046   2.805  0.00528 ** 
## dom_pos_factor2          -0.1140070  0.0384902  -2.962  0.00324 ** 
## babiness                 -0.0055373  0.0050773  -1.091  0.27612    
## helpfulness               0.1313714  0.0145795   9.011  < 2e-16 ***
## n_synsets                 0.0001001  0.0013634   0.073  0.94153    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1775 on 394 degrees of freedom
##   (348 observations deleted due to missingness)
## Multiple R-squared:  0.2262, Adjusted R-squared:  0.2065 
## F-statistic: 11.52 on 10 and 394 DF,  p-value: < 2.2e-16

This model is not great! So much variance unaccounted for… why?

all_words_all_features_typed <- all_words_all_features %>% 
  mutate(type_broad = ifelse(type=="CDI","CDI","non-CDI"))

ggplot(all_words_all_features_typed, aes(x=KupermanAoA, y=all_ages_says, color=type_broad))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Seems like we should look at CDI and non-CDI words separately, as very different things are going on.

CDI words only

cdi_words_all_features <- filter(all_words_all_features, type=="CDI")
base_mod_CDI <- lm(all_ages_says ~ KupermanAoA, data=cdi_words_all_features)

CDI_kuperman_resid <- cdi_words_all_features %>% 
  mutate(all_ages_says.Kuperman = resid(base_mod_CDI))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(CDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(CDI_kuperman_resid, aes(x=helpfulness, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models: CDI data

Base model

CDI_kuperman_resid_filtered <- CDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3))))

CDI_kuperman_resid_base <- CDI_kuperman_resid_filtered %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+frequency_subtlex+concreteness+neighbour_concentration+arc,data=.)

summary(CDI_kuperman_resid_base)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.39342 -0.12055 -0.00696  0.11161  0.52158 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)  
## (Intercept)              -0.0340127  0.2766866  -0.123   0.9022  
## scale(hypernyms)          0.0266763  0.0115631   2.307   0.0216 *
## scale(log(hyponyms + 1))  0.0005019  0.0099173   0.051   0.9597  
## frequency_subtlex        -0.0367231  0.0231990  -1.583   0.1142  
## concreteness              0.0109750  0.0203843   0.538   0.5906  
## neighbour_concentration  -0.1096482  0.6555909  -0.167   0.8673  
## arc                       0.3310336  0.1422894   2.326   0.0205 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1699 on 389 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.05416,    Adjusted R-squared:  0.03957 
## F-statistic: 3.712 on 6 and 389 DF,  p-value: 0.001333

CDI_kuperman_resid_childes <- CDI_kuperman_resid_filtered %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+childes_adult_log_freq+concreteness+neighbour_concentration+arc,data=.)

summary(CDI_kuperman_resid_childes)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + childes_adult_log_freq + concreteness + neighbour_concentration + 
##     arc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38351 -0.11435 -0.00293  0.11239  0.48183 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)   
## (Intercept)              -0.1209326  0.2718342  -0.445  0.65666   
## scale(hypernyms)          0.0301777  0.0113496   2.659  0.00816 **
## scale(log(hyponyms + 1)) -0.0004437  0.0100455  -0.044  0.96479   
## childes_adult_log_freq   -0.0054974  0.0083562  -0.658  0.51100   
## concreteness              0.0182683  0.0198145   0.922  0.35712   
## neighbour_concentration  -0.0980764  0.6577419  -0.149  0.88154   
## arc                       0.1959196  0.1073796   1.825  0.06884 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1703 on 389 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.04912,    Adjusted R-squared:  0.03445 
## F-statistic: 3.349 on 6 and 389 DF,  p-value: 0.003139

anova(CDI_kuperman_resid_base, CDI_kuperman_resid_childes)

## Analysis of Variance Table
## 
## Model 1: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc
## Model 2: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + childes_adult_log_freq + concreteness + neighbour_concentration + 
##     arc
##   Res.Df    RSS Df Sum of Sq F Pr(>F)
## 1    389 11.223                      
## 2    389 11.283  0  -0.05974

Include PoS

CDI_kuperman_resid_pos <- CDI_kuperman_resid_filtered %>%
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)

summary(CDI_kuperman_resid_pos)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38584 -0.11597 -0.00807  0.11151  0.52852 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)  
## (Intercept)              -0.084383   0.287071  -0.294   0.7690  
## scale(hypernyms)          0.031420   0.013966   2.250   0.0250 *
## scale(log(hyponyms + 1))  0.000832   0.009926   0.084   0.9332  
## frequency_subtlex        -0.038952   0.023358  -1.668   0.0962 .
## concreteness              0.017050   0.022106   0.771   0.4410  
## neighbour_concentration  -0.047967   0.659761  -0.073   0.9421  
## arc                       0.340952   0.142585   2.391   0.0173 *
## dom_pos_factor2           0.029159   0.036878   0.791   0.4296  
## dom_pos_factor3          -0.095277   0.101494  -0.939   0.3484  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1699 on 387 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.05867,    Adjusted R-squared:  0.03921 
## F-statistic: 3.015 on 8 and 387 DF,  p-value: 0.002694

Include helpfulness

CDI_kuperman_resid_pos_helpful <- CDI_kuperman_resid_filtered %>%
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)

summary(CDI_kuperman_resid_pos_helpful)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + helpfulness, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.40919 -0.09689 -0.00054  0.08081  0.40882 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.587779   0.246332  -2.386 0.017508 *  
## scale(hypernyms)          0.038163   0.011833   3.225 0.001366 ** 
## scale(log(hyponyms + 1))  0.011926   0.008449   1.412 0.158878    
## frequency_subtlex        -0.066797   0.019897  -3.357 0.000866 ***
## concreteness              0.025943   0.018724   1.386 0.166692    
## neighbour_concentration   0.176188   0.558707   0.315 0.752666    
## arc                       0.367297   0.120701   3.043 0.002502 ** 
## dom_pos_factor2           0.006019   0.031269   0.193 0.847449    
## dom_pos_factor3          -0.040387   0.086017  -0.470 0.638961    
## helpfulness               0.142944   0.011511  12.419  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1438 on 386 degrees of freedom
##   (29 observations deleted due to missingness)
## Multiple R-squared:  0.3274, Adjusted R-squared:  0.3117 
## F-statistic: 20.88 on 9 and 386 DF,  p-value: < 2.2e-16

Include babiness and n_synsets

CDI_kuperman_resid_pos_helpful_babiness_synsets <- CDI_kuperman_resid_filtered %>%
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness + babiness + n_synsets,data=.)

summary(CDI_kuperman_resid_pos_helpful_babiness_synsets)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + helpfulness + babiness + n_synsets, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41283 -0.09745  0.00206  0.08341  0.39965 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.4509575  0.2681275  -1.682  0.09353 .  
## scale(hypernyms)          0.0338070  0.0131756   2.566  0.01073 *  
## scale(log(hyponyms + 1))  0.0140630  0.0090256   1.558  0.12015    
## frequency_subtlex        -0.0638006  0.0228710  -2.790  0.00558 ** 
## concreteness              0.0235046  0.0197273   1.191  0.23431    
## neighbour_concentration  -0.1601826  0.6217865  -0.258  0.79686    
## arc                       0.3537708  0.1448743   2.442  0.01513 *  
## dom_pos_factor2          -0.0015195  0.0356091  -0.043  0.96599    
## helpfulness               0.1422203  0.0127649  11.142  < 2e-16 ***
## babiness                 -0.0001798  0.0044717  -0.040  0.96796    
## n_synsets                -0.0007588  0.0013440  -0.565  0.57273    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1457 on 334 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.3061, Adjusted R-squared:  0.2853 
## F-statistic: 14.73 on 10 and 334 DF,  p-value: < 2.2e-16

Compare models

anova(CDI_kuperman_resid_base, CDI_kuperman_resid_pos, CDI_kuperman_resid_pos_helpful)

## Analysis of Variance Table
## 
## Model 1: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc
## Model 2: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor
## Model 3: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + helpfulness
##   Res.Df     RSS Df Sum of Sq       F Pr(>F)    
## 1    389 11.2230                                
## 2    387 11.1694  2    0.0536   1.296 0.2748    
## 3    386  7.9808  1    3.1886 154.219 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

For CDI words, including PoS doesn’t improve model fit, but including helpfulness does

If we look just at non-CDI words, there seem to be 2 clusters - AoAs overestimate some and underestimate others

all_words_all_features_typed <- all_words_all_features_typed %>% 
  mutate(concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))

ggplot(filter(all_words_all_features_typed, type_broad=="non-CDI"),
       aes(x=KupermanAoA, y=all_ages_says))+
  geom_point(aes(color=as.factor(concreteness_meansplit)))+
  geom_smooth(method=lm)+
  theme_classic()

Non-CDI words only: production

non_CDI_all_features <- filter(all_words_all_features, type != "CDI")
base_mod_nonCDI <- lm(all_ages_says ~ KupermanAoA, data=non_CDI_all_features)

nonCDI_kuperman_resid <- non_CDI_all_features %>% 
  mutate(all_ages_says.Kuperman = resid(base_mod_nonCDI),
         concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(nonCDI_kuperman_resid, aes(x=hypernyms, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(nonCDI_kuperman_resid, aes(x=helpfulness, y=all_ages_says.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models

Base model

base_resid_nonCDI <- nonCDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + frequency_subtlex+concreteness+neighbour_concentration+arc, data=.)
summary(base_resid_nonCDI)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50481 -0.11560  0.00255  0.11031  0.74971 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.90099    0.29941  -3.009  0.00284 ** 
## scale(hypernyms)          0.06624    0.01039   6.374 6.91e-10 ***
## scale(log(hyponyms + 1))  0.03713    0.01057   3.514  0.00051 ***
## frequency_subtlex         0.01913    0.02222   0.861  0.38986    
## concreteness              0.10482    0.01403   7.474 8.61e-13 ***
## neighbour_concentration   0.90850    0.79180   1.147  0.25214    
## arc                       0.09575    0.14362   0.667  0.50549    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1672 on 300 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.3338, Adjusted R-squared:  0.3205 
## F-statistic: 25.06 on 6 and 300 DF,  p-value: < 2.2e-16

Including PoS

pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)
  
summary(pos_resid_nonCDI)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48162 -0.09535  0.00603  0.09382  0.77676 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.78716    0.29089  -2.706  0.00720 ** 
## scale(hypernyms)          0.04594    0.01070   4.294 2.38e-05 ***
## scale(log(hyponyms + 1))  0.03009    0.01023   2.943  0.00351 ** 
## frequency_subtlex         0.04869    0.02190   2.223  0.02697 *  
## concreteness              0.09588    0.01392   6.887 3.39e-11 ***
## neighbour_concentration   0.70693    0.76143   0.928  0.35394    
## arc                      -0.09687    0.14165  -0.684  0.49459    
## dom_pos_factor2          -0.11425    0.02578  -4.431 1.32e-05 ***
## dom_pos_factor3           0.09587    0.03744   2.561  0.01094 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1598 on 298 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.3956, Adjusted R-squared:  0.3794 
## F-statistic: 24.38 on 8 and 298 DF,  p-value: < 2.2e-16

Adding helpfulness

helpful_pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)
  
summary(helpful_pos_resid_nonCDI)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + helpfulness, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43119 -0.09495  0.00396  0.08811  0.80900 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.83451    0.28630  -2.915 0.003830 ** 
## scale(hypernyms)          0.04539    0.01046   4.341 1.95e-05 ***
## scale(log(hyponyms + 1))  0.02305    0.01018   2.265 0.024258 *  
## frequency_subtlex         0.01491    0.02375   0.628 0.530655    
## concreteness              0.09210    0.01373   6.707 1.00e-10 ***
## neighbour_concentration   0.73950    0.74858   0.988 0.324018    
## arc                       0.00986    0.14280   0.069 0.944994    
## dom_pos_factor2          -0.12208    0.02545  -4.796 2.56e-06 ***
## dom_pos_factor3           0.07100    0.03754   1.891 0.059532 .  
## helpfulness               0.04692    0.01391   3.372 0.000845 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1571 on 297 degrees of freedom
##   (21 observations deleted due to missingness)
## Multiple R-squared:  0.4179, Adjusted R-squared:  0.4002 
## F-statistic: 23.69 on 9 and 297 DF,  p-value: < 2.2e-16

Adding n_synsets

synsets_helpful_pos_resid_nonCDI <- nonCDI_kuperman_resid %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_says.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + n_synsets,data=.)
  
summary(synsets_helpful_pos_resid_nonCDI)

## 
## Call:
## lm(formula = all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + n_synsets, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49041 -0.09457 -0.00371  0.09183  0.77780 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.863974   0.305278  -2.830  0.00501 ** 
## scale(hypernyms)          0.037150   0.011554   3.215  0.00147 ** 
## scale(log(hyponyms + 1))  0.019794   0.010645   1.860  0.06407 .  
## frequency_subtlex         0.072756   0.023443   3.104  0.00212 ** 
## concreteness              0.102250   0.014463   7.070 1.41e-11 ***
## neighbour_concentration   0.625596   0.800812   0.781  0.43539    
## arc                      -0.091018   0.143999  -0.632  0.52789    
## dom_pos_factor2          -0.121867   0.028915  -4.215 3.45e-05 ***
## n_synsets                -0.003418   0.001605  -2.130  0.03411 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1574 on 262 degrees of freedom
##   (57 observations deleted due to missingness)
## Multiple R-squared:  0.4223, Adjusted R-squared:  0.4047 
## F-statistic: 23.94 on 8 and 262 DF,  p-value: < 2.2e-16

Compare models

anova(base_resid_nonCDI, pos_resid_nonCDI, helpful_pos_resid_nonCDI)

## Analysis of Variance Table
## 
## Model 1: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc
## Model 2: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor
## Model 3: all_ages_says.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + helpfulness
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    300 8.3860                                  
## 2    298 7.6087  2   0.77733 15.752 3.148e-07 ***
## 3    297 7.3281  1   0.28059 11.372 0.0008445 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Accounting for PoS and helpfulness also improve model fit for non-CDI words

Non-CDI words only: understanding

non_CDI_all_features_understands <- filter(all_words_all_features, type != "CDI" & !is.na(all_ages_understands))
base_mod_nonCDI_understands <- lm(all_ages_understands ~ KupermanAoA, data=non_CDI_all_features_understands)

nonCDI_kuperman_resid_understands <- non_CDI_all_features_understands %>% 
  mutate(all_ages_understands.Kuperman = resid(base_mod_nonCDI_understands),
         concreteness_meansplit = ifelse(concreteness < mean(concreteness, na.rm=TRUE), 0, 1))

What is the relation between all_ages_understands and KupermanAoA?

ggplot(nonCDI_kuperman_resid_understands, aes(x=KupermanAoA, y=all_ages_understands))+
  geom_point(aes(color=as.factor(concreteness_meansplit)))+
  geom_smooth(method=lm)+
  theme_classic()

Relation between residual and hypernyms, helpfulness?

Hypernyms

ggplot(nonCDI_kuperman_resid_understands, aes(x=hypernyms, y=all_ages_understands.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Helpfulness

ggplot(nonCDI_kuperman_resid_understands, aes(x=helpfulness, y=all_ages_understands.Kuperman))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()

Models

Base model

base_resid_nonCDI_understands <- nonCDI_kuperman_resid_understands %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>% 
  lm(all_ages_understands.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1)) + frequency_subtlex+concreteness+neighbour_concentration+arc, data=.)
summary(base_resid_nonCDI_understands)

## 
## Call:
## lm(formula = all_ages_understands.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57722 -0.12454 -0.00199  0.11143  0.72459 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.95695    0.31308  -3.057 0.002441 ** 
## scale(hypernyms)          0.05717    0.01090   5.245 2.96e-07 ***
## scale(log(hyponyms + 1))  0.03976    0.01109   3.585 0.000393 ***
## frequency_subtlex         0.02646    0.02329   1.136 0.256749    
## concreteness              0.10834    0.01468   7.383 1.55e-12 ***
## neighbour_concentration   1.20830    0.82806   1.459 0.145563    
## arc                      -0.05358    0.15018  -0.357 0.721514    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1748 on 299 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.3056, Adjusted R-squared:  0.2917 
## F-statistic: 21.93 on 6 and 299 DF,  p-value: < 2.2e-16

Including PoS

pos_resid_nonCDI_understands <- nonCDI_kuperman_resid_understands %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective"), !(is.na(helpfulness))) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_understands.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor,data=.)
  
summary(pos_resid_nonCDI_understands)

## 
## Call:
## lm(formula = all_ages_understands.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc + dom_pos_factor, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.55766 -0.10936  0.01553  0.10139  0.74802 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.78821    0.30612  -2.575  0.01051 *  
## scale(hypernyms)          0.03563    0.01131   3.149  0.00181 ** 
## scale(log(hyponyms + 1))  0.03219    0.01079   2.983  0.00309 ** 
## frequency_subtlex         0.05415    0.02307   2.347  0.01960 *  
## concreteness              0.09574    0.01465   6.533 2.79e-10 ***
## neighbour_concentration   0.92560    0.80133   1.155  0.24899    
## arc                      -0.23916    0.14907  -1.604  0.10970    
## dom_pos_factor2          -0.12824    0.02722  -4.712 3.77e-06 ***
## dom_pos_factor3           0.04921    0.03940   1.249  0.21263    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1682 on 297 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.3619, Adjusted R-squared:  0.3447 
## F-statistic: 21.06 on 8 and 297 DF,  p-value: < 2.2e-16

Adding helpfulness

helpful_pos_resid_nonCDI_understands <- nonCDI_kuperman_resid_understands %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_understands.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness,data=.)
  
summary(helpful_pos_resid_nonCDI_understands)

## 
## Call:
## lm(formula = all_ages_understands.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc + dom_pos_factor + helpfulness, 
##     data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49081 -0.11171 -0.00079  0.09561  0.79075 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.850866   0.297855  -2.857  0.00458 ** 
## scale(hypernyms)          0.035333   0.011003   3.211  0.00147 ** 
## scale(log(hyponyms + 1))  0.023011   0.010686   2.153  0.03211 *  
## frequency_subtlex         0.009406   0.024722   0.380  0.70385    
## concreteness              0.090714   0.014289   6.349 8.15e-10 ***
## neighbour_concentration   0.968333   0.778809   1.243  0.21472    
## arc                      -0.097511   0.148569  -0.656  0.51212    
## dom_pos_factor2          -0.138514   0.026557  -5.216 3.45e-07 ***
## dom_pos_factor3           0.016213   0.039053   0.415  0.67833    
## helpfulness               0.062231   0.014477   4.299 2.33e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1634 on 296 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.3994, Adjusted R-squared:  0.3811 
## F-statistic: 21.87 on 9 and 296 DF,  p-value: < 2.2e-16

Adding n_synsets (no babiness for these words)

synsets_helpful_pos_resid_nonCDI_understands <- nonCDI_kuperman_resid_understands %>%
  filter(dom_pos %in% c("Verb","Noun","Adjective")) %>% 
  mutate(dom_pos_factor = as.factor(ifelse(dom_pos=="Noun",1, ifelse(dom_pos=="Verb",2,3)))) %>% 
  lm(all_ages_understands.Kuperman ~ scale(hypernyms)+ scale(log(hyponyms+1))+ frequency_subtlex + concreteness + neighbour_concentration + arc + dom_pos_factor + helpfulness+ n_synsets,data=.)
  
summary(synsets_helpful_pos_resid_nonCDI_understands)

## 
## Call:
## lm(formula = all_ages_understands.Kuperman ~ scale(hypernyms) + 
##     scale(log(hyponyms + 1)) + frequency_subtlex + concreteness + 
##     neighbour_concentration + arc + dom_pos_factor + helpfulness + 
##     n_synsets, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50927 -0.11269 -0.00717  0.09274  0.78439 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.925822   0.318141  -2.910 0.003935 ** 
## scale(hypernyms)          0.025540   0.012200   2.094 0.037296 *  
## scale(log(hyponyms + 1))  0.013898   0.011312   1.229 0.220380    
## frequency_subtlex         0.030137   0.027383   1.101 0.272132    
## concreteness              0.098996   0.015146   6.536 3.46e-10 ***
## neighbour_concentration   0.942313   0.839741   1.122 0.262864    
## arc                      -0.089704   0.153575  -0.584 0.559670    
## dom_pos_factor2          -0.151957   0.030481  -4.985 1.15e-06 ***
## helpfulness               0.054514   0.016001   3.407 0.000764 ***
## n_synsets                -0.001969   0.001686  -1.168 0.244012    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1622 on 253 degrees of freedom
##   (57 observations deleted due to missingness)
## Multiple R-squared:  0.4304, Adjusted R-squared:  0.4102 
## F-statistic: 21.24 on 9 and 253 DF,  p-value: < 2.2e-16

Compare models

anova(base_resid_nonCDI_understands, pos_resid_nonCDI_understands, helpful_pos_resid_nonCDI_understands)

## Analysis of Variance Table
## 
## Model 1: all_ages_understands.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc
## Model 2: all_ages_understands.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor
## Model 3: all_ages_understands.Kuperman ~ scale(hypernyms) + scale(log(hyponyms + 
##     1)) + frequency_subtlex + concreteness + neighbour_concentration + 
##     arc + dom_pos_factor + helpfulness
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    299 9.1385                                  
## 2    297 8.3978  2   0.74067 13.868 1.749e-06 ***
## 3    296 7.9043  1   0.49346 18.479 2.334e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Accounting for PoS and helpfulness also improve model fit for non-CDI words

How do CDI and non-CDI words compare on these characteristics we’re looking at?

t.test(formula = concreteness ~ type_broad, data = all_words_all_features_typed)

## 
##  Welch Two Sample t-test
## 
## data:  concreteness by type_broad
## t = 10.264, df = 695.48, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.4323634 0.6369131
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              4.519722              3.985083

t.test(formula = frequency_subtlex ~ type_broad, data = all_words_all_features_typed)

## 
##  Welch Two Sample t-test
## 
## data:  frequency_subtlex by type_broad
## t = 0.35543, df = 747.14, p-value = 0.7224
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.09454249  0.13634481
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              4.488141              4.467240

t.test(formula = helpfulness ~ type_broad, data = all_words_all_features_typed)

## 
##  Welch Two Sample t-test
## 
## data:  helpfulness by type_broad
## t = 7.4627, df = 685.51, p-value = 2.578e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2953863 0.5063141
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              3.469448              3.068597

t.test(formula = arc ~ type_broad, data = all_words_all_features_typed)

## 
##  Welch Two Sample t-test
## 
## data:  arc by type_broad
## t = -1.3346, df = 709.91, p-value = 0.1824
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.027240481  0.005192645
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##             0.5715478             0.5825717

t.test(formula = KupermanAoA ~ type_broad, data = all_words_all_features_typed)

## 
##  Welch Two Sample t-test
## 
## data:  KupermanAoA by type_broad
## t = -12.583, df = 550.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.410939 -1.029901
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              4.376311              5.596731

t.test(formula = hypernyms ~ type_broad, data = all_words_all_features_typed)

## 
##  Welch Two Sample t-test
## 
## data:  hypernyms by type_broad
## t = 6.7084, df = 770, p-value = 3.812e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.087927 1.988038
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              7.335766              5.797784

t.test(formula = hyponyms ~ type_broad, data = all_words_all_features_typed)

## 
##  Welch Two Sample t-test
## 
## data:  hyponyms by type_broad
## t = 2.0635, df = 450.87, p-value = 0.03963
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   2.165934 88.765034
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              63.61717              18.15169

t.test(formula = n_synsets ~ type_broad, data = all_words_all_features_typed)

## 
##  Welch Two Sample t-test
## 
## data:  n_synsets by type_broad
## t = -1.945, df = 624.45, p-value = 0.05223
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.3553737  0.0113339
## sample estimates:
##     mean in group CDI mean in group non-CDI 
##              6.033654              7.205674