survey_words <- read.csv("words_to_survey.csv") %>% select(-X)
coeffs <- read.csv("word_coeffs_log_mtld_diff_30_42_with_freq_resid.csv") %>% rename(Word=word)

words_with_coeffs <- left_join(survey_words, coeffs, by="Word") %>% 
  distinct() %>% 
  arrange(t)
## Warning: Column `Word` joining factors with different levels, coercing to
## character vector
nouns <- filter(words_with_coeffs, pos=="n") %>%
  mutate(hyper_z = scale(num_hypernyms), hypo_z = scale(num_hyponyms))

verbs <- filter(words_with_coeffs, pos=="v") %>%
  mutate(hyper_z = scale(num_hypernyms), hypo_z = scale(num_hyponyms))

words_with_coeffs_scaled <- bind_rows(nouns, verbs)

t_test_df <- select(words_with_coeffs_scaled, SeedWord, Word, pos, kid_log_freq, adult_log_freq, type,
                    aoa, logFreq_subtlex, Estimate, t, t_freq_resid, hyper_z, hypo_z) %>% 
  distinct()

seeds_only <- filter(t_test_df, type=="seed")
seeds_no_na <- filter(seeds_only, !is.na(t))

controls_only <- filter(t_test_df, type=="control")
controls_no_na <- filter(controls_only, !is.na(t))

Visualize relation between hypernyms and t

All words by pos

ggplot(words_with_coeffs_scaled, aes(hyper_z, t, color=pos))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()
## Warning: Removed 224 rows containing non-finite values (stat_smooth).
## Warning: Removed 224 rows containing missing values (geom_point).

Control words

ggplot(filter(words_with_coeffs_scaled, type=="control"), aes(hyper_z, t))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()
## Warning: Removed 194 rows containing non-finite values (stat_smooth).
## Warning: Removed 194 rows containing missing values (geom_point).

Seed words

ggplot(filter(words_with_coeffs_scaled, type=="seed"), aes(hyper_z, t))+
  geom_point()+
  geom_smooth(method=lm)+
  theme_classic()
## Warning: Removed 30 rows containing non-finite values (stat_smooth).
## Warning: Removed 30 rows containing missing values (geom_point).

compare mean t (seed vs. control)

t

t.test(t ~ type, t_test_df, na.rm=TRUE, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  t by type
## t = 0.51069, df = 194, p-value = 0.6102
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1391671  0.2364186
## sample estimates:
## mean in group control    mean in group seed 
##            0.02306797           -0.02555780

t_freq_resid

t.test(t_freq_resid ~ type, t_test_df, na.rm=TRUE, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  t_freq_resid by type
## t = 0.38134, df = 194, p-value = 0.7034
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1473496  0.2179891
## sample estimates:
## mean in group control    mean in group seed 
##           -0.05285229           -0.08817204

do hypernyms and hyponyms predict t?

all words

t_all <- lm(t_freq_resid ~ hyper_z + hypo_z, data=t_test_df)
summary(t_all)
## 
## Call:
## lm(formula = t_freq_resid ~ hyper_z + hypo_z, data = t_test_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.39021 -0.42736  0.04601  0.48227  1.26614 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.065027   0.040798  -1.594    0.113  
## hyper_z     -0.076251   0.045991  -1.658    0.099 .
## hypo_z      -0.003386   0.030536  -0.111    0.912  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5546 on 191 degrees of freedom
##   (218 observations deleted due to missingness)
## Multiple R-squared:  0.01518,    Adjusted R-squared:  0.004867 
## F-statistic: 1.472 on 2 and 191 DF,  p-value: 0.2321

seed words only

t_seed <- lm(t_freq_resid ~ hyper_z + hypo_z, data=seeds_only)
summary(t_seed)
## 
## Call:
## lm(formula = t_freq_resid ~ hyper_z + hypo_z, data = seeds_only)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.41881 -0.27173  0.00826  0.44370  1.07363 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.20607    0.29760   0.692    0.492
## hyper_z      0.25037    0.25741   0.973    0.336
## hypo_z       0.02130    0.03252   0.655    0.516
## 
## Residual standard error: 0.5251 on 45 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.02718,    Adjusted R-squared:  -0.01606 
## F-statistic: 0.6286 on 2 and 45 DF,  p-value: 0.5379

control words only

t_control <- lm(t_freq_resid ~ hyper_z + hypo_z, data=controls_only)
summary(t_control)
## 
## Call:
## lm(formula = t_freq_resid ~ hyper_z + hypo_z, data = controls_only)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.31777 -0.45845  0.09856  0.45669  1.21124 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.12024    0.11610  -1.036   0.3021  
## hyper_z     -0.12198    0.05539  -2.202   0.0293 *
## hypo_z      -0.33427    0.43937  -0.761   0.4480  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.562 on 143 degrees of freedom
##   (186 observations deleted due to missingness)
## Multiple R-squared:  0.03717,    Adjusted R-squared:  0.02371 
## F-statistic:  2.76 on 2 and 143 DF,  p-value: 0.06664