survey_words <- read.csv("words_to_survey.csv") %>% select(-X)
coeffs <- read.csv("word_coeffs_log_mtld_diff_30_42_with_freq_resid.csv") %>% rename(Word=word)
words_with_coeffs <- left_join(survey_words, coeffs, by="Word") %>%
distinct() %>%
arrange(t)
## Warning: Column `Word` joining factors with different levels, coercing to
## character vector
nouns <- filter(words_with_coeffs, pos=="n") %>%
mutate(hyper_z = scale(num_hypernyms), hypo_z = scale(num_hyponyms))
verbs <- filter(words_with_coeffs, pos=="v") %>%
mutate(hyper_z = scale(num_hypernyms), hypo_z = scale(num_hyponyms))
words_with_coeffs_scaled <- bind_rows(nouns, verbs)
t_test_df <- select(words_with_coeffs_scaled, SeedWord, Word, pos, kid_log_freq, adult_log_freq, type,
aoa, logFreq_subtlex, Estimate, t, t_freq_resid, hyper_z, hypo_z) %>%
distinct()
seeds_only <- filter(t_test_df, type=="seed")
seeds_no_na <- filter(seeds_only, !is.na(t))
controls_only <- filter(t_test_df, type=="control")
controls_no_na <- filter(controls_only, !is.na(t))
ggplot(words_with_coeffs_scaled, aes(hyper_z, t, color=pos))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
## Warning: Removed 224 rows containing non-finite values (stat_smooth).
## Warning: Removed 224 rows containing missing values (geom_point).
ggplot(filter(words_with_coeffs_scaled, type=="control"), aes(hyper_z, t))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
## Warning: Removed 194 rows containing non-finite values (stat_smooth).
## Warning: Removed 194 rows containing missing values (geom_point).
ggplot(filter(words_with_coeffs_scaled, type=="seed"), aes(hyper_z, t))+
geom_point()+
geom_smooth(method=lm)+
theme_classic()
## Warning: Removed 30 rows containing non-finite values (stat_smooth).
## Warning: Removed 30 rows containing missing values (geom_point).
t.test(t ~ type, t_test_df, na.rm=TRUE, var.equal=TRUE)
##
## Two Sample t-test
##
## data: t by type
## t = 0.51069, df = 194, p-value = 0.6102
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1391671 0.2364186
## sample estimates:
## mean in group control mean in group seed
## 0.02306797 -0.02555780
t.test(t_freq_resid ~ type, t_test_df, na.rm=TRUE, var.equal=TRUE)
##
## Two Sample t-test
##
## data: t_freq_resid by type
## t = 0.38134, df = 194, p-value = 0.7034
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1473496 0.2179891
## sample estimates:
## mean in group control mean in group seed
## -0.05285229 -0.08817204
t_all <- lm(t_freq_resid ~ hyper_z + hypo_z, data=t_test_df)
summary(t_all)
##
## Call:
## lm(formula = t_freq_resid ~ hyper_z + hypo_z, data = t_test_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.39021 -0.42736 0.04601 0.48227 1.26614
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.065027 0.040798 -1.594 0.113
## hyper_z -0.076251 0.045991 -1.658 0.099 .
## hypo_z -0.003386 0.030536 -0.111 0.912
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5546 on 191 degrees of freedom
## (218 observations deleted due to missingness)
## Multiple R-squared: 0.01518, Adjusted R-squared: 0.004867
## F-statistic: 1.472 on 2 and 191 DF, p-value: 0.2321
t_seed <- lm(t_freq_resid ~ hyper_z + hypo_z, data=seeds_only)
summary(t_seed)
##
## Call:
## lm(formula = t_freq_resid ~ hyper_z + hypo_z, data = seeds_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.41881 -0.27173 0.00826 0.44370 1.07363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.20607 0.29760 0.692 0.492
## hyper_z 0.25037 0.25741 0.973 0.336
## hypo_z 0.02130 0.03252 0.655 0.516
##
## Residual standard error: 0.5251 on 45 degrees of freedom
## (32 observations deleted due to missingness)
## Multiple R-squared: 0.02718, Adjusted R-squared: -0.01606
## F-statistic: 0.6286 on 2 and 45 DF, p-value: 0.5379
t_control <- lm(t_freq_resid ~ hyper_z + hypo_z, data=controls_only)
summary(t_control)
##
## Call:
## lm(formula = t_freq_resid ~ hyper_z + hypo_z, data = controls_only)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.31777 -0.45845 0.09856 0.45669 1.21124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.12024 0.11610 -1.036 0.3021
## hyper_z -0.12198 0.05539 -2.202 0.0293 *
## hypo_z -0.33427 0.43937 -0.761 0.4480
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.562 on 143 degrees of freedom
## (186 observations deleted due to missingness)
## Multiple R-squared: 0.03717, Adjusted R-squared: 0.02371
## F-statistic: 2.76 on 2 and 143 DF, p-value: 0.06664