LANG_ES_PATH <- here("analyses/study1c/parameter_exploration/es_params.csv")
lang_es <- read_csv(LANG_ES_PATH,
col_names = c("domain", "x",
"lang_effect_size", "model",
"min_word_count", "vector_size", "ngrams", "window_size")) %>%
select(-x)
tidy_lang_es <- lang_es %>%
filter(model == "coca") %>%
select(-model) %>%
full_join(lang_es %>% filter(model == "bnc") %>% select(-model),
by = c("domain", "min_word_count",
"vector_size", "ngrams", "window_size")) %>%
rename(coca_lang_es = lang_effect_size.x,
bnc_lang_es = lang_effect_size.y) %>%
mutate(lang_diff = bnc_lang_es - coca_lang_es) %>%
select(domain, min_word_count, vector_size, ngrams, window_size, everything(), lang_diff)
BEHAVIORAL_PATH <- here("data/study1c/processed/tidy_behavioral_iat_data.csv")
tidy_behavioral_es <- read_csv(BEHAVIORAL_PATH) %>%
rename(uk_behavior_es = uk,
us_behavior_es = us)
full_df <- tidy_lang_es %>%
left_join(tidy_behavioral_es)
get_correlations <- function(current_df){
cor1 <- tidy(cor.test(current_df$coca_lang_es, current_df$us_behavior_es)) %>%
mutate(corr = "coca_us")
cor2 <- tidy(cor.test(current_df$coca_lang_es, current_df$uk_behavior_es)) %>%
mutate(corr = "coca_uk")
cor3 <- tidy(cor.test(current_df$bnc_lang_es, current_df$us_behavior_es)) %>%
mutate(corr = "bnc_us")
cor4 <- tidy(cor.test(current_df$bnc_lang_es, current_df$uk_behavior_es)) %>%
mutate(corr = "bnc_uk")
cor5 <- tidy(cor.test(current_df$behavioral_resid_diff, current_df$lang_diff)) %>%
mutate(corr = "diff_corr")
bind_rows(list(cor1, cor2, cor3, cor4, cor5)) %>%
select(corr, estimate, statistic, p.value,
parameter, conf.low, conf.high) %>%
list()
}
all_corrs <- full_df %>%
group_by(vector_size, ngrams, window_size) %>%
slice(1:31) %>%
nest() %>%
mutate(model = map(data, ~get_correlations(.))) %>%
select(-data) %>%
unnest() %>%
unnest()
200-500 (right) refer to vector size. 2 - 20 (top) refer to window size.
all_corrs %>%
mutate(corr = fct_relevel(corr, c("coca_us", "bnc_uk"))) %>%
filter(vector_size < 500,
window_size < 20) %>%
ggplot(aes(x = corr, y = estimate, color = ngrams, group = interaction(ngrams, window_size, vector_size))) +
ylab("correlation coefficient") +
geom_line() +
geom_hline(aes(yintercept = 0), linetype = 2) +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high), size = .4) +
facet_grid(ngrams + vector_size ~ window_size) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))