ETS_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/ets/mean_cluster_corrs/")
cluster_ets <- map_df(list.files(ETS_CLUSTER_CORR_PATH, full.names = T),
~read_csv(., col_names = c("cluster", "same", "cor", "sd", "n", "lang1", "lang2", "nclusters"))) %>%
filter(!is.na(cor))
# aggregate within language pairs
language_pair_means_ets <- cluster_ets %>%
group_by(nclusters, lang1, lang2, same) %>%
summarize(cor = mean(cor, na.rm = T)) %>%
mutate(corpus = "ets")
WIKI_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/wiki/mean_cluster_corrs/")
language_pair_means_wiki <- map_df(list.files(WIKI_CLUSTER_CORR_PATH, full.names = T),
~read_csv(., col_names = c("nclusters", "same", "cor", "lang1", "lang2"))) %>%
mutate(corpus = "wiki")
language_pair_means <- bind_rows(language_pair_means_ets,
language_pair_means_wiki)
# get local-global diff
language_pair_means_diff <- language_pair_means %>%
select(corpus, nclusters, lang1, lang2, same, cor) %>%
spread(same, cor) %>%
mutate(dif = local - global)
mean_dif <- language_pair_means_diff %>%
group_by(corpus, nclusters) %>%
multi_boot_standard(col = "dif", na.rm = T) %>%
mutate(nwords = case_when(corpus == "ets" ~ 10000,
TRUE ~ 3530),
nwords_per_cluster = nwords/nclusters)
ggplot(mean_dif, aes(x = nclusters, y = mean, color = corpus)) +
geom_pointrange(aes(ymin = ci_lower, max = ci_upper)) +
geom_smooth(formula = y~log(x), method = "lm") +
geom_hline(aes(yintercept = 0), linetype = 2) +
xlab("Number of Clusters") +
ylab("Local-Global Difference") +
theme_classic(base_size = 14)

ggplot(mean_dif, aes(x = nwords_per_cluster, y = mean, color = corpus)) +
geom_pointrange(aes(ymin = ci_lower, max = ci_upper)) +
#geom_point(alpha = .6, size =3) +
#geom_linerange(aes(ymin = ci_lower, max = ci_upper)) +
#geom_line() +
geom_smooth(formula = y~log(x), method = "lm") +
geom_hline(aes(yintercept = 0), linetype = 2) +
xlab("Number of Words per Cluster") +
#annotate(geom = "text", label=paste("italic(R^2)==",rsquared), x = 200, y = .04, parse = T, color = "red", size = 6) +
ylab("Local-Global Difference") +
# scale_x_log10() +
theme_classic(base_size = 14)

rsquared_ets <- lm(mean ~ log(nclusters), data = mean_dif %>% filter(corpus == "ets")) %>%
summary() %>%
pluck("r.squared") %>%
round(2)
rsquared_wiki <- lm(mean ~ log(nclusters), data = mean_dif %>% filter(corpus == "wiki")) %>%
summary() %>%
pluck("r.squared") %>%
round(2)