ETS_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/ets/mean_cluster_corrs/")

cluster_ets <- map_df(list.files(ETS_CLUSTER_CORR_PATH, full.names = T), 
                      ~read_csv(., col_names = c("cluster", "same", "cor", "sd", "n", "lang1", "lang2", "nclusters"))) %>%
  filter(!is.na(cor))

# aggregate within language pairs
language_pair_means_ets <- cluster_ets %>%
  group_by(nclusters, lang1, lang2, same) %>% 
  summarize(cor = mean(cor, na.rm = T))  %>% 
  mutate(corpus = "ets")
WIKI_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/wiki/mean_cluster_corrs/")

language_pair_means_wiki <- map_df(list.files(WIKI_CLUSTER_CORR_PATH, full.names = T), 
                      ~read_csv(., col_names = c("nclusters", "same", "cor",  "lang1", "lang2")))  %>%
  mutate(corpus = "wiki")
language_pair_means <- bind_rows(language_pair_means_ets, 
                                 language_pair_means_wiki)

# get local-global diff
language_pair_means_diff <- language_pair_means %>%
  select(corpus, nclusters, lang1, lang2, same, cor) %>%
  spread(same, cor)  %>%
  mutate(dif = local - global)

mean_dif <- language_pair_means_diff %>%
  group_by(corpus, nclusters) %>%
  multi_boot_standard(col = "dif", na.rm = T) %>%
  mutate(nwords = case_when(corpus == "ets" ~ 10000, 
                            TRUE ~ 3530),
         nwords_per_cluster = nwords/nclusters)

ggplot(mean_dif, aes(x = nclusters, y = mean, color = corpus)) +
  geom_pointrange(aes(ymin = ci_lower, max = ci_upper)) +
  geom_smooth(formula = y~log(x), method = "lm") +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  xlab("Number of Clusters") +
  ylab("Local-Global Difference") +
  theme_classic(base_size = 14)

ggplot(mean_dif, aes(x = nwords_per_cluster, y = mean, color = corpus)) +
  geom_pointrange(aes(ymin = ci_lower, max = ci_upper)) +
  #geom_point(alpha = .6, size =3) +
  #geom_linerange(aes(ymin = ci_lower, max = ci_upper)) +
  #geom_line() +
  geom_smooth(formula = y~log(x), method = "lm") +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  xlab("Number of Words per Cluster") +
  #annotate(geom = "text", label=paste("italic(R^2)==",rsquared), x = 200, y = .04, parse = T, color = "red", size = 6) +
  ylab("Local-Global Difference") +
#  scale_x_log10() +
  theme_classic(base_size = 14)

rsquared_ets <- lm(mean ~ log(nclusters), data = mean_dif %>% filter(corpus == "ets")) %>%
  summary() %>%
  pluck("r.squared") %>%
  round(2)

rsquared_wiki <- lm(mean ~ log(nclusters), data = mean_dif %>% filter(corpus == "wiki")) %>%
  summary() %>%
  pluck("r.squared") %>%
  round(2)

Ribbon plot

ets_ribbon_data <- language_pair_means %>%
  group_by(same, corpus, nclusters) %>%
  multi_boot_standard(col = "cor", na.rm = T)  

ets_ribbon_data_for_plotting <- ets_ribbon_data %>%
  group_by(corpus, nclusters) %>%
  nest()%>%
  mutate(mean_max = map_dbl(data, ~filter(., same == "local") %>% pull(mean)),
         mean_min = map_dbl(data, ~filter(., same == "global") %>% pull(mean))) %>%
  unnest() %>%
  ungroup() %>%
  mutate(corpus = case_when(corpus == "wiki" ~ "Wikipedia",
                            corpus == "ets"  ~ "TOEFL"),
        same = fct_rev(same))

ggplot(ets_ribbon_data_for_plotting, aes(x = nclusters, y = mean, color = same)) +
    geom_ribbon(aes(x = nclusters, ymin = mean_min, ymax = mean_max), alpha = .1, color = "white") +
  geom_pointrange(aes(ymin = ci_lower, max = ci_upper)) +
  geom_line(size = 2) +
  xlab("Number of Clusters (log)") +
  ylab("Word distance correlation") +
  scale_x_log10() +
  #annotation_logticks() +
  scale_color_manual("Comparison", values = c("#E41A1C", "#377EB8" )) +
  facet_wrap(~corpus)+
    ylab("Cross-linguistic\nWord Distance Correlation") +
theme_classic(base_size = 20) +
  theme(axis.line = element_line(size = 1.2),
        axis.ticks = element_line(size = 1),
        legend.text = element_text(size = 8),
        legend.title = element_text(size = 10),
        legend.background = element_rect(linetype = 1, size = 0.5, colour = 1))