Predicting local-global difference with concreteness - there’s a trend for more concrete clusters to have a bigger difference.

BRYSBAERT_PATH <- here("analyses/02_concreteness_semantics/data/brysbaert_corpus.csv")
CLUSTER_ETS <- here("analyses/02_concreteness_semantics/data/ets/target_word_cluster_assignments_ets.csv")
ETS_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/ets/lang_pairwise_semantics_correlations_ets_by_cluster.csv")

CLUSTER_WIKI <- here("analyses/02_concreteness_semantics/data/wiki/target_word_cluster_assignments.csv")
WIKI_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/wiki/lang_pairwise_semantics_correlations_wiki_by_cluster.csv")


brysbaert_norms <- read_csv(BRYSBAERT_PATH) %>%
  clean_names() %>%
  select(word, conc_m) %>%
  filter(!is.na(word))

# 10 clusters
cluster_assignments_raw_ets <- read_csv(CLUSTER_ETS) %>%
  mutate(corpus = "TOEFL")
cluster_assignments_raw_wiki <- read_csv(CLUSTER_WIKI) %>%
  mutate(corpus = "Wikipedia")

mean_conc_by_cluster <- bind_rows(cluster_assignments_raw_ets, cluster_assignments_raw_wiki)  %>%
  mutate(cluster = as.numeric(as.character(cluster))) %>%
  left_join(brysbaert_norms) %>%
  group_by(cluster, corpus) %>%
  multi_boot_standard(col = "conc_m") %>%
  arrange(corpus, mean)

cluster_ets <-  read_csv(ETS_CLUSTER_CORR_PATH,
                         col_names = c("cluster1", "cluster2", "cor", "lang1", "lang2")) %>%
  mutate(corpus = "TOEFL")

cluster_wiki <- read_csv(WIKI_CLUSTER_CORR_PATH,
                         col_names = c("cluster1", "cluster2", "cor", "lang1", "lang2")) %>%
  distinct() %>%
  mutate(corpus = "Wikipedia")

cluster_pair_means <- cluster_ets %>%
  bind_rows(cluster_wiki) %>%
  group_by(corpus, cluster1, cluster2) %>% # aggregate across languages
  summarize(cor = mean(cor, na.rm = T))

full_cluster_pair_means <- cluster_pair_means %>%
  bind_rows(data.frame(corpus = cluster_pair_means$corpus,
                       cluster2 = cluster_pair_means$cluster1,
                       cluster1 = cluster_pair_means$cluster2,
                       cor = cluster_pair_means$cor)) %>%
  mutate(same = case_when(cluster1 == cluster2 ~ "Local", TRUE ~ "Global")) %>%
  distinct()

df <- full_cluster_pair_means %>%
  group_by(corpus, cluster1, same) %>%
  multi_boot_standard(col = "cor") %>%
  ungroup()  %>%
  mutate(same = fct_rev(same))

df_segment <- df %>%
  select(corpus, cluster1, same, mean) %>%
  spread(same, mean)  %>%
  mutate(dif = Local-Global) %>%
  arrange(corpus, dif) %>%
  select(corpus, cluster1, dif) %>%
  rename(cluster = cluster1)

10 cluster solutions

These are the words in the TOEFL cluster with smallest local-global difference:

filter(cluster_assignments_raw_ets, cluster == 9) %>%
  DT::datatable()

… they’re quite abstract!

Local global diff as a function of cluster concreteness:

full_join(df_segment, mean_conc_by_cluster) %>%
  ggplot(aes(x = mean, y = dif)) +
  xlab("Mean concreteness") +
  ylab("Local-Global difference") +
  geom_point() +
  geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper)) +
  geom_smooth(method = "lm") +
  facet_wrap(~corpus) +
  theme_classic()

ETS data by nclusters

(don’t have raw data to do this for wiki)

CLUSTER_ETS_ALL <- here("analyses/02_concreteness_semantics/data/ets/target_word_cluster_assignments_ets_all.csv")
ETS_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/ets/mean_cluster_corrs/")


cluster_assignments_raw_ets_all <- read_csv(CLUSTER_ETS_ALL) %>%
  mutate(word = tolower(word)) %>%
  rename(nclusters = n_clusters) %>%
  filter(nclusters < 500)

mean_conc_by_cluster <- cluster_assignments_raw_ets_all %>%
  left_join(brysbaert_norms) %>%
  group_by(nclusters, cluster) %>%
  multi_boot_standard(col = "conc_m")


cluster_ets <- map_df(list.files(ETS_CLUSTER_CORR_PATH, full.names = T),
                      ~read_csv(., col_names = c("cluster", "same", "cor", "sd", "n", "lang1", "lang2", "nclusters"))) %>%
  filter(!is.na(cor))

cluster_pair_means <- cluster_ets %>%
  select(nclusters, cluster, same, cor, lang1, lang2) %>%
  pivot_wider(names_from = "same", values_from = "cor") %>%
  mutate(dif = local-global) %>%
  group_by(nclusters, cluster) %>%
  summarize(mean_dif = mean(dif, na.rm = T))

all_data <- full_join(cluster_pair_means, mean_conc_by_cluster)
all_data %>%
  ggplot(aes(x = mean, y = mean_dif)) +
  xlab("Mean concreteness") +
  facet_wrap(~nclusters) +
  ylab("Local-Global difference") +
  geom_point() +
  geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper)) +
  geom_smooth(method = "lm") +
  theme_classic()

corr_values <- all_data %>%
  filter(!is.na(mean_dif)) %>%
  group_by(nclusters) %>%
  nest() %>%
  mutate(cor = map(data, ~cor.test(.x$mean, .x$mean_dif) %>% tidy())) %>%
  select(-data)%>%
  unnest() %>%
  mutate(sig = case_when(p.value < .05~ "sig", TRUE ~ "not_sig"))

corr_values %>%
  ggplot(aes(x = nclusters, y = estimate)) +
  geom_line() +
  ylab("Local/Global difference ~ concreteness corr") +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  geom_pointrange(aes(color = sig, ymin = conf.low, ymax = conf.high )) +
  scale_x_log10() +
  theme_classic()