Predicting local-global difference with concreteness - there’s a trend for more concrete clusters to have a bigger difference.
BRYSBAERT_PATH <- here("analyses/02_concreteness_semantics/data/brysbaert_corpus.csv")
CLUSTER_ETS <- here("analyses/02_concreteness_semantics/data/ets/target_word_cluster_assignments_ets.csv")
ETS_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/ets/lang_pairwise_semantics_correlations_ets_by_cluster.csv")
CLUSTER_WIKI <- here("analyses/02_concreteness_semantics/data/wiki/target_word_cluster_assignments.csv")
WIKI_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/wiki/lang_pairwise_semantics_correlations_wiki_by_cluster.csv")
brysbaert_norms <- read_csv(BRYSBAERT_PATH) %>%
clean_names() %>%
select(word, conc_m) %>%
filter(!is.na(word))
# 10 clusters
cluster_assignments_raw_ets <- read_csv(CLUSTER_ETS) %>%
mutate(corpus = "TOEFL")
cluster_assignments_raw_wiki <- read_csv(CLUSTER_WIKI) %>%
mutate(corpus = "Wikipedia")
mean_conc_by_cluster <- bind_rows(cluster_assignments_raw_ets, cluster_assignments_raw_wiki) %>%
mutate(cluster = as.numeric(as.character(cluster))) %>%
left_join(brysbaert_norms) %>%
group_by(cluster, corpus) %>%
multi_boot_standard(col = "conc_m") %>%
arrange(corpus, mean)
cluster_ets <- read_csv(ETS_CLUSTER_CORR_PATH,
col_names = c("cluster1", "cluster2", "cor", "lang1", "lang2")) %>%
mutate(corpus = "TOEFL")
cluster_wiki <- read_csv(WIKI_CLUSTER_CORR_PATH,
col_names = c("cluster1", "cluster2", "cor", "lang1", "lang2")) %>%
distinct() %>%
mutate(corpus = "Wikipedia")
cluster_pair_means <- cluster_ets %>%
bind_rows(cluster_wiki) %>%
group_by(corpus, cluster1, cluster2) %>% # aggregate across languages
summarize(cor = mean(cor, na.rm = T))
full_cluster_pair_means <- cluster_pair_means %>%
bind_rows(data.frame(corpus = cluster_pair_means$corpus,
cluster2 = cluster_pair_means$cluster1,
cluster1 = cluster_pair_means$cluster2,
cor = cluster_pair_means$cor)) %>%
mutate(same = case_when(cluster1 == cluster2 ~ "Local", TRUE ~ "Global")) %>%
distinct()
df <- full_cluster_pair_means %>%
group_by(corpus, cluster1, same) %>%
multi_boot_standard(col = "cor") %>%
ungroup() %>%
mutate(same = fct_rev(same))
df_segment <- df %>%
select(corpus, cluster1, same, mean) %>%
spread(same, mean) %>%
mutate(dif = Local-Global) %>%
arrange(corpus, dif) %>%
select(corpus, cluster1, dif) %>%
rename(cluster = cluster1)
These are the words in the TOEFL cluster with smallest local-global difference:
filter(cluster_assignments_raw_ets, cluster == 9) %>%
DT::datatable()
… they’re quite abstract!
Local global diff as a function of cluster concreteness:
full_join(df_segment, mean_conc_by_cluster) %>%
ggplot(aes(x = mean, y = dif)) +
xlab("Mean concreteness") +
ylab("Local-Global difference") +
geom_point() +
geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper)) +
geom_smooth(method = "lm") +
facet_wrap(~corpus) +
theme_classic()
(don’t have raw data to do this for wiki)
CLUSTER_ETS_ALL <- here("analyses/02_concreteness_semantics/data/ets/target_word_cluster_assignments_ets_all.csv")
ETS_CLUSTER_CORR_PATH <- here("analyses/02_concreteness_semantics/data/ets/mean_cluster_corrs/")
cluster_assignments_raw_ets_all <- read_csv(CLUSTER_ETS_ALL) %>%
mutate(word = tolower(word)) %>%
rename(nclusters = n_clusters) %>%
filter(nclusters < 500)
mean_conc_by_cluster <- cluster_assignments_raw_ets_all %>%
left_join(brysbaert_norms) %>%
group_by(nclusters, cluster) %>%
multi_boot_standard(col = "conc_m")
cluster_ets <- map_df(list.files(ETS_CLUSTER_CORR_PATH, full.names = T),
~read_csv(., col_names = c("cluster", "same", "cor", "sd", "n", "lang1", "lang2", "nclusters"))) %>%
filter(!is.na(cor))
cluster_pair_means <- cluster_ets %>%
select(nclusters, cluster, same, cor, lang1, lang2) %>%
pivot_wider(names_from = "same", values_from = "cor") %>%
mutate(dif = local-global) %>%
group_by(nclusters, cluster) %>%
summarize(mean_dif = mean(dif, na.rm = T))
all_data <- full_join(cluster_pair_means, mean_conc_by_cluster)
all_data %>%
ggplot(aes(x = mean, y = mean_dif)) +
xlab("Mean concreteness") +
facet_wrap(~nclusters) +
ylab("Local-Global difference") +
geom_point() +
geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper)) +
geom_smooth(method = "lm") +
theme_classic()
corr_values <- all_data %>%
filter(!is.na(mean_dif)) %>%
group_by(nclusters) %>%
nest() %>%
mutate(cor = map(data, ~cor.test(.x$mean, .x$mean_dif) %>% tidy())) %>%
select(-data)%>%
unnest() %>%
mutate(sig = case_when(p.value < .05~ "sig", TRUE ~ "not_sig"))
corr_values %>%
ggplot(aes(x = nclusters, y = estimate)) +
geom_line() +
ylab("Local/Global difference ~ concreteness corr") +
geom_hline(aes(yintercept = 0), linetype = 2) +
geom_pointrange(aes(color = sig, ymin = conf.low, ymax = conf.high )) +
scale_x_log10() +
theme_classic()