VPATH <- here("analyses/02_concreteness_semantics/data/ets_fasttext/vmeasures/observed/")
VPATH_RANDOM <- here("analyses/02_concreteness_semantics/data/ets_fasttext/vmeasures/random/")
VPATH_SAME <- here("analyses/02_concreteness_semantics/data/ets_fasttext/vmeasures/same_doc2vec/")
CORR_PATH <- here("analyses/02_concreteness_semantics/data/ets_fasttext/pairwise_distances/correlations_in_ft_doc2vec_dists.csv")
CORR_PATH_RANDOM <- here("analyses/02_concreteness_semantics/data/ets_fasttext/pairwise_distances/correlations_in_ft_doc2vec_dists_shuffled.csv")
all_v_measures <- map_df(list.files(VPATH, full.names = T), read_csv) %>%
mutate(type = "observed")
all_measures <- all_v_measures %>%
pivot_longer(cols = vmeasure:completeness)
all_measure_means = all_measures %>%
group_by(nclusts, name, type) %>%
multi_boot_standard(col = "value")
all_v_measures_random <- map_df(list.files(VPATH_RANDOM, full.names = T), read_csv) %>%
mutate(type= "random")
all_measures_random <- all_v_measures_random %>%
pivot_longer(cols = vmeasure:completeness)
all_measure_means_random = all_measures_random %>%
group_by(nclusts, name, type) %>%
multi_boot_standard(col = "value")
all_v_measures_same <- map_df(list.files(VPATH_SAME, full.names = T), read_csv) %>%
mutate(type= "same_doc2vec")
all_measures_same <- all_v_measures_same %>%
pivot_longer(cols = vmeasure:completeness)
all_measure_means_same = all_measures_same %>%
group_by(nclusts, name, type) %>%
multi_boot_standard(col = "value")
all_measure_means_random %>%
bind_rows(all_measure_means) %>%
bind_rows(all_measure_means_same) %>%
ggplot(aes(x = nclusts, y = mean, color = name, linetype = type, shape = type)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), size = .2) +
geom_line() +
ylab("mean across 35 languages") +
xlab("number of clusters") +
ylim(0,1) +
theme_classic()
N words is roughly 2000.
all_corrs <- read_csv(CORR_PATH) %>%
mutate(type = "ft_vs_doc2vec")
corr_random <- read_csv(CORR_PATH_RANDOM) %>%
mutate(type = "random")
all_corrs %>%
bind_rows(corr_random) %>%
ggplot(aes(x = estimate, fill = type)) +
geom_histogram() +
xlab("Mean pearson's correlation") +
ggtitle("Correlation between two model types for all word pairs (by language)") +
theme_classic()