Read in tensor output
TENSOR_OUTPUT_FILENAME <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/7_corr_shared_ktensor_output_WWL.mat"
tensor_output <- read.mat(TENSOR_OUTPUT_FILENAME)
words1 <- tensor_output$k[[1]]
words2 <- tensor_output$k[[2]]
langs <- tensor_output$k[[3]]
WORDS <- read_feather("../data/4_matlab_data/ets_models/word_to_is_shared.feather")
LANGS <- c('ARA', 'BEN', 'BUL', 'CHI', 'DUT', 'ENG', 'FAS', 'FRE', 'GER', 'GRE', 'GUJ','HIN', 'IBO', 'IND', 'ITA', 'JPN','KAN', 'KOR', 'MAL', 'MAR', 'NEP', 'PAN', 'POL','POR', 'RUM', 'RUS', 'SPA', 'TAM', 'TEL', 'TGL', 'THA', 'TUR', 'URD', 'VIE', 'YOR')
Get tsne of word coordinates
# get tsne coordinates
tsne_out = Rtsne::Rtsne(words1)
tsne_dims <- tsne_out$Y %>%
as.data.frame() %>%
rename(tsne_X = V1,
tsne_Y = V2) %>%
bind_cols(word = WORDS$w1)
Cluster for visualization
N_CLUSTERS <- 50
clusters <- kmeans(scale(tsne_dims[,c("tsne_X", "tsne_Y")]), N_CLUSTERS)
tsne_dims$cluster = factor(clusters$cluster)
ggplot(tsne_dims,
aes(x = tsne_X, y = tsne_Y, color = cluster)) +
geom_text(aes(label = word), size = 1.5) +
theme_void() +
ggtitle("t-sne projection of 710 words based on tensor factorization ranks") +
theme(legend.position = "none")
Get ranks for average clusters
clusters <- kmeans(scale(words1), N_CLUSTERS)
words_to_clusters <- words1 %>%
as.data.frame() %>%
bind_cols(cluster_id = clusters$cluster)
cluster_rank_means <- words_to_clusters %>%
select(cluster_id, everything()) %>%
split(.$cluster_id) %>%
map(function(x) {
colMeans(x[,-1]) %>%
t() %>%
as.data.frame() %>%
mutate(cluster_id = x[1,1])}) %>%
bind_rows() %>%
select(cluster_id, everything())
Get distance of each lang to each clsuter
all_ranks <- rbind(langs, as.matrix(cluster_rank_means[-1]))
cosine_dists <- coop::cosine(t(all_ranks))
cosine_dists <- data.frame(cosine_dists[1:35, 36:85])
cluster_names <- map(list(1:50), ~paste0("cluster_",.)) %>%
unlist()
cosine_dists <- setNames(cosine_dists, cluster_names)
cosine_dists_tidy <- cosine_dists %>%
mutate(langs = LANGS) %>%
gather("cluster", "measure", -langs) %>%
arrange(langs)
For each language pair, get clusters for which languages diff most in similarity (absolute difference of cosine distance)
lang_pairs <- read_csv("lang_pairwise_rank_correlations.csv", col_names = F) %>%
select(1,2) %>%
rename(lang1 = X1, lang2 = X2)
get_diff_in_sim_to_cluster <- function(this_lang1, this_lang2, df){
lang1_df <- df[df$langs == this_lang1,]
lang2_df <- df[df$langs == this_lang2,]
lang1_df %>%
left_join(lang2_df, by = "cluster") %>%
mutate(sim_diff = measure.x - measure.y) %>%
mutate(lang1 = this_lang1,
lang2 = this_lang2) %>%
select(lang1, lang2, cluster, sim_diff)
}
pairwise_sim_diffs <- map2_df(lang_pairs$lang1, lang_pairs$lang2,
get_diff_in_sim_to_cluster, cosine_dists_tidy) %>%
mutate(abs_sim_diff = abs(sim_diff))
cluster_diff_arranged <- pairwise_sim_diffs %>%
group_by(lang1, lang2) %>%
arrange(-abs_sim_diff) %>%
slice(1) %>%
ungroup() %>%
separate(cluster, c("cluster", "cluster_id"), "_") %>%
select(-cluster)
cluster_diff_counts <- cluster_diff_arranged %>%
count(cluster_id) %>%
data.frame() %>%
arrange(-n)
#bind_cols(words = WORDS,
# cluster_id = clusters$cluster) %>% #arrange(cluster_id) %>%
# write_csv("words_to_clusters.csv")
raw_cluster_names <- read_csv("words_to_clusters.csv")
DT::datatable(raw_cluster_names)
cluster_names <- raw_cluster_names %>%
filter(!is.na(cluster_gloss)) %>%
select(-w1) %>%
mutate(cluster_id = as.character(cluster_id))
cluster_diff_counts %>%
left_join(cluster_names) %>%
DT::datatable()
cluster_diff_arranged %>%
left_join(cluster_names) %>%
arrange(-abs_sim_diff) %>%
DT::datatable()