Read in tensor output

TENSOR_OUTPUT_FILENAME <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/7_corr_shared_ktensor_output_WWL.mat"
tensor_output <- read.mat(TENSOR_OUTPUT_FILENAME)

words1 <- tensor_output$k[[1]]
words2 <- tensor_output$k[[2]]
langs <- tensor_output$k[[3]]

WORDS <- read_feather("../data/4_matlab_data/ets_models/word_to_is_shared.feather")

LANGS <- c('ARA', 'BEN', 'BUL', 'CHI', 'DUT', 'ENG', 'FAS', 'FRE', 'GER', 'GRE', 'GUJ','HIN', 'IBO', 'IND', 'ITA', 'JPN','KAN', 'KOR', 'MAL', 'MAR', 'NEP', 'PAN', 'POL','POR', 'RUM', 'RUS', 'SPA', 'TAM', 'TEL', 'TGL', 'THA', 'TUR', 'URD', 'VIE', 'YOR')

Tsne of words

Get tsne of word coordinates

# get tsne coordinates
tsne_out = Rtsne::Rtsne(words1)
tsne_dims <- tsne_out$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  %>%
  bind_cols(word = WORDS$w1) 

Cluster for visualization

N_CLUSTERS <- 50
clusters <- kmeans(scale(tsne_dims[,c("tsne_X", "tsne_Y")]), N_CLUSTERS)
tsne_dims$cluster = factor(clusters$cluster)

ggplot(tsne_dims,
         aes(x = tsne_X, y = tsne_Y, color = cluster)) +
  geom_text(aes(label = word), size = 1.5) +
  theme_void() +
  ggtitle("t-sne projection of 710 words based on tensor factorization ranks") +
  theme(legend.position = "none")

Get cluster differences by language pair

Get ranks for average clusters

clusters <- kmeans(scale(words1), N_CLUSTERS)

words_to_clusters <- words1 %>%
  as.data.frame() %>%
  bind_cols(cluster_id = clusters$cluster)

cluster_rank_means <- words_to_clusters %>%
  select(cluster_id, everything()) %>%
  split(.$cluster_id) %>%
  map(function(x) {
    colMeans(x[,-1]) %>% 
        t() %>% 
        as.data.frame() %>% 
        mutate(cluster_id = x[1,1])}) %>%
  bind_rows() %>%
  select(cluster_id, everything())

Get distance of each lang to each clsuter

all_ranks <- rbind(langs,  as.matrix(cluster_rank_means[-1]))
cosine_dists <- coop::cosine(t(all_ranks))

cosine_dists <- data.frame(cosine_dists[1:35, 36:85])
cluster_names <- map(list(1:50), ~paste0("cluster_",.)) %>% 
  unlist()
cosine_dists <- setNames(cosine_dists, cluster_names)

cosine_dists_tidy <- cosine_dists %>%
  mutate(langs = LANGS) %>%
  gather("cluster", "measure", -langs) %>%
  arrange(langs)

For each language pair, get clusters for which languages diff most in similarity (absolute difference of cosine distance)

lang_pairs <- read_csv("lang_pairwise_rank_correlations.csv", col_names = F) %>%
  select(1,2) %>%
  rename(lang1 = X1, lang2 = X2)

get_diff_in_sim_to_cluster <- function(this_lang1, this_lang2, df){
  
  lang1_df <- df[df$langs == this_lang1,]
  lang2_df <- df[df$langs == this_lang2,]
  
  lang1_df %>%
    left_join(lang2_df, by = "cluster") %>%
    mutate(sim_diff = measure.x - measure.y) %>%
    mutate(lang1 = this_lang1, 
           lang2 = this_lang2) %>%
    select(lang1, lang2, cluster, sim_diff)
}

pairwise_sim_diffs <- map2_df(lang_pairs$lang1, lang_pairs$lang2, 
                              get_diff_in_sim_to_cluster, cosine_dists_tidy) %>%
  mutate(abs_sim_diff = abs(sim_diff)) 

cluster_diff_arranged <- pairwise_sim_diffs %>%
  group_by(lang1, lang2) %>%
  arrange(-abs_sim_diff) %>%
  slice(1) %>%
  ungroup() %>%
  separate(cluster, c("cluster", "cluster_id"), "_") %>%
  select(-cluster)

cluster_diff_counts <- cluster_diff_arranged %>%
  count(cluster_id) %>%
  data.frame() %>%
  arrange(-n)

Clusters

#bind_cols(words = WORDS, 
#          cluster_id = clusters$cluster) %>% #arrange(cluster_id) %>%
#  write_csv("words_to_clusters.csv")

raw_cluster_names <- read_csv("words_to_clusters.csv")

DT::datatable(raw_cluster_names)
cluster_names <- raw_cluster_names %>%
  filter(!is.na(cluster_gloss)) %>%
  select(-w1) %>%
  mutate(cluster_id = as.character(cluster_id))

Clusters that are most variable

cluster_diff_counts %>%
  left_join(cluster_names)  %>%
  DT::datatable()

Most different cluster by language pair

cluster_diff_arranged %>%
  left_join(cluster_names)  %>%
  arrange(-abs_sim_diff) %>%
  DT::datatable()