Nclusters is 50.

TENSOR_OUTPUT_FILENAME_FAM <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/4_corr_shared_ktensor_output_WWL_fam.mat"
tensor_outputF <- read.mat(TENSOR_OUTPUT_FILENAME_FAM)

words1F <- tensor_outputF$k[[1]]
words2F <- tensor_outputF$k[[2]]
langsF <- tensor_outputF$k[[3]]

WORDS_F <- read_feather("../data/2_matlab_data/word_to_is_shared_fam.feather")

FAMS <- c("Afro-Asiatic","Indo-European","Sino-Tibetan","Atlantic-Congo","Austronesian", "Japonic","Dravidian","Korean","TaiKadai", "Turkic","Austroasiatic")

Cluster words

Get tsne of word coordinates

# get tsne coordinates
tsne_outF = Rtsne::Rtsne(words1F)
tsne_dimsF <- tsne_outF$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  %>%
  bind_cols(word = sort(WORDS_F$.)) # the words are in alphabetical order

Cluster

N_CLUSTERS <- 50
cluster_path <- paste0("tsne_familiy_", N_CLUSTERS, ".csv")

clustersF <- kmeans(scale(tsne_dimsF[,c("tsne_X", "tsne_Y")]), N_CLUSTERS)
tsne_dimsF$cluster = factor(clustersF$cluster)
#write_csv(tsne_dimsF %>% arrange(cluster), cluster_path)
tsne_dimsF <- read_csv(cluster_path)

ggplot(tsne_dimsF,
         aes(x = tsne_X, y = tsne_Y, color = as.factor(cluster))) +
  geom_text(aes(label = word), size = 1.5) +
  theme_void() +
  ggtitle("t-sne projection of 710 words based on tensor factorization ranks") +
  theme(legend.position = "none")