Nclusters is 50.

TENSOR_OUTPUT_FILENAME_FAM <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/4_corr_shared_ktensor_output_WWL_fam.mat"
tensor_outputF <- read.mat(TENSOR_OUTPUT_FILENAME_FAM)

words1F <- tensor_outputF$k[[1]]
words2F <- tensor_outputF$k[[2]]
langsF <- tensor_outputF$k[[3]]

WORDS_F <- read_feather("../data/2_matlab_data/word_to_is_shared_fam.feather")

FAMS <- c("Afro-Asiatic","Indo-European","Sino-Tibetan","Atlantic-Congo","Austronesian", "Japonic","Dravidian","Korean","TaiKadai", "Turkic","Austroasiatic")

Cluster words

Get tsne of word coordinates

# get tsne coordinates
tsne_outF = Rtsne::Rtsne(words1F)
tsne_dimsF <- tsne_outF$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  %>%
  bind_cols(word = sort(WORDS_F$.)) # the words are in alphabetical order

Cluster

N_CLUSTERS <- 50
cluster_path <- paste0("tsne_familiy_", N_CLUSTERS, ".csv")

clustersF <- kmeans(scale(tsne_dimsF[,c("tsne_X", "tsne_Y")]), N_CLUSTERS)
tsne_dimsF$cluster = factor(clustersF$cluster)
#write_csv(tsne_dimsF %>% arrange(cluster), cluster_path)
tsne_dimsF <- read_csv(cluster_path)

ggplot(tsne_dimsF,
         aes(x = tsne_X, y = tsne_Y, color = as.factor(cluster))) +
  geom_text(aes(label = word), size = 1.5) +
  theme_void() +
  ggtitle("t-sne projection of 710 words based on tensor factorization ranks") +
  theme(legend.position = "none")

Get relationship between clusters and family

Language families

LANG_KEY <- "../../../data/processed/lang_names/ets_to_google_langcodes_complete.csv"

read_csv(LANG_KEY) %>%
  select(lang_name, aff) %>%
  arrange(aff) %>%
  kable()
lang_name aff
Arabic Afro-Asiatic
Igbo Atlantic-Congo
Yoruba Atlantic-Congo
Vietnamese Austroasiatic
Indonesian Austronesian
Tagalog Austronesian
Kannada Dravidian
Malayalam Dravidian
Tamil Dravidian
Telugu Dravidian
Bengali Indo-European
Bulgarian Indo-European
Dutch; Flemish Indo-European
English Indo-European
French Indo-European
German Indo-European
Greek, Modern (1453-) Indo-European
Gujarati Indo-European
Hindi Indo-European
Italian Indo-European
Marathi Indo-European
Nepali Indo-European
Panjabi; Punjabi Indo-European
Polish Indo-European
Portuguese Indo-European
Romanian; Moldavian; Moldovan Indo-European
Russian Indo-European
Spanish; Castilian Indo-European
Urdu Indo-European
Farsi; Persian Indo-European
Japanese Japonic
Korean Korean
Chinese Sino-Tibetan
Thai TaiKadai
Turkish Turkic

Get average ranks for each word cluster

words_to_clusters <- words1F %>%
  as.data.frame() %>%
  mutate(cluster_id = tsne_dimsF$cluster) %>%
  select(cluster_id, everything()) 

cluster_rank_means <- words_to_clusters %>%
  split(.$cluster_id) %>%
  map(function(x) {
    colMeans(x[,-1]) %>% 
        t() %>% 
        as.data.frame() %>% 
        mutate(cluster_id = x[1,1])}) %>%
  bind_rows() %>%
  select(cluster_id, everything())

Get distance of each family to each cluster

all_ranks <- rbind(langsF,  as.matrix(cluster_rank_means[,-1]))

rank_corrs <- cor(t(all_ranks))
rank_corrs_df <- data.frame(rank_corrs[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>% 
  unlist()
rank_corrs_df <- setNames(rank_corrs_df, cluster_names)

rank_ps <-corr.test(as.data.frame(t(all_ranks)), adjust = "none")$p
rank_ps_df <-data.frame(rank_ps[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>% 
  unlist()
rank_ps_df <- setNames(rank_ps_df, cluster_names)

rank_ps_tidy <- rank_ps_df %>%
  mutate(langs = FAMS) %>%
  gather("cluster", "p", -langs) %>%
  arrange(langs)

rank_corrs_tidy <- rank_corrs_df %>%
  mutate(langs = FAMS) %>%
  gather("cluster", "r", -langs) %>%
  arrange(langs) %>%
  left_join(rank_ps_tidy)

Mean correlations by family across clusters

rank_corrs_tidy %>%
  ggplot(aes(y = r, x = langs, fill = langs)) +
  geom_hline(aes(yintercept = 0), linetype = 2)+
  geom_boxplot() +
  theme_classic() +
  theme(legend.position = "none")

Specific cluster x family

Get families that are most negatively/positibely correlated with a cluster

extreme_cluster_fams <- rank_corrs_tidy %>%
  arrange(-abs(r)) %>%
  slice(1:50) %>%
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x)

top_50_cluster_words <- tsne_dimsF %>%
  filter(cluster %in% unique(extreme_cluster_fams$cluster)) %>%
  select(word, cluster) %>%
  arrange(cluster)

#write_csv(top_50_cluster_words, "cluster_words_top50_100clusts.csv")

Cluster labels:

clust_labels_words <- tsne_dimsF %>%
  mutate(cluster = as.character(cluster)) 

DT::datatable(clust_labels_words)
clust_labels <- clust_labels_words %>%
  filter(!is.na(label)) %>%
  select(label, cluster)

Most negative clusters:

extreme_cluster_fams %>%
  left_join(clust_labels)  %>% 
  kable()
langs cluster r p label
Indo-European 1 -0.1584482 0.0009510 evaluative_adjs
Indo-European 41 -0.1544323 0.0012826 large_conflict
Dravidian 41 -0.1457721 0.0023873 large_conflict
Indo-European 27 -0.1457384 0.0023929 leisure_activities
Indo-European 16 -0.1370298 0.0043264 verbs
Indo-European 6 -0.1342653 0.0051862 misc2
Turkic 48 -0.1341743 0.0052170 quality_advs
Dravidian 50 -0.1334147 0.0054802 argue
Dravidian 33 -0.1305999 0.0065627 commerce2
Indo-European 23 -0.1287410 0.0073790 positive_adjs2
Sino-Tibetan 6 -0.1284630 0.0075086 misc2
Sino-Tibetan 2 -0.1254523 0.0090478 cause_effect
Afro-Asiatic 27 -0.1218739 0.0112377 leisure_activities
Sino-Tibetan 38 -0.1211140 0.0117590 enivornment
Turkic 17 -0.1208550 0.0119416 bodily_functions
Dravidian 23 -0.1203783 0.0122841 positive_adjs2
TaiKadai 31 -0.1200513 0.0125240 time_money
Indo-European 43 -0.1197152 0.0127749 influence_change
Austroasiatic 16 -0.1195271 0.0129172 verbs
Korean 27 -0.1195089 0.0129310 leisure_activities
Indo-European 36 -0.1194114 0.0130054 technology_media
Indo-European 37 -0.1181880 0.0139715 adverbs
Afro-Asiatic 13 -0.1153281 0.0164798 quantity
Atlantic-Congo 10 -0.1132770 0.0185136 pronouns_students_family
TaiKadai 10 -0.1130774 0.0187228 pronouns_students_family
Indo-European 11 -0.1123374 0.0195163 community
Atlantic-Congo 29 -0.1110445 0.0209728 knowledge
Austronesian 2 -0.1108386 0.0212133 cause_effect
Indo-European 22 -0.1101063 0.0220881 scientific_method
Dravidian 29 -0.1082349 0.0244671 knowledge
Dravidian 10 -0.1064369 0.0269579 pronouns_students_family
Indo-European 17 -0.1063720 0.0270517 bodily_functions
Dravidian 6 -0.1061588 0.0273620 misc2
Turkic 47 -0.1058631 0.0277974 spending_time
Indo-European 40 -0.1057993 0.0278921 passions
Austronesian 32 0.1047876 0.0294316 commerce
Dravidian 37 -0.1042318 0.0303079 adverbs
Indo-European 9 -0.1034267 0.0316168 support_teach
Sino-Tibetan 16 -0.1020797 0.0339149 verbs
Indo-European 35 -0.1017913 0.0344248 emotion
Dravidian 22 -0.1011145 0.0356476 scientific_method
Dravidian 14 -0.1005402 0.0367141 neuter_pronouns
Sino-Tibetan 23 -0.1005400 0.0367144 positive_adjs2
Dravidian 38 -0.1003552 0.0370632 enivornment
Indo-European 14 -0.1000620 0.0376224 neuter_pronouns
Dravidian 27 -0.0996543 0.0384119 leisure_activities
Indo-European 25 -0.0996043 0.0385098 social_political_school
Sino-Tibetan 33 -0.0994101 0.0388915 commerce2
Sino-Tibetan 3 -0.0987432 0.0402272 evidence
Austroasiatic 15 -0.0982262 0.0412891 time

Most variable clusters:

rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  arrange(-abs(sd_vals)) %>%
  slice(1:25) %>% 
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) %>%
  left_join(clust_labels) %>%
  kable()
cluster sd_vals label
33 0.0642818 commerce2
41 0.0629742 large_conflict
29 0.0571347 knowledge
38 0.0549977 enivornment
9 0.0531196 support_teach
48 0.0528757 quality_advs
6 0.0523577 misc2
17 0.0521986 bodily_functions
47 0.0520229 spending_time
1 0.0512362 evaluative_adjs
50 0.0493476 argue
13 0.0490917 quantity
14 0.0487416 neuter_pronouns
40 0.0486679 passions
15 0.0485182 time
32 0.0481226 commerce
10 0.0476278 pronouns_students_family
27 0.0468331 leisure_activities
2 0.0464333 cause_effect
16 0.0453629 verbs
23 0.0453624 positive_adjs2
19 0.0453190 normative_life
31 0.0445085 time_money
18 0.0445004 episitimic_verbs
4 0.0442333 such

Least variable clusters:

rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  arrange(abs(sd_vals)) %>%
  slice(1:25) %>% 
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) %>%
  left_join(clust_labels) %>%
  kable()
cluster sd_vals label
24 0.0264137 forward_progress
46 0.0285585 connectives
5 0.0302817 travel
44 0.0309131 university
22 0.0324151 scientific_method
7 0.0341511 person_interaction_verbs
20 0.0342804 communication
36 0.0343011 technology_media
39 0.0343070 international
8 0.0355229 advs
45 0.0369672 places
35 0.0371251 emotion
11 0.0381141 community
37 0.0386323 adverbs
34 0.0389631 alone_vs_groups
3 0.0391821 evidence
21 0.0391867 misc3
12 0.0393987 future_aspirations
26 0.0400627 areas_of_study
49 0.0402915 prepositions
43 0.0415925 influence_change
28 0.0420455 modals
42 0.0423599 academic_specialization
30 0.0426380 misc4
25 0.0435252 social_political_school

Concreteness

By cluster

brysbaert <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/conceptviz/data/supplementary_data/brysbaert_corpus.csv") %>%
  select(Word, Conc.M, Conc.SD, SUBTLEX)

word_conc <- clust_labels_words %>%
  select(word, cluster) %>%
  left_join(brysbaert, by = c('word' = "Word"))

cluster_sd <- rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) 

m <- word_conc %>%
  group_by(cluster) %>%
  summarize(mean_conc = mean(Conc.M, na.rm = T)) %>%
  left_join(cluster_sd)

cor.test(m$mean_conc, m$sd_vals)
## 
##  Pearson's product-moment correlation
## 
## data:  m$mean_conc and m$sd_vals
## t = -0.48212, df = 48, p-value = 0.6319
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3411748  0.2130449
## sample estimates:
##         cor 
## -0.06941948

No corr.

By word

all_ranks_word <- rbind(langsF,  words1F)

rank_corrs_word <- cor(t(all_ranks_word))
rank_corrs_df_word <- data.frame(rank_corrs_word[1:11, 12:(11 +  dim(words1F)[1])])
rank_corrs_df_word <- setNames(rank_corrs_df_word, unlist(WORDS_F, use.names = F))

word_sd_conc <- rank_corrs_df_word %>%
  mutate(langs = FAMS) %>%
  select(langs, everything()) %>%
  gather("word", "corr", -langs) %>%
  group_by(word) %>%
  summarize(sd = sd(corr)) %>%
  arrange(sd) %>%
  left_join(word_conc)

cor.test(word_sd_conc$Conc.M, word_sd_conc$sd)
## 
##  Pearson's product-moment correlation
## 
## data:  word_sd_conc$Conc.M and word_sd_conc$sd
## t = 2.8732, df = 574, p-value = 0.004214
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03774095 0.19883205
## sample estimates:
##     cor 
## 0.11907

Corr in opposite predicted direction: more concrete, more variable.