Nclusters is 50.

TENSOR_OUTPUT_FILENAME_FAM <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/4_corr_shared_ktensor_output_WWL_fam.mat"
tensor_outputF <- read.mat(TENSOR_OUTPUT_FILENAME_FAM)

words1F <- tensor_outputF$k[[1]]
words2F <- tensor_outputF$k[[2]]
langsF <- tensor_outputF$k[[3]]

WORDS_F <- read_feather("../data/2_matlab_data/word_to_is_shared_fam.feather")

FAMS <- c("Afro-Asiatic","Indo-European","Sino-Tibetan","Atlantic-Congo","Austronesian", "Japonic","Dravidian","Korean","TaiKadai", "Turkic","Austroasiatic")

Cluster words

Get tsne of word coordinates

# get tsne coordinates
tsne_outF = Rtsne::Rtsne(words1F)
tsne_dimsF <- tsne_outF$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  %>%
  bind_cols(word = sort(WORDS_F$.)) # the words are in alphabetical order

Cluster

N_CLUSTERS <- 50
cluster_path <- paste0("tsne_familiy_", N_CLUSTERS, ".csv")

clustersF <- kmeans(scale(tsne_dimsF[,c("tsne_X", "tsne_Y")]), N_CLUSTERS)
tsne_dimsF$cluster = factor(clustersF$cluster)
#write_csv(tsne_dimsF %>% arrange(cluster), cluster_path)
tsne_dimsF <- read_csv(cluster_path)

ggplot(tsne_dimsF,
         aes(x = tsne_X, y = tsne_Y, color = as.factor(cluster))) +
  geom_text(aes(label = word), size = 1.5) +
  theme_void() +
  ggtitle("t-sne projection of 710 words based on tensor factorization ranks") +
  theme(legend.position = "none")

Get relationship between clusters and family

Language families

LANG_KEY <- "../../../data/processed/lang_names/ets_to_google_langcodes_complete.csv"

lang_key <- read_csv(LANG_KEY) 

lang_key %>%
  select(lang_name, aff) %>%
  arrange(aff) %>%
  kable()
lang_name aff
Arabic Afro-Asiatic
Igbo Atlantic-Congo
Yoruba Atlantic-Congo
Vietnamese Austroasiatic
Indonesian Austronesian
Tagalog Austronesian
Kannada Dravidian
Malayalam Dravidian
Tamil Dravidian
Telugu Dravidian
Bengali Indo-European
Bulgarian Indo-European
Dutch; Flemish Indo-European
English Indo-European
French Indo-European
German Indo-European
Greek, Modern (1453-) Indo-European
Gujarati Indo-European
Hindi Indo-European
Italian Indo-European
Marathi Indo-European
Nepali Indo-European
Panjabi; Punjabi Indo-European
Polish Indo-European
Portuguese Indo-European
Romanian; Moldavian; Moldovan Indo-European
Russian Indo-European
Spanish; Castilian Indo-European
Urdu Indo-European
Farsi; Persian Indo-European
Japanese Japonic
Korean Korean
Chinese Sino-Tibetan
Thai TaiKadai
Turkish Turkic
lang_fam_counts <- count(lang_key, aff)

Get average ranks for each word cluster

words_to_clusters <- words1F %>%
  as.data.frame() %>%
  mutate(cluster_id = tsne_dimsF$cluster) %>%
  select(cluster_id, everything()) 

cluster_rank_means <- words_to_clusters %>%
  split(.$cluster_id) %>%
  map(function(x) {
    colMeans(x[,-1]) %>% 
        t() %>% 
        as.data.frame() %>% 
        mutate(cluster_id = x[1,1])}) %>%
  bind_rows() %>%
  select(cluster_id, everything())

Get distance of each family to each cluster

all_ranks <- rbind(langsF,  as.matrix(cluster_rank_means[,-1]))

rank_corrs <- cor(t(all_ranks))
rank_corrs_df <- data.frame(rank_corrs[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>% 
  unlist()
rank_corrs_df <- setNames(rank_corrs_df, cluster_names)

rank_ps <-corr.test(as.data.frame(t(all_ranks)), adjust = "none")$p
rank_ps_df <-data.frame(rank_ps[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>% 
  unlist()
rank_ps_df <- setNames(rank_ps_df, cluster_names)

rank_ps_tidy <- rank_ps_df %>%
  mutate(langs = FAMS) %>%
  gather("cluster", "p", -langs) %>%
  arrange(langs)

rank_corrs_tidy <- rank_corrs_df %>%
  mutate(langs = FAMS) %>%
  gather("cluster", "r", -langs) %>%
  arrange(langs) %>%
  left_join(rank_ps_tidy) %>%
  left_join(lang_fam_counts, by = c("langs" = "aff")) %>%
  mutate(r_norm = r/n,
         r_norm_log = r/log(n + 1))

Mean correlations by family across clusters

rank_corrs_tidy %>%
  select(langs, r, r_norm, r_norm_log) %>%
  gather("rmeasure", "rvalue", -langs) %>%
  group_by(langs, rmeasure) %>%
  langcog::multi_boot_standard(col  = "rvalue") %>%
  left_join(lang_fam_counts, by = c("langs" = "aff")) %>%
  ggplot(aes(y = mean, x = langs, fill = langs, color = langs)) +
  geom_hline(aes(yintercept = 0), linetype = 2)+
  ylab("Language Families") +
  xlab("mean r") +
  facet_wrap(~ rmeasure)   +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  theme_classic() +
  theme(legend.position = "none") +
  coord_flip()

Specific cluster x family

Get families that are most negatively/positively correlated with a cluster

extreme_cluster_fams <- rank_corrs_tidy %>%
  arrange(r_norm_log) %>%
  slice(1:50) %>%
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x)

top_50_cluster_words <- tsne_dimsF %>%
  filter(cluster %in% unique(extreme_cluster_fams$cluster)) %>%
  select(word, cluster) %>%
  arrange(cluster)

#write_csv(top_50_cluster_words, "cluster_words_top50_100clusts.csv")

Cluster labels:

clust_labels_words <- tsne_dimsF %>%
  mutate(cluster = as.character(cluster)) 

DT::datatable(clust_labels_words)
clust_labels <- clust_labels_words %>%
  filter(!is.na(label)) %>%
  select(label, cluster)

Most negative clusters:

extreme_cluster_fams %>%
  left_join(clust_labels)  %>% 
  kable()
langs cluster r p n r_norm r_norm_log label
Turkic 48 -0.1341743 0.0052170 1 -0.1341743 -0.1935726 quality_advs
Sino-Tibetan 6 -0.1284630 0.0075086 1 -0.1284630 -0.1853329 misc2
Sino-Tibetan 2 -0.1254523 0.0090478 1 -0.1254523 -0.1809894 cause_effect
Afro-Asiatic 27 -0.1218739 0.0112377 1 -0.1218739 -0.1758269 leisure_activities
Sino-Tibetan 38 -0.1211140 0.0117590 1 -0.1211140 -0.1747306 enivornment
Turkic 17 -0.1208550 0.0119416 1 -0.1208550 -0.1743569 bodily_functions
TaiKadai 31 -0.1200513 0.0125240 1 -0.1200513 -0.1731974 time_money
Austroasiatic 16 -0.1195271 0.0129172 1 -0.1195271 -0.1724412 verbs
Korean 27 -0.1195089 0.0129310 1 -0.1195089 -0.1724150 leisure_activities
Afro-Asiatic 13 -0.1153281 0.0164798 1 -0.1153281 -0.1663833 quantity
TaiKadai 10 -0.1130774 0.0187228 1 -0.1130774 -0.1631362 pronouns_students_family
Turkic 47 -0.1058631 0.0277974 1 -0.1058631 -0.1527282 spending_time
Sino-Tibetan 16 -0.1020797 0.0339149 1 -0.1020797 -0.1472698 verbs
Sino-Tibetan 23 -0.1005400 0.0367144 1 -0.1005400 -0.1450486 positive_adjs2
Sino-Tibetan 33 -0.0994101 0.0388915 1 -0.0994101 -0.1434185 commerce2
Sino-Tibetan 3 -0.0987432 0.0402272 1 -0.0987432 -0.1424563 evidence
Austroasiatic 15 -0.0982262 0.0412891 1 -0.0982262 -0.1417105 time
Afro-Asiatic 4 -0.0981376 0.0414734 1 -0.0981376 -0.1415827 such
Japonic 47 -0.0932949 0.0526590 1 -0.0932949 -0.1345960 spending_time
Sino-Tibetan 44 -0.0929744 0.0534804 1 -0.0929744 -0.1341337 university
Sino-Tibetan 35 -0.0928611 0.0537734 1 -0.0928611 -0.1339702 emotion
Afro-Asiatic 22 -0.0889323 0.0647867 1 -0.0889323 -0.1283021 scientific_method
Sino-Tibetan 9 -0.0885772 0.0658673 1 -0.0885772 -0.1277899 support_teach
Turkic 36 -0.0880849 0.0673902 1 -0.0880849 -0.1270797 technology_media
TaiKadai 47 -0.0843182 0.0800215 1 -0.0843182 -0.1216454 spending_time
Sino-Tibetan 22 -0.0833001 0.0837464 1 -0.0833001 -0.1201766 scientific_method
Sino-Tibetan 17 -0.0828689 0.0853658 1 -0.0828689 -0.1195545 bodily_functions
TaiKadai 34 -0.0814182 0.0909995 1 -0.0814182 -0.1174616 alone_vs_groups
Austroasiatic 9 -0.0770070 0.1099771 1 -0.0770070 -0.1110976 support_teach
Austroasiatic 28 -0.0767868 0.1110004 1 -0.0767868 -0.1107799 modals
Sino-Tibetan 31 -0.0760635 0.1144146 1 -0.0760635 -0.1097365 time_money
TaiKadai 35 -0.0746981 0.1210842 1 -0.0746981 -0.1077666 emotion
Japonic 48 -0.0746486 0.1213315 1 -0.0746486 -0.1076952 quality_advs
TaiKadai 37 -0.0719880 0.1352158 1 -0.0719880 -0.1038568 adverbs
TaiKadai 16 -0.0717716 0.1363971 1 -0.0717716 -0.1035445 verbs
Atlantic-Congo 10 -0.1132770 0.0185136 2 -0.0566385 -0.1031092 pronouns_students_family
Sino-Tibetan 41 -0.0710265 0.1405242 1 -0.0710265 -0.1024695 large_conflict
Turkic 38 -0.0705579 0.1431678 1 -0.0705579 -0.1017936 enivornment
Atlantic-Congo 29 -0.1110445 0.0209728 2 -0.0555223 -0.1010771 knowledge
Japonic 27 -0.0699868 0.1464419 1 -0.0699868 -0.1009696 leisure_activities
Austronesian 2 -0.1108386 0.0212133 2 -0.0554193 -0.1008897 cause_effect
Afro-Asiatic 38 -0.0696857 0.1481906 1 -0.0696857 -0.1005352 enivornment
Turkic 10 -0.0687947 0.1534588 1 -0.0687947 -0.0992498 pronouns_students_family
Sino-Tibetan 21 -0.0686965 0.1540479 1 -0.0686965 -0.0991082 misc3
Austroasiatic 47 -0.0682052 0.1570219 1 -0.0682052 -0.0983993 spending_time
Turkic 13 -0.0681511 0.1573521 1 -0.0681511 -0.0983212 quantity
Turkic 26 -0.0672240 0.1630898 1 -0.0672240 -0.0969838 areas_of_study
Turkic 31 -0.0671267 0.1637013 1 -0.0671267 -0.0968433 time_money
Sino-Tibetan 46 -0.0665176 0.1675666 1 -0.0665176 -0.0959646 connectives
Sino-Tibetan 34 -0.0655714 0.1737062 1 -0.0655714 -0.0945995 alone_vs_groups

Most variable clusters:

rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  arrange(-abs(sd_vals)) %>%
  slice(1:25) %>% 
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) %>%
  left_join(clust_labels) %>%
  kable()
cluster sd_vals label
33 0.0642818 commerce2
41 0.0629742 large_conflict
29 0.0571347 knowledge
38 0.0549977 enivornment
9 0.0531196 support_teach
48 0.0528757 quality_advs
6 0.0523577 misc2
17 0.0521986 bodily_functions
47 0.0520229 spending_time
1 0.0512362 evaluative_adjs
50 0.0493476 argue
13 0.0490917 quantity
14 0.0487416 neuter_pronouns
40 0.0486679 passions
15 0.0485182 time
32 0.0481226 commerce
10 0.0476278 pronouns_students_family
27 0.0468331 leisure_activities
2 0.0464333 cause_effect
16 0.0453629 verbs
23 0.0453624 positive_adjs2
19 0.0453190 normative_life
31 0.0445085 time_money
18 0.0445004 episitimic_verbs
4 0.0442333 such

Least variable clusters:

rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  arrange(abs(sd_vals)) %>%
  slice(1:25) %>% 
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) %>%
  left_join(clust_labels) %>%
  kable()
cluster sd_vals label
24 0.0264137 forward_progress
46 0.0285585 connectives
5 0.0302817 travel
44 0.0309131 university
22 0.0324151 scientific_method
7 0.0341511 person_interaction_verbs
20 0.0342804 communication
36 0.0343011 technology_media
39 0.0343070 international
8 0.0355229 advs
45 0.0369672 places
35 0.0371251 emotion
11 0.0381141 community
37 0.0386323 adverbs
34 0.0389631 alone_vs_groups
3 0.0391821 evidence
21 0.0391867 misc3
12 0.0393987 future_aspirations
26 0.0400627 areas_of_study
49 0.0402915 prepositions
43 0.0415925 influence_change
28 0.0420455 modals
42 0.0423599 academic_specialization
30 0.0426380 misc4
25 0.0435252 social_political_school

Concreteness

By cluster

brysbaert <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/conceptviz/data/supplementary_data/brysbaert_corpus.csv") %>%
  select(Word, Conc.M, Conc.SD, SUBTLEX)

word_conc <- clust_labels_words %>%
  select(word, cluster) %>%
  left_join(brysbaert, by = c('word' = "Word"))

cluster_sd <- rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) 

m <- word_conc %>%
  group_by(cluster) %>%
  summarize(mean_conc = mean(Conc.M, na.rm = T)) %>%
  left_join(cluster_sd)

cor.test(m$mean_conc, m$sd_vals)
## 
##  Pearson's product-moment correlation
## 
## data:  m$mean_conc and m$sd_vals
## t = -0.48212, df = 48, p-value = 0.6319
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3411748  0.2130449
## sample estimates:
##         cor 
## -0.06941948

No corr.

By word

all_ranks_word <- rbind(langsF,  words1F)

rank_corrs_word <- cor(t(all_ranks_word))
rank_corrs_df_word <- data.frame(rank_corrs_word[1:11, 12:(11 +  dim(words1F)[1])])
rank_corrs_df_word <- setNames(rank_corrs_df_word, unlist(WORDS_F, use.names = F))

word_sd_conc <- rank_corrs_df_word %>%
  mutate(langs = FAMS) %>%
  select(langs, everything()) %>%
  gather("word", "corr", -langs) %>%
  group_by(word) %>%
  summarize(sd = sd(corr)) %>%
  arrange(sd) %>%
  left_join(word_conc)

cor.test(word_sd_conc$Conc.M, word_sd_conc$sd)
## 
##  Pearson's product-moment correlation
## 
## data:  word_sd_conc$Conc.M and word_sd_conc$sd
## t = 2.8732, df = 574, p-value = 0.004214
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03774095 0.19883205
## sample estimates:
##     cor 
## 0.11907

Corr in opposite predicted direction: more concrete, more variable.