Nclusters is 50.
TENSOR_OUTPUT_FILENAME_FAM <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/4_corr_shared_ktensor_output_WWL_fam.mat"
tensor_outputF <- read.mat(TENSOR_OUTPUT_FILENAME_FAM)
words1F <- tensor_outputF$k[[1]]
words2F <- tensor_outputF$k[[2]]
langsF <- tensor_outputF$k[[3]]
WORDS_F <- read_feather("../data/2_matlab_data/word_to_is_shared_fam.feather")
FAMS <- c("Afro-Asiatic","Indo-European","Sino-Tibetan","Atlantic-Congo","Austronesian", "Japonic","Dravidian","Korean","TaiKadai", "Turkic","Austroasiatic")
Get tsne of word coordinates
# get tsne coordinates
tsne_outF = Rtsne::Rtsne(words1F)
tsne_dimsF <- tsne_outF$Y %>%
as.data.frame() %>%
rename(tsne_X = V1,
tsne_Y = V2) %>%
bind_cols(word = sort(WORDS_F$.)) # the words are in alphabetical order
Cluster
N_CLUSTERS <- 50
cluster_path <- paste0("tsne_familiy_", N_CLUSTERS, ".csv")
clustersF <- kmeans(scale(tsne_dimsF[,c("tsne_X", "tsne_Y")]), N_CLUSTERS)
tsne_dimsF$cluster = factor(clustersF$cluster)
#write_csv(tsne_dimsF %>% arrange(cluster), cluster_path)
tsne_dimsF <- read_csv(cluster_path)
ggplot(tsne_dimsF,
aes(x = tsne_X, y = tsne_Y, color = as.factor(cluster))) +
geom_text(aes(label = word), size = 1.5) +
theme_void() +
ggtitle("t-sne projection of 710 words based on tensor factorization ranks") +
theme(legend.position = "none")
Language families
LANG_KEY <- "../../../data/processed/lang_names/ets_to_google_langcodes_complete.csv"
lang_key <- read_csv(LANG_KEY)
lang_key %>%
select(lang_name, aff) %>%
arrange(aff) %>%
kable()
lang_name | aff |
---|---|
Arabic | Afro-Asiatic |
Igbo | Atlantic-Congo |
Yoruba | Atlantic-Congo |
Vietnamese | Austroasiatic |
Indonesian | Austronesian |
Tagalog | Austronesian |
Kannada | Dravidian |
Malayalam | Dravidian |
Tamil | Dravidian |
Telugu | Dravidian |
Bengali | Indo-European |
Bulgarian | Indo-European |
Dutch; Flemish | Indo-European |
English | Indo-European |
French | Indo-European |
German | Indo-European |
Greek, Modern (1453-) | Indo-European |
Gujarati | Indo-European |
Hindi | Indo-European |
Italian | Indo-European |
Marathi | Indo-European |
Nepali | Indo-European |
Panjabi; Punjabi | Indo-European |
Polish | Indo-European |
Portuguese | Indo-European |
Romanian; Moldavian; Moldovan | Indo-European |
Russian | Indo-European |
Spanish; Castilian | Indo-European |
Urdu | Indo-European |
Farsi; Persian | Indo-European |
Japanese | Japonic |
Korean | Korean |
Chinese | Sino-Tibetan |
Thai | TaiKadai |
Turkish | Turkic |
lang_fam_counts <- count(lang_key, aff)
Get average ranks for each word cluster
words_to_clusters <- words1F %>%
as.data.frame() %>%
mutate(cluster_id = tsne_dimsF$cluster) %>%
select(cluster_id, everything())
cluster_rank_means <- words_to_clusters %>%
split(.$cluster_id) %>%
map(function(x) {
colMeans(x[,-1]) %>%
t() %>%
as.data.frame() %>%
mutate(cluster_id = x[1,1])}) %>%
bind_rows() %>%
select(cluster_id, everything())
Get distance of each family to each cluster
all_ranks <- rbind(langsF, as.matrix(cluster_rank_means[,-1]))
rank_corrs <- cor(t(all_ranks))
rank_corrs_df <- data.frame(rank_corrs[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>%
unlist()
rank_corrs_df <- setNames(rank_corrs_df, cluster_names)
rank_ps <-corr.test(as.data.frame(t(all_ranks)), adjust = "none")$p
rank_ps_df <-data.frame(rank_ps[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>%
unlist()
rank_ps_df <- setNames(rank_ps_df, cluster_names)
rank_ps_tidy <- rank_ps_df %>%
mutate(langs = FAMS) %>%
gather("cluster", "p", -langs) %>%
arrange(langs)
rank_corrs_tidy <- rank_corrs_df %>%
mutate(langs = FAMS) %>%
gather("cluster", "r", -langs) %>%
arrange(langs) %>%
left_join(rank_ps_tidy) %>%
left_join(lang_fam_counts, by = c("langs" = "aff")) %>%
mutate(r_norm = r/n,
r_norm_log = r/log(n + 1))
Mean correlations by family across clusters
rank_corrs_tidy %>%
select(langs, r, r_norm, r_norm_log) %>%
gather("rmeasure", "rvalue", -langs) %>%
group_by(langs, rmeasure) %>%
langcog::multi_boot_standard(col = "rvalue") %>%
left_join(lang_fam_counts, by = c("langs" = "aff")) %>%
ggplot(aes(y = mean, x = langs, fill = langs, color = langs)) +
geom_hline(aes(yintercept = 0), linetype = 2)+
ylab("Language Families") +
xlab("mean r") +
facet_wrap(~ rmeasure) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_classic() +
theme(legend.position = "none") +
coord_flip()
Get families that are most negatively/positively correlated with a cluster
extreme_cluster_fams <- rank_corrs_tidy %>%
arrange(r_norm_log) %>%
slice(1:50) %>%
separate(cluster, c("x", "cluster"), "_") %>%
select(-x)
top_50_cluster_words <- tsne_dimsF %>%
filter(cluster %in% unique(extreme_cluster_fams$cluster)) %>%
select(word, cluster) %>%
arrange(cluster)
#write_csv(top_50_cluster_words, "cluster_words_top50_100clusts.csv")
Cluster labels:
clust_labels_words <- tsne_dimsF %>%
mutate(cluster = as.character(cluster))
DT::datatable(clust_labels_words)
clust_labels <- clust_labels_words %>%
filter(!is.na(label)) %>%
select(label, cluster)
extreme_cluster_fams %>%
left_join(clust_labels) %>%
kable()
langs | cluster | r | p | n | r_norm | r_norm_log | label |
---|---|---|---|---|---|---|---|
Turkic | 48 | -0.1341743 | 0.0052170 | 1 | -0.1341743 | -0.1935726 | quality_advs |
Sino-Tibetan | 6 | -0.1284630 | 0.0075086 | 1 | -0.1284630 | -0.1853329 | misc2 |
Sino-Tibetan | 2 | -0.1254523 | 0.0090478 | 1 | -0.1254523 | -0.1809894 | cause_effect |
Afro-Asiatic | 27 | -0.1218739 | 0.0112377 | 1 | -0.1218739 | -0.1758269 | leisure_activities |
Sino-Tibetan | 38 | -0.1211140 | 0.0117590 | 1 | -0.1211140 | -0.1747306 | enivornment |
Turkic | 17 | -0.1208550 | 0.0119416 | 1 | -0.1208550 | -0.1743569 | bodily_functions |
TaiKadai | 31 | -0.1200513 | 0.0125240 | 1 | -0.1200513 | -0.1731974 | time_money |
Austroasiatic | 16 | -0.1195271 | 0.0129172 | 1 | -0.1195271 | -0.1724412 | verbs |
Korean | 27 | -0.1195089 | 0.0129310 | 1 | -0.1195089 | -0.1724150 | leisure_activities |
Afro-Asiatic | 13 | -0.1153281 | 0.0164798 | 1 | -0.1153281 | -0.1663833 | quantity |
TaiKadai | 10 | -0.1130774 | 0.0187228 | 1 | -0.1130774 | -0.1631362 | pronouns_students_family |
Turkic | 47 | -0.1058631 | 0.0277974 | 1 | -0.1058631 | -0.1527282 | spending_time |
Sino-Tibetan | 16 | -0.1020797 | 0.0339149 | 1 | -0.1020797 | -0.1472698 | verbs |
Sino-Tibetan | 23 | -0.1005400 | 0.0367144 | 1 | -0.1005400 | -0.1450486 | positive_adjs2 |
Sino-Tibetan | 33 | -0.0994101 | 0.0388915 | 1 | -0.0994101 | -0.1434185 | commerce2 |
Sino-Tibetan | 3 | -0.0987432 | 0.0402272 | 1 | -0.0987432 | -0.1424563 | evidence |
Austroasiatic | 15 | -0.0982262 | 0.0412891 | 1 | -0.0982262 | -0.1417105 | time |
Afro-Asiatic | 4 | -0.0981376 | 0.0414734 | 1 | -0.0981376 | -0.1415827 | such |
Japonic | 47 | -0.0932949 | 0.0526590 | 1 | -0.0932949 | -0.1345960 | spending_time |
Sino-Tibetan | 44 | -0.0929744 | 0.0534804 | 1 | -0.0929744 | -0.1341337 | university |
Sino-Tibetan | 35 | -0.0928611 | 0.0537734 | 1 | -0.0928611 | -0.1339702 | emotion |
Afro-Asiatic | 22 | -0.0889323 | 0.0647867 | 1 | -0.0889323 | -0.1283021 | scientific_method |
Sino-Tibetan | 9 | -0.0885772 | 0.0658673 | 1 | -0.0885772 | -0.1277899 | support_teach |
Turkic | 36 | -0.0880849 | 0.0673902 | 1 | -0.0880849 | -0.1270797 | technology_media |
TaiKadai | 47 | -0.0843182 | 0.0800215 | 1 | -0.0843182 | -0.1216454 | spending_time |
Sino-Tibetan | 22 | -0.0833001 | 0.0837464 | 1 | -0.0833001 | -0.1201766 | scientific_method |
Sino-Tibetan | 17 | -0.0828689 | 0.0853658 | 1 | -0.0828689 | -0.1195545 | bodily_functions |
TaiKadai | 34 | -0.0814182 | 0.0909995 | 1 | -0.0814182 | -0.1174616 | alone_vs_groups |
Austroasiatic | 9 | -0.0770070 | 0.1099771 | 1 | -0.0770070 | -0.1110976 | support_teach |
Austroasiatic | 28 | -0.0767868 | 0.1110004 | 1 | -0.0767868 | -0.1107799 | modals |
Sino-Tibetan | 31 | -0.0760635 | 0.1144146 | 1 | -0.0760635 | -0.1097365 | time_money |
TaiKadai | 35 | -0.0746981 | 0.1210842 | 1 | -0.0746981 | -0.1077666 | emotion |
Japonic | 48 | -0.0746486 | 0.1213315 | 1 | -0.0746486 | -0.1076952 | quality_advs |
TaiKadai | 37 | -0.0719880 | 0.1352158 | 1 | -0.0719880 | -0.1038568 | adverbs |
TaiKadai | 16 | -0.0717716 | 0.1363971 | 1 | -0.0717716 | -0.1035445 | verbs |
Atlantic-Congo | 10 | -0.1132770 | 0.0185136 | 2 | -0.0566385 | -0.1031092 | pronouns_students_family |
Sino-Tibetan | 41 | -0.0710265 | 0.1405242 | 1 | -0.0710265 | -0.1024695 | large_conflict |
Turkic | 38 | -0.0705579 | 0.1431678 | 1 | -0.0705579 | -0.1017936 | enivornment |
Atlantic-Congo | 29 | -0.1110445 | 0.0209728 | 2 | -0.0555223 | -0.1010771 | knowledge |
Japonic | 27 | -0.0699868 | 0.1464419 | 1 | -0.0699868 | -0.1009696 | leisure_activities |
Austronesian | 2 | -0.1108386 | 0.0212133 | 2 | -0.0554193 | -0.1008897 | cause_effect |
Afro-Asiatic | 38 | -0.0696857 | 0.1481906 | 1 | -0.0696857 | -0.1005352 | enivornment |
Turkic | 10 | -0.0687947 | 0.1534588 | 1 | -0.0687947 | -0.0992498 | pronouns_students_family |
Sino-Tibetan | 21 | -0.0686965 | 0.1540479 | 1 | -0.0686965 | -0.0991082 | misc3 |
Austroasiatic | 47 | -0.0682052 | 0.1570219 | 1 | -0.0682052 | -0.0983993 | spending_time |
Turkic | 13 | -0.0681511 | 0.1573521 | 1 | -0.0681511 | -0.0983212 | quantity |
Turkic | 26 | -0.0672240 | 0.1630898 | 1 | -0.0672240 | -0.0969838 | areas_of_study |
Turkic | 31 | -0.0671267 | 0.1637013 | 1 | -0.0671267 | -0.0968433 | time_money |
Sino-Tibetan | 46 | -0.0665176 | 0.1675666 | 1 | -0.0665176 | -0.0959646 | connectives |
Sino-Tibetan | 34 | -0.0655714 | 0.1737062 | 1 | -0.0655714 | -0.0945995 | alone_vs_groups |
rank_corrs_tidy %>%
group_by(cluster) %>%
summarize(sd_vals = sd(r)) %>%
arrange(-abs(sd_vals)) %>%
slice(1:25) %>%
separate(cluster, c("x", "cluster"), "_") %>%
select(-x) %>%
left_join(clust_labels) %>%
kable()
cluster | sd_vals | label |
---|---|---|
33 | 0.0642818 | commerce2 |
41 | 0.0629742 | large_conflict |
29 | 0.0571347 | knowledge |
38 | 0.0549977 | enivornment |
9 | 0.0531196 | support_teach |
48 | 0.0528757 | quality_advs |
6 | 0.0523577 | misc2 |
17 | 0.0521986 | bodily_functions |
47 | 0.0520229 | spending_time |
1 | 0.0512362 | evaluative_adjs |
50 | 0.0493476 | argue |
13 | 0.0490917 | quantity |
14 | 0.0487416 | neuter_pronouns |
40 | 0.0486679 | passions |
15 | 0.0485182 | time |
32 | 0.0481226 | commerce |
10 | 0.0476278 | pronouns_students_family |
27 | 0.0468331 | leisure_activities |
2 | 0.0464333 | cause_effect |
16 | 0.0453629 | verbs |
23 | 0.0453624 | positive_adjs2 |
19 | 0.0453190 | normative_life |
31 | 0.0445085 | time_money |
18 | 0.0445004 | episitimic_verbs |
4 | 0.0442333 | such |
rank_corrs_tidy %>%
group_by(cluster) %>%
summarize(sd_vals = sd(r)) %>%
arrange(abs(sd_vals)) %>%
slice(1:25) %>%
separate(cluster, c("x", "cluster"), "_") %>%
select(-x) %>%
left_join(clust_labels) %>%
kable()
cluster | sd_vals | label |
---|---|---|
24 | 0.0264137 | forward_progress |
46 | 0.0285585 | connectives |
5 | 0.0302817 | travel |
44 | 0.0309131 | university |
22 | 0.0324151 | scientific_method |
7 | 0.0341511 | person_interaction_verbs |
20 | 0.0342804 | communication |
36 | 0.0343011 | technology_media |
39 | 0.0343070 | international |
8 | 0.0355229 | advs |
45 | 0.0369672 | places |
35 | 0.0371251 | emotion |
11 | 0.0381141 | community |
37 | 0.0386323 | adverbs |
34 | 0.0389631 | alone_vs_groups |
3 | 0.0391821 | evidence |
21 | 0.0391867 | misc3 |
12 | 0.0393987 | future_aspirations |
26 | 0.0400627 | areas_of_study |
49 | 0.0402915 | prepositions |
43 | 0.0415925 | influence_change |
28 | 0.0420455 | modals |
42 | 0.0423599 | academic_specialization |
30 | 0.0426380 | misc4 |
25 | 0.0435252 | social_political_school |
brysbaert <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/conceptviz/data/supplementary_data/brysbaert_corpus.csv") %>%
select(Word, Conc.M, Conc.SD, SUBTLEX)
word_conc <- clust_labels_words %>%
select(word, cluster) %>%
left_join(brysbaert, by = c('word' = "Word"))
cluster_sd <- rank_corrs_tidy %>%
group_by(cluster) %>%
summarize(sd_vals = sd(r)) %>%
separate(cluster, c("x", "cluster"), "_") %>%
select(-x)
m <- word_conc %>%
group_by(cluster) %>%
summarize(mean_conc = mean(Conc.M, na.rm = T)) %>%
left_join(cluster_sd)
cor.test(m$mean_conc, m$sd_vals)
##
## Pearson's product-moment correlation
##
## data: m$mean_conc and m$sd_vals
## t = -0.48212, df = 48, p-value = 0.6319
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3411748 0.2130449
## sample estimates:
## cor
## -0.06941948
No corr.
all_ranks_word <- rbind(langsF, words1F)
rank_corrs_word <- cor(t(all_ranks_word))
rank_corrs_df_word <- data.frame(rank_corrs_word[1:11, 12:(11 + dim(words1F)[1])])
rank_corrs_df_word <- setNames(rank_corrs_df_word, unlist(WORDS_F, use.names = F))
word_sd_conc <- rank_corrs_df_word %>%
mutate(langs = FAMS) %>%
select(langs, everything()) %>%
gather("word", "corr", -langs) %>%
group_by(word) %>%
summarize(sd = sd(corr)) %>%
arrange(sd) %>%
left_join(word_conc)
cor.test(word_sd_conc$Conc.M, word_sd_conc$sd)
##
## Pearson's product-moment correlation
##
## data: word_sd_conc$Conc.M and word_sd_conc$sd
## t = 2.8732, df = 574, p-value = 0.004214
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03774095 0.19883205
## sample estimates:
## cor
## 0.11907
Corr in opposite predicted direction: more concrete, more variable.