Nclusters is 50.
TENSOR_OUTPUT_FILENAME_FAM <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/4_corr_shared_ktensor_output_WWL_fam.mat"
tensor_outputF <- read.mat(TENSOR_OUTPUT_FILENAME_FAM)
words1F <- tensor_outputF$k[[1]]
words2F <- tensor_outputF$k[[2]]
langsF <- tensor_outputF$k[[3]]
WORDS_F <- read_feather("../data/2_matlab_data/word_to_is_shared_fam.feather")
FAMS <- c("Afro-Asiatic","Indo-European","Sino-Tibetan","Atlantic-Congo","Austronesian", "Japonic","Dravidian","Korean","TaiKadai", "Turkic","Austroasiatic")
Get tsne of word coordinates
# get tsne coordinates
tsne_outF = Rtsne::Rtsne(words1F)
tsne_dimsF <- tsne_outF$Y %>%
as.data.frame() %>%
rename(tsne_X = V1,
tsne_Y = V2) %>%
bind_cols(word = sort(WORDS_F$.)) # the words are in alphabetical order
Cluster
N_CLUSTERS <- 50
cluster_path <- paste0("tsne_familiy_", N_CLUSTERS, ".csv")
clustersF <- kmeans(scale(tsne_dimsF[,c("tsne_X", "tsne_Y")]), N_CLUSTERS)
tsne_dimsF$cluster = factor(clustersF$cluster)
#write_csv(tsne_dimsF %>% arrange(cluster), cluster_path)
tsne_dimsF <- read_csv(cluster_path)
ggplot(tsne_dimsF,
aes(x = tsne_X, y = tsne_Y, color = as.factor(cluster))) +
geom_text(aes(label = word), size = 1.5) +
theme_void() +
ggtitle("t-sne projection of 710 words based on tensor factorization ranks") +
theme(legend.position = "none")
Language families
LANG_KEY <- "../../../data/processed/lang_names/ets_to_google_langcodes_complete.csv"
read_csv(LANG_KEY) %>%
select(lang_name, aff) %>%
arrange(aff) %>%
kable()
| lang_name | aff |
|---|---|
| Arabic | Afro-Asiatic |
| Igbo | Atlantic-Congo |
| Yoruba | Atlantic-Congo |
| Vietnamese | Austroasiatic |
| Indonesian | Austronesian |
| Tagalog | Austronesian |
| Kannada | Dravidian |
| Malayalam | Dravidian |
| Tamil | Dravidian |
| Telugu | Dravidian |
| Bengali | Indo-European |
| Bulgarian | Indo-European |
| Dutch; Flemish | Indo-European |
| English | Indo-European |
| French | Indo-European |
| German | Indo-European |
| Greek, Modern (1453-) | Indo-European |
| Gujarati | Indo-European |
| Hindi | Indo-European |
| Italian | Indo-European |
| Marathi | Indo-European |
| Nepali | Indo-European |
| Panjabi; Punjabi | Indo-European |
| Polish | Indo-European |
| Portuguese | Indo-European |
| Romanian; Moldavian; Moldovan | Indo-European |
| Russian | Indo-European |
| Spanish; Castilian | Indo-European |
| Urdu | Indo-European |
| Farsi; Persian | Indo-European |
| Japanese | Japonic |
| Korean | Korean |
| Chinese | Sino-Tibetan |
| Thai | TaiKadai |
| Turkish | Turkic |
Get average ranks for each word cluster
words_to_clusters <- words1F %>%
as.data.frame() %>%
mutate(cluster_id = tsne_dimsF$cluster) %>%
select(cluster_id, everything())
cluster_rank_means <- words_to_clusters %>%
split(.$cluster_id) %>%
map(function(x) {
colMeans(x[,-1]) %>%
t() %>%
as.data.frame() %>%
mutate(cluster_id = x[1,1])}) %>%
bind_rows() %>%
select(cluster_id, everything())
Get distance of each family to each cluster
all_ranks <- rbind(langsF, as.matrix(cluster_rank_means[,-1]))
rank_corrs <- cor(t(all_ranks))
rank_corrs_df <- data.frame(rank_corrs[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>%
unlist()
rank_corrs_df <- setNames(rank_corrs_df, cluster_names)
rank_ps <-corr.test(as.data.frame(t(all_ranks)), adjust = "none")$p
rank_ps_df <-data.frame(rank_ps[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>%
unlist()
rank_ps_df <- setNames(rank_ps_df, cluster_names)
rank_ps_tidy <- rank_ps_df %>%
mutate(langs = FAMS) %>%
gather("cluster", "p", -langs) %>%
arrange(langs)
rank_corrs_tidy <- rank_corrs_df %>%
mutate(langs = FAMS) %>%
gather("cluster", "r", -langs) %>%
arrange(langs) %>%
left_join(rank_ps_tidy)
Mean correlations by family across clusters
rank_corrs_tidy %>%
ggplot(aes(y = r, x = langs, fill = langs)) +
geom_hline(aes(yintercept = 0), linetype = 2)+
geom_boxplot() +
theme_classic() +
theme(legend.position = "none")
Get families that are most negatively/positibely correlated with a cluster
extreme_cluster_fams <- rank_corrs_tidy %>%
arrange(-abs(r)) %>%
slice(1:50) %>%
separate(cluster, c("x", "cluster"), "_") %>%
select(-x)
top_50_cluster_words <- tsne_dimsF %>%
filter(cluster %in% unique(extreme_cluster_fams$cluster)) %>%
select(word, cluster) %>%
arrange(cluster)
#write_csv(top_50_cluster_words, "cluster_words_top50_100clusts.csv")
Cluster labels:
clust_labels_words <- tsne_dimsF %>%
mutate(cluster = as.character(cluster))
DT::datatable(clust_labels_words)
clust_labels <- clust_labels_words %>%
filter(!is.na(label)) %>%
select(label, cluster)
extreme_cluster_fams %>%
left_join(clust_labels) %>%
kable()
| langs | cluster | r | p | label |
|---|---|---|---|---|
| Indo-European | 1 | -0.1584482 | 0.0009510 | evaluative_adjs |
| Indo-European | 41 | -0.1544323 | 0.0012826 | large_conflict |
| Dravidian | 41 | -0.1457721 | 0.0023873 | large_conflict |
| Indo-European | 27 | -0.1457384 | 0.0023929 | leisure_activities |
| Indo-European | 16 | -0.1370298 | 0.0043264 | verbs |
| Indo-European | 6 | -0.1342653 | 0.0051862 | misc2 |
| Turkic | 48 | -0.1341743 | 0.0052170 | quality_advs |
| Dravidian | 50 | -0.1334147 | 0.0054802 | argue |
| Dravidian | 33 | -0.1305999 | 0.0065627 | commerce2 |
| Indo-European | 23 | -0.1287410 | 0.0073790 | positive_adjs2 |
| Sino-Tibetan | 6 | -0.1284630 | 0.0075086 | misc2 |
| Sino-Tibetan | 2 | -0.1254523 | 0.0090478 | cause_effect |
| Afro-Asiatic | 27 | -0.1218739 | 0.0112377 | leisure_activities |
| Sino-Tibetan | 38 | -0.1211140 | 0.0117590 | enivornment |
| Turkic | 17 | -0.1208550 | 0.0119416 | bodily_functions |
| Dravidian | 23 | -0.1203783 | 0.0122841 | positive_adjs2 |
| TaiKadai | 31 | -0.1200513 | 0.0125240 | time_money |
| Indo-European | 43 | -0.1197152 | 0.0127749 | influence_change |
| Austroasiatic | 16 | -0.1195271 | 0.0129172 | verbs |
| Korean | 27 | -0.1195089 | 0.0129310 | leisure_activities |
| Indo-European | 36 | -0.1194114 | 0.0130054 | technology_media |
| Indo-European | 37 | -0.1181880 | 0.0139715 | adverbs |
| Afro-Asiatic | 13 | -0.1153281 | 0.0164798 | quantity |
| Atlantic-Congo | 10 | -0.1132770 | 0.0185136 | pronouns_students_family |
| TaiKadai | 10 | -0.1130774 | 0.0187228 | pronouns_students_family |
| Indo-European | 11 | -0.1123374 | 0.0195163 | community |
| Atlantic-Congo | 29 | -0.1110445 | 0.0209728 | knowledge |
| Austronesian | 2 | -0.1108386 | 0.0212133 | cause_effect |
| Indo-European | 22 | -0.1101063 | 0.0220881 | scientific_method |
| Dravidian | 29 | -0.1082349 | 0.0244671 | knowledge |
| Dravidian | 10 | -0.1064369 | 0.0269579 | pronouns_students_family |
| Indo-European | 17 | -0.1063720 | 0.0270517 | bodily_functions |
| Dravidian | 6 | -0.1061588 | 0.0273620 | misc2 |
| Turkic | 47 | -0.1058631 | 0.0277974 | spending_time |
| Indo-European | 40 | -0.1057993 | 0.0278921 | passions |
| Austronesian | 32 | 0.1047876 | 0.0294316 | commerce |
| Dravidian | 37 | -0.1042318 | 0.0303079 | adverbs |
| Indo-European | 9 | -0.1034267 | 0.0316168 | support_teach |
| Sino-Tibetan | 16 | -0.1020797 | 0.0339149 | verbs |
| Indo-European | 35 | -0.1017913 | 0.0344248 | emotion |
| Dravidian | 22 | -0.1011145 | 0.0356476 | scientific_method |
| Dravidian | 14 | -0.1005402 | 0.0367141 | neuter_pronouns |
| Sino-Tibetan | 23 | -0.1005400 | 0.0367144 | positive_adjs2 |
| Dravidian | 38 | -0.1003552 | 0.0370632 | enivornment |
| Indo-European | 14 | -0.1000620 | 0.0376224 | neuter_pronouns |
| Dravidian | 27 | -0.0996543 | 0.0384119 | leisure_activities |
| Indo-European | 25 | -0.0996043 | 0.0385098 | social_political_school |
| Sino-Tibetan | 33 | -0.0994101 | 0.0388915 | commerce2 |
| Sino-Tibetan | 3 | -0.0987432 | 0.0402272 | evidence |
| Austroasiatic | 15 | -0.0982262 | 0.0412891 | time |
rank_corrs_tidy %>%
group_by(cluster) %>%
summarize(sd_vals = sd(r)) %>%
arrange(-abs(sd_vals)) %>%
slice(1:25) %>%
separate(cluster, c("x", "cluster"), "_") %>%
select(-x) %>%
left_join(clust_labels) %>%
kable()
| cluster | sd_vals | label |
|---|---|---|
| 33 | 0.0642818 | commerce2 |
| 41 | 0.0629742 | large_conflict |
| 29 | 0.0571347 | knowledge |
| 38 | 0.0549977 | enivornment |
| 9 | 0.0531196 | support_teach |
| 48 | 0.0528757 | quality_advs |
| 6 | 0.0523577 | misc2 |
| 17 | 0.0521986 | bodily_functions |
| 47 | 0.0520229 | spending_time |
| 1 | 0.0512362 | evaluative_adjs |
| 50 | 0.0493476 | argue |
| 13 | 0.0490917 | quantity |
| 14 | 0.0487416 | neuter_pronouns |
| 40 | 0.0486679 | passions |
| 15 | 0.0485182 | time |
| 32 | 0.0481226 | commerce |
| 10 | 0.0476278 | pronouns_students_family |
| 27 | 0.0468331 | leisure_activities |
| 2 | 0.0464333 | cause_effect |
| 16 | 0.0453629 | verbs |
| 23 | 0.0453624 | positive_adjs2 |
| 19 | 0.0453190 | normative_life |
| 31 | 0.0445085 | time_money |
| 18 | 0.0445004 | episitimic_verbs |
| 4 | 0.0442333 | such |
rank_corrs_tidy %>%
group_by(cluster) %>%
summarize(sd_vals = sd(r)) %>%
arrange(abs(sd_vals)) %>%
slice(1:25) %>%
separate(cluster, c("x", "cluster"), "_") %>%
select(-x) %>%
left_join(clust_labels) %>%
kable()
| cluster | sd_vals | label |
|---|---|---|
| 24 | 0.0264137 | forward_progress |
| 46 | 0.0285585 | connectives |
| 5 | 0.0302817 | travel |
| 44 | 0.0309131 | university |
| 22 | 0.0324151 | scientific_method |
| 7 | 0.0341511 | person_interaction_verbs |
| 20 | 0.0342804 | communication |
| 36 | 0.0343011 | technology_media |
| 39 | 0.0343070 | international |
| 8 | 0.0355229 | advs |
| 45 | 0.0369672 | places |
| 35 | 0.0371251 | emotion |
| 11 | 0.0381141 | community |
| 37 | 0.0386323 | adverbs |
| 34 | 0.0389631 | alone_vs_groups |
| 3 | 0.0391821 | evidence |
| 21 | 0.0391867 | misc3 |
| 12 | 0.0393987 | future_aspirations |
| 26 | 0.0400627 | areas_of_study |
| 49 | 0.0402915 | prepositions |
| 43 | 0.0415925 | influence_change |
| 28 | 0.0420455 | modals |
| 42 | 0.0423599 | academic_specialization |
| 30 | 0.0426380 | misc4 |
| 25 | 0.0435252 | social_political_school |
brysbaert <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/conceptviz/data/supplementary_data/brysbaert_corpus.csv") %>%
select(Word, Conc.M, Conc.SD, SUBTLEX)
word_conc <- clust_labels_words %>%
select(word, cluster) %>%
left_join(brysbaert, by = c('word' = "Word"))
cluster_sd <- rank_corrs_tidy %>%
group_by(cluster) %>%
summarize(sd_vals = sd(r)) %>%
separate(cluster, c("x", "cluster"), "_") %>%
select(-x)
m <- word_conc %>%
group_by(cluster) %>%
summarize(mean_conc = mean(Conc.M, na.rm = T)) %>%
left_join(cluster_sd)
cor.test(m$mean_conc, m$sd_vals)
##
## Pearson's product-moment correlation
##
## data: m$mean_conc and m$sd_vals
## t = -0.48212, df = 48, p-value = 0.6319
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3411748 0.2130449
## sample estimates:
## cor
## -0.06941948
No corr.
all_ranks_word <- rbind(langsF, words1F)
rank_corrs_word <- cor(t(all_ranks_word))
rank_corrs_df_word <- data.frame(rank_corrs_word[1:11, 12:(11 + dim(words1F)[1])])
rank_corrs_df_word <- setNames(rank_corrs_df_word, unlist(WORDS_F, use.names = F))
word_sd_conc <- rank_corrs_df_word %>%
mutate(langs = FAMS) %>%
select(langs, everything()) %>%
gather("word", "corr", -langs) %>%
group_by(word) %>%
summarize(sd = sd(corr)) %>%
arrange(sd) %>%
left_join(word_conc)
cor.test(word_sd_conc$Conc.M, word_sd_conc$sd)
##
## Pearson's product-moment correlation
##
## data: word_sd_conc$Conc.M and word_sd_conc$sd
## t = 2.8732, df = 574, p-value = 0.004214
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03774095 0.19883205
## sample estimates:
## cor
## 0.11907
Corr in opposite predicted direction: more concrete, more variable.