Rank analysis of words x families

Cluster words
Get relationship between clusters and family
Specific cluster x family
Concreteness
- By cluster
- By word

Nclusters is 50.

TENSOR_OUTPUT_FILENAME_FAM <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/4_corr_shared_ktensor_output_WWL_fam.mat"
tensor_outputF <- read.mat(TENSOR_OUTPUT_FILENAME_FAM)

words1F <- tensor_outputF$k[[1]]
words2F <- tensor_outputF$k[[2]]
langsF <- tensor_outputF$k[[3]]

WORDS_F <- read_feather("../data/2_matlab_data/word_to_is_shared_fam.feather")

FAMS <- c("Afro-Asiatic","Indo-European","Sino-Tibetan","Atlantic-Congo","Austronesian", "Japonic","Dravidian","Korean","TaiKadai", "Turkic","Austroasiatic")

Cluster words

Get tsne of word coordinates

# get tsne coordinates
tsne_outF = Rtsne::Rtsne(words1F)
tsne_dimsF <- tsne_outF$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  %>%
  bind_cols(word = sort(WORDS_F$.)) # the words are in alphabetical order

Cluster

N_CLUSTERS <- 50
cluster_path <- paste0("tsne_familiy_", N_CLUSTERS, ".csv")

clustersF <- kmeans(scale(tsne_dimsF[,c("tsne_X", "tsne_Y")]), N_CLUSTERS)
tsne_dimsF$cluster = factor(clustersF$cluster)
#write_csv(tsne_dimsF %>% arrange(cluster), cluster_path)

tsne_dimsF <- read_csv(cluster_path)

ggplot(tsne_dimsF,
         aes(x = tsne_X, y = tsne_Y, color = as.factor(cluster))) +
  geom_text(aes(label = word), size = 1.5) +
  theme_void() +
  ggtitle("t-sne projection of 710 words based on tensor factorization ranks") +
  theme(legend.position = "none")

Get relationship between clusters and family

Language families

LANG_KEY <- "../../../data/processed/lang_names/ets_to_google_langcodes_complete.csv"

lang_key <- read_csv(LANG_KEY) 

lang_key %>%
  select(lang_name, aff) %>%
  arrange(aff) %>%
  kable()

lang_name	aff
Arabic	Afro-Asiatic
Igbo	Atlantic-Congo
Yoruba	Atlantic-Congo
Vietnamese	Austroasiatic
Indonesian	Austronesian
Tagalog	Austronesian
Kannada	Dravidian
Malayalam	Dravidian
Tamil	Dravidian
Telugu	Dravidian
Bengali	Indo-European
Bulgarian	Indo-European
Dutch; Flemish	Indo-European
English	Indo-European
French	Indo-European
German	Indo-European
Greek, Modern (1453-)	Indo-European
Gujarati	Indo-European
Hindi	Indo-European
Italian	Indo-European
Marathi	Indo-European
Nepali	Indo-European
Panjabi; Punjabi	Indo-European
Polish	Indo-European
Portuguese	Indo-European
Romanian; Moldavian; Moldovan	Indo-European
Russian	Indo-European
Spanish; Castilian	Indo-European
Urdu	Indo-European
Farsi; Persian	Indo-European
Japanese	Japonic
Korean	Korean
Chinese	Sino-Tibetan
Thai	TaiKadai
Turkish	Turkic

lang_fam_counts <- count(lang_key, aff)

Get average ranks for each word cluster

words_to_clusters <- words1F %>%
  as.data.frame() %>%
  mutate(cluster_id = tsne_dimsF$cluster) %>%
  select(cluster_id, everything()) 

cluster_rank_means <- words_to_clusters %>%
  split(.$cluster_id) %>%
  map(function(x) {
    colMeans(x[,-1]) %>% 
        t() %>% 
        as.data.frame() %>% 
        mutate(cluster_id = x[1,1])}) %>%
  bind_rows() %>%
  select(cluster_id, everything())

Get distance of each family to each cluster

all_ranks <- rbind(langsF,  as.matrix(cluster_rank_means[,-1]))

rank_corrs <- cor(t(all_ranks))
rank_corrs_df <- data.frame(rank_corrs[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>% 
  unlist()
rank_corrs_df <- setNames(rank_corrs_df, cluster_names)

rank_ps <-corr.test(as.data.frame(t(all_ranks)), adjust = "none")$p
rank_ps_df <-data.frame(rank_ps[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>% 
  unlist()
rank_ps_df <- setNames(rank_ps_df, cluster_names)

rank_ps_tidy <- rank_ps_df %>%
  mutate(langs = FAMS) %>%
  gather("cluster", "p", -langs) %>%
  arrange(langs)

rank_corrs_tidy <- rank_corrs_df %>%
  mutate(langs = FAMS) %>%
  gather("cluster", "r", -langs) %>%
  arrange(langs) %>%
  left_join(rank_ps_tidy) %>%
  left_join(lang_fam_counts, by = c("langs" = "aff")) %>%
  mutate(r_norm = r/n,
         r_norm_log = r/log(n + 1))

Mean correlations by family across clusters

rank_corrs_tidy %>%
  select(langs, r, r_norm, r_norm_log) %>%
  gather("rmeasure", "rvalue", -langs) %>%
  group_by(langs, rmeasure) %>%
  langcog::multi_boot_standard(col  = "rvalue") %>%
  left_join(lang_fam_counts, by = c("langs" = "aff")) %>%
  ggplot(aes(y = mean, x = langs, fill = langs, color = langs)) +
  geom_hline(aes(yintercept = 0), linetype = 2)+
  ylab("Language Families") +
  xlab("mean r") +
  facet_wrap(~ rmeasure)   +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  theme_classic() +
  theme(legend.position = "none") +
  coord_flip()

Specific cluster x family

Get families that are most negatively/positively correlated with a cluster

extreme_cluster_fams <- rank_corrs_tidy %>%
  arrange(r_norm_log) %>%
  slice(1:50) %>%
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x)

top_50_cluster_words <- tsne_dimsF %>%
  filter(cluster %in% unique(extreme_cluster_fams$cluster)) %>%
  select(word, cluster) %>%
  arrange(cluster)

#write_csv(top_50_cluster_words, "cluster_words_top50_100clusts.csv")

Cluster labels:

clust_labels_words <- tsne_dimsF %>%
  mutate(cluster = as.character(cluster)) 

DT::datatable(clust_labels_words)

clust_labels <- clust_labels_words %>%
  filter(!is.na(label)) %>%
  select(label, cluster)

Most negative clusters:

extreme_cluster_fams %>%
  left_join(clust_labels)  %>% 
  kable()

langs	cluster	r	p	n	r_norm	r_norm_log	label
Turkic	48	-0.1341743	0.0052170	1	-0.1341743	-0.1935726	quality_advs
Sino-Tibetan	6	-0.1284630	0.0075086	1	-0.1284630	-0.1853329	misc2
Sino-Tibetan	2	-0.1254523	0.0090478	1	-0.1254523	-0.1809894	cause_effect
Afro-Asiatic	27	-0.1218739	0.0112377	1	-0.1218739	-0.1758269	leisure_activities
Sino-Tibetan	38	-0.1211140	0.0117590	1	-0.1211140	-0.1747306	enivornment
Turkic	17	-0.1208550	0.0119416	1	-0.1208550	-0.1743569	bodily_functions
TaiKadai	31	-0.1200513	0.0125240	1	-0.1200513	-0.1731974	time_money
Austroasiatic	16	-0.1195271	0.0129172	1	-0.1195271	-0.1724412	verbs
Korean	27	-0.1195089	0.0129310	1	-0.1195089	-0.1724150	leisure_activities
Afro-Asiatic	13	-0.1153281	0.0164798	1	-0.1153281	-0.1663833	quantity
TaiKadai	10	-0.1130774	0.0187228	1	-0.1130774	-0.1631362	pronouns_students_family
Turkic	47	-0.1058631	0.0277974	1	-0.1058631	-0.1527282	spending_time
Sino-Tibetan	16	-0.1020797	0.0339149	1	-0.1020797	-0.1472698	verbs
Sino-Tibetan	23	-0.1005400	0.0367144	1	-0.1005400	-0.1450486	positive_adjs2
Sino-Tibetan	33	-0.0994101	0.0388915	1	-0.0994101	-0.1434185	commerce2
Sino-Tibetan	3	-0.0987432	0.0402272	1	-0.0987432	-0.1424563	evidence
Austroasiatic	15	-0.0982262	0.0412891	1	-0.0982262	-0.1417105	time
Afro-Asiatic	4	-0.0981376	0.0414734	1	-0.0981376	-0.1415827	such
Japonic	47	-0.0932949	0.0526590	1	-0.0932949	-0.1345960	spending_time
Sino-Tibetan	44	-0.0929744	0.0534804	1	-0.0929744	-0.1341337	university
Sino-Tibetan	35	-0.0928611	0.0537734	1	-0.0928611	-0.1339702	emotion
Afro-Asiatic	22	-0.0889323	0.0647867	1	-0.0889323	-0.1283021	scientific_method
Sino-Tibetan	9	-0.0885772	0.0658673	1	-0.0885772	-0.1277899	support_teach
Turkic	36	-0.0880849	0.0673902	1	-0.0880849	-0.1270797	technology_media
TaiKadai	47	-0.0843182	0.0800215	1	-0.0843182	-0.1216454	spending_time
Sino-Tibetan	22	-0.0833001	0.0837464	1	-0.0833001	-0.1201766	scientific_method
Sino-Tibetan	17	-0.0828689	0.0853658	1	-0.0828689	-0.1195545	bodily_functions
TaiKadai	34	-0.0814182	0.0909995	1	-0.0814182	-0.1174616	alone_vs_groups
Austroasiatic	9	-0.0770070	0.1099771	1	-0.0770070	-0.1110976	support_teach
Austroasiatic	28	-0.0767868	0.1110004	1	-0.0767868	-0.1107799	modals
Sino-Tibetan	31	-0.0760635	0.1144146	1	-0.0760635	-0.1097365	time_money
TaiKadai	35	-0.0746981	0.1210842	1	-0.0746981	-0.1077666	emotion
Japonic	48	-0.0746486	0.1213315	1	-0.0746486	-0.1076952	quality_advs
TaiKadai	37	-0.0719880	0.1352158	1	-0.0719880	-0.1038568	adverbs
TaiKadai	16	-0.0717716	0.1363971	1	-0.0717716	-0.1035445	verbs
Atlantic-Congo	10	-0.1132770	0.0185136	2	-0.0566385	-0.1031092	pronouns_students_family
Sino-Tibetan	41	-0.0710265	0.1405242	1	-0.0710265	-0.1024695	large_conflict
Turkic	38	-0.0705579	0.1431678	1	-0.0705579	-0.1017936	enivornment
Atlantic-Congo	29	-0.1110445	0.0209728	2	-0.0555223	-0.1010771	knowledge
Japonic	27	-0.0699868	0.1464419	1	-0.0699868	-0.1009696	leisure_activities
Austronesian	2	-0.1108386	0.0212133	2	-0.0554193	-0.1008897	cause_effect
Afro-Asiatic	38	-0.0696857	0.1481906	1	-0.0696857	-0.1005352	enivornment
Turkic	10	-0.0687947	0.1534588	1	-0.0687947	-0.0992498	pronouns_students_family
Sino-Tibetan	21	-0.0686965	0.1540479	1	-0.0686965	-0.0991082	misc3
Austroasiatic	47	-0.0682052	0.1570219	1	-0.0682052	-0.0983993	spending_time
Turkic	13	-0.0681511	0.1573521	1	-0.0681511	-0.0983212	quantity
Turkic	26	-0.0672240	0.1630898	1	-0.0672240	-0.0969838	areas_of_study
Turkic	31	-0.0671267	0.1637013	1	-0.0671267	-0.0968433	time_money
Sino-Tibetan	46	-0.0665176	0.1675666	1	-0.0665176	-0.0959646	connectives
Sino-Tibetan	34	-0.0655714	0.1737062	1	-0.0655714	-0.0945995	alone_vs_groups

Most variable clusters:

rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  arrange(-abs(sd_vals)) %>%
  slice(1:25) %>% 
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) %>%
  left_join(clust_labels) %>%
  kable()

cluster	sd_vals	label
33	0.0642818	commerce2
41	0.0629742	large_conflict
29	0.0571347	knowledge
38	0.0549977	enivornment
9	0.0531196	support_teach
48	0.0528757	quality_advs
6	0.0523577	misc2
17	0.0521986	bodily_functions
47	0.0520229	spending_time
1	0.0512362	evaluative_adjs
50	0.0493476	argue
13	0.0490917	quantity
14	0.0487416	neuter_pronouns
40	0.0486679	passions
15	0.0485182	time
32	0.0481226	commerce
10	0.0476278	pronouns_students_family
27	0.0468331	leisure_activities
2	0.0464333	cause_effect
16	0.0453629	verbs
23	0.0453624	positive_adjs2
19	0.0453190	normative_life
31	0.0445085	time_money
18	0.0445004	episitimic_verbs
4	0.0442333	such

Least variable clusters:

rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  arrange(abs(sd_vals)) %>%
  slice(1:25) %>% 
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) %>%
  left_join(clust_labels) %>%
  kable()

cluster	sd_vals	label
24	0.0264137	forward_progress
46	0.0285585	connectives
5	0.0302817	travel
44	0.0309131	university
22	0.0324151	scientific_method
7	0.0341511	person_interaction_verbs
20	0.0342804	communication
36	0.0343011	technology_media
39	0.0343070	international
8	0.0355229	advs
45	0.0369672	places
35	0.0371251	emotion
11	0.0381141	community
37	0.0386323	adverbs
34	0.0389631	alone_vs_groups
3	0.0391821	evidence
21	0.0391867	misc3
12	0.0393987	future_aspirations
26	0.0400627	areas_of_study
49	0.0402915	prepositions
43	0.0415925	influence_change
28	0.0420455	modals
42	0.0423599	academic_specialization
30	0.0426380	misc4
25	0.0435252	social_political_school

Concreteness

By cluster

brysbaert <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/conceptviz/data/supplementary_data/brysbaert_corpus.csv") %>%
  select(Word, Conc.M, Conc.SD, SUBTLEX)

word_conc <- clust_labels_words %>%
  select(word, cluster) %>%
  left_join(brysbaert, by = c('word' = "Word"))

cluster_sd <- rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) 

m <- word_conc %>%
  group_by(cluster) %>%
  summarize(mean_conc = mean(Conc.M, na.rm = T)) %>%
  left_join(cluster_sd)

cor.test(m$mean_conc, m$sd_vals)

## 
##  Pearson's product-moment correlation
## 
## data:  m$mean_conc and m$sd_vals
## t = -0.48212, df = 48, p-value = 0.6319
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3411748  0.2130449
## sample estimates:
##         cor 
## -0.06941948

No corr.

By word

all_ranks_word <- rbind(langsF,  words1F)

rank_corrs_word <- cor(t(all_ranks_word))
rank_corrs_df_word <- data.frame(rank_corrs_word[1:11, 12:(11 +  dim(words1F)[1])])
rank_corrs_df_word <- setNames(rank_corrs_df_word, unlist(WORDS_F, use.names = F))

word_sd_conc <- rank_corrs_df_word %>%
  mutate(langs = FAMS) %>%
  select(langs, everything()) %>%
  gather("word", "corr", -langs) %>%
  group_by(word) %>%
  summarize(sd = sd(corr)) %>%
  arrange(sd) %>%
  left_join(word_conc)

cor.test(word_sd_conc$Conc.M, word_sd_conc$sd)

## 
##  Pearson's product-moment correlation
## 
## data:  word_sd_conc$Conc.M and word_sd_conc$sd
## t = 2.8732, df = 574, p-value = 0.004214
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03774095 0.19883205
## sample estimates:
##     cor 
## 0.11907

Corr in opposite predicted direction: more concrete, more variable.