Rank analysis of words x families

Cluster words
Get relationship between clusters and family
Specific cluster x family
Concreteness
- By cluster
- By word

Nclusters is 50.

TENSOR_OUTPUT_FILENAME_FAM <- "../B_tensor_factorization_matlab/tensor_data_as_mat/output/4_corr_shared_ktensor_output_WWL_fam.mat"
tensor_outputF <- read.mat(TENSOR_OUTPUT_FILENAME_FAM)

words1F <- tensor_outputF$k[[1]]
words2F <- tensor_outputF$k[[2]]
langsF <- tensor_outputF$k[[3]]

WORDS_F <- read_feather("../data/2_matlab_data/word_to_is_shared_fam.feather")

FAMS <- c("Afro-Asiatic","Indo-European","Sino-Tibetan","Atlantic-Congo","Austronesian", "Japonic","Dravidian","Korean","TaiKadai", "Turkic","Austroasiatic")

Cluster words

Get tsne of word coordinates

# get tsne coordinates
tsne_outF = Rtsne::Rtsne(words1F)
tsne_dimsF <- tsne_outF$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  %>%
  bind_cols(word = sort(WORDS_F$.)) # the words are in alphabetical order

Cluster

N_CLUSTERS <- 50
cluster_path <- paste0("tsne_familiy_", N_CLUSTERS, ".csv")

clustersF <- kmeans(scale(tsne_dimsF[,c("tsne_X", "tsne_Y")]), N_CLUSTERS)
tsne_dimsF$cluster = factor(clustersF$cluster)
#write_csv(tsne_dimsF %>% arrange(cluster), cluster_path)

tsne_dimsF <- read_csv(cluster_path)

ggplot(tsne_dimsF,
         aes(x = tsne_X, y = tsne_Y, color = as.factor(cluster))) +
  geom_text(aes(label = word), size = 1.5) +
  theme_void() +
  ggtitle("t-sne projection of 710 words based on tensor factorization ranks") +
  theme(legend.position = "none")

Get relationship between clusters and family

Language families

LANG_KEY <- "../../../data/processed/lang_names/ets_to_google_langcodes_complete.csv"

read_csv(LANG_KEY) %>%
  select(lang_name, aff) %>%
  arrange(aff) %>%
  kable()

lang_name	aff
Arabic	Afro-Asiatic
Igbo	Atlantic-Congo
Yoruba	Atlantic-Congo
Vietnamese	Austroasiatic
Indonesian	Austronesian
Tagalog	Austronesian
Kannada	Dravidian
Malayalam	Dravidian
Tamil	Dravidian
Telugu	Dravidian
Bengali	Indo-European
Bulgarian	Indo-European
Dutch; Flemish	Indo-European
English	Indo-European
French	Indo-European
German	Indo-European
Greek, Modern (1453-)	Indo-European
Gujarati	Indo-European
Hindi	Indo-European
Italian	Indo-European
Marathi	Indo-European
Nepali	Indo-European
Panjabi; Punjabi	Indo-European
Polish	Indo-European
Portuguese	Indo-European
Romanian; Moldavian; Moldovan	Indo-European
Russian	Indo-European
Spanish; Castilian	Indo-European
Urdu	Indo-European
Farsi; Persian	Indo-European
Japanese	Japonic
Korean	Korean
Chinese	Sino-Tibetan
Thai	TaiKadai
Turkish	Turkic

Get average ranks for each word cluster

words_to_clusters <- words1F %>%
  as.data.frame() %>%
  mutate(cluster_id = tsne_dimsF$cluster) %>%
  select(cluster_id, everything()) 

cluster_rank_means <- words_to_clusters %>%
  split(.$cluster_id) %>%
  map(function(x) {
    colMeans(x[,-1]) %>% 
        t() %>% 
        as.data.frame() %>% 
        mutate(cluster_id = x[1,1])}) %>%
  bind_rows() %>%
  select(cluster_id, everything())

Get distance of each family to each cluster

all_ranks <- rbind(langsF,  as.matrix(cluster_rank_means[,-1]))

rank_corrs <- cor(t(all_ranks))
rank_corrs_df <- data.frame(rank_corrs[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>% 
  unlist()
rank_corrs_df <- setNames(rank_corrs_df, cluster_names)

rank_ps <-corr.test(as.data.frame(t(all_ranks)), adjust = "none")$p
rank_ps_df <-data.frame(rank_ps[1:11, 12:(11 + N_CLUSTERS)])
cluster_names <- map(list(1:N_CLUSTERS), ~paste0("cluster_",.)) %>% 
  unlist()
rank_ps_df <- setNames(rank_ps_df, cluster_names)

rank_ps_tidy <- rank_ps_df %>%
  mutate(langs = FAMS) %>%
  gather("cluster", "p", -langs) %>%
  arrange(langs)

rank_corrs_tidy <- rank_corrs_df %>%
  mutate(langs = FAMS) %>%
  gather("cluster", "r", -langs) %>%
  arrange(langs) %>%
  left_join(rank_ps_tidy)

Mean correlations by family across clusters

rank_corrs_tidy %>%
  ggplot(aes(y = r, x = langs, fill = langs)) +
  geom_hline(aes(yintercept = 0), linetype = 2)+
  geom_boxplot() +
  theme_classic() +
  theme(legend.position = "none")

Specific cluster x family

Get families that are most negatively/positibely correlated with a cluster

extreme_cluster_fams <- rank_corrs_tidy %>%
  arrange(-abs(r)) %>%
  slice(1:50) %>%
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x)

top_50_cluster_words <- tsne_dimsF %>%
  filter(cluster %in% unique(extreme_cluster_fams$cluster)) %>%
  select(word, cluster) %>%
  arrange(cluster)

#write_csv(top_50_cluster_words, "cluster_words_top50_100clusts.csv")

Cluster labels:

clust_labels_words <- tsne_dimsF %>%
  mutate(cluster = as.character(cluster)) 

DT::datatable(clust_labels_words)

clust_labels <- clust_labels_words %>%
  filter(!is.na(label)) %>%
  select(label, cluster)

Most negative clusters:

extreme_cluster_fams %>%
  left_join(clust_labels)  %>% 
  kable()

langs	cluster	r	p	label
Indo-European	1	-0.1584482	0.0009510	evaluative_adjs
Indo-European	41	-0.1544323	0.0012826	large_conflict
Dravidian	41	-0.1457721	0.0023873	large_conflict
Indo-European	27	-0.1457384	0.0023929	leisure_activities
Indo-European	16	-0.1370298	0.0043264	verbs
Indo-European	6	-0.1342653	0.0051862	misc2
Turkic	48	-0.1341743	0.0052170	quality_advs
Dravidian	50	-0.1334147	0.0054802	argue
Dravidian	33	-0.1305999	0.0065627	commerce2
Indo-European	23	-0.1287410	0.0073790	positive_adjs2
Sino-Tibetan	6	-0.1284630	0.0075086	misc2
Sino-Tibetan	2	-0.1254523	0.0090478	cause_effect
Afro-Asiatic	27	-0.1218739	0.0112377	leisure_activities
Sino-Tibetan	38	-0.1211140	0.0117590	enivornment
Turkic	17	-0.1208550	0.0119416	bodily_functions
Dravidian	23	-0.1203783	0.0122841	positive_adjs2
TaiKadai	31	-0.1200513	0.0125240	time_money
Indo-European	43	-0.1197152	0.0127749	influence_change
Austroasiatic	16	-0.1195271	0.0129172	verbs
Korean	27	-0.1195089	0.0129310	leisure_activities
Indo-European	36	-0.1194114	0.0130054	technology_media
Indo-European	37	-0.1181880	0.0139715	adverbs
Afro-Asiatic	13	-0.1153281	0.0164798	quantity
Atlantic-Congo	10	-0.1132770	0.0185136	pronouns_students_family
TaiKadai	10	-0.1130774	0.0187228	pronouns_students_family
Indo-European	11	-0.1123374	0.0195163	community
Atlantic-Congo	29	-0.1110445	0.0209728	knowledge
Austronesian	2	-0.1108386	0.0212133	cause_effect
Indo-European	22	-0.1101063	0.0220881	scientific_method
Dravidian	29	-0.1082349	0.0244671	knowledge
Dravidian	10	-0.1064369	0.0269579	pronouns_students_family
Indo-European	17	-0.1063720	0.0270517	bodily_functions
Dravidian	6	-0.1061588	0.0273620	misc2
Turkic	47	-0.1058631	0.0277974	spending_time
Indo-European	40	-0.1057993	0.0278921	passions
Austronesian	32	0.1047876	0.0294316	commerce
Dravidian	37	-0.1042318	0.0303079	adverbs
Indo-European	9	-0.1034267	0.0316168	support_teach
Sino-Tibetan	16	-0.1020797	0.0339149	verbs
Indo-European	35	-0.1017913	0.0344248	emotion
Dravidian	22	-0.1011145	0.0356476	scientific_method
Dravidian	14	-0.1005402	0.0367141	neuter_pronouns
Sino-Tibetan	23	-0.1005400	0.0367144	positive_adjs2
Dravidian	38	-0.1003552	0.0370632	enivornment
Indo-European	14	-0.1000620	0.0376224	neuter_pronouns
Dravidian	27	-0.0996543	0.0384119	leisure_activities
Indo-European	25	-0.0996043	0.0385098	social_political_school
Sino-Tibetan	33	-0.0994101	0.0388915	commerce2
Sino-Tibetan	3	-0.0987432	0.0402272	evidence
Austroasiatic	15	-0.0982262	0.0412891	time

Most variable clusters:

rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  arrange(-abs(sd_vals)) %>%
  slice(1:25) %>% 
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) %>%
  left_join(clust_labels) %>%
  kable()

cluster	sd_vals	label
33	0.0642818	commerce2
41	0.0629742	large_conflict
29	0.0571347	knowledge
38	0.0549977	enivornment
9	0.0531196	support_teach
48	0.0528757	quality_advs
6	0.0523577	misc2
17	0.0521986	bodily_functions
47	0.0520229	spending_time
1	0.0512362	evaluative_adjs
50	0.0493476	argue
13	0.0490917	quantity
14	0.0487416	neuter_pronouns
40	0.0486679	passions
15	0.0485182	time
32	0.0481226	commerce
10	0.0476278	pronouns_students_family
27	0.0468331	leisure_activities
2	0.0464333	cause_effect
16	0.0453629	verbs
23	0.0453624	positive_adjs2
19	0.0453190	normative_life
31	0.0445085	time_money
18	0.0445004	episitimic_verbs
4	0.0442333	such

Least variable clusters:

rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  arrange(abs(sd_vals)) %>%
  slice(1:25) %>% 
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) %>%
  left_join(clust_labels) %>%
  kable()

cluster	sd_vals	label
24	0.0264137	forward_progress
46	0.0285585	connectives
5	0.0302817	travel
44	0.0309131	university
22	0.0324151	scientific_method
7	0.0341511	person_interaction_verbs
20	0.0342804	communication
36	0.0343011	technology_media
39	0.0343070	international
8	0.0355229	advs
45	0.0369672	places
35	0.0371251	emotion
11	0.0381141	community
37	0.0386323	adverbs
34	0.0389631	alone_vs_groups
3	0.0391821	evidence
21	0.0391867	misc3
12	0.0393987	future_aspirations
26	0.0400627	areas_of_study
49	0.0402915	prepositions
43	0.0415925	influence_change
28	0.0420455	modals
42	0.0423599	academic_specialization
30	0.0426380	misc4
25	0.0435252	social_political_school

Concreteness

By cluster

brysbaert <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/conceptviz/data/supplementary_data/brysbaert_corpus.csv") %>%
  select(Word, Conc.M, Conc.SD, SUBTLEX)

word_conc <- clust_labels_words %>%
  select(word, cluster) %>%
  left_join(brysbaert, by = c('word' = "Word"))

cluster_sd <- rank_corrs_tidy %>%
  group_by(cluster) %>%
  summarize(sd_vals = sd(r)) %>%
  separate(cluster, c("x", "cluster"), "_") %>%
  select(-x) 

m <- word_conc %>%
  group_by(cluster) %>%
  summarize(mean_conc = mean(Conc.M, na.rm = T)) %>%
  left_join(cluster_sd)

cor.test(m$mean_conc, m$sd_vals)

## 
##  Pearson's product-moment correlation
## 
## data:  m$mean_conc and m$sd_vals
## t = -0.48212, df = 48, p-value = 0.6319
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3411748  0.2130449
## sample estimates:
##         cor 
## -0.06941948

No corr.

By word

all_ranks_word <- rbind(langsF,  words1F)

rank_corrs_word <- cor(t(all_ranks_word))
rank_corrs_df_word <- data.frame(rank_corrs_word[1:11, 12:(11 +  dim(words1F)[1])])
rank_corrs_df_word <- setNames(rank_corrs_df_word, unlist(WORDS_F, use.names = F))

word_sd_conc <- rank_corrs_df_word %>%
  mutate(langs = FAMS) %>%
  select(langs, everything()) %>%
  gather("word", "corr", -langs) %>%
  group_by(word) %>%
  summarize(sd = sd(corr)) %>%
  arrange(sd) %>%
  left_join(word_conc)

cor.test(word_sd_conc$Conc.M, word_sd_conc$sd)

## 
##  Pearson's product-moment correlation
## 
## data:  word_sd_conc$Conc.M and word_sd_conc$sd
## t = 2.8732, df = 574, p-value = 0.004214
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03774095 0.19883205
## sample estimates:
##     cor 
## 0.11907

Corr in opposite predicted direction: more concrete, more variable.