TENSOR_OUTPUT_FILENAME <- "../tensor_factorization_matlab_stuff/tensor_data_as_mat/output/7_corr_shared_ktensor_output.mat"
tensor_output <- read.mat(TENSOR_OUTPUT_FILENAME)
lang1 <- tensor_output$k[[1]]
lang2 <- tensor_output$k[[2]]
words <- tensor_output$k[[3]]
We start with a 35 languages x 35 languages x 710 words matrix M. The words are the set of words that are common across all 35 languages (when all words are used including missing values (N= ~4000), I run into memory issues). Value M[ i, j, k] is given by:
We decompose this matrix using the Bayesian approach (code). The rank of the decomposed tensor is 53 (though this is an estimate and varies slightly by run). The output of this decomposition is three matrices: language by rank (35 x 53); language by rank (35 x 53); word by rank (710 x 53).
The two language matrices are identical. Here we look at the language matrix and then the word matrix
Here’s what the language matrix looks like:
LANGS <- c('ARA', 'BEN', 'BUL', 'CHI', 'DUT', 'ENG', 'FAS', 'FRE', 'GER', 'GRE', 'GUJ','HIN', 'IBO', 'IND', 'ITA', 'JPN','KAN', 'KOR', 'MAL', 'MAR', 'NEP', 'PAN', 'POL','POR', 'RUM', 'RUS', 'SPA', 'TAM', 'TEL', 'TGL', 'THA', 'TUR', 'URD', 'VIE', 'YOR')
lang_id_df = data.frame(lang = LANGS ,
lang_id = as.factor(1:35))
long_lang1 = lang1 %>%
tbl_df() %>%
rownames_to_column('lang_id') %>%
gather(rank_num, value, -lang_id) %>%
mutate(
lang_id = factor(lang_id, levels=1:53),
rank_num = factor(gsub("V", "", rank_num),
levels=1:53)) %>%
left_join(lang_id_df)
ggplot(long_lang1, aes(x = rank_num, y = lang,
fill = value) ) +
geom_tile() +
scale_fill_gradient2()
Most variable ranks:
long_lang1 %>%
group_by(rank_num) %>%
summarize(var = var(value)) %>%
arrange(-var) %>%
slice(1:10) %>%
kable()
| rank_num | var |
|---|---|
| 45 | 0.0294117 |
| 39 | 0.0294104 |
| 48 | 0.0292729 |
| 44 | 0.0292628 |
| 43 | 0.0292528 |
| 42 | 0.0292450 |
| 51 | 0.0292368 |
| 46 | 0.0290085 |
| 47 | 0.0288276 |
| 49 | 0.0280370 |
And, the dendogram, which looks very sensible.
all_corrs_mat <- as.matrix(lang1)
rownames(all_corrs_mat) <- LANGS
dist_matrix <- dist(all_corrs_mat)
ggdendro::ggdendrogram(hclust(dist_matrix))
Dendogram with last 50% of ranks only:
all_corrs_mat_last <- as.matrix(lang1)[,26:53]
rownames(all_corrs_mat_last) <- LANGS
dist_matrix_last <- dist(all_corrs_mat_last)
ggdendro::ggdendrogram(hclust(dist_matrix_last))
wals_dists <- read_csv("../../../../study2/data/processed/ling_dists/wals_distances.csv")
asjp_dists <- read_csv("../../../../study2/data/processed/ling_dists/asjp_distances.csv")
dplace_dists <- read_csv("../../../../study2/data/processed/ling_dists/dplace_distances.csv") %>%
mutate(lang1 = tolower(lang1),
lang2 = tolower(lang2))
eco_dists <- read_csv("../../../../study2/data/processed/ling_dists/eco_distances.csv")
semantic_raw_dists_all <- read_csv("../../../../study2/data/processed/pairwise_lang_distances/HD_centroid_distances_all.csv") %>%
as.matrix() %>%
as.data.frame() %>%
mutate(lang1 = tolower(colnames(.))) %>%
gather(lang2, essay_centroid_distance, -36) %>%
mutate(lang2 = tolower(lang2))
tensor_dists <- dist_matrix %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column("lang1") %>%
gather(lang2, tensor_dist, -1) %>%
mutate(lang1 = tolower(lang1),
lang2 = tolower(lang2))
tensor_dists_last <- dist_matrix_last %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column("lang1") %>%
gather(lang2, tensor_dist_last, -1) %>%
mutate(lang1 = tolower(lang1),
lang2 = tolower(lang2))
all_dists <- wals_dists %>%
left_join(asjp_dists) %>%
left_join(dplace_dists) %>%
left_join(eco_dists) %>%
left_join(semantic_raw_dists_all) %>%
left_join(tensor_dists) %>%
left_join(tensor_dists_last)
corr_mat <- cor(all_dists[,c(-1,-2)],
use = "pairwise.complete.obs")
p.mat <- corrplot::cor.mtest(all_dists[,c(-1,-2)],
use = "pairwise.complete.obs")$p
cols = rev(colorRampPalette(c("red", "white", "blue"))(100))
corrplot::corrplot(corr_mat, method = "color", col = cols,
type = "upper", order = "original", number.cex = .7,
addCoef.col = "black",
p.mat = p.mat, insig = "blank",
tl.col = "black", tl.srt = 90,
diag = FALSE)
# Let's reduce the dimensionality of the languages
clusts <- hclust(dist_matrix)
lang_tree_groups <- cutree(hclust(dist_matrix), k = 5) %>%
data.frame() %>%
rownames_to_column("lang") %>%
rename(lang_clust_id = ".")
Here’s what the word matrix looks like:
WORDS <- read_feather("../data/4_matlab_data/ets_models/word_to_is_shared.feather")
word_id_df = data.frame(word = WORDS$w1 ,
word_id = as.factor(1:710))
long_words = words %>%
tbl_df() %>%
rownames_to_column('word_id') %>%
gather(rank_num, value, -word_id) %>%
mutate(
word_id = factor(word_id, levels=1:710),
rank_num = factor(gsub("V", "", rank_num),
levels=1:53)) %>%
left_join(word_id_df)
ggplot(long_words, aes(x = rank_num, y = word,
fill = value) ) +
geom_tile() +
scale_fill_gradient2() +
theme(axis.text.y = element_text(size = 2))
As a sanity check, let’s look at the distances between ranks for a few words. They look relatively sensible.
TEST_WORDS <- c("father", "mother", "children", "parents", "young",
"book", "books", "easier", "easily", "three")
word_mat <- long_words %>%
filter(word %in% TEST_WORDS)%>%
select(word, rank_num, value) %>%
spread(word, value) %>%
select(-rank_num) %>%
as.matrix() %>%
t()
dist_matrix_word <- dist(word_mat)
ggdendro::ggdendrogram(hclust(dist_matrix_word))
Most variable ranks:
var_ranks <- long_words %>%
group_by(rank_num) %>%
summarize(var = var(value)) %>%
top_n(10, var)
kable(var_ranks)
| rank_num | var |
|---|---|
| 33 | 0.0013744 |
| 34 | 0.0013946 |
| 41 | 0.0012964 |
| 43 | 0.0012529 |
| 46 | 0.0013253 |
| 47 | 0.0013475 |
| 49 | 0.0013690 |
| 51 | 0.0013432 |
| 52 | 0.0012926 |
| 53 | 0.0013944 |
Top and bottom N words in 10 most variable ranks
TOP:
N_words <- 20
long_words %>%
filter(rank_num %in% c(var_ranks$rank_num)) %>%
group_by(rank_num) %>%
top_n(N_words, value) %>%
arrange(value) %>%
select(rank_num, word) %>%
mutate(n = 1:n()) %>%
spread(rank_num, word) %>%
select(-n) %>%
kable()
| 33 | 34 | 41 | 43 | 46 | 47 | 49 | 51 | 52 | 53 |
|---|---|---|---|---|---|---|---|---|---|
| else | activities | childhood | play | thing | having | may | useful | again | close |
| travelling | ability | effect | those | risk | just | great | again | etc | decision |
| examples | give | her | purpose | coming | actually | affect | afford | competition | ones |
| everybody | skills | whether | effective | thinking | others | questions | away | form | education |
| right | bad | travelling | possible | needed | given | down | thought | finding | situation |
| coming | home | teachers | states | childhood | am | interested | jobs | needed | decisions |
| professional | idea | advertising | move | always | therefore | big | importance | called | change |
| single | places | traveling | cause | single | creativity | strong | certain | type | financial |
| related | project | specialize | problem | moreover | now | still | works | therefore | friend |
| fact | gain | almost | since | instance | development | lead | thus | up | again |
| being | useful | came | big | write | takes | small | furthermore | into | stop |
| shows | told | thing | question | away | schools | here | problems | ever | contact |
| provided | classes | act | usually | self | means | activities | bad | bring | results |
| older | end | made | five | move | though | hear | questions | step | attention |
| stress | understanding | experience | between | difficult | step | later | difference | plays | family |
| common | respect | given | studies | wrong | huge | step | daily | through | friends |
| respect | childhood | studies | right | possible | thinks | effective | whether | food | classes |
| large | advertisement | depend | points | giving | stay | giving | attention | quality | giving |
| given | helps | interest | definitely | usually | goal | aware | purpose | provided | keep |
| parts | follow | major | means | position | education | childhood | related | cases | old |
BOTTOM:
long_words %>%
filter(rank_num %in% c(var_ranks$rank_num)) %>%
group_by(rank_num) %>%
top_n(-N_words, value) %>%
arrange(value) %>%
select(rank_num, word) %>%
mutate(n = 1:n()) %>%
spread(rank_num, word) %>%
select(-n) %>%
kable()
| 33 | 34 | 41 | 43 | 46 | 47 | 49 | 51 | 52 | 53 |
|---|---|---|---|---|---|---|---|---|---|
| found | everyday | says | certain | major | older | course | together | made | thinking |
| company | task | attention | health | according | studies | food | working | companies | finding |
| state | big | stay | instead | studies | lead | second | alone | coming | together |
| home | difference | afford | according | related | attention | outside | project | necessary | takes |
| did | likes | looking | trip | those | values | given | whole | areas | thus |
| went | third | city | next | communicate | instead | answer | everyday | difference | means |
| saw | state | look | coming | decisions | communicate | thinking | decision | wants | everyday |
| factors | takes | results | gives | problems | interest | since | group | period | chance |
| enjoyment | down | free | experienced | second | dangerous | free | question | choice | affect |
| contact | giving | common | giving | step | compare | care | here | needs | secondly |
| came | furthermore | saying | thinks | brings | depend | act | three | common | knowing |
| transportation | points | both | still | therefore | experienced | easy | asked | individual | being |
| share | following | giving | imagine | almost | grow | period | brings | third | getting |
| view | pay | everybody | keep | similar | educated | like | single | particular | stress |
| turn | disagree | due | too | saying | mistake | advertisement | satisfied | lack | strong |
| became | issues | happens | daily | gives | usually | both | professional | wanted | form |
| food | topics | love | others | third | child | its | everyone | strong | major |
| back | city | effort | because | down | situation | says | work | stress | effective |
| year | need | speak | quality | same | looking | getting | remember | probably | aware |
| effect | agree | opportunity | makes | love | works | let | kid | conclude | its |
least variable ranks:
low_var_ranks <- long_words %>%
group_by(rank_num) %>%
summarize(var = var(value)) %>%
top_n(-10, var)
kable(low_var_ranks)
| rank_num | var |
|---|---|
| 1 | 0.0000831 |
| 6 | 0.0001830 |
| 9 | 0.0001623 |
| 12 | 0.0001715 |
| 14 | 0.0001826 |
| 15 | 0.0001658 |
| 18 | 0.0001867 |
| 27 | 0.0001687 |
| 28 | 0.0001836 |
| 29 | 0.0001825 |
Top and bottom N words in 10 lease variable ranks
TOP:
N_words <- 20
long_words %>%
filter(rank_num %in% c(low_var_ranks$rank_num)) %>%
group_by(rank_num) %>%
top_n(N_words, value) %>%
arrange(value) %>%
select(rank_num, word) %>%
mutate(n = 1:n()) %>%
spread(rank_num, word) %>%
select(-n) %>%
kable()
| 1 | 6 | 9 | 12 | 14 | 15 | 18 | 27 | 28 | 29 |
|---|---|---|---|---|---|---|---|---|---|
| interesting | these | information | types | when | needed | nowadays | ask | development | cars |
| energy | today | tell | transportation | last | furthermore | news | did | using | technology |
| persons | less | she | children | society | contact | go | ways | effect | see |
| contact | internet | teach | their | father | secondly | using | types | technology | world |
| words | number | of | get | future | act | ways | television | do | teacher |
| actually | changing | then | student | the | persons | years | or | never | books |
| experienced | information | her | cars | saw | coming | companies | use | changing | students |
| compare | then | his | university | wanted | outside | family | cars | of | hours |
| real | such | economy | information | friend | body | teach | technology | these | went |
| act | technology | transportation | companies | went | no | never | advantages | points | factors |
| effort | places | internet | alone | success | finally | was | enjoy | car | day |
| rest | news | media | made | lack | far | friend | doctor | internet | city |
| aware | books | reasons | them | manage | single | me | go | place | human |
| thinking | changes | or | news | she | shows | society | news | modern | she |
| thus | various | development | students | go | task | television | know | various | go |
| needed | living | he | advertisement | he | tend | media | want | television | developing |
| finally | modern | valuable | time | was | takes | he | computers | transportation | developed |
| educated | source | technology | study | year | aware | modern | public | city | never |
| interested | of | modern | internet | or | creativity | she | tv | adult | food |
| depend | transportation | provides | various | using | either | or | internet | teach | did |
BOTTOM:
long_words %>%
filter(rank_num %in% c(low_var_ranks$rank_num)) %>%
group_by(rank_num) %>%
top_n(-N_words, value) %>%
arrange(value) %>%
select(rank_num, word) %>%
mutate(n = 1:n()) %>%
spread(rank_num, word) %>%
select(-n) %>%
kable()
| 1 | 6 | 9 | 12 | 14 | 15 | 18 | 27 | 28 | 29 |
|---|---|---|---|---|---|---|---|---|---|
| internet | conclusion | especially | grow | ever | etc | major | large | off | same |
| he | members | cause | human | therefore | modern | cost | cannot | needed | major |
| of | quality | keep | opportunity | related | had | left | professional | tend | teenager |
| modern | depend | health | turn | difficult | did | thinking | provided | importance | probably |
| today | position | fast | build | rich | types | themselves | values | little | present |
| technology | though | brings | words | cost | problems | deal | whether | times | looking |
| do | between | express | choice | keep | go | free | although | position | impact |
| was | human | difficult | show | energy | city | financially | older | looking | issues |
| she | energy | actually | likes | questions | internet | often | tend | kid | interest |
| go | older | hand | doesn | forget | media | positive | teenager | seen | given |
| cars | interested | cases | thought | means | before | language | third | act | aspect |
| world | questions | values | leave | listen | was | interested | apply | conclusion | persons |
| children | often | based | attitude | cannot | tv | called | follow | poor | finally |
| media | too | between | during | persons | teacher | communicate | quality | build | particular |
| want | increase | step | mean | goal | system | short | finding | else | ever |
| television | look | since | since | definitely | computers | known | actually | difference | grow |
| him | choice | quality | once | effort | public | perfect | often | teaching | educated |
| reasons | myself | experienced | works | books | living | others | interesting | here | words |
| students | thought | depends | energy | result | he | concept | write | though | clear |
| had | here | depend | away | takes | think | second | therefore | brings | called |
What makes sense as the best way to look at the cross of languages and words? I’ve tried some clustering approaches, but it’s not that interpretable. Would be helpful to talk about this.