TENSOR_OUTPUT_FILENAME <- "../tensor_factorization_matlab_stuff/tensor_data_as_mat/output/7_corr_shared_ktensor_output.mat"
tensor_output <- read.mat(TENSOR_OUTPUT_FILENAME)

lang1 <- tensor_output$k[[1]]
lang2 <- tensor_output$k[[2]]
words <- tensor_output$k[[3]]

We start with a 35 languages x 35 languages x 710 words matrix M. The words are the set of words that are common across all 35 languages (when all words are used including missing values (N= ~4000), I run into memory issues). Value M[ i, j, k] is given by:

We decompose this matrix using the Bayesian approach (code). The rank of the decomposed tensor is 53 (though this is an estimate and varies slightly by run). The output of this decomposition is three matrices: language by rank (35 x 53); language by rank (35 x 53); word by rank (710 x 53).

The two language matrices are identical. Here we look at the language matrix and then the word matrix

0.1 Language matrix

Here’s what the language matrix looks like:

LANGS <- c('ARA', 'BEN', 'BUL', 'CHI', 'DUT', 'ENG', 'FAS', 'FRE', 'GER', 'GRE', 'GUJ','HIN', 'IBO', 'IND', 'ITA', 'JPN','KAN', 'KOR', 'MAL', 'MAR', 'NEP', 'PAN', 'POL','POR', 'RUM', 'RUS', 'SPA', 'TAM', 'TEL', 'TGL', 'THA', 'TUR', 'URD', 'VIE', 'YOR')

lang_id_df = data.frame(lang = LANGS ,
           lang_id = as.factor(1:35))

long_lang1 = lang1 %>%
    tbl_df() %>%
    rownames_to_column('lang_id') %>%
    gather(rank_num, value, -lang_id) %>%
    mutate(
        lang_id = factor(lang_id, levels=1:53),
        rank_num = factor(gsub("V", "", rank_num),
                         levels=1:53)) %>%
   left_join(lang_id_df)

ggplot(long_lang1, aes(x = rank_num, y = lang, 
              fill = value) ) +
  geom_tile() +
  scale_fill_gradient2()

Most variable ranks:

long_lang1 %>%
  group_by(rank_num) %>%
  summarize(var = var(value)) %>%
  arrange(-var) %>%
  slice(1:10) %>%
  kable()
rank_num var
45 0.0294117
39 0.0294104
48 0.0292729
44 0.0292628
43 0.0292528
42 0.0292450
51 0.0292368
46 0.0290085
47 0.0288276
49 0.0280370

And, the dendogram, which looks very sensible.

all_corrs_mat <- as.matrix(lang1)
rownames(all_corrs_mat) <- LANGS
dist_matrix <- dist(all_corrs_mat)
ggdendro::ggdendrogram(hclust(dist_matrix))

Dendogram with last 50% of ranks only:

all_corrs_mat_last <- as.matrix(lang1)[,26:53]
rownames(all_corrs_mat_last) <- LANGS
dist_matrix_last <- dist(all_corrs_mat_last)
ggdendro::ggdendrogram(hclust(dist_matrix_last))

wals_dists <- read_csv("../../../../study2/data/processed/ling_dists/wals_distances.csv")
asjp_dists <- read_csv("../../../../study2/data/processed/ling_dists/asjp_distances.csv")
dplace_dists <- read_csv("../../../../study2/data/processed/ling_dists/dplace_distances.csv") %>%
  mutate(lang1 = tolower(lang1),
         lang2 = tolower(lang2))
eco_dists <- read_csv("../../../../study2/data/processed/ling_dists/eco_distances.csv")


semantic_raw_dists_all <- read_csv("../../../../study2/data/processed/pairwise_lang_distances/HD_centroid_distances_all.csv")  %>%
  as.matrix() %>%
  as.data.frame() %>%
  mutate(lang1 = tolower(colnames(.))) %>%
  gather(lang2, essay_centroid_distance, -36) %>%
    mutate(lang2 = tolower(lang2))


tensor_dists <- dist_matrix %>%
  as.matrix() %>%
  as.data.frame() %>%
  rownames_to_column("lang1") %>%
  gather(lang2, tensor_dist, -1) %>%
  mutate(lang1 = tolower(lang1),
         lang2 = tolower(lang2))

tensor_dists_last <- dist_matrix_last %>%
  as.matrix() %>%
  as.data.frame() %>%
  rownames_to_column("lang1") %>%
  gather(lang2, tensor_dist_last, -1) %>%
  mutate(lang1 = tolower(lang1),
         lang2 = tolower(lang2))


all_dists <- wals_dists %>%
  left_join(asjp_dists) %>%
  left_join(dplace_dists) %>% 
  left_join(eco_dists) %>%
  left_join(semantic_raw_dists_all) %>%
  left_join(tensor_dists) %>%
  left_join(tensor_dists_last)

corr_mat <- cor(all_dists[,c(-1,-2)], 
                use = "pairwise.complete.obs")

p.mat <- corrplot::cor.mtest(all_dists[,c(-1,-2)], 
                  use = "pairwise.complete.obs")$p

cols = rev(colorRampPalette(c("red", "white", "blue"))(100))

corrplot::corrplot(corr_mat, method = "color",  col = cols,
         type = "upper", order = "original", number.cex = .7,
         addCoef.col = "black", 
         p.mat = p.mat, insig = "blank", 
         tl.col = "black", tl.srt = 90,
         diag = FALSE)

# Let's reduce the dimensionality of the languages
clusts <- hclust(dist_matrix)

lang_tree_groups <- cutree(hclust(dist_matrix), k = 5)  %>%
  data.frame() %>% 
  rownames_to_column("lang")  %>%
  rename(lang_clust_id = ".")

0.2 Word matrix

Here’s what the word matrix looks like:

WORDS <- read_feather("../data/4_matlab_data/ets_models/word_to_is_shared.feather")

word_id_df = data.frame(word = WORDS$w1 ,
                        word_id = as.factor(1:710))

long_words = words %>%
    tbl_df() %>%
    rownames_to_column('word_id') %>%
    gather(rank_num, value, -word_id) %>%
    mutate(
        word_id = factor(word_id, levels=1:710),
        rank_num = factor(gsub("V", "", rank_num),
                         levels=1:53)) %>%
   left_join(word_id_df)

ggplot(long_words, aes(x = rank_num, y = word, 
              fill = value) ) +
  geom_tile() +
  scale_fill_gradient2() +
  theme(axis.text.y = element_text(size = 2))

As a sanity check, let’s look at the distances between ranks for a few words. They look relatively sensible.

TEST_WORDS <- c("father", "mother", "children", "parents", "young",
                     "book", "books", "easier", "easily", "three")
word_mat <- long_words %>%
  filter(word %in% TEST_WORDS)%>%
  select(word, rank_num, value) %>%
  spread(word, value) %>%
  select(-rank_num) %>%
  as.matrix() %>%
  t()
dist_matrix_word <- dist(word_mat)

ggdendro::ggdendrogram(hclust(dist_matrix_word))

Most variable ranks:

var_ranks <- long_words %>%
  group_by(rank_num) %>%
  summarize(var = var(value)) %>%
  top_n(10, var)

kable(var_ranks)
rank_num var
33 0.0013744
34 0.0013946
41 0.0012964
43 0.0012529
46 0.0013253
47 0.0013475
49 0.0013690
51 0.0013432
52 0.0012926
53 0.0013944

Top and bottom N words in 10 most variable ranks

TOP:

N_words <- 20

long_words %>%
  filter(rank_num %in% c(var_ranks$rank_num)) %>%
  group_by(rank_num) %>%
  top_n(N_words, value) %>%
  arrange(value) %>%
  select(rank_num, word) %>%
  mutate(n = 1:n()) %>%
  spread(rank_num, word) %>%
  select(-n) %>%
  kable()
33 34 41 43 46 47 49 51 52 53
else activities childhood play thing having may useful again close
travelling ability effect those risk just great again etc decision
examples give her purpose coming actually affect afford competition ones
everybody skills whether effective thinking others questions away form education
right bad travelling possible needed given down thought finding situation
coming home teachers states childhood am interested jobs needed decisions
professional idea advertising move always therefore big importance called change
single places traveling cause single creativity strong certain type financial
related project specialize problem moreover now still works therefore friend
fact gain almost since instance development lead thus up again
being useful came big write takes small furthermore into stop
shows told thing question away schools here problems ever contact
provided classes act usually self means activities bad bring results
older end made five move though hear questions step attention
stress understanding experience between difficult step later difference plays family
common respect given studies wrong huge step daily through friends
respect childhood studies right possible thinks effective whether food classes
large advertisement depend points giving stay giving attention quality giving
given helps interest definitely usually goal aware purpose provided keep
parts follow major means position education childhood related cases old

BOTTOM:

long_words %>%
  filter(rank_num %in% c(var_ranks$rank_num)) %>%
  group_by(rank_num) %>%
  top_n(-N_words, value) %>%
  arrange(value) %>%
  select(rank_num, word) %>%
  mutate(n = 1:n()) %>%
  spread(rank_num, word) %>%
  select(-n) %>%
  kable()
33 34 41 43 46 47 49 51 52 53
found everyday says certain major older course together made thinking
company task attention health according studies food working companies finding
state big stay instead studies lead second alone coming together
home difference afford according related attention outside project necessary takes
did likes looking trip those values given whole areas thus
went third city next communicate instead answer everyday difference means
saw state look coming decisions communicate thinking decision wants everyday
factors takes results gives problems interest since group period chance
enjoyment down free experienced second dangerous free question choice affect
contact giving common giving step compare care here needs secondly
came furthermore saying thinks brings depend act three common knowing
transportation points both still therefore experienced easy asked individual being
share following giving imagine almost grow period brings third getting
view pay everybody keep similar educated like single particular stress
turn disagree due too saying mistake advertisement satisfied lack strong
became issues happens daily gives usually both professional wanted form
food topics love others third child its everyone strong major
back city effort because down situation says work stress effective
year need speak quality same looking getting remember probably aware
effect agree opportunity makes love works let kid conclude its

least variable ranks:

low_var_ranks <- long_words %>%
  group_by(rank_num) %>%
  summarize(var = var(value)) %>%
  top_n(-10, var)

kable(low_var_ranks)
rank_num var
1 0.0000831
6 0.0001830
9 0.0001623
12 0.0001715
14 0.0001826
15 0.0001658
18 0.0001867
27 0.0001687
28 0.0001836
29 0.0001825

Top and bottom N words in 10 lease variable ranks

TOP:

N_words <- 20

long_words %>%
  filter(rank_num %in% c(low_var_ranks$rank_num)) %>%
  group_by(rank_num) %>%
  top_n(N_words, value) %>%
  arrange(value) %>%
  select(rank_num, word) %>%
  mutate(n = 1:n()) %>%
  spread(rank_num, word) %>%
  select(-n) %>%
  kable()
1 6 9 12 14 15 18 27 28 29
interesting these information types when needed nowadays ask development cars
energy today tell transportation last furthermore news did using technology
persons less she children society contact go ways effect see
contact internet teach their father secondly using types technology world
words number of get future act ways television do teacher
actually changing then student the persons years or never books
experienced information her cars saw coming companies use changing students
compare then his university wanted outside family cars of hours
real such economy information friend body teach technology these went
act technology transportation companies went no never advantages points factors
effort places internet alone success finally was enjoy car day
rest news media made lack far friend doctor internet city
aware books reasons them manage single me go place human
thinking changes or news she shows society news modern she
thus various development students go task television know various go
needed living he advertisement he tend media want television developing
finally modern valuable time was takes he computers transportation developed
educated source technology study year aware modern public city never
interested of modern internet or creativity she tv adult food
depend transportation provides various using either or internet teach did

BOTTOM:

long_words %>%
  filter(rank_num %in% c(low_var_ranks$rank_num)) %>%
  group_by(rank_num) %>%
  top_n(-N_words, value) %>%
  arrange(value) %>%
  select(rank_num, word) %>%
  mutate(n = 1:n()) %>%
  spread(rank_num, word) %>%
  select(-n) %>%
  kable()
1 6 9 12 14 15 18 27 28 29
internet conclusion especially grow ever etc major large off same
he members cause human therefore modern cost cannot needed major
of quality keep opportunity related had left professional tend teenager
modern depend health turn difficult did thinking provided importance probably
today position fast build rich types themselves values little present
technology though brings words cost problems deal whether times looking
do between express choice keep go free although position impact
was human difficult show energy city financially older looking issues
she energy actually likes questions internet often tend kid interest
go older hand doesn forget media positive teenager seen given
cars interested cases thought means before language third act aspect
world questions values leave listen was interested apply conclusion persons
children often based attitude cannot tv called follow poor finally
media too between during persons teacher communicate quality build particular
want increase step mean goal system short finding else ever
television look since since definitely computers known actually difference grow
him choice quality once effort public perfect often teaching educated
reasons myself experienced works books living others interesting here words
students thought depends energy result he concept write though clear
had here depend away takes think second therefore brings called

0.3 Languages and words

What makes sense as the best way to look at the cross of languages and words? I’ve tried some clustering approaches, but it’s not that interpretable. Would be helpful to talk about this.