Tensor Decomposition Analysis

0.1 Language matrix
0.2 Word matrix
0.3 Languages and words

TENSOR_OUTPUT_FILENAME <- "../tensor_factorization_matlab_stuff/tensor_data_as_mat/output/7_corr_shared_ktensor_output.mat"
tensor_output <- read.mat(TENSOR_OUTPUT_FILENAME)

lang1 <- tensor_output$k[[1]]
lang2 <- tensor_output$k[[2]]
words <- tensor_output$k[[3]]

We start with a 35 languages x 35 languages x 710 words matrix M. The words are the set of words that are common across all 35 languages (when all words are used including missing values (N= ~4000), I run into memory issues). Value M[ i, j, k] is given by:

for language i, take the cosine distance between word k and every other word
for language j, take the cosine distance between word k and every other word
calculate Pearson’s r between the two sets of distances. The Pearson’s r value is the entry in the matrix.

We decompose this matrix using the Bayesian approach (code). The rank of the decomposed tensor is 53 (though this is an estimate and varies slightly by run). The output of this decomposition is three matrices: language by rank (35 x 53); language by rank (35 x 53); word by rank (710 x 53).

The two language matrices are identical. Here we look at the language matrix and then the word matrix

0.1 Language matrix

Here’s what the language matrix looks like:

LANGS <- c('ARA', 'BEN', 'BUL', 'CHI', 'DUT', 'ENG', 'FAS', 'FRE', 'GER', 'GRE', 'GUJ','HIN', 'IBO', 'IND', 'ITA', 'JPN','KAN', 'KOR', 'MAL', 'MAR', 'NEP', 'PAN', 'POL','POR', 'RUM', 'RUS', 'SPA', 'TAM', 'TEL', 'TGL', 'THA', 'TUR', 'URD', 'VIE', 'YOR')

lang_id_df = data.frame(lang = LANGS ,
           lang_id = as.factor(1:35))

long_lang1 = lang1 %>%
    tbl_df() %>%
    rownames_to_column('lang_id') %>%
    gather(rank_num, value, -lang_id) %>%
    mutate(
        lang_id = factor(lang_id, levels=1:53),
        rank_num = factor(gsub("V", "", rank_num),
                         levels=1:53)) %>%
   left_join(lang_id_df)

ggplot(long_lang1, aes(x = rank_num, y = lang, 
              fill = value) ) +
  geom_tile() +
  scale_fill_gradient2()

Most variable ranks:

long_lang1 %>%
  group_by(rank_num) %>%
  summarize(var = var(value)) %>%
  arrange(-var) %>%
  slice(1:10) %>%
  kable()

rank_num	var
45	0.0294117
39	0.0294104
48	0.0292729
44	0.0292628
43	0.0292528
42	0.0292450
51	0.0292368
46	0.0290085
47	0.0288276
49	0.0280370

And, the dendogram, which looks very sensible.

all_corrs_mat <- as.matrix(lang1)
rownames(all_corrs_mat) <- LANGS
dist_matrix <- dist(all_corrs_mat)
ggdendro::ggdendrogram(hclust(dist_matrix))

Dendogram with last 50% of ranks only:

all_corrs_mat_last <- as.matrix(lang1)[,26:53]
rownames(all_corrs_mat_last) <- LANGS
dist_matrix_last <- dist(all_corrs_mat_last)
ggdendro::ggdendrogram(hclust(dist_matrix_last))

wals_dists <- read_csv("../../../../study2/data/processed/ling_dists/wals_distances.csv")
asjp_dists <- read_csv("../../../../study2/data/processed/ling_dists/asjp_distances.csv")
dplace_dists <- read_csv("../../../../study2/data/processed/ling_dists/dplace_distances.csv") %>%
  mutate(lang1 = tolower(lang1),
         lang2 = tolower(lang2))
eco_dists <- read_csv("../../../../study2/data/processed/ling_dists/eco_distances.csv")


semantic_raw_dists_all <- read_csv("../../../../study2/data/processed/pairwise_lang_distances/HD_centroid_distances_all.csv")  %>%
  as.matrix() %>%
  as.data.frame() %>%
  mutate(lang1 = tolower(colnames(.))) %>%
  gather(lang2, essay_centroid_distance, -36) %>%
    mutate(lang2 = tolower(lang2))


tensor_dists <- dist_matrix %>%
  as.matrix() %>%
  as.data.frame() %>%
  rownames_to_column("lang1") %>%
  gather(lang2, tensor_dist, -1) %>%
  mutate(lang1 = tolower(lang1),
         lang2 = tolower(lang2))

tensor_dists_last <- dist_matrix_last %>%
  as.matrix() %>%
  as.data.frame() %>%
  rownames_to_column("lang1") %>%
  gather(lang2, tensor_dist_last, -1) %>%
  mutate(lang1 = tolower(lang1),
         lang2 = tolower(lang2))


all_dists <- wals_dists %>%
  left_join(asjp_dists) %>%
  left_join(dplace_dists) %>% 
  left_join(eco_dists) %>%
  left_join(semantic_raw_dists_all) %>%
  left_join(tensor_dists) %>%
  left_join(tensor_dists_last)

corr_mat <- cor(all_dists[,c(-1,-2)], 
                use = "pairwise.complete.obs")

p.mat <- corrplot::cor.mtest(all_dists[,c(-1,-2)], 
                  use = "pairwise.complete.obs")$p

cols = rev(colorRampPalette(c("red", "white", "blue"))(100))

corrplot::corrplot(corr_mat, method = "color",  col = cols,
         type = "upper", order = "original", number.cex = .7,
         addCoef.col = "black", 
         p.mat = p.mat, insig = "blank", 
         tl.col = "black", tl.srt = 90,
         diag = FALSE)

# Let's reduce the dimensionality of the languages
clusts <- hclust(dist_matrix)

lang_tree_groups <- cutree(hclust(dist_matrix), k = 5)  %>%
  data.frame() %>% 
  rownames_to_column("lang")  %>%
  rename(lang_clust_id = ".")

0.2 Word matrix

Here’s what the word matrix looks like:

WORDS <- read_feather("../data/4_matlab_data/ets_models/word_to_is_shared.feather")

word_id_df = data.frame(word = WORDS$w1 ,
                        word_id = as.factor(1:710))

long_words = words %>%
    tbl_df() %>%
    rownames_to_column('word_id') %>%
    gather(rank_num, value, -word_id) %>%
    mutate(
        word_id = factor(word_id, levels=1:710),
        rank_num = factor(gsub("V", "", rank_num),
                         levels=1:53)) %>%
   left_join(word_id_df)

ggplot(long_words, aes(x = rank_num, y = word, 
              fill = value) ) +
  geom_tile() +
  scale_fill_gradient2() +
  theme(axis.text.y = element_text(size = 2))

As a sanity check, let’s look at the distances between ranks for a few words. They look relatively sensible.

TEST_WORDS <- c("father", "mother", "children", "parents", "young",
                     "book", "books", "easier", "easily", "three")
word_mat <- long_words %>%
  filter(word %in% TEST_WORDS)%>%
  select(word, rank_num, value) %>%
  spread(word, value) %>%
  select(-rank_num) %>%
  as.matrix() %>%
  t()
dist_matrix_word <- dist(word_mat)

ggdendro::ggdendrogram(hclust(dist_matrix_word))

Most variable ranks:

var_ranks <- long_words %>%
  group_by(rank_num) %>%
  summarize(var = var(value)) %>%
  top_n(10, var)

kable(var_ranks)

rank_num	var
33	0.0013744
34	0.0013946
41	0.0012964
43	0.0012529
46	0.0013253
47	0.0013475
49	0.0013690
51	0.0013432
52	0.0012926
53	0.0013944

Top and bottom N words in 10 most variable ranks

TOP:

N_words <- 20

long_words %>%
  filter(rank_num %in% c(var_ranks$rank_num)) %>%
  group_by(rank_num) %>%
  top_n(N_words, value) %>%
  arrange(value) %>%
  select(rank_num, word) %>%
  mutate(n = 1:n()) %>%
  spread(rank_num, word) %>%
  select(-n) %>%
  kable()

33	34	41	43	46	47	49	51	52	53
else	activities	childhood	play	thing	having	may	useful	again	close
travelling	ability	effect	those	risk	just	great	again	etc	decision
examples	give	her	purpose	coming	actually	affect	afford	competition	ones
everybody	skills	whether	effective	thinking	others	questions	away	form	education
right	bad	travelling	possible	needed	given	down	thought	finding	situation
coming	home	teachers	states	childhood	am	interested	jobs	needed	decisions
professional	idea	advertising	move	always	therefore	big	importance	called	change
single	places	traveling	cause	single	creativity	strong	certain	type	financial
related	project	specialize	problem	moreover	now	still	works	therefore	friend
fact	gain	almost	since	instance	development	lead	thus	up	again
being	useful	came	big	write	takes	small	furthermore	into	stop
shows	told	thing	question	away	schools	here	problems	ever	contact
provided	classes	act	usually	self	means	activities	bad	bring	results
older	end	made	five	move	though	hear	questions	step	attention
stress	understanding	experience	between	difficult	step	later	difference	plays	family
common	respect	given	studies	wrong	huge	step	daily	through	friends
respect	childhood	studies	right	possible	thinks	effective	whether	food	classes
large	advertisement	depend	points	giving	stay	giving	attention	quality	giving
given	helps	interest	definitely	usually	goal	aware	purpose	provided	keep
parts	follow	major	means	position	education	childhood	related	cases	old

BOTTOM:

long_words %>%
  filter(rank_num %in% c(var_ranks$rank_num)) %>%
  group_by(rank_num) %>%
  top_n(-N_words, value) %>%
  arrange(value) %>%
  select(rank_num, word) %>%
  mutate(n = 1:n()) %>%
  spread(rank_num, word) %>%
  select(-n) %>%
  kable()

33	34	41	43	46	47	49	51	52	53
found	everyday	says	certain	major	older	course	together	made	thinking
company	task	attention	health	according	studies	food	working	companies	finding
state	big	stay	instead	studies	lead	second	alone	coming	together
home	difference	afford	according	related	attention	outside	project	necessary	takes
did	likes	looking	trip	those	values	given	whole	areas	thus
went	third	city	next	communicate	instead	answer	everyday	difference	means
saw	state	look	coming	decisions	communicate	thinking	decision	wants	everyday
factors	takes	results	gives	problems	interest	since	group	period	chance
enjoyment	down	free	experienced	second	dangerous	free	question	choice	affect
contact	giving	common	giving	step	compare	care	here	needs	secondly
came	furthermore	saying	thinks	brings	depend	act	three	common	knowing
transportation	points	both	still	therefore	experienced	easy	asked	individual	being
share	following	giving	imagine	almost	grow	period	brings	third	getting
view	pay	everybody	keep	similar	educated	like	single	particular	stress
turn	disagree	due	too	saying	mistake	advertisement	satisfied	lack	strong
became	issues	happens	daily	gives	usually	both	professional	wanted	form
food	topics	love	others	third	child	its	everyone	strong	major
back	city	effort	because	down	situation	says	work	stress	effective
year	need	speak	quality	same	looking	getting	remember	probably	aware
effect	agree	opportunity	makes	love	works	let	kid	conclude	its

least variable ranks:

low_var_ranks <- long_words %>%
  group_by(rank_num) %>%
  summarize(var = var(value)) %>%
  top_n(-10, var)

kable(low_var_ranks)

rank_num	var
1	0.0000831
6	0.0001830
9	0.0001623
12	0.0001715
14	0.0001826
15	0.0001658
18	0.0001867
27	0.0001687
28	0.0001836
29	0.0001825

Top and bottom N words in 10 lease variable ranks

TOP:

N_words <- 20

long_words %>%
  filter(rank_num %in% c(low_var_ranks$rank_num)) %>%
  group_by(rank_num) %>%
  top_n(N_words, value) %>%
  arrange(value) %>%
  select(rank_num, word) %>%
  mutate(n = 1:n()) %>%
  spread(rank_num, word) %>%
  select(-n) %>%
  kable()

1	6	9	12	14	15	18	27	28	29
interesting	these	information	types	when	needed	nowadays	ask	development	cars
energy	today	tell	transportation	last	furthermore	news	did	using	technology
persons	less	she	children	society	contact	go	ways	effect	see
contact	internet	teach	their	father	secondly	using	types	technology	world
words	number	of	get	future	act	ways	television	do	teacher
actually	changing	then	student	the	persons	years	or	never	books
experienced	information	her	cars	saw	coming	companies	use	changing	students
compare	then	his	university	wanted	outside	family	cars	of	hours
real	such	economy	information	friend	body	teach	technology	these	went
act	technology	transportation	companies	went	no	never	advantages	points	factors
effort	places	internet	alone	success	finally	was	enjoy	car	day
rest	news	media	made	lack	far	friend	doctor	internet	city
aware	books	reasons	them	manage	single	me	go	place	human
thinking	changes	or	news	she	shows	society	news	modern	she
thus	various	development	students	go	task	television	know	various	go
needed	living	he	advertisement	he	tend	media	want	television	developing
finally	modern	valuable	time	was	takes	he	computers	transportation	developed
educated	source	technology	study	year	aware	modern	public	city	never
interested	of	modern	internet	or	creativity	she	tv	adult	food
depend	transportation	provides	various	using	either	or	internet	teach	did

BOTTOM:

long_words %>%
  filter(rank_num %in% c(low_var_ranks$rank_num)) %>%
  group_by(rank_num) %>%
  top_n(-N_words, value) %>%
  arrange(value) %>%
  select(rank_num, word) %>%
  mutate(n = 1:n()) %>%
  spread(rank_num, word) %>%
  select(-n) %>%
  kable()

1	6	9	12	14	15	18	27	28	29
internet	conclusion	especially	grow	ever	etc	major	large	off	same
he	members	cause	human	therefore	modern	cost	cannot	needed	major
of	quality	keep	opportunity	related	had	left	professional	tend	teenager
modern	depend	health	turn	difficult	did	thinking	provided	importance	probably
today	position	fast	build	rich	types	themselves	values	little	present
technology	though	brings	words	cost	problems	deal	whether	times	looking
do	between	express	choice	keep	go	free	although	position	impact
was	human	difficult	show	energy	city	financially	older	looking	issues
she	energy	actually	likes	questions	internet	often	tend	kid	interest
go	older	hand	doesn	forget	media	positive	teenager	seen	given
cars	interested	cases	thought	means	before	language	third	act	aspect
world	questions	values	leave	listen	was	interested	apply	conclusion	persons
children	often	based	attitude	cannot	tv	called	follow	poor	finally
media	too	between	during	persons	teacher	communicate	quality	build	particular
want	increase	step	mean	goal	system	short	finding	else	ever
television	look	since	since	definitely	computers	known	actually	difference	grow
him	choice	quality	once	effort	public	perfect	often	teaching	educated
reasons	myself	experienced	works	books	living	others	interesting	here	words
students	thought	depends	energy	result	he	concept	write	though	clear
had	here	depend	away	takes	think	second	therefore	brings	called

0.3 Languages and words

What makes sense as the best way to look at the cross of languages and words? I’ve tried some clustering approaches, but it’s not that interpretable. Would be helpful to talk about this.

Tensor Decomposition Analysis

Molly Lewis

2018-02-21

0.1 Language matrix

0.2 Word matrix

0.3 Languages and words