TENSOR_OUTPUT_FILENAME <- "../tensor_factorization_matlab_stuff/tensor_data_as_mat/output/7_corr_shared_ktensor_output_WWL.mat"
tensor_output <- read.mat(TENSOR_OUTPUT_FILENAME)
words1 <- tensor_output$k[[1]]
words2 <- tensor_output$k[[2]]
langs <- tensor_output$k[[3]]
The tensorization solution has 615 factors.
Here’s what the language matrix looks like:
LANGS <- c('ARA', 'BEN', 'BUL', 'CHI', 'DUT', 'ENG', 'FAS', 'FRE', 'GER', 'GRE', 'GUJ','HIN', 'IBO', 'IND', 'ITA', 'JPN','KAN', 'KOR', 'MAL', 'MAR', 'NEP', 'PAN', 'POL','POR', 'RUM', 'RUS', 'SPA', 'TAM', 'TEL', 'TGL', 'THA', 'TUR', 'URD', 'VIE', 'YOR')
lang_id_df = data.frame(lang = LANGS ,
lang_id = as.factor(1:35))
long_lang = langs %>%
tbl_df() %>%
rownames_to_column('lang_id') %>%
gather(rank_num, value, -lang_id) %>%
mutate(
lang_id = factor(lang_id, levels = 1:615),
rank_num = factor(gsub("V", "", rank_num),
levels=1:615)) %>%
left_join(lang_id_df)
ggplot(long_lang, aes(x = rank_num, y = lang,
fill = value) ) +
geom_tile() +
scale_fill_gradient2() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
And, the dendogram, which looks relatively sensible.
all_corrs_mat <- as.matrix(langs)
rownames(all_corrs_mat) <- LANGS
dist_matrix <- dist(all_corrs_mat)
ggdendro::ggdendrogram(hclust(dist_matrix))
wals_dists <- read_csv("../../../../study2/data/processed/ling_dists/wals_distances.csv")
asjp_dists <- read_csv("../../../../study2/data/processed/ling_dists/asjp_distances.csv")
dplace_dists <- read_csv("../../../../study2/data/processed/ling_dists/dplace_distances.csv") %>%
mutate(lang1 = tolower(lang1),
lang2 = tolower(lang2))
eco_dists <- read_csv("../../../../study2/data/processed/ling_dists/eco_distances.csv")
semantic_raw_dists_all <- read_csv("../../../../study2/data/processed/pairwise_lang_distances/HD_centroid_distances_all.csv") %>%
as.matrix() %>%
as.data.frame() %>%
mutate(lang1 = tolower(colnames(.))) %>%
gather(lang2, essay_centroid_distance, -36) %>%
mutate(lang2 = tolower(lang2))
tensor_dists <- dist_matrix %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column("lang1") %>%
gather(lang2, tensor_dist, -1) %>%
mutate(lang1 = tolower(lang1),
lang2 = tolower(lang2))
all_dists <- wals_dists %>%
left_join(asjp_dists) %>%
left_join(dplace_dists) %>%
left_join(eco_dists) %>%
left_join(semantic_raw_dists_all) %>%
left_join(tensor_dists)
corr_mat <- cor(all_dists[,c(-1,-2)],
use = "pairwise.complete.obs")
p.mat <- corrplot::cor.mtest(all_dists[,c(-1,-2)],
use = "pairwise.complete.obs")$p
cols = rev(colorRampPalette(c("red", "white", "blue"))(100))
corrplot::corrplot(corr_mat, method = "color", col = cols,
type = "upper", order = "original", number.cex = .7,
addCoef.col = "black",
p.mat = p.mat, insig = "blank",
tl.col = "black", tl.srt = 90,
diag = FALSE)
Here’s what the word matrix looks like:
WORDS <- read_feather("../data/4_matlab_data/ets_models/word_to_is_shared.feather")
word_id_df = data.frame(word = WORDS$w1 ,
word_id = as.factor(1:710))
long_words = words1 %>%
tbl_df() %>%
rownames_to_column('word_id') %>%
gather(rank_num, value, -word_id) %>%
mutate(
word_id = factor(word_id, levels=1:710),
rank_num = factor(gsub("V", "", rank_num),
levels=1:615)) %>%
left_join(word_id_df)
ggplot(long_words, aes(x = rank_num, y = word,
fill = value) ) +
geom_tile() +
scale_fill_gradient2() +
theme(axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank())
As a sanity check, let’s look at the distances between ranks for a few words. They look relatively sensible.
TEST_WORDS <- c("father", "mother", "children", "parents", "young",
"book", "books", "easier", "easily", "three")
word_mat <- long_words %>%
filter(word %in% TEST_WORDS)%>%
select(word, rank_num, value) %>%
spread(word, value) %>%
select(-rank_num) %>%
as.matrix() %>%
t()
dist_matrix_word <- dist(word_mat)
ggdendro::ggdendrogram(hclust(dist_matrix_word))
Full word diagram - see “word_diagram.pdf”
I hand labeled each factors based on the first 6 words in each. I only labeled ones where there was a relatively sensible label.
top_words <- long_words %>%
select(-word_id) %>%
group_by(rank_num) %>%
arrange(-value) %>%
slice(1:6) %>%
mutate(nth_word = 1:n()) %>%
select(-value) %>%
spread(nth_word, word)
# write_csv(top_words, "WWL_words.csv")
labeled_ranks <- read_csv( "WWL_words_labeled.csv") %>%
filter(!is.na(molly_label)) %>%
select(rank_num, molly_label) %>%
group_by(molly_label) %>%
mutate(n = 1:n(),
lab = paste0(molly_label, n))
Here’s what the word matrix looks like with labeled factors:
long_lang = langs %>%
tbl_df() %>%
rownames_to_column('lang_id') %>%
gather(rank_num, value, -lang_id) %>%
mutate(
lang_id = factor(lang_id, levels = 1:615),
rank_num = factor(gsub("V", "", rank_num),
levels=1:615),
rank_num = as.factor(rank_num)) %>%
left_join(lang_id_df) %>%
right_join(labeled_ranks %>% mutate(rank_num = as.factor(rank_num)))
ggplot(long_lang, aes(x = lab, y = lang,
fill = value) ) +
geom_tile() +
xlab("Molly-labeled factor") +
scale_fill_gradient2() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0 , size = 5),
axis.ticks.x = element_blank())