TENSOR_OUTPUT_FILENAME <- "../tensor_factorization_matlab_stuff/tensor_data_as_mat/output/7_corr_shared_ktensor_output_WWL.mat"
tensor_output <- read.mat(TENSOR_OUTPUT_FILENAME)

words1 <- tensor_output$k[[1]]
words2 <- tensor_output$k[[2]]
langs <- tensor_output$k[[3]]

The tensorization solution has 615 factors.

0.1 Language matrix

Here’s what the language matrix looks like:

LANGS <- c('ARA', 'BEN', 'BUL', 'CHI', 'DUT', 'ENG', 'FAS', 'FRE', 'GER', 'GRE', 'GUJ','HIN', 'IBO', 'IND', 'ITA', 'JPN','KAN', 'KOR', 'MAL', 'MAR', 'NEP', 'PAN', 'POL','POR', 'RUM', 'RUS', 'SPA', 'TAM', 'TEL', 'TGL', 'THA', 'TUR', 'URD', 'VIE', 'YOR')

lang_id_df = data.frame(lang = LANGS ,
           lang_id = as.factor(1:35))

long_lang = langs %>%
    tbl_df() %>%
    rownames_to_column('lang_id') %>%
    gather(rank_num, value, -lang_id) %>%
    mutate(
        lang_id = factor(lang_id, levels = 1:615),
        rank_num = factor(gsub("V", "", rank_num),
                         levels=1:615)) %>%
   left_join(lang_id_df)

ggplot(long_lang, aes(x = rank_num, y = lang, 
              fill = value) ) +
  geom_tile() +
  scale_fill_gradient2() +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

And, the dendogram, which looks relatively sensible.

all_corrs_mat <- as.matrix(langs)
rownames(all_corrs_mat) <- LANGS
dist_matrix <- dist(all_corrs_mat)
ggdendro::ggdendrogram(hclust(dist_matrix))

0.1.1 Predicting language distances with independent measures

wals_dists <- read_csv("../../../../study2/data/processed/ling_dists/wals_distances.csv")
asjp_dists <- read_csv("../../../../study2/data/processed/ling_dists/asjp_distances.csv")
dplace_dists <- read_csv("../../../../study2/data/processed/ling_dists/dplace_distances.csv") %>%
  mutate(lang1 = tolower(lang1),
         lang2 = tolower(lang2))
eco_dists <- read_csv("../../../../study2/data/processed/ling_dists/eco_distances.csv")


semantic_raw_dists_all <- read_csv("../../../../study2/data/processed/pairwise_lang_distances/HD_centroid_distances_all.csv")  %>%
  as.matrix() %>%
  as.data.frame() %>%
  mutate(lang1 = tolower(colnames(.))) %>%
  gather(lang2, essay_centroid_distance, -36) %>%
    mutate(lang2 = tolower(lang2))


tensor_dists <- dist_matrix %>%
  as.matrix() %>%
  as.data.frame() %>%
  rownames_to_column("lang1") %>%
  gather(lang2, tensor_dist, -1) %>%
  mutate(lang1 = tolower(lang1),
         lang2 = tolower(lang2))

all_dists <- wals_dists %>%
  left_join(asjp_dists) %>%
  left_join(dplace_dists) %>% 
  left_join(eco_dists) %>%
  left_join(semantic_raw_dists_all) %>%
  left_join(tensor_dists)

corr_mat <- cor(all_dists[,c(-1,-2)], 
                use = "pairwise.complete.obs")

p.mat <- corrplot::cor.mtest(all_dists[,c(-1,-2)], 
                  use = "pairwise.complete.obs")$p

cols = rev(colorRampPalette(c("red", "white", "blue"))(100))

corrplot::corrplot(corr_mat, method = "color",  col = cols,
         type = "upper", order = "original", number.cex = .7,
         addCoef.col = "black", 
         p.mat = p.mat, insig = "blank", 
         tl.col = "black", tl.srt = 90,
         diag = FALSE)

0.2 Word matrix

Here’s what the word matrix looks like:

WORDS <- read_feather("../data/4_matlab_data/ets_models/word_to_is_shared.feather")

word_id_df = data.frame(word = WORDS$w1 ,
                        word_id = as.factor(1:710))

long_words = words1 %>%
    tbl_df() %>%
    rownames_to_column('word_id') %>%
    gather(rank_num, value, -word_id) %>%
    mutate(
        word_id = factor(word_id, levels=1:710),
        rank_num = factor(gsub("V", "", rank_num),
                         levels=1:615)) %>%
   left_join(word_id_df)

ggplot(long_words, aes(x = rank_num, y = word, 
              fill = value) ) +
  geom_tile() +
  scale_fill_gradient2() +
  theme(axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank())

As a sanity check, let’s look at the distances between ranks for a few words. They look relatively sensible.

TEST_WORDS <- c("father", "mother", "children", "parents", "young",
                     "book", "books", "easier", "easily", "three")
word_mat <- long_words %>%
  filter(word %in% TEST_WORDS)%>%
  select(word, rank_num, value) %>%
  spread(word, value) %>%
  select(-rank_num) %>%
  as.matrix() %>%
  t()
dist_matrix_word <- dist(word_mat)

ggdendro::ggdendrogram(hclust(dist_matrix_word))

Full word diagram - see “word_diagram.pdf”

0.2.1 Labeled word factors

I hand labeled each factors based on the first 6 words in each. I only labeled ones where there was a relatively sensible label.

top_words <- long_words %>%
  select(-word_id) %>%
  group_by(rank_num) %>%
  arrange(-value) %>%
  slice(1:6) %>%
  mutate(nth_word = 1:n()) %>%
  select(-value) %>%
  spread(nth_word, word)

# write_csv(top_words, "WWL_words.csv")
labeled_ranks <- read_csv( "WWL_words_labeled.csv") %>%
  filter(!is.na(molly_label)) %>%
  select(rank_num, molly_label) %>%
  group_by(molly_label) %>%
  mutate(n = 1:n(),
         lab = paste0(molly_label, n))

Here’s what the word matrix looks like with labeled factors:

long_lang = langs %>%
    tbl_df() %>%
    rownames_to_column('lang_id') %>%
    gather(rank_num, value, -lang_id) %>%
    mutate(
        lang_id = factor(lang_id, levels = 1:615),
        rank_num = factor(gsub("V", "", rank_num),
                         levels=1:615),
        rank_num = as.factor(rank_num)) %>%
   left_join(lang_id_df) %>%
   right_join(labeled_ranks %>% mutate(rank_num = as.factor(rank_num)))

ggplot(long_lang, aes(x = lab, y = lang, 
              fill = value) ) +
  geom_tile() +
  xlab("Molly-labeled factor") +
  scale_fill_gradient2() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0 , size = 5),
        axis.ticks.x = element_blank())