0.0.1 Lang-Lang-Words correlations

Read in lang-pairs word correlations

FILENAME <- "lang_pairwise_word_rank_correlations.csv"
corr_df <- read_csv(FILENAME, 
              col_names = c("word", "statistic", "p_value", 
                            "estimate", "lang1", "lang2")) %>%
          mutate_if(is.character, as.factor) %>%
          select(lang1, lang2, word, everything()) %>%
          rename(r_coefficient = estimate)

Look at distribution of correlations across words and languages.

It’s mean is slightly - but reliably - above zero.

ggplot(corr_df, aes(x = r_coefficient)) +
  geom_histogram() +
  geom_vline(aes(xintercept = 0), color = "red") +
  theme_classic()

t.test(corr_df$r_coefficient)
## 
##  One Sample t-test
## 
## data:  corr_df$r_coefficient
## t = 76.061, df = 447300, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.004515396 0.004754259
## sample estimates:
##   mean of x 
## 0.004634827

All language pairs have at least some words that are correlated.

Here I’m filtering by only those correlations that are reliable at the alpha = .05 level.

corr_df %>%
  filter(p_value < .05) %>%
  count(lang1, lang2) %>%
  arrange(n) %>%
  DT::datatable()

0.0.2 Lang-Lang correlations

LANG_LANG_OUTPUT_FILENAME <- "lang_pairwise_rank_correlations.csv"
lang_lang_corrs <- read_csv(LANG_LANG_OUTPUT_FILENAME, 
                            col_names = c("lang1", "lang2", "r_coefficient",
                                          "statistic", "p.value", "conf_low",
                                          "conf_high"))

ggplot(lang_lang_corrs, aes(x = r_coefficient)) +
  geom_histogram() +
  geom_vline(aes(xintercept = 0), color = "red") +
  theme_classic()

ggplot(lang_lang_corrs, aes(x = p.value)) +
  geom_histogram() +
  geom_vline(aes(xintercept = 0), color = "red") +
  geom_vline(aes(xintercept = .05), color = "red") +
  theme_classic()

Top N similiar and dissimilar word table

lang_names <- read_csv("../../../data/processed/lang_names/ETS_abbrev_to_names.csv") %>%
  select(ETS_code, ETS_language_name)

VOFFSET <- .12

# most similliar
top_critical_words <- corr_df %>%
  filter(p_value < .05,
         r_coefficient > 0,
         lang1 != lang2) %>%
  group_by(lang1, lang2) %>%
  arrange(-r_coefficient) %>%
  slice(c(1,2)) %>% # get first two 
  mutate(n = 1:n(),
        lang1_numeric = as.numeric(lang1),
        lang2_numeric = as.numeric(lang2),
        word_height = case_when(
              n == 1 ~ lang2_numeric + VOFFSET, 
              n == 2 ~ lang2_numeric - VOFFSET),
        type = "similiar") 

# most dissimilar
bottom_critical_words <- corr_df %>%
  filter(p_value < .05,
         r_coefficient < 0,
         lang1 != lang2) %>%  
  group_by(lang1, lang2) %>%
  arrange(r_coefficient) %>%
  slice(c(1,2)) %>% # last two
  mutate(n = 1:n(),
         lang1_numeric = as.numeric(lang2),
         lang2_numeric = as.numeric(lang1),
         temp1 = lang1,
         temp2 = lang2) %>%
  ungroup() %>%
  select(-lang1, -lang2) %>%
  rename(lang1 = temp2,
         lang2 = temp1) %>%
  mutate(word_height = case_when(
               n == 1 ~ lang2_numeric + VOFFSET, 
               n == 2 ~ lang2_numeric - VOFFSET),
         type = "dissimilar")
  
critical_words <- bind_rows(list(top_critical_words,
                                 bottom_critical_words)) %>%
  left_join(lang_names, by = c("lang1" = "ETS_code")) %>%
  rename(ETS_language_name1 = ETS_language_name) %>%
  left_join(lang_names, by = c("lang2" = "ETS_code")) %>%
  rename(ETS_language_name2 = ETS_language_name)

top_lang_lang_corrs_clean <- lang_lang_corrs %>%
          select(lang1, lang2, r_coefficient) %>%
          mutate(lang1_numeric = as.numeric(as.factor(lang1)),
                 lang2_numeric = as.numeric(as.factor(lang2)),
                 x = lang1_numeric,
                 y = lang2_numeric + 1)

bottom_lang_lang_corrs_clean <- top_lang_lang_corrs_clean %>%
          mutate(temp1_n = lang1_numeric,
                temp2_n = lang2_numeric,
                x = lang2_numeric + 1,
                y = lang1_numeric) %>%
          rename(lang1_numeric = temp2_n,
                 lang2_numeric = temp1_n) 

lang_lang_corrs_clean <- bind_rows(list(top_lang_lang_corrs_clean,
                                        bottom_lang_lang_corrs_clean))

#pdf("word_table.pdf", width = 13, height = 8.5)   
ggplot() + 
  geom_raster(aes(x = x, 
                  y = y, 
                  fill = r_coefficient),
            data = lang_lang_corrs_clean) +
  geom_text(aes(label = word, x = lang1_numeric, y = word_height), 
            size = 1.5,
            data = critical_words) +
  scale_x_discrete(limits = unique(critical_words$ETS_language_name1), 
                   name = "") +
  scale_y_discrete(limits = unique(critical_words$ETS_language_name1),
                   name = "")  +
  scale_fill_gradient(low = "white", high = "red",
                  name = "Rank Correlation \n(Pearson's r)") +
  theme_classic() +
  theme(axis.ticks = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")

#dev.off()