library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(corrplot)
library(feather)
library(broom)

theme_set(theme_classic(base_size = 10))

Comparision of pairwise distance measure using wikipedia fasttext models with bible and europarl words. Sample of 5000 bible and 100 europarl words per language (varies by language).

Europarl vs. Bible pairwise distances

euro_dists <- read_csv("europarl_distance_summaries.csv", 
         col_names = c("n_words", "empirical_mean", "ci_lower", 
"boot_mean", "ci_upper","median", "sd", "cv", "wiki_lang_code"))  %>%
  select(wiki_lang_code, empirical_mean) %>%
  rename(europarl_mean = empirical_mean)


bible_dists <- read_csv("bible_distance_5000_summaries.csv", 
         col_names = c("empirical_mean", "median", "sd", "wiki_lang_code", "cv")) %>%
  left_join(euro_dists)

bible_dists  %>%
  filter(!is.na(europarl_mean)) %>%
  ggplot(aes(x = empirical_mean, y = europarl_mean)) +
  ylab("Europarl mean distance") +
  xlab("Bible mean distance") +
  geom_point() +
  geom_smooth(method = "lm")

Pairwise distances and rhsw

Bible

All langs

df <- read_csv("../../data/information_density_from_pete.csv") %>%
  left_join(bible_dists, by = c(`639-1` = "wiki_lang_code")) %>%
  select(74, 63, 77:81) %>%
  filter(!is.na(empirical_mean))


ggplot(df, aes(x = rhsw, y = empirical_mean)) +
  geom_label(aes(label = `639-1`)) +
  geom_smooth(method = "lm") +
  theme_classic()

ggplot(df, aes(x = rhsw, y = cv)) +
  geom_label(aes(label = `639-1`)) +
  geom_smooth(method = "lm") +
  theme_classic()

Excluding outliers

df %>%
  filter( ! `639-1` %in% c("ja", "zh", "mr", "yo")) %>%
  ggplot( aes(x = rhsw, y = empirical_mean)) +
  geom_label(aes(label = `639-1`)) +
  geom_smooth(method = "lm") +
  theme_classic()