Read in lang-pairs word correlations
FILENAME <- "lang_pairwise_word_rank_correlations.csv"
corr_df <- read_csv(FILENAME,
col_names = c("word", "statistic", "p_value",
"estimate", "lang1", "lang2")) %>%
mutate_if(is.character, as.factor) %>%
select(lang1, lang2, word, everything()) %>%
rename(r_coefficient = estimate)
Look at distribution of correlations across words and languages.
It’s mean is slightly - but reliably - above zero.
ggplot(corr_df, aes(x = r_coefficient)) +
geom_histogram() +
geom_vline(aes(xintercept = 0), color = "red") +
theme_classic()
t.test(corr_df$r_coefficient)
##
## One Sample t-test
##
## data: corr_df$r_coefficient
## t = 76.061, df = 447300, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.004515396 0.004754259
## sample estimates:
## mean of x
## 0.004634827
All language pairs have at least some words that are correlated.
Here I’m filtering by only those correlations that are reliable at the alpha = .05 level.
corr_df %>%
filter(p_value < .05) %>%
count(lang1, lang2) %>%
arrange(n) %>%
DT::datatable()
LANG_LANG_OUTPUT_FILENAME <- "lang_pairwise_rank_correlations.csv"
lang_lang_corrs <- read_csv(LANG_LANG_OUTPUT_FILENAME,
col_names = c("lang1", "lang2", "r_coefficient",
"statistic", "p.value", "conf_low",
"conf_high"))
ggplot(lang_lang_corrs, aes(x = r_coefficient)) +
geom_histogram() +
geom_vline(aes(xintercept = 0), color = "red") +
theme_classic()
ggplot(lang_lang_corrs, aes(x = p.value)) +
geom_histogram() +
geom_vline(aes(xintercept = 0), color = "red") +
geom_vline(aes(xintercept = .05), color = "red") +
theme_classic()
Top N similiar and dissimilar word table
lang_names <- read_csv("../../../data/processed/lang_names/ETS_abbrev_to_names.csv") %>%
select(ETS_code, ETS_language_name)
VOFFSET <- .12
# most similliar
top_critical_words <- corr_df %>%
filter(p_value < .05,
r_coefficient > 0,
lang1 != lang2) %>%
group_by(lang1, lang2) %>%
arrange(-r_coefficient) %>%
slice(c(1,2)) %>% # get first two
mutate(n = 1:n(),
lang1_numeric = as.numeric(lang1),
lang2_numeric = as.numeric(lang2),
word_height = case_when(
n == 1 ~ lang2_numeric + VOFFSET,
n == 2 ~ lang2_numeric - VOFFSET),
type = "similiar")
# most dissimilar
bottom_critical_words <- corr_df %>%
filter(p_value < .05,
r_coefficient < 0,
lang1 != lang2) %>%
group_by(lang1, lang2) %>%
arrange(r_coefficient) %>%
slice(c(1,2)) %>% # last two
mutate(n = 1:n(),
lang1_numeric = as.numeric(lang2),
lang2_numeric = as.numeric(lang1),
temp1 = lang1,
temp2 = lang2) %>%
ungroup() %>%
select(-lang1, -lang2) %>%
rename(lang1 = temp2,
lang2 = temp1) %>%
mutate(word_height = case_when(
n == 1 ~ lang2_numeric + VOFFSET,
n == 2 ~ lang2_numeric - VOFFSET),
type = "dissimilar")
critical_words <- bind_rows(list(top_critical_words,
bottom_critical_words)) %>%
left_join(lang_names, by = c("lang1" = "ETS_code")) %>%
rename(ETS_language_name1 = ETS_language_name) %>%
left_join(lang_names, by = c("lang2" = "ETS_code")) %>%
rename(ETS_language_name2 = ETS_language_name)
top_lang_lang_corrs_clean <- lang_lang_corrs %>%
select(lang1, lang2, r_coefficient) %>%
mutate(lang1_numeric = as.numeric(as.factor(lang1)),
lang2_numeric = as.numeric(as.factor(lang2)),
x = lang1_numeric,
y = lang2_numeric + 1)
bottom_lang_lang_corrs_clean <- top_lang_lang_corrs_clean %>%
mutate(temp1_n = lang1_numeric,
temp2_n = lang2_numeric,
x = lang2_numeric + 1,
y = lang1_numeric) %>%
rename(lang1_numeric = temp2_n,
lang2_numeric = temp1_n)
lang_lang_corrs_clean <- bind_rows(list(top_lang_lang_corrs_clean,
bottom_lang_lang_corrs_clean))
#pdf("word_table.pdf", width = 13, height = 8.5)
ggplot() +
geom_raster(aes(x = x,
y = y,
fill = r_coefficient),
data = lang_lang_corrs_clean) +
geom_text(aes(label = word, x = lang1_numeric, y = word_height),
size = 1.5,
data = critical_words) +
scale_x_discrete(limits = unique(critical_words$ETS_language_name1),
name = "") +
scale_y_discrete(limits = unique(critical_words$ETS_language_name1),
name = "") +
scale_fill_gradient(low = "white", high = "red",
name = "Rank Correlation \n(Pearson's r)") +
theme_classic() +
theme(axis.ticks = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")
#dev.off()