WORDSIM_DATA <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/IATLANG/exploratory_studies/13_europarl/other_data/wordsim353.csv"
wordsim <- read_csv(WORDSIM_DATA) %>%
    janitor::clean_names() %>%
  rename(word1 = word_1,
         word2 = word_2,
         wordsim353 = human_mean)

SIMLEX_DATA <- here("exploratory_studies/13_europarl/other_data/SimLex-999.txt")
simlex <- read_tsv(SIMLEX_DATA) %>%
  select(word1, word2, SimLex999) %>%
  janitor::clean_names() %>%
  full_join(wordsim)

UNTRANS_MODEL <- "/Users/mollylewis/Downloads/europarl_models/untrans_europarl_5ep_en.txt.vec"
TRANS_MODEL <- "/Users/mollylewis/Downloads/europarl_models/trans_f_europarl_5ep_en.txt.vec"

# untranslated
current_model <- fread(
  UNTRANS_MODEL,
  header = FALSE,
  skip = 1,
  quote = "",
  encoding = "UTF-8",
  data.table = TRUE,
  col.names = c("word",
                unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
  filter(word %in% c(simlex$word1, simlex$word2))

word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))

wide_word_word_dists <- word_word_dists %>%
  as.data.frame()  %>%
  mutate(word1 =  current_model$word) %>%
  select(word1,everything())

names(wide_word_word_dists)  = c("word1", current_model$word)

long_word_word_dists_untranslated <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
  select(word1, word2, everything())  

# translated
current_model <- fread(
  TRANS_MODEL,
  header = FALSE,
  skip = 1,
  quote = "",
  encoding = "UTF-8",
  data.table = TRUE,
  col.names = c("word",
                unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
  filter(word %in% c(simlex$word1, simlex$word2))

word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))

wide_word_word_dists <- word_word_dists %>%
  as.data.frame()  %>%
  mutate(word1 =  current_model$word) %>%
  select(word1,everything())

names(wide_word_word_dists)  = c("word1", current_model$word)

long_word_word_dists_translated <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
  select(word1, word2, everything()) 

# wiki
WIKI_MODEL <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/exploratory_analyses/0_exploration/wiki.en.vec"
current_model <- fread(
  WIKI_MODEL,
  header = FALSE,
  skip = 1,
  quote = "",
  encoding = "UTF-8",
  data.table = TRUE,
  col.names = c("word",
                unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
  filter(word %in% c(simlex$word1, simlex$word2))

word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))

wide_word_word_dists <- word_word_dists %>%
  as.data.frame()  %>%
  mutate(word1 = current_model$word) %>%
  select(word1, everything())

names(wide_word_word_dists)  = c("word1", current_model$word)

long_word_word_dists_wiki <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
  select(word1, word2, everything()) 

#
COCA_MODEL <- "/Users/mollylewis/Desktop/coca_all.vec"

current_model <- fread(
  COCA_MODEL,
  header = FALSE,
  skip = 1,
  quote = "",
  encoding = "UTF-8",
  data.table = TRUE,
  col.names = c("word",
                unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
  filter(word %in% c(simlex$word1, simlex$word2))

word_word_dists <- coop::cosine(t(as.matrix(current_model[,-1])))

wide_word_word_dists <- word_word_dists %>%
  as.data.frame()  %>%
  mutate(word1 =  current_model$word) %>%
  select(word1,everything())

names(wide_word_word_dists)  = c("word1", current_model$word)

long_word_word_dists_coca <- gather(wide_word_word_dists, "word2", "cos_dist", -word1) %>%
  select(word1, word2, everything())  

simlex_distances <- simlex %>% 
  left_join(long_word_word_dists_untranslated) %>%
  mutate(model = "untranslated") %>%
  bind_rows(simlex %>% 
              left_join(long_word_word_dists_translated) %>% 
              mutate(model = "translated")) %>%
  bind_rows(simlex %>% 
              left_join(long_word_word_dists_wiki) %>% 
              mutate(model = "wiki")) %>%
  bind_rows(simlex %>% 
              left_join(long_word_word_dists_coca) %>% 
              mutate(model = "coca")) 

Simlex 999

ggplot(simlex_distances, aes(x = cos_dist, y = sim_lex999)) +
facet_wrap(~model) +
  geom_point(alpha = .2) +
  geom_smooth(method = "lm") +
  theme_classic()

simlex_distances %>%
  group_by(model)%>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$sim_lex999, .$cos_dist, method = "spearman")))) %>%
  select(-data) %>%
  unnest() %>%
  kable() 
model estimate statistic p.value method alternative
untranslated 0.2529978 60225292 0 Spearman’s rank correlation rho two.sided
translated 0.3257702 77363878 0 Spearman’s rank correlation rho two.sided
wiki 0.3803332 102968179 0 Spearman’s rank correlation rho two.sided
coca 0.3476024 108406950 0 Spearman’s rank correlation rho two.sided

Word Sim 353

ggplot(simlex_distances, aes(x = cos_dist, y = wordsim353)) +
facet_wrap(~model) +
  geom_point(alpha = .2) +
  geom_smooth(method = "lm") +
  theme_classic()

simlex_distances %>%
  group_by(model)%>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$wordsim353, .$cos_dist, method = "spearman")))) %>%
  select(-data) %>%
  unnest() %>%
  kable() 
model estimate statistic p.value method alternative
untranslated 0.4618351 1613101 0 Spearman’s rank correlation rho two.sided
translated 0.4662047 2037854 0 Spearman’s rank correlation rho two.sided
wiki 0.7322051 1677960 0 Spearman’s rank correlation rho two.sided
coca 0.6652850 2097270 0 Spearman’s rank correlation rho two.sided

Language-wise corrrlations

Simlex999

model_dists  <- long_word_word_dists_translated %>%
  right_join(simlex) %>%
  rename(cos_dist_trans = cos_dist) %>%
  left_join(long_word_word_dists_untranslated %>% 
              rename(cos_dist_untrans = cos_dist),
                     by = c("word1", "word2")) %>%
  left_join(long_word_word_dists_wiki %>% 
              rename(cos_dist_wiki = cos_dist),
                     by = c("word1", "word2")) %>%
  left_join(long_word_word_dists_coca %>% 
              rename(cos_dist_coca = cos_dist),
                     by = c("word1", "word2")) %>%
  mutate(human_norm = case_when(!is.na(sim_lex999) ~ "sim_lex999",
                                TRUE ~ "wordsim353"))

model_dists %>%
  filter(!is.na(sim_lex999)) %>%
  summarize(trans_untrans = cor(.$cos_dist_untrans, .$cos_dist_trans, use= "complete"),
            trans_coca = cor(.$cos_dist_coca, .$cos_dist_trans,  use= "complete"),
            trans_wiki = cor(.$cos_dist_wiki, .$cos_dist_trans,  use= "complete"),
            untrans_coca = cor(.$cos_dist_untrans, .$cos_dist_coca,  use= "complete"),
            untrans_wiki = cor(.$cos_dist_untrans, .$cos_dist_wiki,  use= "complete"),
            coca_wiki = cor(.$cos_dist_coca, .$cos_dist_wiki,  use= "complete")) %>%
  kable()
trans_untrans trans_coca trans_wiki untrans_coca untrans_wiki coca_wiki
0.8137307 0.5938353 0.6160205 0.5408185 0.5729829 0.8183628

wordsim353

model_dists %>%
  filter(!is.na(wordsim353)) %>%
  summarize(trans_untrans = cor(.$cos_dist_untrans, .$cos_dist_trans, use= "complete"),
            trans_coca = cor(.$cos_dist_coca, .$cos_dist_trans,  use= "complete"),
            trans_wiki = cor(.$cos_dist_wiki, .$cos_dist_trans,  use= "complete"),
            untrans_coca = cor(.$cos_dist_untrans, .$cos_dist_coca,  use= "complete"),
            untrans_wiki = cor(.$cos_dist_untrans, .$cos_dist_wiki,  use= "complete"),
            coca_wiki = cor(.$cos_dist_coca, .$cos_dist_wiki,  use= "complete")) %>%
  kable()
trans_untrans trans_coca trans_wiki untrans_coca untrans_wiki coca_wiki
0.8727768 0.6285733 0.6340114 0.60604 0.6311732 0.8510588