Read in word2vec data

#dold <- read_csv("../data/ALL_LANGUAGES_PAIRWISE.csv", na = "<NA>")
d1 <- read_csv("../data/ALL_LANGUAGES_PAIRWISE_WITH_IMPUTED_DOMAINS.csv", na = "<NA>")
#d2 <- read_csv("../data/ALL_SIMILARITIES_FACEBOOK.csv")
#d3 <- read_csv("../data/COMPARATOR_LOG.csv")
#d4 <- read_csv("../data/NORTHEURALEX_WITH_DOMAINS_AND_VECTORS_FACEBOOK.csv")

all_w2v_raw <- d1 %>%
  filter(l1 %in% c("nl", "en"), l2 %in% c("nl", "en")) %>%
  select(-l1, -l2, -p) %>%
  rename(wordform_dutch = wordform_l1,
         wordform_eng = wordform_l2) %>%
  mutate(wordform_dutch=replace(wordform_dutch, 
                                wordform_eng == "finish", "beeindigen")) %>% # this is an encoding issue
  mutate(wordform_dutch = tolower(wordform_dutch)) 

all_w2v  <- all_w2v_raw %>%
  group_by(wordform_eng) %>%
  summarize(rho = mean(rho)) %>% # take the mean of multiple w2vec entries (e.g. bark vs. bark)
  left_join(all_w2v_raw %>% select(wordform_eng, wordform_dutch, semantic_domain2), 
            by = "wordform_eng") %>%
  group_by(wordform_eng) %>%
  slice(1)

Read in association data

en_assoc <- read_csv("../data/correlation_full_en.csv") %>%
  select(-1)

nl_assoc <- read_csv("../data/correlation_full_nl.csv") %>%
  select(-1)

Get translated version of assocations using w2v traslations

dict <- all_w2v %>%
  select(wordform_dutch, wordform_eng) 

nl_assoc_clean <- nl_assoc %>%
    mutate(cue = tolower(cue),
           target = tolower(target)) %>%
  left_join(dict, by = c("cue" = "wordform_dutch")) %>%
  rename(cue_eng_trans = wordform_eng) %>%
  left_join(dict, by = c("target" = "wordform_dutch")) %>%
  rename(target_eng_trans = wordform_eng) %>%
  filter(!is.na(cue_eng_trans), !is.na(target_eng_trans)) %>%
  select(language, cue_eng_trans, target_eng_trans, swow) %>%
  group_by(cue_eng_trans, target_eng_trans, language) %>%
  summarize(swow = mean(swow)) %>%# some dutch words map to the same english word
  ungroup()
  
eng_assoc_clean <- en_assoc %>%
  mutate(cue = tolower(cue),
         target = tolower(target)) %>%
  filter(cue %in% unique(all_w2v$wordform_eng),
         target %in% unique(all_w2v$wordform_eng)) %>%
  rename(cue_eng_trans = cue,
         target_eng_trans = target) %>%
  select(language, cue_eng_trans, target_eng_trans, swow) %>%
  group_by(cue_eng_trans, target_eng_trans, language) %>%
  summarize(swow = mean(swow)) %>% # there are some entries twice in en_assoc with diff captitalization (e.g. down vs. Down)
  ungroup()

Get overlapping common targets up to n

get_overlapping_targets <- function(cue, eng, nl, n){
  targets_english <- eng %>%
    filter(cue_eng_trans == cue) %>%
    arrange(-swow) %>%
    select(target_eng_trans)
  
  targets_nl <- nl %>%
      filter(cue_eng_trans == cue) %>%
      arrange(-swow) %>%
      select(target_eng_trans)
  
  common_targets <- intersect(targets_english, targets_nl)$target_eng_trans
  
  if (length(common_targets) > n){
    common_targets = common_targets[1:n]
  } else if (length(common_targets) == 0){
    common_targets = NA
  }
  
  data.frame(cue = cue, 
             common_targets = common_targets)
}

common_cues <- unique(intersect(eng_assoc_clean$cue_eng_trans, 
                                nl_assoc_clean$cue_eng_trans))

all_common_targets <- map_df(common_cues, 
                      get_overlapping_targets, 
                      eng_assoc_clean, 
                      nl_assoc_clean, 
                      40)

Get rhos

get_rhos <- function(current_cue, all_common_targets, eng, nl){
  
  current_targets <- all_common_targets %>%
    filter(cue == current_cue)
  
  targets_eng_swow <- eng %>%
      filter(cue_eng_trans == current_cue) %>%
      filter(target_eng_trans %in% current_targets$common_targets) 
    
  targets_nl_swow <- nl %>%
      filter(cue_eng_trans == current_cue) %>%
      filter(target_eng_trans %in% current_targets$common_targets) 
    
  rho <- cor(targets_eng_swow$swow,
      targets_nl_swow$swow, method = "spearman")
  
  data.frame(cue = current_cue,
             rho_swow = rho)
  
}


rhos_swow <- map_df(common_cues, 
                      get_rhos, 
                      all_common_targets,
                      eng_assoc_clean, 
                      nl_assoc_clean)

rhos_w2v vs. rhos_swow

rhos_w2v <- all_w2v %>%
  select(wordform_eng, semantic_domain2, rho) %>%
  rename(rho_w2v = rho)

all_rhos <- rhos_swow %>%
  left_join(rhos_w2v, by = c("cue" = "wordform_eng")) %>%
  rename(wordform_eng = cue)

ggplot(all_rhos, aes(x = rho_swow, y = rho_w2v)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ggtitle("rhos_w2v vs. rhos_swow") +
  theme_minimal()

all_rhos %>%
  do(tidy(cor.test(.$rho_swow, .$rho_w2v))) %>%
  select(-parameter, -method, -alternative) %>%
  kable()
estimate statistic p.value conf.low conf.high
0.1667811 4.693727 3.2e-06 0.0973662 0.2345811

By domain

sorted_levels <- all_rhos %>%
  group_by(semantic_domain2) %>%
  filter(semantic_domain2 != "Law" ) %>%
  do(tidy(cor.test(.$rho_swow, .$rho_w2v))) %>%
  select(-parameter, -method, -alternative) %>%
  arrange(estimate) 

all_rhos %>%
  mutate(semantic_domain2 = fct_relevel(semantic_domain2, sorted_levels$semantic_domain2)) %>%
ggplot(aes(x = rho_swow, y = rho_w2v)) +
  geom_point() +
  geom_smooth(method = "lm") +
  facet_wrap(~ semantic_domain2) +
  ggtitle("rhos_w2v vs. rhos_swow") +
  theme_minimal()

kable(sorted_levels)
semantic_domain2 estimate statistic p.value conf.low conf.high
Possession -0.5977821 -4.0156180 0.0003834 -0.7856972 -0.3088652
The house -0.4987747 -2.2287708 0.0415460 -0.7900242 -0.0238469
Social and political relations -0.0963192 -0.4538874 0.6543558 -0.4810257 0.3194913
Food and drink -0.0688961 -0.3906635 0.6986347 -0.3977939 0.2756928
Sense perception 0.0562131 0.3691977 0.7137923 -0.2413024 0.3440697
The body 0.0788450 0.7291852 0.4678929 -0.1340293 0.2847638
The physical world 0.0909341 0.6771911 0.5011233 -0.1737505 0.3433659
Clothing and grooming 0.1066017 0.5252322 0.6042408 -0.2928424 0.4743659
Agriculture and vegetation 0.1324796 0.6547865 0.5188320 -0.2686589 0.4944578
Basic actions and technology 0.1490925 1.0767685 0.2866509 -0.1262906 0.4031403
Spatial relations 0.1571075 1.2626826 0.2113570 -0.0902498 0.3862077
Motion 0.2071187 1.5266588 0.1329060 -0.0642024 0.4499271
Miscellaneous function words 0.2456363 0.8404325 0.4185593 -0.3531443 0.7016578
Animals 0.2657843 1.6310664 0.1118436 -0.0637224 0.5430374
Emotions and values 0.2951654 1.7746643 0.0851796 -0.0422355 0.5721351
Speech and language 0.2958891 1.4194957 0.1704290 -0.1324700 0.6311164
Cognition 0.3475028 1.9957365 0.0554311 -0.0077974 0.6248968
Quantity 0.4172250 2.2490860 0.0339545 0.0356316 0.6926377
Time 0.4222825 3.0548270 0.0038579 0.1469656 0.6368733
Kinship 0.4410533 2.3568382 0.0273196 0.0556145 0.7120862
Warfare and hunting 0.7263466 2.3630040 0.0645101 -0.0589634 0.9563172
Modern world 0.7360155 2.4311176 0.0592989 -0.0382346 0.9580580
Religion and belief 0.8866689 1.9175160 0.3060267 NA NA
all_rhos %>%
  group_by(wordform_eng) %>% 
  summarize(n=n()) %>% 
  filter(n>1) %>% 
  as.data.frame()
## [1] wordform_eng n           
## <0 rows> (or 0-length row.names)