xlangsemvar analyses

Read in word2vec data

#dold <- read_csv("../data/ALL_LANGUAGES_PAIRWISE.csv", na = "<NA>")
d1 <- read_csv("../data/ALL_LANGUAGES_PAIRWISE_WITH_IMPUTED_DOMAINS.csv", na = "<NA>")
#d2 <- read_csv("../data/ALL_SIMILARITIES_FACEBOOK.csv")
#d3 <- read_csv("../data/COMPARATOR_LOG.csv")
#d4 <- read_csv("../data/NORTHEURALEX_WITH_DOMAINS_AND_VECTORS_FACEBOOK.csv")

all_w2v_raw <- d1 %>%
  filter(l1 %in% c("nl", "en"), l2 %in% c("nl", "en")) %>%
  select(-l1, -l2, -p) %>%
  rename(wordform_dutch = wordform_l1,
         wordform_eng = wordform_l2) %>%
  mutate(wordform_dutch=replace(wordform_dutch, 
                                wordform_eng == "finish", "beeindigen")) %>% # this is an encoding issue
  mutate(wordform_dutch = tolower(wordform_dutch)) 

all_w2v  <- all_w2v_raw %>%
  group_by(wordform_eng) %>%
  summarize(rho = mean(rho)) %>% # take the mean of multiple w2vec entries (e.g. bark vs. bark)
  left_join(all_w2v_raw %>% select(wordform_eng, wordform_dutch, semantic_domain2), 
            by = "wordform_eng") %>%
  group_by(wordform_eng) %>%
  slice(1)

Read in association data

en_assoc <- read_csv("../data/correlation_full_en.csv") %>%
  select(-1)

nl_assoc <- read_csv("../data/correlation_full_nl.csv") %>%
  select(-1)

Get translated version of assocations using w2v traslations

dict <- all_w2v %>%
  select(wordform_dutch, wordform_eng) 

nl_assoc_clean <- nl_assoc %>%
    mutate(cue = tolower(cue),
           target = tolower(target)) %>%
  left_join(dict, by = c("cue" = "wordform_dutch")) %>%
  rename(cue_eng_trans = wordform_eng) %>%
  left_join(dict, by = c("target" = "wordform_dutch")) %>%
  rename(target_eng_trans = wordform_eng) %>%
  filter(!is.na(cue_eng_trans), !is.na(target_eng_trans)) %>%
  select(language, cue_eng_trans, target_eng_trans, swow) %>%
  group_by(cue_eng_trans, target_eng_trans, language) %>%
  summarize(swow = mean(swow)) %>%# some dutch words map to the same english word
  ungroup()
  
eng_assoc_clean <- en_assoc %>%
  mutate(cue = tolower(cue),
         target = tolower(target)) %>%
  filter(cue %in% unique(all_w2v$wordform_eng),
         target %in% unique(all_w2v$wordform_eng)) %>%
  rename(cue_eng_trans = cue,
         target_eng_trans = target) %>%
  select(language, cue_eng_trans, target_eng_trans, swow) %>%
  group_by(cue_eng_trans, target_eng_trans, language) %>%
  summarize(swow = mean(swow)) %>% # there are some entries twice in en_assoc with diff captitalization (e.g. down vs. Down)
  ungroup()

Get overlapping common targets up to n

get_overlapping_targets <- function(cue, eng, nl, n){
  targets_english <- eng %>%
    filter(cue_eng_trans == cue) %>%
    arrange(-swow) %>%
    select(target_eng_trans)
  
  targets_nl <- nl %>%
      filter(cue_eng_trans == cue) %>%
      arrange(-swow) %>%
      select(target_eng_trans)
  
  common_targets <- intersect(targets_english, targets_nl)$target_eng_trans
  
  if (length(common_targets) > n){
    common_targets = common_targets[1:n]
  } else if (length(common_targets) == 0){
    common_targets = NA
  }
  
  data.frame(cue = cue, 
             common_targets = common_targets)
}

common_cues <- unique(intersect(eng_assoc_clean$cue_eng_trans, 
                                nl_assoc_clean$cue_eng_trans))

all_common_targets <- map_df(common_cues, 
                      get_overlapping_targets, 
                      eng_assoc_clean, 
                      nl_assoc_clean, 
                      40)

Get rhos

get_rhos <- function(current_cue, all_common_targets, eng, nl){
  
  current_targets <- all_common_targets %>%
    filter(cue == current_cue)
  
  targets_eng_swow <- eng %>%
      filter(cue_eng_trans == current_cue) %>%
      filter(target_eng_trans %in% current_targets$common_targets) 
    
  targets_nl_swow <- nl %>%
      filter(cue_eng_trans == current_cue) %>%
      filter(target_eng_trans %in% current_targets$common_targets) 
    
  rho <- cor(targets_eng_swow$swow,
      targets_nl_swow$swow, method = "spearman")
  
  data.frame(cue = current_cue,
             rho_swow = rho)
  
}


rhos_swow <- map_df(common_cues, 
                      get_rhos, 
                      all_common_targets,
                      eng_assoc_clean, 
                      nl_assoc_clean)

rhos_w2v vs. rhos_swow

rhos_w2v <- all_w2v %>%
  select(wordform_eng, semantic_domain2, rho) %>%
  rename(rho_w2v = rho)

all_rhos <- rhos_swow %>%
  left_join(rhos_w2v, by = c("cue" = "wordform_eng")) %>%
  rename(wordform_eng = cue)

ggplot(all_rhos, aes(x = rho_swow, y = rho_w2v)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ggtitle("rhos_w2v vs. rhos_swow") +
  theme_minimal()

all_rhos %>%
  do(tidy(cor.test(.$rho_swow, .$rho_w2v))) %>%
  select(-parameter, -method, -alternative) %>%
  kable()

estimate	statistic	p.value	conf.low	conf.high
0.1667811	4.693727	3.2e-06	0.0973662	0.2345811

By domain

sorted_levels <- all_rhos %>%
  group_by(semantic_domain2) %>%
  filter(semantic_domain2 != "Law" ) %>%
  do(tidy(cor.test(.$rho_swow, .$rho_w2v))) %>%
  select(-parameter, -method, -alternative) %>%
  arrange(estimate) 

all_rhos %>%
  mutate(semantic_domain2 = fct_relevel(semantic_domain2, sorted_levels$semantic_domain2)) %>%
ggplot(aes(x = rho_swow, y = rho_w2v)) +
  geom_point() +
  geom_smooth(method = "lm") +
  facet_wrap(~ semantic_domain2) +
  ggtitle("rhos_w2v vs. rhos_swow") +
  theme_minimal()

kable(sorted_levels)

semantic_domain2	estimate	statistic	p.value	conf.low	conf.high
Possession	-0.5977821	-4.0156180	0.0003834	-0.7856972	-0.3088652
The house	-0.4987747	-2.2287708	0.0415460	-0.7900242	-0.0238469
Social and political relations	-0.0963192	-0.4538874	0.6543558	-0.4810257	0.3194913
Food and drink	-0.0688961	-0.3906635	0.6986347	-0.3977939	0.2756928
Sense perception	0.0562131	0.3691977	0.7137923	-0.2413024	0.3440697
The body	0.0788450	0.7291852	0.4678929	-0.1340293	0.2847638
The physical world	0.0909341	0.6771911	0.5011233	-0.1737505	0.3433659
Clothing and grooming	0.1066017	0.5252322	0.6042408	-0.2928424	0.4743659
Agriculture and vegetation	0.1324796	0.6547865	0.5188320	-0.2686589	0.4944578
Basic actions and technology	0.1490925	1.0767685	0.2866509	-0.1262906	0.4031403
Spatial relations	0.1571075	1.2626826	0.2113570	-0.0902498	0.3862077
Motion	0.2071187	1.5266588	0.1329060	-0.0642024	0.4499271
Miscellaneous function words	0.2456363	0.8404325	0.4185593	-0.3531443	0.7016578
Animals	0.2657843	1.6310664	0.1118436	-0.0637224	0.5430374
Emotions and values	0.2951654	1.7746643	0.0851796	-0.0422355	0.5721351
Speech and language	0.2958891	1.4194957	0.1704290	-0.1324700	0.6311164
Cognition	0.3475028	1.9957365	0.0554311	-0.0077974	0.6248968
Quantity	0.4172250	2.2490860	0.0339545	0.0356316	0.6926377
Time	0.4222825	3.0548270	0.0038579	0.1469656	0.6368733
Kinship	0.4410533	2.3568382	0.0273196	0.0556145	0.7120862
Warfare and hunting	0.7263466	2.3630040	0.0645101	-0.0589634	0.9563172
Modern world	0.7360155	2.4311176	0.0592989	-0.0382346	0.9580580
Religion and belief	0.8866689	1.9175160	0.3060267	NA	NA

all_rhos %>%
  group_by(wordform_eng) %>% 
  summarize(n=n()) %>% 
  filter(n>1) %>% 
  as.data.frame()

## [1] wordform_eng n           
## <0 rows> (or 0-length row.names)

xlangsemvar analyses

Molly Lewis

2017-09-10

Read in word2vec data

Read in association data

Get translated version of assocations using w2v traslations

Get overlapping common targets up to n

Get rhos

rhos_w2v vs. rhos_swow

By domain