Read in word2vec data
#dold <- read_csv("../data/ALL_LANGUAGES_PAIRWISE.csv", na = "<NA>")
d1 <- read_csv("../data/ALL_LANGUAGES_PAIRWISE_WITH_IMPUTED_DOMAINS.csv", na = "<NA>")
#d2 <- read_csv("../data/ALL_SIMILARITIES_FACEBOOK.csv")
#d3 <- read_csv("../data/COMPARATOR_LOG.csv")
#d4 <- read_csv("../data/NORTHEURALEX_WITH_DOMAINS_AND_VECTORS_FACEBOOK.csv")
all_w2v_raw <- d1 %>%
filter(l1 %in% c("nl", "en"), l2 %in% c("nl", "en")) %>%
select(-l1, -l2, -p) %>%
rename(wordform_dutch = wordform_l1,
wordform_eng = wordform_l2) %>%
mutate(wordform_dutch=replace(wordform_dutch,
wordform_eng == "finish", "beeindigen")) %>% # this is an encoding issue
mutate(wordform_dutch = tolower(wordform_dutch))
all_w2v <- all_w2v_raw %>%
group_by(wordform_eng) %>%
summarize(rho = mean(rho)) %>% # take the mean of multiple w2vec entries (e.g. bark vs. bark)
left_join(all_w2v_raw %>% select(wordform_eng, wordform_dutch, semantic_domain2),
by = "wordform_eng") %>%
group_by(wordform_eng) %>%
slice(1)
Read in association data
en_assoc <- read_csv("../data/correlation_full_en.csv") %>%
select(-1)
nl_assoc <- read_csv("../data/correlation_full_nl.csv") %>%
select(-1)
Get translated version of assocations using w2v traslations
dict <- all_w2v %>%
select(wordform_dutch, wordform_eng)
nl_assoc_clean <- nl_assoc %>%
mutate(cue = tolower(cue),
target = tolower(target)) %>%
left_join(dict, by = c("cue" = "wordform_dutch")) %>%
rename(cue_eng_trans = wordform_eng) %>%
left_join(dict, by = c("target" = "wordform_dutch")) %>%
rename(target_eng_trans = wordform_eng) %>%
filter(!is.na(cue_eng_trans), !is.na(target_eng_trans)) %>%
select(language, cue_eng_trans, target_eng_trans, swow) %>%
group_by(cue_eng_trans, target_eng_trans, language) %>%
summarize(swow = mean(swow)) %>%# some dutch words map to the same english word
ungroup()
eng_assoc_clean <- en_assoc %>%
mutate(cue = tolower(cue),
target = tolower(target)) %>%
filter(cue %in% unique(all_w2v$wordform_eng),
target %in% unique(all_w2v$wordform_eng)) %>%
rename(cue_eng_trans = cue,
target_eng_trans = target) %>%
select(language, cue_eng_trans, target_eng_trans, swow) %>%
group_by(cue_eng_trans, target_eng_trans, language) %>%
summarize(swow = mean(swow)) %>% # there are some entries twice in en_assoc with diff captitalization (e.g. down vs. Down)
ungroup()
Get overlapping common targets up to n
get_overlapping_targets <- function(cue, eng, nl, n){
targets_english <- eng %>%
filter(cue_eng_trans == cue) %>%
arrange(-swow) %>%
select(target_eng_trans)
targets_nl <- nl %>%
filter(cue_eng_trans == cue) %>%
arrange(-swow) %>%
select(target_eng_trans)
common_targets <- intersect(targets_english, targets_nl)$target_eng_trans
if (length(common_targets) > n){
common_targets = common_targets[1:n]
} else if (length(common_targets) == 0){
common_targets = NA
}
data.frame(cue = cue,
common_targets = common_targets)
}
common_cues <- unique(intersect(eng_assoc_clean$cue_eng_trans,
nl_assoc_clean$cue_eng_trans))
all_common_targets <- map_df(common_cues,
get_overlapping_targets,
eng_assoc_clean,
nl_assoc_clean,
40)
Get rhos
get_rhos <- function(current_cue, all_common_targets, eng, nl){
current_targets <- all_common_targets %>%
filter(cue == current_cue)
targets_eng_swow <- eng %>%
filter(cue_eng_trans == current_cue) %>%
filter(target_eng_trans %in% current_targets$common_targets)
targets_nl_swow <- nl %>%
filter(cue_eng_trans == current_cue) %>%
filter(target_eng_trans %in% current_targets$common_targets)
rho <- cor(targets_eng_swow$swow,
targets_nl_swow$swow, method = "spearman")
data.frame(cue = current_cue,
rho_swow = rho)
}
rhos_swow <- map_df(common_cues,
get_rhos,
all_common_targets,
eng_assoc_clean,
nl_assoc_clean)
rhos_w2v vs. rhos_swow
rhos_w2v <- all_w2v %>%
select(wordform_eng, semantic_domain2, rho) %>%
rename(rho_w2v = rho)
all_rhos <- rhos_swow %>%
left_join(rhos_w2v, by = c("cue" = "wordform_eng")) %>%
rename(wordform_eng = cue)
ggplot(all_rhos, aes(x = rho_swow, y = rho_w2v)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle("rhos_w2v vs. rhos_swow") +
theme_minimal()

all_rhos %>%
do(tidy(cor.test(.$rho_swow, .$rho_w2v))) %>%
select(-parameter, -method, -alternative) %>%
kable()
| 0.1667811 |
4.693727 |
3.2e-06 |
0.0973662 |
0.2345811 |
By domain
sorted_levels <- all_rhos %>%
group_by(semantic_domain2) %>%
filter(semantic_domain2 != "Law" ) %>%
do(tidy(cor.test(.$rho_swow, .$rho_w2v))) %>%
select(-parameter, -method, -alternative) %>%
arrange(estimate)
all_rhos %>%
mutate(semantic_domain2 = fct_relevel(semantic_domain2, sorted_levels$semantic_domain2)) %>%
ggplot(aes(x = rho_swow, y = rho_w2v)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~ semantic_domain2) +
ggtitle("rhos_w2v vs. rhos_swow") +
theme_minimal()

kable(sorted_levels)
| Possession |
-0.5977821 |
-4.0156180 |
0.0003834 |
-0.7856972 |
-0.3088652 |
| The house |
-0.4987747 |
-2.2287708 |
0.0415460 |
-0.7900242 |
-0.0238469 |
| Social and political relations |
-0.0963192 |
-0.4538874 |
0.6543558 |
-0.4810257 |
0.3194913 |
| Food and drink |
-0.0688961 |
-0.3906635 |
0.6986347 |
-0.3977939 |
0.2756928 |
| Sense perception |
0.0562131 |
0.3691977 |
0.7137923 |
-0.2413024 |
0.3440697 |
| The body |
0.0788450 |
0.7291852 |
0.4678929 |
-0.1340293 |
0.2847638 |
| The physical world |
0.0909341 |
0.6771911 |
0.5011233 |
-0.1737505 |
0.3433659 |
| Clothing and grooming |
0.1066017 |
0.5252322 |
0.6042408 |
-0.2928424 |
0.4743659 |
| Agriculture and vegetation |
0.1324796 |
0.6547865 |
0.5188320 |
-0.2686589 |
0.4944578 |
| Basic actions and technology |
0.1490925 |
1.0767685 |
0.2866509 |
-0.1262906 |
0.4031403 |
| Spatial relations |
0.1571075 |
1.2626826 |
0.2113570 |
-0.0902498 |
0.3862077 |
| Motion |
0.2071187 |
1.5266588 |
0.1329060 |
-0.0642024 |
0.4499271 |
| Miscellaneous function words |
0.2456363 |
0.8404325 |
0.4185593 |
-0.3531443 |
0.7016578 |
| Animals |
0.2657843 |
1.6310664 |
0.1118436 |
-0.0637224 |
0.5430374 |
| Emotions and values |
0.2951654 |
1.7746643 |
0.0851796 |
-0.0422355 |
0.5721351 |
| Speech and language |
0.2958891 |
1.4194957 |
0.1704290 |
-0.1324700 |
0.6311164 |
| Cognition |
0.3475028 |
1.9957365 |
0.0554311 |
-0.0077974 |
0.6248968 |
| Quantity |
0.4172250 |
2.2490860 |
0.0339545 |
0.0356316 |
0.6926377 |
| Time |
0.4222825 |
3.0548270 |
0.0038579 |
0.1469656 |
0.6368733 |
| Kinship |
0.4410533 |
2.3568382 |
0.0273196 |
0.0556145 |
0.7120862 |
| Warfare and hunting |
0.7263466 |
2.3630040 |
0.0645101 |
-0.0589634 |
0.9563172 |
| Modern world |
0.7360155 |
2.4311176 |
0.0592989 |
-0.0382346 |
0.9580580 |
| Religion and belief |
0.8866689 |
1.9175160 |
0.3060267 |
NA |
NA |
all_rhos %>%
group_by(wordform_eng) %>%
summarize(n=n()) %>%
filter(n>1) %>%
as.data.frame()
## [1] wordform_eng n
## <0 rows> (or 0-length row.names)